From 97da92c0ff92f33a7c33533e5fdd3e870f01cc6a Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Tue, 31 May 2022 12:15:51 +0200 Subject: KVM: s390: selftests: Use TAP interface in the memop test The memop test currently does not have any output (unless one of the TEST_ASSERT statement fails), so it's hard to say for a user whether a certain new sub-test has been included in the binary or not. Let's make this a little bit more user-friendly and include some TAP output via the kselftests.h interface. Reviewed-by: Janosch Frank Signed-off-by: Thomas Huth Link: https://lore.kernel.org/r/20220531101554.36844-2-thuth@redhat.com Signed-off-by: Christian Borntraeger --- tools/testing/selftests/kvm/s390x/memop.c | 95 +++++++++++++++++++++++++------ 1 file changed, 77 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c index 49f26f544127..e704c6fa5758 100644 --- a/tools/testing/selftests/kvm/s390x/memop.c +++ b/tools/testing/selftests/kvm/s390x/memop.c @@ -14,6 +14,7 @@ #include "test_util.h" #include "kvm_util.h" +#include "kselftest.h" enum mop_target { LOGICAL, @@ -691,34 +692,92 @@ static void test_errors(void) kvm_vm_free(t.kvm_vm); } +struct testdef { + const char *name; + void (*test)(void); + int extension; +} testlist[] = { + { + .name = "simple copy", + .test = test_copy, + }, + { + .name = "generic error checks", + .test = test_errors, + }, + { + .name = "copy with storage keys", + .test = test_copy_key, + .extension = 1, + }, + { + .name = "copy with key storage protection override", + .test = test_copy_key_storage_prot_override, + .extension = 1, + }, + { + .name = "copy with key fetch protection", + .test = test_copy_key_fetch_prot, + .extension = 1, + }, + { + .name = "copy with key fetch protection override", + .test = test_copy_key_fetch_prot_override, + .extension = 1, + }, + { + .name = "error checks with key", + .test = test_errors_key, + .extension = 1, + }, + { + .name = "termination", + .test = test_termination, + .extension = 1, + }, + { + .name = "error checks with key storage protection override", + .test = test_errors_key_storage_prot_override, + .extension = 1, + }, + { + .name = "error checks without key fetch prot override", + .test = test_errors_key_fetch_prot_override_not_enabled, + .extension = 1, + }, + { + .name = "error checks with key fetch prot override", + .test = test_errors_key_fetch_prot_override_enabled, + .extension = 1, + }, +}; + int main(int argc, char *argv[]) { - int memop_cap, extension_cap; + int memop_cap, extension_cap, idx; setbuf(stdout, NULL); /* Tell stdout not to buffer its content */ + ksft_print_header(); + memop_cap = kvm_check_cap(KVM_CAP_S390_MEM_OP); extension_cap = kvm_check_cap(KVM_CAP_S390_MEM_OP_EXTENSION); if (!memop_cap) { - print_skip("CAP_S390_MEM_OP not supported"); - exit(KSFT_SKIP); + ksft_exit_skip("CAP_S390_MEM_OP not supported.\n"); } - test_copy(); - if (extension_cap > 0) { - test_copy_key(); - test_copy_key_storage_prot_override(); - test_copy_key_fetch_prot(); - test_copy_key_fetch_prot_override(); - test_errors_key(); - test_termination(); - test_errors_key_storage_prot_override(); - test_errors_key_fetch_prot_override_not_enabled(); - test_errors_key_fetch_prot_override_enabled(); - } else { - print_skip("storage key memop extension not supported"); + ksft_set_plan(ARRAY_SIZE(testlist)); + + for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) { + if (testlist[idx].extension >= extension_cap) { + testlist[idx].test(); + ksft_test_result_pass("%s\n", testlist[idx].name); + } else { + ksft_test_result_skip("%s - extension level %d not supported\n", + testlist[idx].name, + testlist[idx].extension); + } } - test_errors(); - return 0; + ksft_finished(); /* Print results and exit() accordingly */ } -- cgit v1.2.3 From 17e48d8a1ef0ab070b11e7368e14ac45c335de57 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Tue, 31 May 2022 12:15:52 +0200 Subject: KVM: s390: selftests: Use TAP interface in the sync_regs test The sync_regs test currently does not have any output (unless one of the TEST_ASSERT statement fails), so it's hard to say for a user whether a certain new sub-test has been included in the binary or not. Let's make this a little bit more user-friendly and include some TAP output via the kselftests.h interface. To be able to distinguish the different sub-tests more easily, we also break up the huge main() function here in more fine grained parts. Acked-by: Janosch Frank Signed-off-by: Thomas Huth Link: https://lore.kernel.org/r/20220531101554.36844-3-thuth@redhat.com Signed-off-by: Christian Borntraeger --- tools/testing/selftests/kvm/s390x/sync_regs_test.c | 87 ++++++++++++++++------ 1 file changed, 66 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/s390x/sync_regs_test.c b/tools/testing/selftests/kvm/s390x/sync_regs_test.c index caf7b8859a94..9510739e226d 100644 --- a/tools/testing/selftests/kvm/s390x/sync_regs_test.c +++ b/tools/testing/selftests/kvm/s390x/sync_regs_test.c @@ -21,6 +21,7 @@ #include "test_util.h" #include "kvm_util.h" #include "diag318_test_handler.h" +#include "kselftest.h" #define VCPU_ID 5 @@ -74,27 +75,9 @@ static void compare_sregs(struct kvm_sregs *left, struct kvm_sync_regs *right) #define TEST_SYNC_FIELDS (KVM_SYNC_GPRS|KVM_SYNC_ACRS|KVM_SYNC_CRS|KVM_SYNC_DIAG318) #define INVALID_SYNC_FIELD 0x80000000 -int main(int argc, char *argv[]) +void test_read_invalid(struct kvm_vm *vm, struct kvm_run *run) { - struct kvm_vm *vm; - struct kvm_run *run; - struct kvm_regs regs; - struct kvm_sregs sregs; - int rv, cap; - - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); - - cap = kvm_check_cap(KVM_CAP_SYNC_REGS); - if (!cap) { - print_skip("CAP_SYNC_REGS not supported"); - exit(KSFT_SKIP); - } - - /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code); - - run = vcpu_state(vm, VCPU_ID); + int rv; /* Request reading invalid register set from VCPU. */ run->kvm_valid_regs = INVALID_SYNC_FIELD; @@ -110,6 +93,11 @@ int main(int argc, char *argv[]) "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n", rv); vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0; +} + +void test_set_invalid(struct kvm_vm *vm, struct kvm_run *run) +{ + int rv; /* Request setting invalid register set into VCPU. */ run->kvm_dirty_regs = INVALID_SYNC_FIELD; @@ -125,6 +113,13 @@ int main(int argc, char *argv[]) "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n", rv); vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0; +} + +void test_req_and_verify_all_valid_regs(struct kvm_vm *vm, struct kvm_run *run) +{ + struct kvm_sregs sregs; + struct kvm_regs regs; + int rv; /* Request and verify all valid register sets. */ run->kvm_valid_regs = TEST_SYNC_FIELDS; @@ -146,6 +141,13 @@ int main(int argc, char *argv[]) vcpu_sregs_get(vm, VCPU_ID, &sregs); compare_sregs(&sregs, &run->s.regs); +} + +void test_set_and_verify_various_reg_values(struct kvm_vm *vm, struct kvm_run *run) +{ + struct kvm_sregs sregs; + struct kvm_regs regs; + int rv; /* Set and verify various register values */ run->s.regs.gprs[11] = 0xBAD1DEA; @@ -180,6 +182,11 @@ int main(int argc, char *argv[]) vcpu_sregs_get(vm, VCPU_ID, &sregs); compare_sregs(&sregs, &run->s.regs); +} + +void test_clear_kvm_dirty_regs_bits(struct kvm_vm *vm, struct kvm_run *run) +{ + int rv; /* Clear kvm_dirty_regs bits, verify new s.regs values are * overwritten with existing guest values. @@ -200,8 +207,46 @@ int main(int argc, char *argv[]) TEST_ASSERT(run->s.regs.diag318 != 0x4B1D, "diag318 sync regs value incorrect 0x%llx.", run->s.regs.diag318); +} + +struct testdef { + const char *name; + void (*test)(struct kvm_vm *vm, struct kvm_run *run); +} testlist[] = { + { "read invalid", test_read_invalid }, + { "set invalid", test_set_invalid }, + { "request+verify all valid regs", test_req_and_verify_all_valid_regs }, + { "set+verify various regs", test_set_and_verify_various_reg_values }, + { "clear kvm_dirty_regs bits", test_clear_kvm_dirty_regs_bits }, +}; + +int main(int argc, char *argv[]) +{ + static struct kvm_run *run; + static struct kvm_vm *vm; + int idx; + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + ksft_print_header(); + + if (!kvm_check_cap(KVM_CAP_SYNC_REGS)) + ksft_exit_skip("CAP_SYNC_REGS not supported"); + + ksft_set_plan(ARRAY_SIZE(testlist)); + + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, guest_code); + + run = vcpu_state(vm, VCPU_ID); + + for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) { + testlist[idx].test(vm, run); + ksft_test_result_pass("%s\n", testlist[idx].name); + } kvm_vm_free(vm); - return 0; + ksft_finished(); /* Print results and exit() accordingly */ } -- cgit v1.2.3 From 0c073227df5055714a545cbe536e3bd9ea39c74b Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Tue, 31 May 2022 12:15:53 +0200 Subject: KVM: s390: selftests: Use TAP interface in the tprot test The tprot test currently does not have any output (unless one of the TEST_ASSERT statement fails), so it's hard to say for a user whether a certain new sub-test has been included in the binary or not. Let's make this a little bit more user-friendly and include some TAP output via the kselftests.h interface. Reviewed-by: Janosch Frank Reviewed-by: Janis Schoetterl-Glausch Signed-off-by: Thomas Huth Link: https://lore.kernel.org/r/20220531101554.36844-4-thuth@redhat.com Signed-off-by: Christian Borntraeger --- tools/testing/selftests/kvm/s390x/tprot.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/s390x/tprot.c b/tools/testing/selftests/kvm/s390x/tprot.c index c097b9db495e..14d74a9e7b3d 100644 --- a/tools/testing/selftests/kvm/s390x/tprot.c +++ b/tools/testing/selftests/kvm/s390x/tprot.c @@ -8,6 +8,7 @@ #include #include "test_util.h" #include "kvm_util.h" +#include "kselftest.h" #define PAGE_SHIFT 12 #define PAGE_SIZE (1 << PAGE_SHIFT) @@ -63,12 +64,12 @@ static enum permission test_protection(void *addr, uint8_t key) } enum stage { - STAGE_END, STAGE_INIT_SIMPLE, TEST_SIMPLE, STAGE_INIT_FETCH_PROT_OVERRIDE, TEST_FETCH_PROT_OVERRIDE, TEST_STORAGE_PROT_OVERRIDE, + STAGE_END /* must be the last entry (it's the amount of tests) */ }; struct test { @@ -182,7 +183,7 @@ static void guest_code(void) GUEST_SYNC(perform_next_stage(&i, mapped_0)); } -#define HOST_SYNC(vmp, stage) \ +#define HOST_SYNC_NO_TAP(vmp, stage) \ ({ \ struct kvm_vm *__vm = (vmp); \ struct ucall uc; \ @@ -198,12 +199,21 @@ static void guest_code(void) ASSERT_EQ(uc.args[1], __stage); \ }) +#define HOST_SYNC(vmp, stage) \ +({ \ + HOST_SYNC_NO_TAP(vmp, stage); \ + ksft_test_result_pass("" #stage "\n"); \ +}) + int main(int argc, char *argv[]) { struct kvm_vm *vm; struct kvm_run *run; vm_vaddr_t guest_0_page; + ksft_print_header(); + ksft_set_plan(STAGE_END); + vm = vm_create_default(VCPU_ID, 0, guest_code); run = vcpu_state(vm, VCPU_ID); @@ -212,9 +222,14 @@ int main(int argc, char *argv[]) HOST_SYNC(vm, TEST_SIMPLE); guest_0_page = vm_vaddr_alloc(vm, PAGE_SIZE, 0); - if (guest_0_page != 0) - print_skip("Did not allocate page at 0 for fetch protection override tests"); - HOST_SYNC(vm, STAGE_INIT_FETCH_PROT_OVERRIDE); + if (guest_0_page != 0) { + /* Use NO_TAP so we don't get a PASS print */ + HOST_SYNC_NO_TAP(vm, STAGE_INIT_FETCH_PROT_OVERRIDE); + ksft_test_result_skip("STAGE_INIT_FETCH_PROT_OVERRIDE - " + "Did not allocate page at 0\n"); + } else { + HOST_SYNC(vm, STAGE_INIT_FETCH_PROT_OVERRIDE); + } if (guest_0_page == 0) mprotect(addr_gva2hva(vm, (vm_vaddr_t)0), PAGE_SIZE, PROT_READ); run->s.regs.crs[0] |= CR0_FETCH_PROTECTION_OVERRIDE; @@ -224,4 +239,8 @@ int main(int argc, char *argv[]) run->s.regs.crs[0] |= CR0_STORAGE_PROTECTION_OVERRIDE; run->kvm_dirty_regs = KVM_SYNC_CRS; HOST_SYNC(vm, TEST_STORAGE_PROT_OVERRIDE); + + kvm_vm_free(vm); + + ksft_finished(); /* Print results and exit() accordingly */ } -- cgit v1.2.3 From b1edf7f159a6d532757b004a70f31a6425d5043f Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Tue, 31 May 2022 12:15:54 +0200 Subject: KVM: s390: selftests: Use TAP interface in the reset test Let's standardize the s390x KVM selftest output to the TAP output generated via the kselftests.h interface. Reviewed-by: Janosch Frank Signed-off-by: Thomas Huth Link: https://lore.kernel.org/r/20220531101554.36844-5-thuth@redhat.com Signed-off-by: Christian Borntraeger --- tools/testing/selftests/kvm/s390x/resets.c | 38 +++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/s390x/resets.c b/tools/testing/selftests/kvm/s390x/resets.c index b143db6d8693..889449a22e7a 100644 --- a/tools/testing/selftests/kvm/s390x/resets.c +++ b/tools/testing/selftests/kvm/s390x/resets.c @@ -12,6 +12,7 @@ #include "test_util.h" #include "kvm_util.h" +#include "kselftest.h" #define VCPU_ID 3 #define LOCAL_IRQS 32 @@ -202,7 +203,7 @@ static void inject_irq(int cpu_id) static void test_normal(void) { - pr_info("Testing normal reset\n"); + ksft_print_msg("Testing normal reset\n"); /* Create VM */ vm = vm_create_default(VCPU_ID, 0, guest_code_initial); run = vcpu_state(vm, VCPU_ID); @@ -225,7 +226,7 @@ static void test_normal(void) static void test_initial(void) { - pr_info("Testing initial reset\n"); + ksft_print_msg("Testing initial reset\n"); vm = vm_create_default(VCPU_ID, 0, guest_code_initial); run = vcpu_state(vm, VCPU_ID); sync_regs = &run->s.regs; @@ -247,7 +248,7 @@ static void test_initial(void) static void test_clear(void) { - pr_info("Testing clear reset\n"); + ksft_print_msg("Testing clear reset\n"); vm = vm_create_default(VCPU_ID, 0, guest_code_initial); run = vcpu_state(vm, VCPU_ID); sync_regs = &run->s.regs; @@ -266,14 +267,35 @@ static void test_clear(void) kvm_vm_free(vm); } +struct testdef { + const char *name; + void (*test)(void); + bool needs_cap; +} testlist[] = { + { "initial", test_initial, false }, + { "normal", test_normal, true }, + { "clear", test_clear, true }, +}; + int main(int argc, char *argv[]) { + bool has_s390_vcpu_resets = kvm_check_cap(KVM_CAP_S390_VCPU_RESETS); + int idx; + setbuf(stdout, NULL); /* Tell stdout not to buffer its content */ - test_initial(); - if (kvm_check_cap(KVM_CAP_S390_VCPU_RESETS)) { - test_normal(); - test_clear(); + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(testlist)); + + for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) { + if (!testlist[idx].needs_cap || has_s390_vcpu_resets) { + testlist[idx].test(); + ksft_test_result_pass("%s\n", testlist[idx].name); + } else { + ksft_test_result_skip("%s - no VCPU_RESETS capability\n", + testlist[idx].name); + } } - return 0; + + ksft_finished(); /* Print results and exit() accordingly */ } -- cgit v1.2.3 From d18616e7aa940414312d63de918ab2204d82a20e Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Mon, 23 May 2022 23:04:17 +0000 Subject: libbpf: Introduce libbpf_bpf_prog_type_str MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change introduces a new function, libbpf_bpf_prog_type_str, to the public libbpf API. The function allows users to get a string representation for a bpf_prog_type variant. Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220523230428.3077108-2-deso@posteo.net --- tools/lib/bpf/libbpf.c | 43 +++++++++++++++++++++++++++++++++++++++++++ tools/lib/bpf/libbpf.h | 9 +++++++++ tools/lib/bpf/libbpf.map | 3 +++ 3 files changed, 55 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index e89cc9c885b3..01421aff0210 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -72,6 +72,41 @@ static struct bpf_map *bpf_object__add_map(struct bpf_object *obj); static bool prog_is_subprog(const struct bpf_object *obj, const struct bpf_program *prog); +static const char * const prog_type_name[] = { + [BPF_PROG_TYPE_UNSPEC] = "unspec", + [BPF_PROG_TYPE_SOCKET_FILTER] = "socket_filter", + [BPF_PROG_TYPE_KPROBE] = "kprobe", + [BPF_PROG_TYPE_SCHED_CLS] = "sched_cls", + [BPF_PROG_TYPE_SCHED_ACT] = "sched_act", + [BPF_PROG_TYPE_TRACEPOINT] = "tracepoint", + [BPF_PROG_TYPE_XDP] = "xdp", + [BPF_PROG_TYPE_PERF_EVENT] = "perf_event", + [BPF_PROG_TYPE_CGROUP_SKB] = "cgroup_skb", + [BPF_PROG_TYPE_CGROUP_SOCK] = "cgroup_sock", + [BPF_PROG_TYPE_LWT_IN] = "lwt_in", + [BPF_PROG_TYPE_LWT_OUT] = "lwt_out", + [BPF_PROG_TYPE_LWT_XMIT] = "lwt_xmit", + [BPF_PROG_TYPE_SOCK_OPS] = "sock_ops", + [BPF_PROG_TYPE_SK_SKB] = "sk_skb", + [BPF_PROG_TYPE_CGROUP_DEVICE] = "cgroup_device", + [BPF_PROG_TYPE_SK_MSG] = "sk_msg", + [BPF_PROG_TYPE_RAW_TRACEPOINT] = "raw_tracepoint", + [BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr", + [BPF_PROG_TYPE_LWT_SEG6LOCAL] = "lwt_seg6local", + [BPF_PROG_TYPE_LIRC_MODE2] = "lirc_mode2", + [BPF_PROG_TYPE_SK_REUSEPORT] = "sk_reuseport", + [BPF_PROG_TYPE_FLOW_DISSECTOR] = "flow_dissector", + [BPF_PROG_TYPE_CGROUP_SYSCTL] = "cgroup_sysctl", + [BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE] = "raw_tracepoint_writable", + [BPF_PROG_TYPE_CGROUP_SOCKOPT] = "cgroup_sockopt", + [BPF_PROG_TYPE_TRACING] = "tracing", + [BPF_PROG_TYPE_STRUCT_OPS] = "struct_ops", + [BPF_PROG_TYPE_EXT] = "ext", + [BPF_PROG_TYPE_LSM] = "lsm", + [BPF_PROG_TYPE_SK_LOOKUP] = "sk_lookup", + [BPF_PROG_TYPE_SYSCALL] = "syscall", +}; + static int __base_pr(enum libbpf_print_level level, const char *format, va_list args) { @@ -9300,6 +9335,14 @@ int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, return libbpf_err(-ESRCH); } +const char *libbpf_bpf_prog_type_str(enum bpf_prog_type t) +{ + if (t < 0 || t >= ARRAY_SIZE(prog_type_name)) + return NULL; + + return prog_type_name[t]; +} + static struct bpf_map *find_struct_ops_map_by_offset(struct bpf_object *obj, size_t offset) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 9e9a3fd3edd8..11b5f8ce8031 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -51,6 +51,15 @@ enum libbpf_errno { LIBBPF_API int libbpf_strerror(int err, char *buf, size_t size); +/** + * @brief **libbpf_bpf_prog_type_str()** converts the provided program type + * value into a textual representation. + * @param t The program type. + * @return Pointer to a static string identifying the program type. NULL is + * returned for unknown **bpf_prog_type** values. + */ +LIBBPF_API const char *libbpf_bpf_prog_type_str(enum bpf_prog_type t); + enum libbpf_print_level { LIBBPF_WARN, LIBBPF_INFO, diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 52973cffc20c..b12d290f8e4c 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -461,5 +461,8 @@ LIBBPF_0.8.0 { } LIBBPF_0.7.0; LIBBPF_1.0.0 { + global: + libbpf_bpf_prog_type_str; + local: *; }; -- cgit v1.2.3 From 8c5d71d96379e80c7c0d0fa7186c04f4deb04f16 Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Mon, 23 May 2022 23:04:18 +0000 Subject: selftests/bpf: Add test for libbpf_bpf_prog_type_str MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change adds a test for libbpf_bpf_prog_type_str. The test retrieves all variants of the bpf_prog_type enumeration using BTF and makes sure that the function under test works as expected for them. Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220523230428.3077108-3-deso@posteo.net --- .../testing/selftests/bpf/prog_tests/libbpf_str.c | 57 ++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/libbpf_str.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c new file mode 100644 index 000000000000..42696aaebf3e --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include + +/* + * Utility function uppercasing an entire string. + */ +static void uppercase(char *s) +{ + for (; *s != '\0'; s++) + *s = toupper(*s); +} + +/* + * Test case to check that all bpf_prog_type variants are covered by + * libbpf_bpf_prog_type_str. + */ +void test_libbpf_bpf_prog_type_str(void) +{ + struct btf *btf; + const struct btf_type *t; + const struct btf_enum *e; + int i, n, id; + + btf = btf__parse("/sys/kernel/btf/vmlinux", NULL); + if (!ASSERT_OK_PTR(btf, "btf_parse")) + return; + + /* find enum bpf_prog_type and enumerate each value */ + id = btf__find_by_name_kind(btf, "bpf_prog_type", BTF_KIND_ENUM); + if (!ASSERT_GT(id, 0, "bpf_prog_type_id")) + goto cleanup; + t = btf__type_by_id(btf, id); + e = btf_enum(t); + n = btf_vlen(t); + for (i = 0; i < n; e++, i++) { + enum bpf_prog_type prog_type = (enum bpf_prog_type)e->val; + const char *prog_type_name; + const char *prog_type_str; + char buf[256]; + + prog_type_name = btf__str_by_offset(btf, e->name_off); + prog_type_str = libbpf_bpf_prog_type_str(prog_type); + ASSERT_OK_PTR(prog_type_str, prog_type_name); + + snprintf(buf, sizeof(buf), "BPF_PROG_TYPE_%s", prog_type_str); + uppercase(buf); + + ASSERT_STREQ(buf, prog_type_name, "exp_str_value"); + } + +cleanup: + btf__free(btf); +} -- cgit v1.2.3 From b700eeb406a6c1f4d955242e06151f11f13d3e29 Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Mon, 23 May 2022 23:04:19 +0000 Subject: bpftool: Use libbpf_bpf_prog_type_str MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change switches bpftool over to using the recently introduced libbpf_bpf_prog_type_str function instead of maintaining its own string representation for the bpf_prog_type enum. Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20220523230428.3077108-4-deso@posteo.net --- tools/bpf/bpftool/feature.c | 57 ++++++++++++++-------- tools/bpf/bpftool/link.c | 19 +++++--- tools/bpf/bpftool/main.h | 3 -- tools/bpf/bpftool/map.c | 13 +++-- tools/bpf/bpftool/prog.c | 51 ++++--------------- .../selftests/bpf/test_bpftool_synctypes.py | 14 +----- 6 files changed, 65 insertions(+), 92 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index d12f46051aac..02753f934ed3 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -548,8 +548,8 @@ static bool probe_prog_type_ifindex(enum bpf_prog_type prog_type, __u32 ifindex) } static void -probe_prog_type(enum bpf_prog_type prog_type, bool *supported_types, - const char *define_prefix, __u32 ifindex) +probe_prog_type(enum bpf_prog_type prog_type, const char *prog_type_str, + bool *supported_types, const char *define_prefix, __u32 ifindex) { char feat_name[128], plain_desc[128], define_name[128]; const char *plain_comment = "eBPF program_type "; @@ -580,20 +580,16 @@ probe_prog_type(enum bpf_prog_type prog_type, bool *supported_types, supported_types[prog_type] |= res; - if (!prog_type_name[prog_type]) { - p_info("program type name not found (type %d)", prog_type); - return; - } maxlen = sizeof(plain_desc) - strlen(plain_comment) - 1; - if (strlen(prog_type_name[prog_type]) > maxlen) { + if (strlen(prog_type_str) > maxlen) { p_info("program type name too long"); return; } - sprintf(feat_name, "have_%s_prog_type", prog_type_name[prog_type]); - sprintf(define_name, "%s_prog_type", prog_type_name[prog_type]); + sprintf(feat_name, "have_%s_prog_type", prog_type_str); + sprintf(define_name, "%s_prog_type", prog_type_str); uppercase(define_name, sizeof(define_name)); - sprintf(plain_desc, "%s%s", plain_comment, prog_type_name[prog_type]); + sprintf(plain_desc, "%s%s", plain_comment, prog_type_str); print_bool_feature(feat_name, plain_desc, define_name, res, define_prefix); } @@ -728,10 +724,10 @@ probe_helper_for_progtype(enum bpf_prog_type prog_type, bool supported_type, } static void -probe_helpers_for_progtype(enum bpf_prog_type prog_type, bool supported_type, +probe_helpers_for_progtype(enum bpf_prog_type prog_type, + const char *prog_type_str, bool supported_type, const char *define_prefix, __u32 ifindex) { - const char *ptype_name = prog_type_name[prog_type]; char feat_name[128]; unsigned int id; bool probe_res = false; @@ -747,12 +743,12 @@ probe_helpers_for_progtype(enum bpf_prog_type prog_type, bool supported_type, } if (json_output) { - sprintf(feat_name, "%s_available_helpers", ptype_name); + sprintf(feat_name, "%s_available_helpers", prog_type_str); jsonw_name(json_wtr, feat_name); jsonw_start_array(json_wtr); } else if (!define_prefix) { printf("eBPF helpers supported for program type %s:", - ptype_name); + prog_type_str); } for (id = 1; id < ARRAY_SIZE(helper_name); id++) { @@ -768,7 +764,7 @@ probe_helpers_for_progtype(enum bpf_prog_type prog_type, bool supported_type, /* fallthrough */ default: probe_res |= probe_helper_for_progtype(prog_type, supported_type, - define_prefix, id, ptype_name, + define_prefix, id, prog_type_str, ifindex); } } @@ -943,15 +939,24 @@ static void section_program_types(bool *supported_types, const char *define_prefix, __u32 ifindex) { - unsigned int i; + unsigned int prog_type = BPF_PROG_TYPE_UNSPEC; + const char *prog_type_str; print_start_section("program_types", "Scanning eBPF program types...", "/*** eBPF program types ***/", define_prefix); - for (i = BPF_PROG_TYPE_UNSPEC + 1; i < prog_type_name_size; i++) - probe_prog_type(i, supported_types, define_prefix, ifindex); + while (true) { + prog_type++; + prog_type_str = libbpf_bpf_prog_type_str(prog_type); + /* libbpf will return NULL for variants unknown to it. */ + if (!prog_type_str) + break; + + probe_prog_type(prog_type, prog_type_str, supported_types, define_prefix, + ifindex); + } print_end_section(); } @@ -974,7 +979,8 @@ static void section_map_types(const char *define_prefix, __u32 ifindex) static void section_helpers(bool *supported_types, const char *define_prefix, __u32 ifindex) { - unsigned int i; + unsigned int prog_type = BPF_PROG_TYPE_UNSPEC; + const char *prog_type_str; print_start_section("helpers", "Scanning eBPF helper functions...", @@ -996,9 +1002,18 @@ section_helpers(bool *supported_types, const char *define_prefix, __u32 ifindex) " %sBPF__PROG_TYPE_ ## prog_type ## __HELPER_ ## helper\n", define_prefix, define_prefix, define_prefix, define_prefix); - for (i = BPF_PROG_TYPE_UNSPEC + 1; i < prog_type_name_size; i++) - probe_helpers_for_progtype(i, supported_types[i], define_prefix, + while (true) { + prog_type++; + prog_type_str = libbpf_bpf_prog_type_str(prog_type); + /* libbpf will return NULL for variants unknown to it. */ + if (!prog_type_str) + break; + + probe_helpers_for_progtype(prog_type, prog_type_str, + supported_types[prog_type], + define_prefix, ifindex); + } print_end_section(); } diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index 6353a789322b..e27108489604 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -121,6 +121,7 @@ static int get_prog_info(int prog_id, struct bpf_prog_info *info) static int show_link_close_json(int fd, struct bpf_link_info *info) { struct bpf_prog_info prog_info; + const char *prog_type_str; int err; jsonw_start_object(json_wtr); @@ -137,12 +138,12 @@ static int show_link_close_json(int fd, struct bpf_link_info *info) if (err) return err; - if (prog_info.type < prog_type_name_size) - jsonw_string_field(json_wtr, "prog_type", - prog_type_name[prog_info.type]); + prog_type_str = libbpf_bpf_prog_type_str(prog_info.type); + /* libbpf will return NULL for variants unknown to it. */ + if (prog_type_str) + jsonw_string_field(json_wtr, "prog_type", prog_type_str); else - jsonw_uint_field(json_wtr, "prog_type", - prog_info.type); + jsonw_uint_field(json_wtr, "prog_type", prog_info.type); show_link_attach_type_json(info->tracing.attach_type, json_wtr); @@ -214,6 +215,7 @@ static void show_iter_plain(struct bpf_link_info *info) static int show_link_close_plain(int fd, struct bpf_link_info *info) { struct bpf_prog_info prog_info; + const char *prog_type_str; int err; show_link_header_plain(info); @@ -228,9 +230,10 @@ static int show_link_close_plain(int fd, struct bpf_link_info *info) if (err) return err; - if (prog_info.type < prog_type_name_size) - printf("\n\tprog_type %s ", - prog_type_name[prog_info.type]); + prog_type_str = libbpf_bpf_prog_type_str(prog_info.type); + /* libbpf will return NULL for variants unknown to it. */ + if (prog_type_str) + printf("\n\tprog_type %s ", prog_type_str); else printf("\n\tprog_type %u ", prog_info.type); diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index aa99ffab451a..74204d0e33cf 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -63,9 +63,6 @@ static inline void *u64_to_ptr(__u64 ptr) #define HELP_SPEC_LINK \ "LINK := { id LINK_ID | pinned FILE }" -extern const char * const prog_type_name[]; -extern const size_t prog_type_name_size; - extern const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE]; extern const char * const map_type_name[]; diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 877387ef79c7..70a1fd5253da 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -513,10 +513,12 @@ static int show_map_close_json(int fd, struct bpf_map_info *info) if (owner_prog_type) { unsigned int prog_type = atoi(owner_prog_type); + const char *prog_type_str; - if (prog_type < prog_type_name_size) + prog_type_str = libbpf_bpf_prog_type_str(prog_type); + if (prog_type_str) jsonw_string_field(json_wtr, "owner_prog_type", - prog_type_name[prog_type]); + prog_type_str); else jsonw_uint_field(json_wtr, "owner_prog_type", prog_type); @@ -597,10 +599,11 @@ static int show_map_close_plain(int fd, struct bpf_map_info *info) printf("\n\t"); if (owner_prog_type) { unsigned int prog_type = atoi(owner_prog_type); + const char *prog_type_str; - if (prog_type < prog_type_name_size) - printf("owner_prog_type %s ", - prog_type_name[prog_type]); + prog_type_str = libbpf_bpf_prog_type_str(prog_type); + if (prog_type_str) + printf("owner_prog_type %s ", prog_type_str); else printf("owner_prog_type %d ", prog_type); } diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 5c2c63df92e8..39e1e7149f62 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -36,43 +36,6 @@ #define BPF_METADATA_PREFIX "bpf_metadata_" #define BPF_METADATA_PREFIX_LEN (sizeof(BPF_METADATA_PREFIX) - 1) -const char * const prog_type_name[] = { - [BPF_PROG_TYPE_UNSPEC] = "unspec", - [BPF_PROG_TYPE_SOCKET_FILTER] = "socket_filter", - [BPF_PROG_TYPE_KPROBE] = "kprobe", - [BPF_PROG_TYPE_SCHED_CLS] = "sched_cls", - [BPF_PROG_TYPE_SCHED_ACT] = "sched_act", - [BPF_PROG_TYPE_TRACEPOINT] = "tracepoint", - [BPF_PROG_TYPE_XDP] = "xdp", - [BPF_PROG_TYPE_PERF_EVENT] = "perf_event", - [BPF_PROG_TYPE_CGROUP_SKB] = "cgroup_skb", - [BPF_PROG_TYPE_CGROUP_SOCK] = "cgroup_sock", - [BPF_PROG_TYPE_LWT_IN] = "lwt_in", - [BPF_PROG_TYPE_LWT_OUT] = "lwt_out", - [BPF_PROG_TYPE_LWT_XMIT] = "lwt_xmit", - [BPF_PROG_TYPE_SOCK_OPS] = "sock_ops", - [BPF_PROG_TYPE_SK_SKB] = "sk_skb", - [BPF_PROG_TYPE_CGROUP_DEVICE] = "cgroup_device", - [BPF_PROG_TYPE_SK_MSG] = "sk_msg", - [BPF_PROG_TYPE_RAW_TRACEPOINT] = "raw_tracepoint", - [BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr", - [BPF_PROG_TYPE_LWT_SEG6LOCAL] = "lwt_seg6local", - [BPF_PROG_TYPE_LIRC_MODE2] = "lirc_mode2", - [BPF_PROG_TYPE_SK_REUSEPORT] = "sk_reuseport", - [BPF_PROG_TYPE_FLOW_DISSECTOR] = "flow_dissector", - [BPF_PROG_TYPE_CGROUP_SYSCTL] = "cgroup_sysctl", - [BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE] = "raw_tracepoint_writable", - [BPF_PROG_TYPE_CGROUP_SOCKOPT] = "cgroup_sockopt", - [BPF_PROG_TYPE_TRACING] = "tracing", - [BPF_PROG_TYPE_STRUCT_OPS] = "struct_ops", - [BPF_PROG_TYPE_EXT] = "ext", - [BPF_PROG_TYPE_LSM] = "lsm", - [BPF_PROG_TYPE_SK_LOOKUP] = "sk_lookup", - [BPF_PROG_TYPE_SYSCALL] = "syscall", -}; - -const size_t prog_type_name_size = ARRAY_SIZE(prog_type_name); - enum dump_mode { DUMP_JITED, DUMP_XLATED, @@ -428,12 +391,14 @@ out_free: static void print_prog_header_json(struct bpf_prog_info *info, int fd) { + const char *prog_type_str; char prog_name[MAX_PROG_FULL_NAME]; jsonw_uint_field(json_wtr, "id", info->id); - if (info->type < ARRAY_SIZE(prog_type_name)) - jsonw_string_field(json_wtr, "type", - prog_type_name[info->type]); + prog_type_str = libbpf_bpf_prog_type_str(info->type); + + if (prog_type_str) + jsonw_string_field(json_wtr, "type", prog_type_str); else jsonw_uint_field(json_wtr, "type", info->type); @@ -515,11 +480,13 @@ static void print_prog_json(struct bpf_prog_info *info, int fd) static void print_prog_header_plain(struct bpf_prog_info *info, int fd) { + const char *prog_type_str; char prog_name[MAX_PROG_FULL_NAME]; printf("%u: ", info->id); - if (info->type < ARRAY_SIZE(prog_type_name)) - printf("%s ", prog_type_name[info->type]); + prog_type_str = libbpf_bpf_prog_type_str(info->type); + if (prog_type_str) + printf("%s ", prog_type_str); else printf("type %u ", info->type); diff --git a/tools/testing/selftests/bpf/test_bpftool_synctypes.py b/tools/testing/selftests/bpf/test_bpftool_synctypes.py index c0e7acd698ed..1f0ff783f22d 100755 --- a/tools/testing/selftests/bpf/test_bpftool_synctypes.py +++ b/tools/testing/selftests/bpf/test_bpftool_synctypes.py @@ -333,9 +333,6 @@ class ProgFileExtractor(SourceFileExtractor): """ filename = os.path.join(BPFTOOL_DIR, 'prog.c') - def get_prog_types(self): - return self.get_types_from_array('prog_type_name') - def get_attach_types(self): return self.get_types_from_array('attach_type_strings') @@ -533,16 +530,6 @@ def main(): verify(source_map_types, bashcomp_map_types, f'Comparing {MapFileExtractor.filename} (map_type_name) and {BashcompExtractor.filename} (BPFTOOL_MAP_CREATE_TYPES):') - # Program types (enum) - - ref = bpf_info.get_prog_types() - - prog_info = ProgFileExtractor() - prog_types = set(prog_info.get_prog_types().keys()) - - verify(ref, prog_types, - f'Comparing BPF header (enum bpf_prog_type) and {ProgFileExtractor.filename} (prog_type_name):') - # Attach types (enum) ref = bpf_info.get_attach_types() @@ -556,6 +543,7 @@ def main(): # Attach types (names) + prog_info = ProgFileExtractor() source_prog_attach_types = set(prog_info.get_attach_types().values()) help_prog_attach_types = prog_info.get_prog_attach_help() -- cgit v1.2.3 From 3e6dc0207b33edf90be4d5cbbaf06223aa38e50b Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Mon, 23 May 2022 23:04:20 +0000 Subject: libbpf: Introduce libbpf_bpf_map_type_str MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change introduces a new function, libbpf_bpf_map_type_str, to the public libbpf API. The function allows users to get a string representation for a bpf_map_type enum variant. Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220523230428.3077108-5-deso@posteo.net --- tools/lib/bpf/libbpf.c | 42 ++++++++++++++++++++++++++++++++++++++++++ tools/lib/bpf/libbpf.h | 9 +++++++++ tools/lib/bpf/libbpf.map | 1 + 3 files changed, 52 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 01421aff0210..a345bc1a42ee 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -72,6 +72,40 @@ static struct bpf_map *bpf_object__add_map(struct bpf_object *obj); static bool prog_is_subprog(const struct bpf_object *obj, const struct bpf_program *prog); +static const char * const map_type_name[] = { + [BPF_MAP_TYPE_UNSPEC] = "unspec", + [BPF_MAP_TYPE_HASH] = "hash", + [BPF_MAP_TYPE_ARRAY] = "array", + [BPF_MAP_TYPE_PROG_ARRAY] = "prog_array", + [BPF_MAP_TYPE_PERF_EVENT_ARRAY] = "perf_event_array", + [BPF_MAP_TYPE_PERCPU_HASH] = "percpu_hash", + [BPF_MAP_TYPE_PERCPU_ARRAY] = "percpu_array", + [BPF_MAP_TYPE_STACK_TRACE] = "stack_trace", + [BPF_MAP_TYPE_CGROUP_ARRAY] = "cgroup_array", + [BPF_MAP_TYPE_LRU_HASH] = "lru_hash", + [BPF_MAP_TYPE_LRU_PERCPU_HASH] = "lru_percpu_hash", + [BPF_MAP_TYPE_LPM_TRIE] = "lpm_trie", + [BPF_MAP_TYPE_ARRAY_OF_MAPS] = "array_of_maps", + [BPF_MAP_TYPE_HASH_OF_MAPS] = "hash_of_maps", + [BPF_MAP_TYPE_DEVMAP] = "devmap", + [BPF_MAP_TYPE_DEVMAP_HASH] = "devmap_hash", + [BPF_MAP_TYPE_SOCKMAP] = "sockmap", + [BPF_MAP_TYPE_CPUMAP] = "cpumap", + [BPF_MAP_TYPE_XSKMAP] = "xskmap", + [BPF_MAP_TYPE_SOCKHASH] = "sockhash", + [BPF_MAP_TYPE_CGROUP_STORAGE] = "cgroup_storage", + [BPF_MAP_TYPE_REUSEPORT_SOCKARRAY] = "reuseport_sockarray", + [BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE] = "percpu_cgroup_storage", + [BPF_MAP_TYPE_QUEUE] = "queue", + [BPF_MAP_TYPE_STACK] = "stack", + [BPF_MAP_TYPE_SK_STORAGE] = "sk_storage", + [BPF_MAP_TYPE_STRUCT_OPS] = "struct_ops", + [BPF_MAP_TYPE_RINGBUF] = "ringbuf", + [BPF_MAP_TYPE_INODE_STORAGE] = "inode_storage", + [BPF_MAP_TYPE_TASK_STORAGE] = "task_storage", + [BPF_MAP_TYPE_BLOOM_FILTER] = "bloom_filter", +}; + static const char * const prog_type_name[] = { [BPF_PROG_TYPE_UNSPEC] = "unspec", [BPF_PROG_TYPE_SOCKET_FILTER] = "socket_filter", @@ -9335,6 +9369,14 @@ int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, return libbpf_err(-ESRCH); } +const char *libbpf_bpf_map_type_str(enum bpf_map_type t) +{ + if (t < 0 || t >= ARRAY_SIZE(map_type_name)) + return NULL; + + return map_type_name[t]; +} + const char *libbpf_bpf_prog_type_str(enum bpf_prog_type t) { if (t < 0 || t >= ARRAY_SIZE(prog_type_name)) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 11b5f8ce8031..6c5b077351c6 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -51,6 +51,15 @@ enum libbpf_errno { LIBBPF_API int libbpf_strerror(int err, char *buf, size_t size); +/** + * @brief **libbpf_bpf_map_type_str()** converts the provided map type value + * into a textual representation. + * @param t The map type. + * @return Pointer to a static string identifying the map type. NULL is + * returned for unknown **bpf_map_type** values. + */ +LIBBPF_API const char *libbpf_bpf_map_type_str(enum bpf_map_type t); + /** * @brief **libbpf_bpf_prog_type_str()** converts the provided program type * value into a textual representation. diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index b12d290f8e4c..16065ac7792d 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -462,6 +462,7 @@ LIBBPF_0.8.0 { LIBBPF_1.0.0 { global: + libbpf_bpf_map_type_str; libbpf_bpf_prog_type_str; local: *; -- cgit v1.2.3 From c3a2574011a313707570d35b7e6e6536eda69dbb Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Mon, 23 May 2022 23:04:21 +0000 Subject: selftests/bpf: Add test for libbpf_bpf_map_type_str MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change adds a test for libbpf_bpf_map_type_str. The test retrieves all variants of the bpf_map_type enumeration using BTF and makes sure that the function under test works as expected for them. Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220523230428.3077108-6-deso@posteo.net --- .../testing/selftests/bpf/prog_tests/libbpf_str.c | 56 +++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c index 42696aaebf3e..f5185a46ee00 100644 --- a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c +++ b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c @@ -14,11 +14,53 @@ static void uppercase(char *s) *s = toupper(*s); } +/* + * Test case to check that all bpf_map_type variants are covered by + * libbpf_bpf_map_type_str. + */ +static void test_libbpf_bpf_map_type_str(void) +{ + struct btf *btf; + const struct btf_type *t; + const struct btf_enum *e; + int i, n, id; + + btf = btf__parse("/sys/kernel/btf/vmlinux", NULL); + if (!ASSERT_OK_PTR(btf, "btf_parse")) + return; + + /* find enum bpf_map_type and enumerate each value */ + id = btf__find_by_name_kind(btf, "bpf_map_type", BTF_KIND_ENUM); + if (!ASSERT_GT(id, 0, "bpf_map_type_id")) + goto cleanup; + t = btf__type_by_id(btf, id); + e = btf_enum(t); + n = btf_vlen(t); + for (i = 0; i < n; e++, i++) { + enum bpf_map_type map_type = (enum bpf_map_type)e->val; + const char *map_type_name; + const char *map_type_str; + char buf[256]; + + map_type_name = btf__str_by_offset(btf, e->name_off); + map_type_str = libbpf_bpf_map_type_str(map_type); + ASSERT_OK_PTR(map_type_str, map_type_name); + + snprintf(buf, sizeof(buf), "BPF_MAP_TYPE_%s", map_type_str); + uppercase(buf); + + ASSERT_STREQ(buf, map_type_name, "exp_str_value"); + } + +cleanup: + btf__free(btf); +} + /* * Test case to check that all bpf_prog_type variants are covered by * libbpf_bpf_prog_type_str. */ -void test_libbpf_bpf_prog_type_str(void) +static void test_libbpf_bpf_prog_type_str(void) { struct btf *btf; const struct btf_type *t; @@ -55,3 +97,15 @@ void test_libbpf_bpf_prog_type_str(void) cleanup: btf__free(btf); } + +/* + * Run all libbpf str conversion tests. + */ +void test_libbpf_str(void) +{ + if (test__start_subtest("bpf_map_type_str")) + test_libbpf_bpf_map_type_str(); + + if (test__start_subtest("bpf_prog_type_str")) + test_libbpf_bpf_prog_type_str(); +} -- cgit v1.2.3 From 2e98964bd6e283568730b1a4da3b1e4da3306a8e Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Mon, 23 May 2022 23:04:22 +0000 Subject: bpftool: Use libbpf_bpf_map_type_str MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change switches bpftool over to using the recently introduced libbpf_bpf_map_type_str function instead of maintaining its own string representation for the bpf_map_type enum. Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20220523230428.3077108-7-deso@posteo.net --- tools/bpf/bpftool/feature.c | 30 ++++++---- tools/bpf/bpftool/main.h | 3 - tools/bpf/bpftool/map.c | 69 ++++++++-------------- .../selftests/bpf/test_bpftool_synctypes.py | 48 +++++++++------ 4 files changed, 71 insertions(+), 79 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 02753f934ed3..cc9e4df8c58e 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -615,8 +615,8 @@ static bool probe_map_type_ifindex(enum bpf_map_type map_type, __u32 ifindex) } static void -probe_map_type(enum bpf_map_type map_type, const char *define_prefix, - __u32 ifindex) +probe_map_type(enum bpf_map_type map_type, char const *map_type_str, + const char *define_prefix, __u32 ifindex) { char feat_name[128], plain_desc[128], define_name[128]; const char *plain_comment = "eBPF map_type "; @@ -641,20 +641,16 @@ probe_map_type(enum bpf_map_type map_type, const char *define_prefix, * check required for unprivileged users */ - if (!map_type_name[map_type]) { - p_info("map type name not found (type %d)", map_type); - return; - } maxlen = sizeof(plain_desc) - strlen(plain_comment) - 1; - if (strlen(map_type_name[map_type]) > maxlen) { + if (strlen(map_type_str) > maxlen) { p_info("map type name too long"); return; } - sprintf(feat_name, "have_%s_map_type", map_type_name[map_type]); - sprintf(define_name, "%s_map_type", map_type_name[map_type]); + sprintf(feat_name, "have_%s_map_type", map_type_str); + sprintf(define_name, "%s_map_type", map_type_str); uppercase(define_name, sizeof(define_name)); - sprintf(plain_desc, "%s%s", plain_comment, map_type_name[map_type]); + sprintf(plain_desc, "%s%s", plain_comment, map_type_str); print_bool_feature(feat_name, plain_desc, define_name, res, define_prefix); } @@ -963,15 +959,23 @@ section_program_types(bool *supported_types, const char *define_prefix, static void section_map_types(const char *define_prefix, __u32 ifindex) { - unsigned int i; + unsigned int map_type = BPF_MAP_TYPE_UNSPEC; + const char *map_type_str; print_start_section("map_types", "Scanning eBPF map types...", "/*** eBPF map types ***/", define_prefix); - for (i = BPF_MAP_TYPE_UNSPEC + 1; i < map_type_name_size; i++) - probe_map_type(i, define_prefix, ifindex); + while (true) { + map_type++; + map_type_str = libbpf_bpf_map_type_str(map_type); + /* libbpf will return NULL for variants unknown to it. */ + if (!map_type_str) + break; + + probe_map_type(map_type, map_type_str, define_prefix, ifindex); + } print_end_section(); } diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 74204d0e33cf..e4fdaa0740b3 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -65,9 +65,6 @@ static inline void *u64_to_ptr(__u64 ptr) extern const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE]; -extern const char * const map_type_name[]; -extern const size_t map_type_name_size; - /* keep in sync with the definition in skeleton/pid_iter.bpf.c */ enum bpf_obj_type { BPF_OBJ_UNKNOWN, diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 70a1fd5253da..800834be1bcb 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -22,42 +22,6 @@ #include "json_writer.h" #include "main.h" -const char * const map_type_name[] = { - [BPF_MAP_TYPE_UNSPEC] = "unspec", - [BPF_MAP_TYPE_HASH] = "hash", - [BPF_MAP_TYPE_ARRAY] = "array", - [BPF_MAP_TYPE_PROG_ARRAY] = "prog_array", - [BPF_MAP_TYPE_PERF_EVENT_ARRAY] = "perf_event_array", - [BPF_MAP_TYPE_PERCPU_HASH] = "percpu_hash", - [BPF_MAP_TYPE_PERCPU_ARRAY] = "percpu_array", - [BPF_MAP_TYPE_STACK_TRACE] = "stack_trace", - [BPF_MAP_TYPE_CGROUP_ARRAY] = "cgroup_array", - [BPF_MAP_TYPE_LRU_HASH] = "lru_hash", - [BPF_MAP_TYPE_LRU_PERCPU_HASH] = "lru_percpu_hash", - [BPF_MAP_TYPE_LPM_TRIE] = "lpm_trie", - [BPF_MAP_TYPE_ARRAY_OF_MAPS] = "array_of_maps", - [BPF_MAP_TYPE_HASH_OF_MAPS] = "hash_of_maps", - [BPF_MAP_TYPE_DEVMAP] = "devmap", - [BPF_MAP_TYPE_DEVMAP_HASH] = "devmap_hash", - [BPF_MAP_TYPE_SOCKMAP] = "sockmap", - [BPF_MAP_TYPE_CPUMAP] = "cpumap", - [BPF_MAP_TYPE_XSKMAP] = "xskmap", - [BPF_MAP_TYPE_SOCKHASH] = "sockhash", - [BPF_MAP_TYPE_CGROUP_STORAGE] = "cgroup_storage", - [BPF_MAP_TYPE_REUSEPORT_SOCKARRAY] = "reuseport_sockarray", - [BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE] = "percpu_cgroup_storage", - [BPF_MAP_TYPE_QUEUE] = "queue", - [BPF_MAP_TYPE_STACK] = "stack", - [BPF_MAP_TYPE_SK_STORAGE] = "sk_storage", - [BPF_MAP_TYPE_STRUCT_OPS] = "struct_ops", - [BPF_MAP_TYPE_RINGBUF] = "ringbuf", - [BPF_MAP_TYPE_INODE_STORAGE] = "inode_storage", - [BPF_MAP_TYPE_TASK_STORAGE] = "task_storage", - [BPF_MAP_TYPE_BLOOM_FILTER] = "bloom_filter", -}; - -const size_t map_type_name_size = ARRAY_SIZE(map_type_name); - static struct hashmap *map_table; static bool map_is_per_cpu(__u32 type) @@ -81,12 +45,18 @@ static bool map_is_map_of_progs(__u32 type) static int map_type_from_str(const char *type) { + const char *map_type_str; unsigned int i; - for (i = 0; i < ARRAY_SIZE(map_type_name); i++) + for (i = 0; ; i++) { + map_type_str = libbpf_bpf_map_type_str(i); + if (!map_type_str) + break; + /* Don't allow prefixing in case of possible future shadowing */ - if (map_type_name[i] && !strcmp(map_type_name[i], type)) + if (!strcmp(map_type_str, type)) return i; + } return -1; } @@ -472,9 +442,12 @@ static int parse_elem(char **argv, struct bpf_map_info *info, static void show_map_header_json(struct bpf_map_info *info, json_writer_t *wtr) { + const char *map_type_str; + jsonw_uint_field(wtr, "id", info->id); - if (info->type < ARRAY_SIZE(map_type_name)) - jsonw_string_field(wtr, "type", map_type_name[info->type]); + map_type_str = libbpf_bpf_map_type_str(info->type); + if (map_type_str) + jsonw_string_field(wtr, "type", map_type_str); else jsonw_uint_field(wtr, "type", info->type); @@ -561,9 +534,13 @@ static int show_map_close_json(int fd, struct bpf_map_info *info) static void show_map_header_plain(struct bpf_map_info *info) { + const char *map_type_str; + printf("%u: ", info->id); - if (info->type < ARRAY_SIZE(map_type_name)) - printf("%s ", map_type_name[info->type]); + + map_type_str = libbpf_bpf_map_type_str(info->type); + if (map_type_str) + printf("%s ", map_type_str); else printf("type %u ", info->type); @@ -879,9 +856,13 @@ map_dump(int fd, struct bpf_map_info *info, json_writer_t *wtr, } if (info->type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY && - info->value_size != 8) + info->value_size != 8) { + const char *map_type_str; + + map_type_str = libbpf_bpf_map_type_str(info->type); p_info("Warning: cannot read values from %s map with value_size != 8", - map_type_name[info->type]); + map_type_str); + } while (true) { err = bpf_map_get_next_key(fd, prev_key, key); if (err) { diff --git a/tools/testing/selftests/bpf/test_bpftool_synctypes.py b/tools/testing/selftests/bpf/test_bpftool_synctypes.py index 1f0ff783f22d..0a08c074a8fe 100755 --- a/tools/testing/selftests/bpf/test_bpftool_synctypes.py +++ b/tools/testing/selftests/bpf/test_bpftool_synctypes.py @@ -186,6 +186,27 @@ class FileExtractor(object): parser.search_block(start_marker) return parser.parse(pattern, end_marker) + def make_enum_map(self, names, enum_prefix): + """ + Search for and parse an enum containing BPF_* members, just as get_enum + does. However, instead of just returning a set of the variant names, + also generate a textual representation from them by (assuming and) + removing a provided prefix and lowercasing the remainder. Then return a + dict mapping from name to textual representation. + + @enum_values: a set of enum values; e.g., as retrieved by get_enum + @enum_prefix: the prefix to remove from each of the variants to infer + textual representation + """ + mapping = {} + for name in names: + if not name.startswith(enum_prefix): + raise Exception(f"enum variant {name} does not start with {enum_prefix}") + text = name[len(enum_prefix):].lower() + mapping[name] = text + + return mapping + def __get_description_list(self, start_marker, pattern, end_marker): parser = InlineListParser(self.reader) parser.search_block(start_marker) @@ -345,9 +366,6 @@ class MapFileExtractor(SourceFileExtractor): """ filename = os.path.join(BPFTOOL_DIR, 'map.c') - def get_map_types(self): - return self.get_types_from_array('map_type_name') - def get_map_help(self): return self.get_help_list('TYPE') @@ -403,8 +421,9 @@ class BpfHeaderExtractor(FileExtractor): def get_prog_types(self): return self.get_enum('bpf_prog_type') - def get_map_types(self): - return self.get_enum('bpf_map_type') + def get_map_type_map(self): + names = self.get_enum('bpf_map_type') + return self.make_enum_map(names, 'BPF_MAP_TYPE_') def get_attach_types(self): return self.get_enum('bpf_attach_type') @@ -492,21 +511,12 @@ def main(): """) args = argParser.parse_args() - # Map types (enum) - bpf_info = BpfHeaderExtractor() - ref = bpf_info.get_map_types() - - map_info = MapFileExtractor() - source_map_items = map_info.get_map_types() - map_types_enum = set(source_map_items.keys()) - - verify(ref, map_types_enum, - f'Comparing BPF header (enum bpf_map_type) and {MapFileExtractor.filename} (map_type_name):') # Map types (names) - source_map_types = set(source_map_items.values()) + map_info = MapFileExtractor() + source_map_types = set(bpf_info.get_map_type_map().values()) source_map_types.discard('unspec') help_map_types = map_info.get_map_help() @@ -522,13 +532,13 @@ def main(): bashcomp_map_types = bashcomp_info.get_map_types() verify(source_map_types, help_map_types, - f'Comparing {MapFileExtractor.filename} (map_type_name) and {MapFileExtractor.filename} (do_help() TYPE):') + f'Comparing {BpfHeaderExtractor.filename} (bpf_map_type) and {MapFileExtractor.filename} (do_help() TYPE):') verify(source_map_types, man_map_types, - f'Comparing {MapFileExtractor.filename} (map_type_name) and {ManMapExtractor.filename} (TYPE):') + f'Comparing {BpfHeaderExtractor.filename} (bpf_map_type) and {ManMapExtractor.filename} (TYPE):') verify(help_map_options, man_map_options, f'Comparing {MapFileExtractor.filename} (do_help() OPTIONS) and {ManMapExtractor.filename} (OPTIONS):') verify(source_map_types, bashcomp_map_types, - f'Comparing {MapFileExtractor.filename} (map_type_name) and {BashcompExtractor.filename} (BPFTOOL_MAP_CREATE_TYPES):') + f'Comparing {BpfHeaderExtractor.filename} (bpf_map_type) and {BashcompExtractor.filename} (BPFTOOL_MAP_CREATE_TYPES):') # Attach types (enum) -- cgit v1.2.3 From ccde5760bac1b83cfb2bfed97748b71829cbe5d8 Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Mon, 23 May 2022 23:04:23 +0000 Subject: libbpf: Introduce libbpf_bpf_attach_type_str MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change introduces a new function, libbpf_bpf_attach_type_str, to the public libbpf API. The function allows users to get a string representation for a bpf_attach_type variant. Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220523230428.3077108-8-deso@posteo.net --- tools/lib/bpf/libbpf.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++ tools/lib/bpf/libbpf.h | 9 ++++++++ tools/lib/bpf/libbpf.map | 1 + 3 files changed, 64 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index a345bc1a42ee..fa3987bc9172 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -72,6 +72,52 @@ static struct bpf_map *bpf_object__add_map(struct bpf_object *obj); static bool prog_is_subprog(const struct bpf_object *obj, const struct bpf_program *prog); +static const char * const attach_type_name[] = { + [BPF_CGROUP_INET_INGRESS] = "cgroup_inet_ingress", + [BPF_CGROUP_INET_EGRESS] = "cgroup_inet_egress", + [BPF_CGROUP_INET_SOCK_CREATE] = "cgroup_inet_sock_create", + [BPF_CGROUP_INET_SOCK_RELEASE] = "cgroup_inet_sock_release", + [BPF_CGROUP_SOCK_OPS] = "cgroup_sock_ops", + [BPF_CGROUP_DEVICE] = "cgroup_device", + [BPF_CGROUP_INET4_BIND] = "cgroup_inet4_bind", + [BPF_CGROUP_INET6_BIND] = "cgroup_inet6_bind", + [BPF_CGROUP_INET4_CONNECT] = "cgroup_inet4_connect", + [BPF_CGROUP_INET6_CONNECT] = "cgroup_inet6_connect", + [BPF_CGROUP_INET4_POST_BIND] = "cgroup_inet4_post_bind", + [BPF_CGROUP_INET6_POST_BIND] = "cgroup_inet6_post_bind", + [BPF_CGROUP_INET4_GETPEERNAME] = "cgroup_inet4_getpeername", + [BPF_CGROUP_INET6_GETPEERNAME] = "cgroup_inet6_getpeername", + [BPF_CGROUP_INET4_GETSOCKNAME] = "cgroup_inet4_getsockname", + [BPF_CGROUP_INET6_GETSOCKNAME] = "cgroup_inet6_getsockname", + [BPF_CGROUP_UDP4_SENDMSG] = "cgroup_udp4_sendmsg", + [BPF_CGROUP_UDP6_SENDMSG] = "cgroup_udp6_sendmsg", + [BPF_CGROUP_SYSCTL] = "cgroup_sysctl", + [BPF_CGROUP_UDP4_RECVMSG] = "cgroup_udp4_recvmsg", + [BPF_CGROUP_UDP6_RECVMSG] = "cgroup_udp6_recvmsg", + [BPF_CGROUP_GETSOCKOPT] = "cgroup_getsockopt", + [BPF_CGROUP_SETSOCKOPT] = "cgroup_setsockopt", + [BPF_SK_SKB_STREAM_PARSER] = "sk_skb_stream_parser", + [BPF_SK_SKB_STREAM_VERDICT] = "sk_skb_stream_verdict", + [BPF_SK_SKB_VERDICT] = "sk_skb_verdict", + [BPF_SK_MSG_VERDICT] = "sk_msg_verdict", + [BPF_LIRC_MODE2] = "lirc_mode2", + [BPF_FLOW_DISSECTOR] = "flow_dissector", + [BPF_TRACE_RAW_TP] = "trace_raw_tp", + [BPF_TRACE_FENTRY] = "trace_fentry", + [BPF_TRACE_FEXIT] = "trace_fexit", + [BPF_MODIFY_RETURN] = "modify_return", + [BPF_LSM_MAC] = "lsm_mac", + [BPF_SK_LOOKUP] = "sk_lookup", + [BPF_TRACE_ITER] = "trace_iter", + [BPF_XDP_DEVMAP] = "xdp_devmap", + [BPF_XDP_CPUMAP] = "xdp_cpumap", + [BPF_XDP] = "xdp", + [BPF_SK_REUSEPORT_SELECT] = "sk_reuseport_select", + [BPF_SK_REUSEPORT_SELECT_OR_MIGRATE] = "sk_reuseport_select_or_migrate", + [BPF_PERF_EVENT] = "perf_event", + [BPF_TRACE_KPROBE_MULTI] = "trace_kprobe_multi", +}; + static const char * const map_type_name[] = { [BPF_MAP_TYPE_UNSPEC] = "unspec", [BPF_MAP_TYPE_HASH] = "hash", @@ -9369,6 +9415,14 @@ int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, return libbpf_err(-ESRCH); } +const char *libbpf_bpf_attach_type_str(enum bpf_attach_type t) +{ + if (t < 0 || t >= ARRAY_SIZE(attach_type_name)) + return NULL; + + return attach_type_name[t]; +} + const char *libbpf_bpf_map_type_str(enum bpf_map_type t) { if (t < 0 || t >= ARRAY_SIZE(map_type_name)) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 6c5b077351c6..37a234b141c6 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -51,6 +51,15 @@ enum libbpf_errno { LIBBPF_API int libbpf_strerror(int err, char *buf, size_t size); +/** + * @brief **libbpf_bpf_attach_type_str()** converts the provided attach type + * value into a textual representation. + * @param t The attach type. + * @return Pointer to a static string identifying the attach type. NULL is + * returned for unknown **bpf_attach_type** values. + */ +LIBBPF_API const char *libbpf_bpf_attach_type_str(enum bpf_attach_type t); + /** * @brief **libbpf_bpf_map_type_str()** converts the provided map type value * into a textual representation. diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 16065ac7792d..44c53694c22b 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -462,6 +462,7 @@ LIBBPF_0.8.0 { LIBBPF_1.0.0 { global: + libbpf_bpf_attach_type_str; libbpf_bpf_map_type_str; libbpf_bpf_prog_type_str; -- cgit v1.2.3 From 0b27b3d9fdf88c132b095a30ee2c61cd6e56e6cc Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Mon, 23 May 2022 23:04:24 +0000 Subject: selftests/bpf: Add test for libbpf_bpf_attach_type_str MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change adds a test for libbpf_bpf_attach_type_str. The test retrieves all variants of the bpf_attach_type enumeration using BTF and makes sure that the function under test works as expected for them. Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220523230428.3077108-9-deso@posteo.net --- .../testing/selftests/bpf/prog_tests/libbpf_str.c | 48 ++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c index f5185a46ee00..c88df92868c1 100644 --- a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c +++ b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c @@ -14,6 +14,51 @@ static void uppercase(char *s) *s = toupper(*s); } +/* + * Test case to check that all bpf_attach_type variants are covered by + * libbpf_bpf_attach_type_str. + */ +static void test_libbpf_bpf_attach_type_str(void) +{ + struct btf *btf; + const struct btf_type *t; + const struct btf_enum *e; + int i, n, id; + + btf = btf__parse("/sys/kernel/btf/vmlinux", NULL); + if (!ASSERT_OK_PTR(btf, "btf_parse")) + return; + + /* find enum bpf_attach_type and enumerate each value */ + id = btf__find_by_name_kind(btf, "bpf_attach_type", BTF_KIND_ENUM); + if (!ASSERT_GT(id, 0, "bpf_attach_type_id")) + goto cleanup; + t = btf__type_by_id(btf, id); + e = btf_enum(t); + n = btf_vlen(t); + for (i = 0; i < n; e++, i++) { + enum bpf_attach_type attach_type = (enum bpf_attach_type)e->val; + const char *attach_type_name; + const char *attach_type_str; + char buf[256]; + + if (attach_type == __MAX_BPF_ATTACH_TYPE) + continue; + + attach_type_name = btf__str_by_offset(btf, e->name_off); + attach_type_str = libbpf_bpf_attach_type_str(attach_type); + ASSERT_OK_PTR(attach_type_str, attach_type_name); + + snprintf(buf, sizeof(buf), "BPF_%s", attach_type_str); + uppercase(buf); + + ASSERT_STREQ(buf, attach_type_name, "exp_str_value"); + } + +cleanup: + btf__free(btf); +} + /* * Test case to check that all bpf_map_type variants are covered by * libbpf_bpf_map_type_str. @@ -103,6 +148,9 @@ cleanup: */ void test_libbpf_str(void) { + if (test__start_subtest("bpf_attach_type_str")) + test_libbpf_bpf_attach_type_str(); + if (test__start_subtest("bpf_map_type_str")) test_libbpf_bpf_map_type_str(); -- cgit v1.2.3 From 1ba5ad36e00f46e3f7676f5de6b87f5a2f57f1f1 Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Mon, 23 May 2022 23:04:25 +0000 Subject: bpftool: Use libbpf_bpf_attach_type_str MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change switches bpftool over to using the recently introduced libbpf_bpf_attach_type_str function instead of maintaining its own string representation for the bpf_attach_type enum. Note that contrary to other enum types, the variant names that bpftool maps bpf_attach_type to do not adhere a simple to follow rule. With bpf_prog_type, for example, the textual representation can easily be inferred by stripping the BPF_PROG_TYPE_ prefix and lowercasing the remaining string. bpf_attach_type violates this rule for various variants. We decided to fix up this deficiency with this change, meaning that bpftool uses the same textual representations as libbpf. Supporting tests, completion scripts, and man pages have been adjusted accordingly. However, we did add support for accepting (the now undocumented) original attach type names when they are provided by users. For the test (test_bpftool_synctypes.py), I have removed the enum representation checks, because we no longer mirror the various enum variant names in bpftool source code. For the man page, help text, and completion script checks we are now using enum definitions from uapi/linux/bpf.h as the source of truth directly. Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20220523230428.3077108-10-deso@posteo.net --- tools/bpf/bpftool/Documentation/bpftool-cgroup.rst | 16 +++- tools/bpf/bpftool/Documentation/bpftool-prog.rst | 5 +- tools/bpf/bpftool/bash-completion/bpftool | 18 ++-- tools/bpf/bpftool/cgroup.c | 53 +++++++---- tools/bpf/bpftool/common.c | 82 +++++++--------- tools/bpf/bpftool/link.c | 15 ++- tools/bpf/bpftool/main.h | 14 +++ tools/bpf/bpftool/prog.c | 25 ++++- .../selftests/bpf/test_bpftool_synctypes.py | 104 ++++++++------------- 9 files changed, 182 insertions(+), 150 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index a17e9aa314fd..bd015ec9847b 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -31,11 +31,17 @@ CGROUP COMMANDS | **bpftool** **cgroup help** | | *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } -| *ATTACH_TYPE* := { **ingress** | **egress** | **sock_create** | **sock_ops** | **device** | -| **bind4** | **bind6** | **post_bind4** | **post_bind6** | **connect4** | **connect6** | -| **getpeername4** | **getpeername6** | **getsockname4** | **getsockname6** | **sendmsg4** | -| **sendmsg6** | **recvmsg4** | **recvmsg6** | **sysctl** | **getsockopt** | **setsockopt** | -| **sock_release** } +| *ATTACH_TYPE* := { **cgroup_inet_ingress** | **cgroup_inet_egress** | +| **cgroup_inet_sock_create** | **cgroup_sock_ops** | +| **cgroup_device** | **cgroup_inet4_bind** | **cgroup_inet6_bind** | +| **cgroup_inet4_post_bind** | **cgroup_inet6_post_bind** | +| **cgroup_inet4_connect** | **cgroup_inet6_connect** | +| **cgroup_inet4_getpeername** | **cgroup_inet6_getpeername** | +| **cgroup_inet4_getsockname** | **cgroup_inet6_getsockname** | +| **cgroup_udp4_sendmsg** | **cgroup_udp6_sendmsg** | +| **cgroup_udp4_recvmsg** | **cgroup_udp6_recvmsg** | +| **cgroup_sysctl** | **cgroup_getsockopt** | **cgroup_setsockopt** | +| **cgroup_inet_sock_release** } | *ATTACH_FLAGS* := { **multi** | **override** } DESCRIPTION diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index a2e9359e554c..eb1b2a254eb1 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -53,8 +53,9 @@ PROG COMMANDS | **cgroup/getsockopt** | **cgroup/setsockopt** | **cgroup/sock_release** | | **struct_ops** | **fentry** | **fexit** | **freplace** | **sk_lookup** | } -| *ATTACH_TYPE* := { -| **msg_verdict** | **skb_verdict** | **stream_verdict** | **stream_parser** | **flow_dissector** +| *ATTACH_TYPE* := { +| **sk_msg_verdict** | **sk_skb_verdict** | **sk_skb_stream_verdict** | +| **sk_skb_stream_parser** | **flow_dissector** | } | *METRICs* := { | **cycles** | **instructions** | **l1d_loads** | **llc_misses** | diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 5df8d72c5179..91f89a9a5b36 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -407,8 +407,8 @@ _bpftool() return 0 ;; 5) - local BPFTOOL_PROG_ATTACH_TYPES='msg_verdict \ - skb_verdict stream_verdict stream_parser \ + local BPFTOOL_PROG_ATTACH_TYPES='sk_msg_verdict \ + sk_skb_verdict sk_skb_stream_verdict sk_skb_stream_parser \ flow_dissector' COMPREPLY=( $( compgen -W "$BPFTOOL_PROG_ATTACH_TYPES" -- "$cur" ) ) return 0 @@ -1039,12 +1039,14 @@ _bpftool() return 0 ;; attach|detach) - local BPFTOOL_CGROUP_ATTACH_TYPES='ingress egress \ - sock_create sock_ops device \ - bind4 bind6 post_bind4 post_bind6 connect4 connect6 \ - getpeername4 getpeername6 getsockname4 getsockname6 \ - sendmsg4 sendmsg6 recvmsg4 recvmsg6 sysctl getsockopt \ - setsockopt sock_release' + local BPFTOOL_CGROUP_ATTACH_TYPES='cgroup_inet_ingress cgroup_inet_egress \ + cgroup_inet_sock_create cgroup_sock_ops cgroup_device cgroup_inet4_bind \ + cgroup_inet6_bind cgroup_inet4_post_bind cgroup_inet6_post_bind \ + cgroup_inet4_connect cgroup_inet6_connect cgroup_inet4_getpeername \ + cgroup_inet6_getpeername cgroup_inet4_getsockname cgroup_inet6_getsockname \ + cgroup_udp4_sendmsg cgroup_udp6_sendmsg cgroup_udp4_recvmsg \ + cgroup_udp6_recvmsg cgroup_sysctl cgroup_getsockopt cgroup_setsockopt \ + cgroup_inet_sock_release' local ATTACH_FLAGS='multi override' local PROG_TYPE='id pinned tag name' # Check for $prev = $command first diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c index effe136119d7..42421fe47a58 100644 --- a/tools/bpf/bpftool/cgroup.c +++ b/tools/bpf/bpftool/cgroup.c @@ -21,25 +21,43 @@ #define HELP_SPEC_ATTACH_FLAGS \ "ATTACH_FLAGS := { multi | override }" -#define HELP_SPEC_ATTACH_TYPES \ - " ATTACH_TYPE := { ingress | egress | sock_create |\n" \ - " sock_ops | device | bind4 | bind6 |\n" \ - " post_bind4 | post_bind6 | connect4 |\n" \ - " connect6 | getpeername4 | getpeername6 |\n" \ - " getsockname4 | getsockname6 | sendmsg4 |\n" \ - " sendmsg6 | recvmsg4 | recvmsg6 |\n" \ - " sysctl | getsockopt | setsockopt |\n" \ - " sock_release }" +#define HELP_SPEC_ATTACH_TYPES \ + " ATTACH_TYPE := { cgroup_inet_ingress | cgroup_inet_egress |\n" \ + " cgroup_inet_sock_create | cgroup_sock_ops |\n" \ + " cgroup_device | cgroup_inet4_bind |\n" \ + " cgroup_inet6_bind | cgroup_inet4_post_bind |\n" \ + " cgroup_inet6_post_bind | cgroup_inet4_connect |\n" \ + " cgroup_inet6_connect | cgroup_inet4_getpeername |\n" \ + " cgroup_inet6_getpeername | cgroup_inet4_getsockname |\n" \ + " cgroup_inet6_getsockname | cgroup_udp4_sendmsg |\n" \ + " cgroup_udp6_sendmsg | cgroup_udp4_recvmsg |\n" \ + " cgroup_udp6_recvmsg | cgroup_sysctl |\n" \ + " cgroup_getsockopt | cgroup_setsockopt |\n" \ + " cgroup_inet_sock_release }" static unsigned int query_flags; static enum bpf_attach_type parse_attach_type(const char *str) { + const char *attach_type_str; enum bpf_attach_type type; - for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++) { - if (attach_type_name[type] && - is_prefix(str, attach_type_name[type])) + for (type = 0; ; type++) { + attach_type_str = libbpf_bpf_attach_type_str(type); + if (!attach_type_str) + break; + if (!strcmp(str, attach_type_str)) + return type; + } + + /* Also check traditionally used attach type strings. For these we keep + * allowing prefixed usage. + */ + for (type = 0; ; type++) { + attach_type_str = bpf_attach_type_input_str(type); + if (!attach_type_str) + break; + if (is_prefix(str, attach_type_str)) return type; } @@ -52,6 +70,7 @@ static int show_bpf_prog(int id, enum bpf_attach_type attach_type, { char prog_name[MAX_PROG_FULL_NAME]; struct bpf_prog_info info = {}; + const char *attach_type_str; __u32 info_len = sizeof(info); int prog_fd; @@ -64,13 +83,13 @@ static int show_bpf_prog(int id, enum bpf_attach_type attach_type, return -1; } + attach_type_str = libbpf_bpf_attach_type_str(attach_type); get_prog_full_name(&info, prog_fd, prog_name, sizeof(prog_name)); if (json_output) { jsonw_start_object(json_wtr); jsonw_uint_field(json_wtr, "id", info.id); - if (attach_type < ARRAY_SIZE(attach_type_name)) - jsonw_string_field(json_wtr, "attach_type", - attach_type_name[attach_type]); + if (attach_type_str) + jsonw_string_field(json_wtr, "attach_type", attach_type_str); else jsonw_uint_field(json_wtr, "attach_type", attach_type); jsonw_string_field(json_wtr, "attach_flags", @@ -79,8 +98,8 @@ static int show_bpf_prog(int id, enum bpf_attach_type attach_type, jsonw_end_object(json_wtr); } else { printf("%s%-8u ", level ? " " : "", info.id); - if (attach_type < ARRAY_SIZE(attach_type_name)) - printf("%-15s", attach_type_name[attach_type]); + if (attach_type_str) + printf("%-15s", attach_type_str); else printf("type %-10u", attach_type); printf(" %-15s %-15s\n", attach_flags_str, prog_name); diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index c740142c24d8..a45b42ee8ab0 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -31,52 +31,6 @@ #define BPF_FS_MAGIC 0xcafe4a11 #endif -const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = { - [BPF_CGROUP_INET_INGRESS] = "ingress", - [BPF_CGROUP_INET_EGRESS] = "egress", - [BPF_CGROUP_INET_SOCK_CREATE] = "sock_create", - [BPF_CGROUP_INET_SOCK_RELEASE] = "sock_release", - [BPF_CGROUP_SOCK_OPS] = "sock_ops", - [BPF_CGROUP_DEVICE] = "device", - [BPF_CGROUP_INET4_BIND] = "bind4", - [BPF_CGROUP_INET6_BIND] = "bind6", - [BPF_CGROUP_INET4_CONNECT] = "connect4", - [BPF_CGROUP_INET6_CONNECT] = "connect6", - [BPF_CGROUP_INET4_POST_BIND] = "post_bind4", - [BPF_CGROUP_INET6_POST_BIND] = "post_bind6", - [BPF_CGROUP_INET4_GETPEERNAME] = "getpeername4", - [BPF_CGROUP_INET6_GETPEERNAME] = "getpeername6", - [BPF_CGROUP_INET4_GETSOCKNAME] = "getsockname4", - [BPF_CGROUP_INET6_GETSOCKNAME] = "getsockname6", - [BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4", - [BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6", - [BPF_CGROUP_SYSCTL] = "sysctl", - [BPF_CGROUP_UDP4_RECVMSG] = "recvmsg4", - [BPF_CGROUP_UDP6_RECVMSG] = "recvmsg6", - [BPF_CGROUP_GETSOCKOPT] = "getsockopt", - [BPF_CGROUP_SETSOCKOPT] = "setsockopt", - [BPF_SK_SKB_STREAM_PARSER] = "sk_skb_stream_parser", - [BPF_SK_SKB_STREAM_VERDICT] = "sk_skb_stream_verdict", - [BPF_SK_SKB_VERDICT] = "sk_skb_verdict", - [BPF_SK_MSG_VERDICT] = "sk_msg_verdict", - [BPF_LIRC_MODE2] = "lirc_mode2", - [BPF_FLOW_DISSECTOR] = "flow_dissector", - [BPF_TRACE_RAW_TP] = "raw_tp", - [BPF_TRACE_FENTRY] = "fentry", - [BPF_TRACE_FEXIT] = "fexit", - [BPF_MODIFY_RETURN] = "mod_ret", - [BPF_LSM_MAC] = "lsm_mac", - [BPF_SK_LOOKUP] = "sk_lookup", - [BPF_TRACE_ITER] = "trace_iter", - [BPF_XDP_DEVMAP] = "xdp_devmap", - [BPF_XDP_CPUMAP] = "xdp_cpumap", - [BPF_XDP] = "xdp", - [BPF_SK_REUSEPORT_SELECT] = "sk_skb_reuseport_select", - [BPF_SK_REUSEPORT_SELECT_OR_MIGRATE] = "sk_skb_reuseport_select_or_migrate", - [BPF_PERF_EVENT] = "perf_event", - [BPF_TRACE_KPROBE_MULTI] = "trace_kprobe_multi", -}; - void p_err(const char *fmt, ...) { va_list ap; @@ -1009,3 +963,39 @@ bool equal_fn_for_key_as_id(const void *k1, const void *k2, void *ctx) { return k1 == k2; } + +const char *bpf_attach_type_input_str(enum bpf_attach_type t) +{ + switch (t) { + case BPF_CGROUP_INET_INGRESS: return "ingress"; + case BPF_CGROUP_INET_EGRESS: return "egress"; + case BPF_CGROUP_INET_SOCK_CREATE: return "sock_create"; + case BPF_CGROUP_INET_SOCK_RELEASE: return "sock_release"; + case BPF_CGROUP_SOCK_OPS: return "sock_ops"; + case BPF_CGROUP_DEVICE: return "device"; + case BPF_CGROUP_INET4_BIND: return "bind4"; + case BPF_CGROUP_INET6_BIND: return "bind6"; + case BPF_CGROUP_INET4_CONNECT: return "connect4"; + case BPF_CGROUP_INET6_CONNECT: return "connect6"; + case BPF_CGROUP_INET4_POST_BIND: return "post_bind4"; + case BPF_CGROUP_INET6_POST_BIND: return "post_bind6"; + case BPF_CGROUP_INET4_GETPEERNAME: return "getpeername4"; + case BPF_CGROUP_INET6_GETPEERNAME: return "getpeername6"; + case BPF_CGROUP_INET4_GETSOCKNAME: return "getsockname4"; + case BPF_CGROUP_INET6_GETSOCKNAME: return "getsockname6"; + case BPF_CGROUP_UDP4_SENDMSG: return "sendmsg4"; + case BPF_CGROUP_UDP6_SENDMSG: return "sendmsg6"; + case BPF_CGROUP_SYSCTL: return "sysctl"; + case BPF_CGROUP_UDP4_RECVMSG: return "recvmsg4"; + case BPF_CGROUP_UDP6_RECVMSG: return "recvmsg6"; + case BPF_CGROUP_GETSOCKOPT: return "getsockopt"; + case BPF_CGROUP_SETSOCKOPT: return "setsockopt"; + case BPF_TRACE_RAW_TP: return "raw_tp"; + case BPF_TRACE_FENTRY: return "fentry"; + case BPF_TRACE_FEXIT: return "fexit"; + case BPF_MODIFY_RETURN: return "mod_ret"; + case BPF_SK_REUSEPORT_SELECT: return "sk_skb_reuseport_select"; + case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE: return "sk_skb_reuseport_select_or_migrate"; + default: return libbpf_bpf_attach_type_str(t); + } +} diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index e27108489604..66a25450b598 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -78,9 +78,11 @@ show_link_header_json(struct bpf_link_info *info, json_writer_t *wtr) static void show_link_attach_type_json(__u32 attach_type, json_writer_t *wtr) { - if (attach_type < ARRAY_SIZE(attach_type_name)) - jsonw_string_field(wtr, "attach_type", - attach_type_name[attach_type]); + const char *attach_type_str; + + attach_type_str = libbpf_bpf_attach_type_str(attach_type); + if (attach_type_str) + jsonw_string_field(wtr, "attach_type", attach_type_str); else jsonw_uint_field(wtr, "attach_type", attach_type); } @@ -196,8 +198,11 @@ static void show_link_header_plain(struct bpf_link_info *info) static void show_link_attach_type_plain(__u32 attach_type) { - if (attach_type < ARRAY_SIZE(attach_type_name)) - printf("attach_type %s ", attach_type_name[attach_type]); + const char *attach_type_str; + + attach_type_str = libbpf_bpf_attach_type_str(attach_type); + if (attach_type_str) + printf("attach_type %s ", attach_type_str); else printf("attach_type %u ", attach_type); } diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index e4fdaa0740b3..6c311f47147e 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -243,6 +243,20 @@ int print_all_levels(__maybe_unused enum libbpf_print_level level, size_t hash_fn_for_key_as_id(const void *key, void *ctx); bool equal_fn_for_key_as_id(const void *k1, const void *k2, void *ctx); +/* bpf_attach_type_input_str - convert the provided attach type value into a + * textual representation that we accept for input purposes. + * + * This function is similar in nature to libbpf_bpf_attach_type_str, but + * recognizes some attach type names that have been used by the program in the + * past and which do not follow the string inference scheme that libbpf uses. + * These textual representations should only be used for user input. + * + * @t: The attach type + * Returns a pointer to a static string identifying the attach type. NULL is + * returned for unknown bpf_attach_type values. + */ +const char *bpf_attach_type_input_str(enum bpf_attach_type t); + static inline void *u32_as_hash_field(__u32 x) { return (void *)(uintptr_t)x; diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 39e1e7149f62..e71f0b2da50b 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -41,12 +41,23 @@ enum dump_mode { DUMP_XLATED, }; +static const bool attach_types[] = { + [BPF_SK_SKB_STREAM_PARSER] = true, + [BPF_SK_SKB_STREAM_VERDICT] = true, + [BPF_SK_SKB_VERDICT] = true, + [BPF_SK_MSG_VERDICT] = true, + [BPF_FLOW_DISSECTOR] = true, + [__MAX_BPF_ATTACH_TYPE] = false, +}; + +/* Textual representations traditionally used by the program and kept around + * for the sake of backwards compatibility. + */ static const char * const attach_type_strings[] = { [BPF_SK_SKB_STREAM_PARSER] = "stream_parser", [BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict", [BPF_SK_SKB_VERDICT] = "skb_verdict", [BPF_SK_MSG_VERDICT] = "msg_verdict", - [BPF_FLOW_DISSECTOR] = "flow_dissector", [__MAX_BPF_ATTACH_TYPE] = NULL, }; @@ -57,6 +68,14 @@ static enum bpf_attach_type parse_attach_type(const char *str) enum bpf_attach_type type; for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++) { + if (attach_types[type]) { + const char *attach_type_str; + + attach_type_str = libbpf_bpf_attach_type_str(type); + if (!strcmp(str, attach_type_str)) + return type; + } + if (attach_type_strings[type] && is_prefix(str, attach_type_strings[type])) return type; @@ -2341,8 +2360,8 @@ static int do_help(int argc, char **argv) " cgroup/sendmsg6 | cgroup/recvmsg4 | cgroup/recvmsg6 |\n" " cgroup/getsockopt | cgroup/setsockopt | cgroup/sock_release |\n" " struct_ops | fentry | fexit | freplace | sk_lookup }\n" - " ATTACH_TYPE := { msg_verdict | skb_verdict | stream_verdict |\n" - " stream_parser | flow_dissector }\n" + " ATTACH_TYPE := { sk_msg_verdict | sk_skb_verdict | sk_skb_stream_verdict |\n" + " sk_skb_stream_parser | flow_dissector }\n" " METRIC := { cycles | instructions | l1d_loads | llc_misses | itlb_misses | dtlb_misses }\n" " " HELP_SPEC_OPTIONS " |\n" " {-f|--bpffs} | {-m|--mapcompat} | {-n|--nomount} |\n" diff --git a/tools/testing/selftests/bpf/test_bpftool_synctypes.py b/tools/testing/selftests/bpf/test_bpftool_synctypes.py index 0a08c074a8fe..e443e6542cb9 100755 --- a/tools/testing/selftests/bpf/test_bpftool_synctypes.py +++ b/tools/testing/selftests/bpf/test_bpftool_synctypes.py @@ -58,7 +58,7 @@ class BlockParser(object): class ArrayParser(BlockParser): """ - A parser for extracting dicionaries of values from some BPF-related arrays. + A parser for extracting a set of values from some BPF-related arrays. @reader: a pointer to the open file to parse @array_name: name of the array to parse """ @@ -66,7 +66,7 @@ class ArrayParser(BlockParser): def __init__(self, reader, array_name): self.array_name = array_name - self.start_marker = re.compile(f'(static )?const char \* const {self.array_name}\[.*\] = {{\n') + self.start_marker = re.compile(f'(static )?const bool {self.array_name}\[.*\] = {{\n') super().__init__(reader) def search_block(self): @@ -80,15 +80,15 @@ class ArrayParser(BlockParser): Parse a block and return data as a dictionary. Items to extract must be on separate lines in the file. """ - pattern = re.compile('\[(BPF_\w*)\]\s*= "(.*)",?$') - entries = {} + pattern = re.compile('\[(BPF_\w*)\]\s*= (true|false),?$') + entries = set() while True: line = self.reader.readline() if line == '' or re.match(self.end_marker, line): break capture = pattern.search(line) if capture: - entries[capture.group(1)] = capture.group(2) + entries |= {capture.group(1)} return entries class InlineListParser(BlockParser): @@ -115,7 +115,7 @@ class InlineListParser(BlockParser): class FileExtractor(object): """ A generic reader for extracting data from a given file. This class contains - several helper methods that wrap arround parser objects to extract values + several helper methods that wrap around parser objects to extract values from different structures. This class does not offer a way to set a filename, which is expected to be defined in children classes. @@ -139,21 +139,19 @@ class FileExtractor(object): def get_types_from_array(self, array_name): """ - Search for and parse an array associating names to BPF_* enum members, - for example: + Search for and parse a list of allowed BPF_* enum members, for example: - const char * const prog_type_name[] = { - [BPF_PROG_TYPE_UNSPEC] = "unspec", - [BPF_PROG_TYPE_SOCKET_FILTER] = "socket_filter", - [BPF_PROG_TYPE_KPROBE] = "kprobe", + const bool prog_type_name[] = { + [BPF_PROG_TYPE_UNSPEC] = true, + [BPF_PROG_TYPE_SOCKET_FILTER] = true, + [BPF_PROG_TYPE_KPROBE] = true, }; - Return a dictionary with the enum member names as keys and the - associated names as values, for example: + Return a set of the enum members, for example: - {'BPF_PROG_TYPE_UNSPEC': 'unspec', - 'BPF_PROG_TYPE_SOCKET_FILTER': 'socket_filter', - 'BPF_PROG_TYPE_KPROBE': 'kprobe'} + {'BPF_PROG_TYPE_UNSPEC', + 'BPF_PROG_TYPE_SOCKET_FILTER', + 'BPF_PROG_TYPE_KPROBE'} @array_name: name of the array to parse """ @@ -355,7 +353,8 @@ class ProgFileExtractor(SourceFileExtractor): filename = os.path.join(BPFTOOL_DIR, 'prog.c') def get_attach_types(self): - return self.get_types_from_array('attach_type_strings') + types = self.get_types_from_array('attach_types') + return self.make_enum_map(types, 'BPF_') def get_prog_attach_help(self): return self.get_help_list('ATTACH_TYPE') @@ -378,30 +377,6 @@ class CgroupFileExtractor(SourceFileExtractor): def get_prog_attach_help(self): return self.get_help_list('ATTACH_TYPE') -class CommonFileExtractor(SourceFileExtractor): - """ - An extractor for bpftool's common.c. - """ - filename = os.path.join(BPFTOOL_DIR, 'common.c') - - def __init__(self): - super().__init__() - self.attach_types = {} - - def get_attach_types(self): - if not self.attach_types: - self.attach_types = self.get_types_from_array('attach_type_name') - return self.attach_types - - def get_cgroup_attach_types(self): - if not self.attach_types: - self.get_attach_types() - cgroup_types = {} - for (key, value) in self.attach_types.items(): - if key.find('BPF_CGROUP') != -1: - cgroup_types[key] = value - return cgroup_types - class GenericSourceExtractor(SourceFileExtractor): """ An extractor for generic source code files. @@ -418,6 +393,10 @@ class BpfHeaderExtractor(FileExtractor): """ filename = os.path.join(INCLUDE_DIR, 'uapi/linux/bpf.h') + def __init__(self): + super().__init__() + self.attach_types = {} + def get_prog_types(self): return self.get_enum('bpf_prog_type') @@ -425,8 +404,17 @@ class BpfHeaderExtractor(FileExtractor): names = self.get_enum('bpf_map_type') return self.make_enum_map(names, 'BPF_MAP_TYPE_') - def get_attach_types(self): - return self.get_enum('bpf_attach_type') + def get_attach_type_map(self): + if not self.attach_types: + names = self.get_enum('bpf_attach_type') + self.attach_types = self.make_enum_map(names, 'BPF_') + return self.attach_types + + def get_cgroup_attach_type_map(self): + if not self.attach_types: + self.get_attach_type_map() + return {name: text for name, text in self.attach_types.items() + if name.startswith('BPF_CGROUP')} class ManPageExtractor(FileExtractor): """ @@ -540,17 +528,6 @@ def main(): verify(source_map_types, bashcomp_map_types, f'Comparing {BpfHeaderExtractor.filename} (bpf_map_type) and {BashcompExtractor.filename} (BPFTOOL_MAP_CREATE_TYPES):') - # Attach types (enum) - - ref = bpf_info.get_attach_types() - bpf_info.close() - - common_info = CommonFileExtractor() - attach_types = common_info.get_attach_types() - - verify(ref, attach_types, - f'Comparing BPF header (enum bpf_attach_type) and {CommonFileExtractor.filename} (attach_type_name):') - # Attach types (names) prog_info = ProgFileExtractor() @@ -569,18 +546,17 @@ def main(): bashcomp_prog_attach_types = bashcomp_info.get_prog_attach_types() verify(source_prog_attach_types, help_prog_attach_types, - f'Comparing {ProgFileExtractor.filename} (attach_type_strings) and {ProgFileExtractor.filename} (do_help() ATTACH_TYPE):') + f'Comparing {ProgFileExtractor.filename} (bpf_attach_type) and {ProgFileExtractor.filename} (do_help() ATTACH_TYPE):') verify(source_prog_attach_types, man_prog_attach_types, - f'Comparing {ProgFileExtractor.filename} (attach_type_strings) and {ManProgExtractor.filename} (ATTACH_TYPE):') + f'Comparing {ProgFileExtractor.filename} (bpf_attach_type) and {ManProgExtractor.filename} (ATTACH_TYPE):') verify(help_prog_options, man_prog_options, f'Comparing {ProgFileExtractor.filename} (do_help() OPTIONS) and {ManProgExtractor.filename} (OPTIONS):') verify(source_prog_attach_types, bashcomp_prog_attach_types, - f'Comparing {ProgFileExtractor.filename} (attach_type_strings) and {BashcompExtractor.filename} (BPFTOOL_PROG_ATTACH_TYPES):') + f'Comparing {ProgFileExtractor.filename} (bpf_attach_type) and {BashcompExtractor.filename} (BPFTOOL_PROG_ATTACH_TYPES):') # Cgroup attach types - - source_cgroup_attach_types = set(common_info.get_cgroup_attach_types().values()) - common_info.close() + source_cgroup_attach_types = set(bpf_info.get_cgroup_attach_type_map().values()) + bpf_info.close() cgroup_info = CgroupFileExtractor() help_cgroup_attach_types = cgroup_info.get_prog_attach_help() @@ -596,13 +572,13 @@ def main(): bashcomp_info.close() verify(source_cgroup_attach_types, help_cgroup_attach_types, - f'Comparing {CommonFileExtractor.filename} (attach_type_strings) and {CgroupFileExtractor.filename} (do_help() ATTACH_TYPE):') + f'Comparing {BpfHeaderExtractor.filename} (bpf_attach_type) and {CgroupFileExtractor.filename} (do_help() ATTACH_TYPE):') verify(source_cgroup_attach_types, man_cgroup_attach_types, - f'Comparing {CommonFileExtractor.filename} (attach_type_strings) and {ManCgroupExtractor.filename} (ATTACH_TYPE):') + f'Comparing {BpfHeaderExtractor.filename} (bpf_attach_type) and {ManCgroupExtractor.filename} (ATTACH_TYPE):') verify(help_cgroup_options, man_cgroup_options, f'Comparing {CgroupFileExtractor.filename} (do_help() OPTIONS) and {ManCgroupExtractor.filename} (OPTIONS):') verify(source_cgroup_attach_types, bashcomp_cgroup_attach_types, - f'Comparing {CommonFileExtractor.filename} (attach_type_strings) and {BashcompExtractor.filename} (BPFTOOL_CGROUP_ATTACH_TYPES):') + f'Comparing {BpfHeaderExtractor.filename} (bpf_attach_type) and {BashcompExtractor.filename} (BPFTOOL_CGROUP_ATTACH_TYPES):') # Options for remaining commands -- cgit v1.2.3 From ba5d1b5802d4a732cb563e81ff30b7f514257a99 Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Mon, 23 May 2022 23:04:26 +0000 Subject: libbpf: Introduce libbpf_bpf_link_type_str MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change introduces a new function, libbpf_bpf_link_type_str, to the public libbpf API. The function allows users to get a string representation for a bpf_link_type enum variant. Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220523230428.3077108-11-deso@posteo.net --- tools/lib/bpf/libbpf.c | 21 +++++++++++++++++++++ tools/lib/bpf/libbpf.h | 9 +++++++++ tools/lib/bpf/libbpf.map | 1 + 3 files changed, 31 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index fa3987bc9172..5afe4cbd684f 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -118,6 +118,19 @@ static const char * const attach_type_name[] = { [BPF_TRACE_KPROBE_MULTI] = "trace_kprobe_multi", }; +static const char * const link_type_name[] = { + [BPF_LINK_TYPE_UNSPEC] = "unspec", + [BPF_LINK_TYPE_RAW_TRACEPOINT] = "raw_tracepoint", + [BPF_LINK_TYPE_TRACING] = "tracing", + [BPF_LINK_TYPE_CGROUP] = "cgroup", + [BPF_LINK_TYPE_ITER] = "iter", + [BPF_LINK_TYPE_NETNS] = "netns", + [BPF_LINK_TYPE_XDP] = "xdp", + [BPF_LINK_TYPE_PERF_EVENT] = "perf_event", + [BPF_LINK_TYPE_KPROBE_MULTI] = "kprobe_multi", + [BPF_LINK_TYPE_STRUCT_OPS] = "struct_ops", +}; + static const char * const map_type_name[] = { [BPF_MAP_TYPE_UNSPEC] = "unspec", [BPF_MAP_TYPE_HASH] = "hash", @@ -9423,6 +9436,14 @@ const char *libbpf_bpf_attach_type_str(enum bpf_attach_type t) return attach_type_name[t]; } +const char *libbpf_bpf_link_type_str(enum bpf_link_type t) +{ + if (t < 0 || t >= ARRAY_SIZE(link_type_name)) + return NULL; + + return link_type_name[t]; +} + const char *libbpf_bpf_map_type_str(enum bpf_map_type t) { if (t < 0 || t >= ARRAY_SIZE(map_type_name)) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 37a234b141c6..5b34ca5bed42 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -60,6 +60,15 @@ LIBBPF_API int libbpf_strerror(int err, char *buf, size_t size); */ LIBBPF_API const char *libbpf_bpf_attach_type_str(enum bpf_attach_type t); +/** + * @brief **libbpf_bpf_link_type_str()** converts the provided link type value + * into a textual representation. + * @param t The link type. + * @return Pointer to a static string identifying the link type. NULL is + * returned for unknown **bpf_link_type** values. + */ +LIBBPF_API const char *libbpf_bpf_link_type_str(enum bpf_link_type t); + /** * @brief **libbpf_bpf_map_type_str()** converts the provided map type value * into a textual representation. diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 44c53694c22b..38e284ff057d 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -463,6 +463,7 @@ LIBBPF_0.8.0 { LIBBPF_1.0.0 { global: libbpf_bpf_attach_type_str; + libbpf_bpf_link_type_str; libbpf_bpf_map_type_str; libbpf_bpf_prog_type_str; -- cgit v1.2.3 From dea73da2213a722d952840edd598f2f3a40ddc68 Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Mon, 23 May 2022 23:04:27 +0000 Subject: selftests/bpf: Add test for libbpf_bpf_link_type_str MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change adds a test for libbpf_bpf_link_type_str. The test retrieves all variants of the bpf_link_type enumeration using BTF and makes sure that the function under test works as expected for them. Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220523230428.3077108-12-deso@posteo.net --- .../testing/selftests/bpf/prog_tests/libbpf_str.c | 48 ++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c index c88df92868c1..93e9cddaadcf 100644 --- a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c +++ b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c @@ -59,6 +59,51 @@ cleanup: btf__free(btf); } +/* + * Test case to check that all bpf_link_type variants are covered by + * libbpf_bpf_link_type_str. + */ +static void test_libbpf_bpf_link_type_str(void) +{ + struct btf *btf; + const struct btf_type *t; + const struct btf_enum *e; + int i, n, id; + + btf = btf__parse("/sys/kernel/btf/vmlinux", NULL); + if (!ASSERT_OK_PTR(btf, "btf_parse")) + return; + + /* find enum bpf_link_type and enumerate each value */ + id = btf__find_by_name_kind(btf, "bpf_link_type", BTF_KIND_ENUM); + if (!ASSERT_GT(id, 0, "bpf_link_type_id")) + goto cleanup; + t = btf__type_by_id(btf, id); + e = btf_enum(t); + n = btf_vlen(t); + for (i = 0; i < n; e++, i++) { + enum bpf_link_type link_type = (enum bpf_link_type)e->val; + const char *link_type_name; + const char *link_type_str; + char buf[256]; + + if (link_type == MAX_BPF_LINK_TYPE) + continue; + + link_type_name = btf__str_by_offset(btf, e->name_off); + link_type_str = libbpf_bpf_link_type_str(link_type); + ASSERT_OK_PTR(link_type_str, link_type_name); + + snprintf(buf, sizeof(buf), "BPF_LINK_TYPE_%s", link_type_str); + uppercase(buf); + + ASSERT_STREQ(buf, link_type_name, "exp_str_value"); + } + +cleanup: + btf__free(btf); +} + /* * Test case to check that all bpf_map_type variants are covered by * libbpf_bpf_map_type_str. @@ -151,6 +196,9 @@ void test_libbpf_str(void) if (test__start_subtest("bpf_attach_type_str")) test_libbpf_bpf_attach_type_str(); + if (test__start_subtest("bpf_link_type_str")) + test_libbpf_bpf_link_type_str(); + if (test__start_subtest("bpf_map_type_str")) test_libbpf_bpf_map_type_str(); -- cgit v1.2.3 From c7e7e279dc8367920bddbe96115beef4ec2519a3 Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Mon, 23 May 2022 23:04:28 +0000 Subject: bpftool: Use libbpf_bpf_link_type_str MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change switches bpftool over to using the recently introduced libbpf_bpf_link_type_str function instead of maintaining its own string representation for the bpf_link_type enum. Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220523230428.3077108-13-deso@posteo.net --- tools/bpf/bpftool/link.c | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index 66a25450b598..7a20931c3250 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -13,19 +13,6 @@ #include "json_writer.h" #include "main.h" -static const char * const link_type_name[] = { - [BPF_LINK_TYPE_UNSPEC] = "unspec", - [BPF_LINK_TYPE_RAW_TRACEPOINT] = "raw_tracepoint", - [BPF_LINK_TYPE_TRACING] = "tracing", - [BPF_LINK_TYPE_CGROUP] = "cgroup", - [BPF_LINK_TYPE_ITER] = "iter", - [BPF_LINK_TYPE_NETNS] = "netns", - [BPF_LINK_TYPE_XDP] = "xdp", - [BPF_LINK_TYPE_PERF_EVENT] = "perf_event", - [BPF_LINK_TYPE_KPROBE_MULTI] = "kprobe_multi", - [BPF_LINK_TYPE_STRUCT_OPS] = "struct_ops", -}; - static struct hashmap *link_table; static int link_parse_fd(int *argc, char ***argv) @@ -67,9 +54,12 @@ static int link_parse_fd(int *argc, char ***argv) static void show_link_header_json(struct bpf_link_info *info, json_writer_t *wtr) { + const char *link_type_str; + jsonw_uint_field(wtr, "id", info->id); - if (info->type < ARRAY_SIZE(link_type_name)) - jsonw_string_field(wtr, "type", link_type_name[info->type]); + link_type_str = libbpf_bpf_link_type_str(info->type); + if (link_type_str) + jsonw_string_field(wtr, "type", link_type_str); else jsonw_uint_field(wtr, "type", info->type); @@ -187,9 +177,12 @@ static int show_link_close_json(int fd, struct bpf_link_info *info) static void show_link_header_plain(struct bpf_link_info *info) { + const char *link_type_str; + printf("%u: ", info->id); - if (info->type < ARRAY_SIZE(link_type_name)) - printf("%s ", link_type_name[info->type]); + link_type_str = libbpf_bpf_link_type_str(info->type); + if (link_type_str) + printf("%s ", link_type_str); else printf("type %u ", info->type); -- cgit v1.2.3 From eb7b36ce47f830a01ad9405e673b563cc3638d5d Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Sat, 21 May 2022 23:13:29 +0800 Subject: selftests/bpf: Fix test_run logic in fexit_stress.c In the commit da00d2f117a0 ("bpf: Add test ops for BPF_PROG_TYPE_TRACING"), the bpf_fentry_test1 function was moved into bpf_prog_test_run_tracing(), which is the test_run function of the tracing BPF programs. Thus calling 'bpf_prog_test_run_opts(filter_fd, &topts)' will not trigger bpf_fentry_test1 function as filter_fd is a sk_filter BPF program. Fix it by replacing filter_fd with fexit_fd in the bpf_prog_test_run_opts() function. Fixes: da00d2f117a0 ("bpf: Add test ops for BPF_PROG_TYPE_TRACING") Signed-off-by: Yuntao Wang Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220521151329.648013-1-ytcoode@gmail.com --- .../selftests/bpf/prog_tests/fexit_stress.c | 32 +++------------------- 1 file changed, 4 insertions(+), 28 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_stress.c b/tools/testing/selftests/bpf/prog_tests/fexit_stress.c index a7e74297f15f..5a7e6011f6bf 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_stress.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_stress.c @@ -7,11 +7,9 @@ void serial_test_fexit_stress(void) { - char test_skb[128] = {}; int fexit_fd[CNT] = {}; int link_fd[CNT] = {}; - char error[4096]; - int err, i, filter_fd; + int err, i; const struct bpf_insn trace_program[] = { BPF_MOV64_IMM(BPF_REG_0, 0), @@ -20,25 +18,9 @@ void serial_test_fexit_stress(void) LIBBPF_OPTS(bpf_prog_load_opts, trace_opts, .expected_attach_type = BPF_TRACE_FEXIT, - .log_buf = error, - .log_size = sizeof(error), ); - const struct bpf_insn skb_program[] = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - }; - - LIBBPF_OPTS(bpf_prog_load_opts, skb_opts, - .log_buf = error, - .log_size = sizeof(error), - ); - - LIBBPF_OPTS(bpf_test_run_opts, topts, - .data_in = test_skb, - .data_size_in = sizeof(test_skb), - .repeat = 1, - ); + LIBBPF_OPTS(bpf_test_run_opts, topts); err = libbpf_find_vmlinux_btf_id("bpf_fentry_test1", trace_opts.expected_attach_type); @@ -58,15 +40,9 @@ void serial_test_fexit_stress(void) goto out; } - filter_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", - skb_program, sizeof(skb_program) / sizeof(struct bpf_insn), - &skb_opts); - if (!ASSERT_GE(filter_fd, 0, "test_program_loaded")) - goto out; + err = bpf_prog_test_run_opts(fexit_fd[0], &topts); + ASSERT_OK(err, "bpf_prog_test_run_opts"); - err = bpf_prog_test_run_opts(filter_fd, &topts); - close(filter_fd); - CHECK_FAIL(err); out: for (i = 0; i < CNT; i++) { if (link_fd[i]) -- cgit v1.2.3 From de4b4b94fad90f876ab12e87999109e31a1871b4 Mon Sep 17 00:00:00 2001 From: Michael Mullin Date: Mon, 23 May 2022 15:49:17 -0400 Subject: bpftool: Check for NULL ptr of btf in codegen_asserts bpf_object__btf() can return a NULL value. If bpf_object__btf returns null, do not progress through codegen_asserts(). This avoids a null ptr dereference at the call btf__type_cnt() in the function find_type_for_map() Signed-off-by: Michael Mullin Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220523194917.igkgorco42537arb@jup --- tools/bpf/bpftool/gen.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 4c9477ff748d..f158dc1c2149 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -474,6 +474,9 @@ static void codegen_asserts(struct bpf_object *obj, const char *obj_name) const struct btf_type *sec; char map_ident[256], var_ident[256]; + if (!btf) + return; + codegen("\ \n\ __attribute__((unused)) static void \n\ -- cgit v1.2.3 From 610cd93b44ce1e7323878ef5f7b23dc10c7d45b7 Mon Sep 17 00:00:00 2001 From: Douglas Raillard Date: Tue, 24 May 2022 10:44:47 +0100 Subject: libbpf: Fix determine_ptr_size() guessing One strategy employed by libbpf to guess the pointer size is by finding the size of "unsigned long" type. This is achieved by looking for a type of with the expected name and checking its size. Unfortunately, the C syntax is friendlier to humans than to computers as there is some variety in how such a type can be named. Specifically, gcc and clang do not use the same names for integer types in debug info: - clang uses "unsigned long" - gcc uses "long unsigned int" Lookup all the names for such a type so that libbpf can hope to find the information it wants. Signed-off-by: Douglas Raillard Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220524094447.332186-1-douglas.raillard@arm.com --- tools/lib/bpf/btf.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index bb1e06eb1eca..3d6c30d9a575 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -472,9 +472,22 @@ const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 type_id) static int determine_ptr_size(const struct btf *btf) { + static const char * const long_aliases[] = { + "long", + "long int", + "int long", + "unsigned long", + "long unsigned", + "unsigned long int", + "unsigned int long", + "long unsigned int", + "long int unsigned", + "int unsigned long", + "int long unsigned", + }; const struct btf_type *t; const char *name; - int i, n; + int i, j, n; if (btf->base_btf && btf->base_btf->ptr_sz > 0) return btf->base_btf->ptr_sz; @@ -485,15 +498,16 @@ static int determine_ptr_size(const struct btf *btf) if (!btf_is_int(t)) continue; + if (t->size != 4 && t->size != 8) + continue; + name = btf__name_by_offset(btf, t->name_off); if (!name) continue; - if (strcmp(name, "long int") == 0 || - strcmp(name, "long unsigned int") == 0) { - if (t->size != 4 && t->size != 8) - continue; - return t->size; + for (j = 0; j < ARRAY_SIZE(long_aliases); j++) { + if (strcmp(name, long_aliases[j]) == 0) + return t->size; } } -- cgit v1.2.3 From 9bbdfad8a5192cfa698994dd8db3a52b137e7478 Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Wed, 1 Jun 2022 15:40:25 +0000 Subject: libbpf: Fix a couple of typos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change fixes a couple of typos that were encountered while studying the source code. Signed-off-by: Daniel Müller Signed-off-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20220601154025.3295035-1-deso@posteo.net --- tools/lib/bpf/btf.c | 2 +- tools/lib/bpf/libbpf.h | 2 +- tools/lib/bpf/relo_core.c | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 3d6c30d9a575..2e9c23bcdc67 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -130,7 +130,7 @@ static inline __u64 ptr_to_u64(const void *ptr) /* Ensure given dynamically allocated memory region pointed to by *data* with * capacity of *cap_cnt* elements each taking *elem_sz* bytes has enough - * memory to accomodate *add_cnt* new elements, assuming *cur_cnt* elements + * memory to accommodate *add_cnt* new elements, assuming *cur_cnt* elements * are already used. At most *max_cnt* elements can be ever allocated. * If necessary, memory is reallocated and all existing data is copied over, * new pointer to the memory region is stored at *data, new memory region diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 5b34ca5bed42..fa27969da0da 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -107,7 +107,7 @@ struct bpf_object_open_attr { }; struct bpf_object_open_opts { - /* size of this struct, for forward/backward compatiblity */ + /* size of this struct, for forward/backward compatibility */ size_t sz; /* object name override, if provided: * - for object open from file, this will override setting object diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c index ba4453dfd1ed..d8ab4c61cb61 100644 --- a/tools/lib/bpf/relo_core.c +++ b/tools/lib/bpf/relo_core.c @@ -167,7 +167,7 @@ static bool core_relo_is_enumval_based(enum bpf_core_relo_kind kind) * just a parsed access string representation): [0, 1, 2, 3]. * * High-level spec will capture only 3 points: - * - intial zero-index access by pointer (&s->... is the same as &s[0]...); + * - initial zero-index access by pointer (&s->... is the same as &s[0]...); * - field 'a' access (corresponds to '2' in low-level spec); * - array element #3 access (corresponds to '3' in low-level spec). * @@ -1148,11 +1148,11 @@ int bpf_core_format_spec(char *buf, size_t buf_sz, const struct bpf_core_spec *s * 3. It is supported and expected that there might be multiple flavors * matching the spec. As long as all the specs resolve to the same set of * offsets across all candidates, there is no error. If there is any - * ambiguity, CO-RE relocation will fail. This is necessary to accomodate - * imprefection of BTF deduplication, which can cause slight duplication of + * ambiguity, CO-RE relocation will fail. This is necessary to accommodate + * imperfection of BTF deduplication, which can cause slight duplication of * the same BTF type, if some directly or indirectly referenced (by * pointer) type gets resolved to different actual types in different - * object files. If such situation occurs, deduplicated BTF will end up + * object files. If such a situation occurs, deduplicated BTF will end up * with two (or more) structurally identical types, which differ only in * types they refer to through pointer. This should be OK in most cases and * is not an error. -- cgit v1.2.3 From e6ff92f41b65fce07365f1066fb13b5e42aca08d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 1 Jun 2022 16:40:50 -0700 Subject: selftests/bpf: Fix tc_redirect_dtime tc_redirect_dtime was reported flaky from time to time. It always fails at the udp test and complains about the bpf@tc-ingress got a skb->tstamp when handling udp packet. It is unexpected because the skb->tstamp should have been cleared when crossing different netns. The most likely cause is that the skb is actually a tcp packet from the earlier tcp test. It could be the final TCP_FIN handling. This patch tightens the skb->tstamp check in the bpf prog. It ensures the skb is the current testing traffic. First, it checks that skb matches the IPPROTO of the running test (i.e. tcp vs udp). Second, it checks the server port (dst_ns_port). The server port is unique for each test (50000 + test_enum). Also fixed a typo in test_udp_dtime(): s/P100/P101/ Fixes: c803475fd8dd ("bpf: selftests: test skb->tstamp in redirect_neigh") Reported-by: Andrii Nakryiko Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20220601234050.2572671-1-kafai@fb.com --- .../testing/selftests/bpf/prog_tests/tc_redirect.c | 8 ++-- tools/testing/selftests/bpf/progs/test_tc_dtime.c | 53 +++++++++++++++++++++- 2 files changed, 55 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index 958dae769c52..cb6a53b3e023 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -646,7 +646,7 @@ static void test_tcp_clear_dtime(struct test_tc_dtime *skel) __u32 *errs = skel->bss->errs[t]; skel->bss->test = t; - test_inet_dtime(AF_INET6, SOCK_STREAM, IP6_DST, 0); + test_inet_dtime(AF_INET6, SOCK_STREAM, IP6_DST, 50000 + t); ASSERT_EQ(dtimes[INGRESS_FWDNS_P100], 0, dtime_cnt_str(t, INGRESS_FWDNS_P100)); @@ -683,7 +683,7 @@ static void test_tcp_dtime(struct test_tc_dtime *skel, int family, bool bpf_fwd) errs = skel->bss->errs[t]; skel->bss->test = t; - test_inet_dtime(family, SOCK_STREAM, addr, 0); + test_inet_dtime(family, SOCK_STREAM, addr, 50000 + t); /* fwdns_prio100 prog does not read delivery_time_type, so * kernel puts the (rcv) timetamp in __sk_buff->tstamp @@ -715,13 +715,13 @@ static void test_udp_dtime(struct test_tc_dtime *skel, int family, bool bpf_fwd) errs = skel->bss->errs[t]; skel->bss->test = t; - test_inet_dtime(family, SOCK_DGRAM, addr, 0); + test_inet_dtime(family, SOCK_DGRAM, addr, 50000 + t); ASSERT_EQ(dtimes[INGRESS_FWDNS_P100], 0, dtime_cnt_str(t, INGRESS_FWDNS_P100)); /* non mono delivery time is not forwarded */ ASSERT_EQ(dtimes[INGRESS_FWDNS_P101], 0, - dtime_cnt_str(t, INGRESS_FWDNS_P100)); + dtime_cnt_str(t, INGRESS_FWDNS_P101)); for (i = EGRESS_FWDNS_P100; i < SET_DTIME; i++) ASSERT_GT(dtimes[i], 0, dtime_cnt_str(t, i)); diff --git a/tools/testing/selftests/bpf/progs/test_tc_dtime.c b/tools/testing/selftests/bpf/progs/test_tc_dtime.c index 06f300d06dbd..b596479a9ebe 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_dtime.c +++ b/tools/testing/selftests/bpf/progs/test_tc_dtime.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include #include #include @@ -115,6 +117,19 @@ static bool bpf_fwd(void) return test < TCP_IP4_RT_FWD; } +static __u8 get_proto(void) +{ + switch (test) { + case UDP_IP4: + case UDP_IP6: + case UDP_IP4_RT_FWD: + case UDP_IP6_RT_FWD: + return IPPROTO_UDP; + default: + return IPPROTO_TCP; + } +} + /* -1: parse error: TC_ACT_SHOT * 0: not testing traffic: TC_ACT_OK * >0: first byte is the inet_proto, second byte has the netns @@ -122,11 +137,16 @@ static bool bpf_fwd(void) */ static int skb_get_type(struct __sk_buff *skb) { + __u16 dst_ns_port = __bpf_htons(50000 + test); void *data_end = ctx_ptr(skb->data_end); void *data = ctx_ptr(skb->data); __u8 inet_proto = 0, ns = 0; struct ipv6hdr *ip6h; + __u16 sport, dport; struct iphdr *iph; + struct tcphdr *th; + struct udphdr *uh; + void *trans; switch (skb->protocol) { case __bpf_htons(ETH_P_IP): @@ -138,6 +158,7 @@ static int skb_get_type(struct __sk_buff *skb) else if (iph->saddr == ip4_dst) ns = DST_NS; inet_proto = iph->protocol; + trans = iph + 1; break; case __bpf_htons(ETH_P_IPV6): ip6h = data + sizeof(struct ethhdr); @@ -148,15 +169,43 @@ static int skb_get_type(struct __sk_buff *skb) else if (v6_equal(ip6h->saddr, (struct in6_addr)ip6_dst)) ns = DST_NS; inet_proto = ip6h->nexthdr; + trans = ip6h + 1; break; default: return 0; } - if ((inet_proto != IPPROTO_TCP && inet_proto != IPPROTO_UDP) || !ns) + /* skb is not from src_ns or dst_ns. + * skb is not the testing IPPROTO. + */ + if (!ns || inet_proto != get_proto()) return 0; - return (ns << 8 | inet_proto); + switch (inet_proto) { + case IPPROTO_TCP: + th = trans; + if (th + 1 > data_end) + return -1; + sport = th->source; + dport = th->dest; + break; + case IPPROTO_UDP: + uh = trans; + if (uh + 1 > data_end) + return -1; + sport = uh->source; + dport = uh->dest; + break; + default: + return 0; + } + + /* The skb is the testing traffic */ + if ((ns == SRC_NS && dport == dst_ns_port) || + (ns == DST_NS && sport == dst_ns_port)) + return (ns << 8 | inet_proto); + + return 0; } /* format: direction@iface@netns -- cgit v1.2.3 From 611edf1bacc51355ccb497915695db7f869cb382 Mon Sep 17 00:00:00 2001 From: Yuze Chi Date: Thu, 2 Jun 2022 22:51:56 -0700 Subject: libbpf: Fix is_pow_of_2 Move the correct definition from linker.c into libbpf_internal.h. Fixes: 0087a681fa8c ("libbpf: Automatically fix up BPF_MAP_TYPE_RINGBUF size, if necessary") Reported-by: Yuze Chi Signed-off-by: Yuze Chi Signed-off-by: Ian Rogers Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220603055156.2830463-1-irogers@google.com --- tools/lib/bpf/libbpf.c | 5 ----- tools/lib/bpf/libbpf_internal.h | 5 +++++ tools/lib/bpf/linker.c | 5 ----- 3 files changed, 5 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 5afe4cbd684f..b03165687936 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5071,11 +5071,6 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map) static void bpf_map__destroy(struct bpf_map *map); -static bool is_pow_of_2(size_t x) -{ - return x && (x & (x - 1)); -} - static size_t adjust_ringbuf_sz(size_t sz) { __u32 page_sz = sysconf(_SC_PAGE_SIZE); diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 4abdbe2fea9d..ef5d975078e5 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -580,4 +580,9 @@ struct bpf_link * usdt_manager_attach_usdt(struct usdt_manager *man, const char *usdt_provider, const char *usdt_name, __u64 usdt_cookie); +static inline bool is_pow_of_2(size_t x) +{ + return x && (x & (x - 1)) == 0; +} + #endif /* __LIBBPF_LIBBPF_INTERNAL_H */ diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index 9aa016fb55aa..85c0fddf55d1 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -697,11 +697,6 @@ static int linker_load_obj_file(struct bpf_linker *linker, const char *filename, return err; } -static bool is_pow_of_2(size_t x) -{ - return x && (x & (x - 1)) == 0; -} - static int linker_sanity_check_elf(struct src_obj *obj) { struct src_sec *sec; -- cgit v1.2.3 From 02f4afebf8a54ba16f99f4f6ca10df3efeac6229 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 2 Jun 2022 11:25:07 +0800 Subject: selftests/bpf: Add drv mode testing for xdping As subject, we only test SKB mode for xdping at present. Now add DRV mode for xdping. Signed-off-by: Hangbin Liu Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20220602032507.464453-1-liuhangbin@gmail.com --- tools/testing/selftests/bpf/test_xdping.sh | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_xdping.sh b/tools/testing/selftests/bpf/test_xdping.sh index c2f0ddb45531..c3d82e0a7378 100755 --- a/tools/testing/selftests/bpf/test_xdping.sh +++ b/tools/testing/selftests/bpf/test_xdping.sh @@ -95,5 +95,9 @@ for server_args in "" "-I veth0 -s -S" ; do test "$client_args" "$server_args" done +# Test drv mode +test "-I veth1 -N" "-I veth0 -s -N" +test "-I veth1 -N -c 10" "-I veth0 -s -N" + echo "OK. All tests passed" exit 0 -- cgit v1.2.3 From 6089fb325cf737eeb2c4d236c94697112ca860da Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 6 Jun 2022 23:26:00 -0700 Subject: bpf: Add btf enum64 support Currently, BTF only supports upto 32bit enum value with BTF_KIND_ENUM. But in kernel, some enum indeed has 64bit values, e.g., in uapi bpf.h, we have enum { BPF_F_INDEX_MASK = 0xffffffffULL, BPF_F_CURRENT_CPU = BPF_F_INDEX_MASK, BPF_F_CTXLEN_MASK = (0xfffffULL << 32), }; In this case, BTF_KIND_ENUM will encode the value of BPF_F_CTXLEN_MASK as 0, which certainly is incorrect. This patch added a new btf kind, BTF_KIND_ENUM64, which permits 64bit value to cover the above use case. The BTF_KIND_ENUM64 has the following three fields followed by the common type: struct bpf_enum64 { __u32 nume_off; __u32 val_lo32; __u32 val_hi32; }; Currently, btf type section has an alignment of 4 as all element types are u32. Representing the value with __u64 will introduce a pad for bpf_enum64 and may also introduce misalignment for the 64bit value. Hence, two members of val_hi32 and val_lo32 are chosen to avoid these issues. The kflag is also introduced for BTF_KIND_ENUM and BTF_KIND_ENUM64 to indicate whether the value is signed or unsigned. The kflag intends to provide consistent output of BTF C fortmat with the original source code. For example, the original BTF_KIND_ENUM bit value is 0xffffffff. The format C has two choices, printing out 0xffffffff or -1 and current libbpf prints out as unsigned value. But if the signedness is preserved in btf, the value can be printed the same as the original source code. The kflag value 0 means unsigned values, which is consistent to the default by libbpf and should also cover most cases as well. The new BTF_KIND_ENUM64 is intended to support the enum value represented as 64bit value. But it can represent all BTF_KIND_ENUM values as well. The compiler ([1]) and pahole will generate BTF_KIND_ENUM64 only if the value has to be represented with 64 bits. In addition, a static inline function btf_kind_core_compat() is introduced which will be used later when libbpf relo_core.c changed. Here the kernel shares the same relo_core.c with libbpf. [1] https://reviews.llvm.org/D124641 Acked-by: Andrii Nakryiko Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220607062600.3716578-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 28 ++++++++ include/uapi/linux/btf.h | 17 ++++- kernel/bpf/btf.c | 142 +++++++++++++++++++++++++++++++++++++---- kernel/bpf/verifier.c | 2 +- tools/include/uapi/linux/btf.h | 17 ++++- 5 files changed, 185 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/include/linux/btf.h b/include/linux/btf.h index 2611cea2c2b6..1bfed7fa0428 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -177,6 +177,19 @@ static inline bool btf_type_is_enum(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM; } +static inline bool btf_is_any_enum(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM || + BTF_INFO_KIND(t->info) == BTF_KIND_ENUM64; +} + +static inline bool btf_kind_core_compat(const struct btf_type *t1, + const struct btf_type *t2) +{ + return BTF_INFO_KIND(t1->info) == BTF_INFO_KIND(t2->info) || + (btf_is_any_enum(t1) && btf_is_any_enum(t2)); +} + static inline bool str_is_empty(const char *s) { return !s || !s[0]; @@ -192,6 +205,16 @@ static inline bool btf_is_enum(const struct btf_type *t) return btf_kind(t) == BTF_KIND_ENUM; } +static inline bool btf_is_enum64(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_ENUM64; +} + +static inline u64 btf_enum64_value(const struct btf_enum64 *e) +{ + return ((u64)e->val_hi32 << 32) | e->val_lo32; +} + static inline bool btf_is_composite(const struct btf_type *t) { u16 kind = btf_kind(t); @@ -332,6 +355,11 @@ static inline struct btf_enum *btf_enum(const struct btf_type *t) return (struct btf_enum *)(t + 1); } +static inline struct btf_enum64 *btf_enum64(const struct btf_type *t) +{ + return (struct btf_enum64 *)(t + 1); +} + static inline const struct btf_var_secinfo *btf_type_var_secinfo( const struct btf_type *t) { diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index a9162a6c0284..ec1798b6d3ff 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -36,10 +36,10 @@ struct btf_type { * bits 24-28: kind (e.g. int, ptr, array...etc) * bits 29-30: unused * bit 31: kind_flag, currently used by - * struct, union and fwd + * struct, union, enum, fwd and enum64 */ __u32 info; - /* "size" is used by INT, ENUM, STRUCT, UNION and DATASEC. + /* "size" is used by INT, ENUM, STRUCT, UNION, DATASEC and ENUM64. * "size" tells the size of the type it is describing. * * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, @@ -63,7 +63,7 @@ enum { BTF_KIND_ARRAY = 3, /* Array */ BTF_KIND_STRUCT = 4, /* Struct */ BTF_KIND_UNION = 5, /* Union */ - BTF_KIND_ENUM = 6, /* Enumeration */ + BTF_KIND_ENUM = 6, /* Enumeration up to 32-bit values */ BTF_KIND_FWD = 7, /* Forward */ BTF_KIND_TYPEDEF = 8, /* Typedef */ BTF_KIND_VOLATILE = 9, /* Volatile */ @@ -76,6 +76,7 @@ enum { BTF_KIND_FLOAT = 16, /* Floating point */ BTF_KIND_DECL_TAG = 17, /* Decl Tag */ BTF_KIND_TYPE_TAG = 18, /* Type Tag */ + BTF_KIND_ENUM64 = 19, /* Enumeration up to 64-bit values */ NR_BTF_KINDS, BTF_KIND_MAX = NR_BTF_KINDS - 1, @@ -186,4 +187,14 @@ struct btf_decl_tag { __s32 component_idx; }; +/* BTF_KIND_ENUM64 is followed by multiple "struct btf_enum64". + * The exact number of btf_enum64 is stored in the vlen (of the + * info in "struct btf_type"). + */ +struct btf_enum64 { + __u32 name_off; + __u32 val_lo32; + __u32 val_hi32; +}; + #endif /* _UAPI__LINUX_BTF_H__ */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7bccaa4646e5..6c0d8480e15c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -309,6 +309,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_FLOAT] = "FLOAT", [BTF_KIND_DECL_TAG] = "DECL_TAG", [BTF_KIND_TYPE_TAG] = "TYPE_TAG", + [BTF_KIND_ENUM64] = "ENUM64", }; const char *btf_type_str(const struct btf_type *t) @@ -666,6 +667,7 @@ static bool btf_type_has_size(const struct btf_type *t) case BTF_KIND_ENUM: case BTF_KIND_DATASEC: case BTF_KIND_FLOAT: + case BTF_KIND_ENUM64: return true; } @@ -711,6 +713,11 @@ static const struct btf_decl_tag *btf_type_decl_tag(const struct btf_type *t) return (const struct btf_decl_tag *)(t + 1); } +static const struct btf_enum64 *btf_type_enum64(const struct btf_type *t) +{ + return (const struct btf_enum64 *)(t + 1); +} + static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) { return kind_ops[BTF_INFO_KIND(t->info)]; @@ -1019,6 +1026,7 @@ static const char *btf_show_name(struct btf_show *show) parens = "{"; break; case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: prefix = "enum"; break; default: @@ -1834,6 +1842,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type, case BTF_KIND_UNION: case BTF_KIND_ENUM: case BTF_KIND_FLOAT: + case BTF_KIND_ENUM64: size = type->size; goto resolved; @@ -3670,6 +3679,7 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, { const struct btf_enum *enums = btf_type_enum(t); struct btf *btf = env->btf; + const char *fmt_str; u16 i, nr_enums; u32 meta_needed; @@ -3683,11 +3693,6 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (btf_type_kflag(t)) { - btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); - return -EINVAL; - } - if (t->size > 8 || !is_power_of_2(t->size)) { btf_verifier_log_type(env, t, "Unexpected size"); return -EINVAL; @@ -3718,7 +3723,8 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, if (env->log.level == BPF_LOG_KERNEL) continue; - btf_verifier_log(env, "\t%s val=%d\n", + fmt_str = btf_type_kflag(t) ? "\t%s val=%d\n" : "\t%s val=%u\n"; + btf_verifier_log(env, fmt_str, __btf_name_by_offset(btf, enums[i].name_off), enums[i].val); } @@ -3759,7 +3765,10 @@ static void btf_enum_show(const struct btf *btf, const struct btf_type *t, return; } - btf_show_type_value(show, "%d", v); + if (btf_type_kflag(t)) + btf_show_type_value(show, "%d", v); + else + btf_show_type_value(show, "%u", v); btf_show_end_type(show); } @@ -3772,6 +3781,109 @@ static struct btf_kind_operations enum_ops = { .show = btf_enum_show, }; +static s32 btf_enum64_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + const struct btf_enum64 *enums = btf_type_enum64(t); + struct btf *btf = env->btf; + const char *fmt_str; + u16 i, nr_enums; + u32 meta_needed; + + nr_enums = btf_type_vlen(t); + meta_needed = nr_enums * sizeof(*enums); + + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (t->size > 8 || !is_power_of_2(t->size)) { + btf_verifier_log_type(env, t, "Unexpected size"); + return -EINVAL; + } + + /* enum type either no name or a valid one */ + if (t->name_off && + !btf_name_valid_identifier(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + for (i = 0; i < nr_enums; i++) { + if (!btf_name_offset_valid(btf, enums[i].name_off)) { + btf_verifier_log(env, "\tInvalid name_offset:%u", + enums[i].name_off); + return -EINVAL; + } + + /* enum member must have a valid name */ + if (!enums[i].name_off || + !btf_name_valid_identifier(btf, enums[i].name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + if (env->log.level == BPF_LOG_KERNEL) + continue; + + fmt_str = btf_type_kflag(t) ? "\t%s val=%lld\n" : "\t%s val=%llu\n"; + btf_verifier_log(env, fmt_str, + __btf_name_by_offset(btf, enums[i].name_off), + btf_enum64_value(enums + i)); + } + + return meta_needed; +} + +static void btf_enum64_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) +{ + const struct btf_enum64 *enums = btf_type_enum64(t); + u32 i, nr_enums = btf_type_vlen(t); + void *safe_data; + s64 v; + + safe_data = btf_show_start_type(show, t, type_id, data); + if (!safe_data) + return; + + v = *(u64 *)safe_data; + + for (i = 0; i < nr_enums; i++) { + if (v != btf_enum64_value(enums + i)) + continue; + + btf_show_type_value(show, "%s", + __btf_name_by_offset(btf, + enums[i].name_off)); + + btf_show_end_type(show); + return; + } + + if (btf_type_kflag(t)) + btf_show_type_value(show, "%lld", v); + else + btf_show_type_value(show, "%llu", v); + btf_show_end_type(show); +} + +static struct btf_kind_operations enum64_ops = { + .check_meta = btf_enum64_check_meta, + .resolve = btf_df_resolve, + .check_member = btf_enum_check_member, + .check_kflag_member = btf_enum_check_kflag_member, + .log_details = btf_enum_log, + .show = btf_enum64_show, +}; + static s32 btf_func_proto_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) @@ -4438,6 +4550,7 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { [BTF_KIND_FLOAT] = &float_ops, [BTF_KIND_DECL_TAG] = &decl_tag_ops, [BTF_KIND_TYPE_TAG] = &modifier_ops, + [BTF_KIND_ENUM64] = &enum64_ops, }; static s32 btf_check_meta(struct btf_verifier_env *env, @@ -5299,7 +5412,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_small_int(t) || btf_type_is_enum(t)) + if (btf_type_is_small_int(t) || btf_is_any_enum(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { @@ -5763,7 +5876,7 @@ static int __get_type_size(struct btf *btf, u32 btf_id, if (btf_type_is_ptr(t)) /* kernel size of pointer. Not BPF's size of pointer*/ return sizeof(void *); - if (btf_type_is_int(t) || btf_type_is_enum(t)) + if (btf_type_is_int(t) || btf_is_any_enum(t)) return t->size; *bad_type = t; return -EINVAL; @@ -5911,7 +6024,7 @@ static int btf_check_func_type_match(struct bpf_verifier_log *log, * to context only. And only global functions can be replaced. * Hence type check only those types. */ - if (btf_type_is_int(t1) || btf_type_is_enum(t1)) + if (btf_type_is_int(t1) || btf_is_any_enum(t1)) continue; if (!btf_type_is_ptr(t1)) { bpf_log(log, @@ -6408,7 +6521,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, t = btf_type_by_id(btf, t->type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (!btf_type_is_int(t) && !btf_type_is_enum(t)) { + if (!btf_type_is_int(t) && !btf_is_any_enum(t)) { bpf_log(log, "Global function %s() doesn't return scalar. Only those are supported.\n", tname); @@ -6423,7 +6536,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, t = btf_type_by_id(btf, args[i].type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_int(t) || btf_type_is_enum(t)) { + if (btf_type_is_int(t) || btf_is_any_enum(t)) { reg->type = SCALAR_VALUE; continue; } @@ -7335,6 +7448,7 @@ recur: case BTF_KIND_UNION: case BTF_KIND_ENUM: case BTF_KIND_FWD: + case BTF_KIND_ENUM64: return 1; case BTF_KIND_INT: /* just reject deprecated bitfield-like integers; all other @@ -7387,10 +7501,10 @@ recur: * field-based relocations. This function assumes that root types were already * checked for name match. Beyond that initial root-level name check, names * are completely ignored. Compatibility rules are as follows: - * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but + * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs/ENUM64s are considered compatible, but * kind should match for local and target types (i.e., STRUCT is not * compatible with UNION); - * - for ENUMs, the size is ignored; + * - for ENUMs/ENUM64s, the size is ignored; * - for INT, size and signedness are ignored; * - for ARRAY, dimensionality is ignored, element types are checked for * compatibility recursively; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index aedac2ac02b9..2d2872682278 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10901,7 +10901,7 @@ static int check_btf_func(struct bpf_verifier_env *env, goto err_free; ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL); scalar_return = - btf_type_is_small_int(ret_type) || btf_type_is_enum(ret_type); + btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type); if (i && !scalar_return && env->subprog_info[i].has_ld_abs) { verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n"); goto err_free; diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h index a9162a6c0284..ec1798b6d3ff 100644 --- a/tools/include/uapi/linux/btf.h +++ b/tools/include/uapi/linux/btf.h @@ -36,10 +36,10 @@ struct btf_type { * bits 24-28: kind (e.g. int, ptr, array...etc) * bits 29-30: unused * bit 31: kind_flag, currently used by - * struct, union and fwd + * struct, union, enum, fwd and enum64 */ __u32 info; - /* "size" is used by INT, ENUM, STRUCT, UNION and DATASEC. + /* "size" is used by INT, ENUM, STRUCT, UNION, DATASEC and ENUM64. * "size" tells the size of the type it is describing. * * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, @@ -63,7 +63,7 @@ enum { BTF_KIND_ARRAY = 3, /* Array */ BTF_KIND_STRUCT = 4, /* Struct */ BTF_KIND_UNION = 5, /* Union */ - BTF_KIND_ENUM = 6, /* Enumeration */ + BTF_KIND_ENUM = 6, /* Enumeration up to 32-bit values */ BTF_KIND_FWD = 7, /* Forward */ BTF_KIND_TYPEDEF = 8, /* Typedef */ BTF_KIND_VOLATILE = 9, /* Volatile */ @@ -76,6 +76,7 @@ enum { BTF_KIND_FLOAT = 16, /* Floating point */ BTF_KIND_DECL_TAG = 17, /* Decl Tag */ BTF_KIND_TYPE_TAG = 18, /* Type Tag */ + BTF_KIND_ENUM64 = 19, /* Enumeration up to 64-bit values */ NR_BTF_KINDS, BTF_KIND_MAX = NR_BTF_KINDS - 1, @@ -186,4 +187,14 @@ struct btf_decl_tag { __s32 component_idx; }; +/* BTF_KIND_ENUM64 is followed by multiple "struct btf_enum64". + * The exact number of btf_enum64 is stored in the vlen (of the + * info in "struct btf_type"). + */ +struct btf_enum64 { + __u32 name_off; + __u32 val_lo32; + __u32 val_hi32; +}; + #endif /* _UAPI__LINUX_BTF_H__ */ -- cgit v1.2.3 From 776281652ddcd98bb292335ea3634da341c849b4 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 6 Jun 2022 23:26:05 -0700 Subject: libbpf: Permit 64bit relocation value Currently, the libbpf limits the relocation value to be 32bit since all current relocations have such a limit. But with BTF_KIND_ENUM64 support, the enum value could be 64bit. So let us permit 64bit relocation value in libbpf. Acked-by: Andrii Nakryiko Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220607062605.3716779-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/relo_core.c | 49 ++++++++++++++++++++++++++--------------------- tools/lib/bpf/relo_core.h | 4 ++-- 2 files changed, 29 insertions(+), 24 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c index d8ab4c61cb61..0dce5644877b 100644 --- a/tools/lib/bpf/relo_core.c +++ b/tools/lib/bpf/relo_core.c @@ -583,7 +583,7 @@ static int bpf_core_spec_match(struct bpf_core_spec *local_spec, static int bpf_core_calc_field_relo(const char *prog_name, const struct bpf_core_relo *relo, const struct bpf_core_spec *spec, - __u32 *val, __u32 *field_sz, __u32 *type_id, + __u64 *val, __u32 *field_sz, __u32 *type_id, bool *validate) { const struct bpf_core_accessor *acc; @@ -708,7 +708,7 @@ static int bpf_core_calc_field_relo(const char *prog_name, static int bpf_core_calc_type_relo(const struct bpf_core_relo *relo, const struct bpf_core_spec *spec, - __u32 *val, bool *validate) + __u64 *val, bool *validate) { __s64 sz; @@ -751,7 +751,7 @@ static int bpf_core_calc_type_relo(const struct bpf_core_relo *relo, static int bpf_core_calc_enumval_relo(const struct bpf_core_relo *relo, const struct bpf_core_spec *spec, - __u32 *val) + __u64 *val) { const struct btf_type *t; const struct btf_enum *e; @@ -929,7 +929,7 @@ int bpf_core_patch_insn(const char *prog_name, struct bpf_insn *insn, int insn_idx, const struct bpf_core_relo *relo, int relo_idx, const struct bpf_core_relo_res *res) { - __u32 orig_val, new_val; + __u64 orig_val, new_val; __u8 class; class = BPF_CLASS(insn->code); @@ -954,28 +954,30 @@ poison: if (BPF_SRC(insn->code) != BPF_K) return -EINVAL; if (res->validate && insn->imm != orig_val) { - pr_warn("prog '%s': relo #%d: unexpected insn #%d (ALU/ALU64) value: got %u, exp %u -> %u\n", + pr_warn("prog '%s': relo #%d: unexpected insn #%d (ALU/ALU64) value: got %u, exp %llu -> %llu\n", prog_name, relo_idx, - insn_idx, insn->imm, orig_val, new_val); + insn_idx, insn->imm, (unsigned long long)orig_val, + (unsigned long long)new_val); return -EINVAL; } orig_val = insn->imm; insn->imm = new_val; - pr_debug("prog '%s': relo #%d: patched insn #%d (ALU/ALU64) imm %u -> %u\n", + pr_debug("prog '%s': relo #%d: patched insn #%d (ALU/ALU64) imm %llu -> %llu\n", prog_name, relo_idx, insn_idx, - orig_val, new_val); + (unsigned long long)orig_val, (unsigned long long)new_val); break; case BPF_LDX: case BPF_ST: case BPF_STX: if (res->validate && insn->off != orig_val) { - pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDX/ST/STX) value: got %u, exp %u -> %u\n", - prog_name, relo_idx, insn_idx, insn->off, orig_val, new_val); + pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDX/ST/STX) value: got %u, exp %llu -> %llu\n", + prog_name, relo_idx, insn_idx, insn->off, (unsigned long long)orig_val, + (unsigned long long)new_val); return -EINVAL; } if (new_val > SHRT_MAX) { - pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) value too big: %u\n", - prog_name, relo_idx, insn_idx, new_val); + pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) value too big: %llu\n", + prog_name, relo_idx, insn_idx, (unsigned long long)new_val); return -ERANGE; } if (res->fail_memsz_adjust) { @@ -987,8 +989,9 @@ poison: orig_val = insn->off; insn->off = new_val; - pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) off %u -> %u\n", - prog_name, relo_idx, insn_idx, orig_val, new_val); + pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) off %llu -> %llu\n", + prog_name, relo_idx, insn_idx, (unsigned long long)orig_val, + (unsigned long long)new_val); if (res->new_sz != res->orig_sz) { int insn_bytes_sz, insn_bpf_sz; @@ -1026,18 +1029,18 @@ poison: imm = insn[0].imm + ((__u64)insn[1].imm << 32); if (res->validate && imm != orig_val) { - pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDIMM64) value: got %llu, exp %u -> %u\n", + pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDIMM64) value: got %llu, exp %llu -> %llu\n", prog_name, relo_idx, insn_idx, (unsigned long long)imm, - orig_val, new_val); + (unsigned long long)orig_val, (unsigned long long)new_val); return -EINVAL; } insn[0].imm = new_val; - insn[1].imm = 0; /* currently only 32-bit values are supported */ - pr_debug("prog '%s': relo #%d: patched insn #%d (LDIMM64) imm64 %llu -> %u\n", + insn[1].imm = new_val >> 32; + pr_debug("prog '%s': relo #%d: patched insn #%d (LDIMM64) imm64 %llu -> %llu\n", prog_name, relo_idx, insn_idx, - (unsigned long long)imm, new_val); + (unsigned long long)imm, (unsigned long long)new_val); break; } default: @@ -1261,10 +1264,12 @@ int bpf_core_calc_relo_insn(const char *prog_name, * decision and value, otherwise it's dangerous to * proceed due to ambiguity */ - pr_warn("prog '%s': relo #%d: relocation decision ambiguity: %s %u != %s %u\n", + pr_warn("prog '%s': relo #%d: relocation decision ambiguity: %s %llu != %s %llu\n", prog_name, relo_idx, - cand_res.poison ? "failure" : "success", cand_res.new_val, - targ_res->poison ? "failure" : "success", targ_res->new_val); + cand_res.poison ? "failure" : "success", + (unsigned long long)cand_res.new_val, + targ_res->poison ? "failure" : "success", + (unsigned long long)targ_res->new_val); return -EINVAL; } diff --git a/tools/lib/bpf/relo_core.h b/tools/lib/bpf/relo_core.h index 073039d8ca4f..7df0da082f2c 100644 --- a/tools/lib/bpf/relo_core.h +++ b/tools/lib/bpf/relo_core.h @@ -46,9 +46,9 @@ struct bpf_core_spec { struct bpf_core_relo_res { /* expected value in the instruction, unless validate == false */ - __u32 orig_val; + __u64 orig_val; /* new value that needs to be patched up to */ - __u32 new_val; + __u64 new_val; /* relocation unsuccessful, poison instruction, but don't fail load */ bool poison; /* some relocations can't be validated against orig_val */ -- cgit v1.2.3 From b58b2b3a31228bd9aaed9b96e9452dafd0d46024 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 6 Jun 2022 23:26:10 -0700 Subject: libbpf: Fix an error in 64bit relocation value computation Currently, the 64bit relocation value in the instruction is computed as follows: __u64 imm = insn[0].imm + ((__u64)insn[1].imm << 32) Suppose insn[0].imm = -1 (0xffffffff) and insn[1].imm = 1. With the above computation, insn[0].imm will first sign-extend to 64bit -1 (0xffffffffFFFFFFFF) and then add 0x1FFFFFFFF, producing incorrect value 0xFFFFFFFF. The correct value should be 0x1FFFFFFFF. Changing insn[0].imm to __u32 first will prevent 64bit sign extension and fix the issue. Merging high and low 32bit values also changed from '+' to '|' to be consistent with other similar occurences in kernel and libbpf. Acked-by: Andrii Nakryiko Acked-by: Dave Marchevsky Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220607062610.3717378-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/relo_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c index 0dce5644877b..073a54ed7432 100644 --- a/tools/lib/bpf/relo_core.c +++ b/tools/lib/bpf/relo_core.c @@ -1027,7 +1027,7 @@ poison: return -EINVAL; } - imm = insn[0].imm + ((__u64)insn[1].imm << 32); + imm = (__u32)insn[0].imm | ((__u64)insn[1].imm << 32); if (res->validate && imm != orig_val) { pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDIMM64) value: got %llu, exp %llu -> %llu\n", prog_name, relo_idx, -- cgit v1.2.3 From 8479aa752226bea40b24db277f1f10ab2726f8bb Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 6 Jun 2022 23:26:15 -0700 Subject: libbpf: Refactor btf__add_enum() for future code sharing Refactor btf__add_enum() function to create a separate function btf_add_enum_common() so later the common function can be used to add enum64 btf type. There is no functionality change for this patch. Acked-by: Andrii Nakryiko Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220607062615.3718063-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/btf.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 2e9c23bcdc67..1972d0c65e0c 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -2129,20 +2129,8 @@ int btf__add_field(struct btf *btf, const char *name, int type_id, return 0; } -/* - * Append new BTF_KIND_ENUM type with: - * - *name* - name of the enum, can be NULL or empty for anonymous enums; - * - *byte_sz* - size of the enum, in bytes. - * - * Enum initially has no enum values in it (and corresponds to enum forward - * declaration). Enumerator values can be added by btf__add_enum_value() - * immediately after btf__add_enum() succeeds. - * - * Returns: - * - >0, type ID of newly added BTF type; - * - <0, on error. - */ -int btf__add_enum(struct btf *btf, const char *name, __u32 byte_sz) +static int btf_add_enum_common(struct btf *btf, const char *name, __u32 byte_sz, + bool is_signed, __u8 kind) { struct btf_type *t; int sz, name_off = 0; @@ -2167,12 +2155,30 @@ int btf__add_enum(struct btf *btf, const char *name, __u32 byte_sz) /* start out with vlen=0; it will be adjusted when adding enum values */ t->name_off = name_off; - t->info = btf_type_info(BTF_KIND_ENUM, 0, 0); + t->info = btf_type_info(kind, 0, is_signed); t->size = byte_sz; return btf_commit_type(btf, sz); } +/* + * Append new BTF_KIND_ENUM type with: + * - *name* - name of the enum, can be NULL or empty for anonymous enums; + * - *byte_sz* - size of the enum, in bytes. + * + * Enum initially has no enum values in it (and corresponds to enum forward + * declaration). Enumerator values can be added by btf__add_enum_value() + * immediately after btf__add_enum() succeeds. + * + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_enum(struct btf *btf, const char *name, __u32 byte_sz) +{ + return btf_add_enum_common(btf, name, byte_sz, false, BTF_KIND_ENUM); +} + /* * Append new enum value for the current ENUM type with: * - *name* - name of the enumerator value, can't be NULL or empty; -- cgit v1.2.3 From dffbbdc2d9889670c30e07d05fc0dd712e8ad430 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 6 Jun 2022 23:26:21 -0700 Subject: libbpf: Add enum64 parsing and new enum64 public API Add enum64 parsing support and two new enum64 public APIs: btf__add_enum64 btf__add_enum64_value Also add support of signedness for BTF_KIND_ENUM. The BTF_KIND_ENUM API signatures are not changed. The signedness will be changed from unsigned to signed if btf__add_enum_value() finds any negative values. Acked-by: Andrii Nakryiko Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220607062621.3719391-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/btf.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++ tools/lib/bpf/btf.h | 12 ++++++ tools/lib/bpf/libbpf.map | 2 + 3 files changed, 117 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 1972d0c65e0c..8676efb2baba 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -305,6 +305,8 @@ static int btf_type_size(const struct btf_type *t) return base_size + sizeof(__u32); case BTF_KIND_ENUM: return base_size + vlen * sizeof(struct btf_enum); + case BTF_KIND_ENUM64: + return base_size + vlen * sizeof(struct btf_enum64); case BTF_KIND_ARRAY: return base_size + sizeof(struct btf_array); case BTF_KIND_STRUCT: @@ -334,6 +336,7 @@ static void btf_bswap_type_base(struct btf_type *t) static int btf_bswap_type_rest(struct btf_type *t) { struct btf_var_secinfo *v; + struct btf_enum64 *e64; struct btf_member *m; struct btf_array *a; struct btf_param *p; @@ -361,6 +364,13 @@ static int btf_bswap_type_rest(struct btf_type *t) e->val = bswap_32(e->val); } return 0; + case BTF_KIND_ENUM64: + for (i = 0, e64 = btf_enum64(t); i < vlen; i++, e64++) { + e64->name_off = bswap_32(e64->name_off); + e64->val_lo32 = bswap_32(e64->val_lo32); + e64->val_hi32 = bswap_32(e64->val_hi32); + } + return 0; case BTF_KIND_ARRAY: a = btf_array(t); a->type = bswap_32(a->type); @@ -611,6 +621,7 @@ __s64 btf__resolve_size(const struct btf *btf, __u32 type_id) case BTF_KIND_STRUCT: case BTF_KIND_UNION: case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: case BTF_KIND_DATASEC: case BTF_KIND_FLOAT: size = t->size; @@ -658,6 +669,7 @@ int btf__align_of(const struct btf *btf, __u32 id) switch (kind) { case BTF_KIND_INT: case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: case BTF_KIND_FLOAT: return min(btf_ptr_sz(btf), (size_t)t->size); case BTF_KIND_PTR: @@ -2176,6 +2188,10 @@ static int btf_add_enum_common(struct btf *btf, const char *name, __u32 byte_sz, */ int btf__add_enum(struct btf *btf, const char *name, __u32 byte_sz) { + /* + * set the signedness to be unsigned, it will change to signed + * if any later enumerator is negative. + */ return btf_add_enum_common(btf, name, byte_sz, false, BTF_KIND_ENUM); } @@ -2226,6 +2242,82 @@ int btf__add_enum_value(struct btf *btf, const char *name, __s64 value) t = btf_last_type(btf); btf_type_inc_vlen(t); + /* if negative value, set signedness to signed */ + if (value < 0) + t->info = btf_type_info(btf_kind(t), btf_vlen(t), true); + + btf->hdr->type_len += sz; + btf->hdr->str_off += sz; + return 0; +} + +/* + * Append new BTF_KIND_ENUM64 type with: + * - *name* - name of the enum, can be NULL or empty for anonymous enums; + * - *byte_sz* - size of the enum, in bytes. + * - *is_signed* - whether the enum values are signed or not; + * + * Enum initially has no enum values in it (and corresponds to enum forward + * declaration). Enumerator values can be added by btf__add_enum64_value() + * immediately after btf__add_enum64() succeeds. + * + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_enum64(struct btf *btf, const char *name, __u32 byte_sz, + bool is_signed) +{ + return btf_add_enum_common(btf, name, byte_sz, is_signed, + BTF_KIND_ENUM64); +} + +/* + * Append new enum value for the current ENUM64 type with: + * - *name* - name of the enumerator value, can't be NULL or empty; + * - *value* - integer value corresponding to enum value *name*; + * Returns: + * - 0, on success; + * - <0, on error. + */ +int btf__add_enum64_value(struct btf *btf, const char *name, __u64 value) +{ + struct btf_enum64 *v; + struct btf_type *t; + int sz, name_off; + + /* last type should be BTF_KIND_ENUM64 */ + if (btf->nr_types == 0) + return libbpf_err(-EINVAL); + t = btf_last_type(btf); + if (!btf_is_enum64(t)) + return libbpf_err(-EINVAL); + + /* non-empty name */ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + + /* decompose and invalidate raw data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_enum64); + v = btf_add_type_mem(btf, sz); + if (!v) + return libbpf_err(-ENOMEM); + + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + + v->name_off = name_off; + v->val_lo32 = (__u32)value; + v->val_hi32 = value >> 32; + + /* update parent type's vlen */ + t = btf_last_type(btf); + btf_type_inc_vlen(t); + btf->hdr->type_len += sz; btf->hdr->str_off += sz; return 0; @@ -4737,6 +4829,7 @@ int btf_type_visit_type_ids(struct btf_type *t, type_id_visit_fn visit, void *ct case BTF_KIND_INT: case BTF_KIND_FLOAT: case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: return 0; case BTF_KIND_FWD: @@ -4831,6 +4924,16 @@ int btf_type_visit_str_offs(struct btf_type *t, str_off_visit_fn visit, void *ct } break; } + case BTF_KIND_ENUM64: { + struct btf_enum64 *m = btf_enum64(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->name_off, ctx); + if (err) + return err; + } + break; + } case BTF_KIND_FUNC_PROTO: { struct btf_param *m = btf_params(t); diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 951ac7475794..a41463bf9060 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -215,6 +215,8 @@ LIBBPF_API int btf__add_field(struct btf *btf, const char *name, int field_type_ /* enum construction APIs */ LIBBPF_API int btf__add_enum(struct btf *btf, const char *name, __u32 bytes_sz); LIBBPF_API int btf__add_enum_value(struct btf *btf, const char *name, __s64 value); +LIBBPF_API int btf__add_enum64(struct btf *btf, const char *name, __u32 bytes_sz, bool is_signed); +LIBBPF_API int btf__add_enum64_value(struct btf *btf, const char *name, __u64 value); enum btf_fwd_kind { BTF_FWD_STRUCT = 0, @@ -454,6 +456,11 @@ static inline bool btf_is_enum(const struct btf_type *t) return btf_kind(t) == BTF_KIND_ENUM; } +static inline bool btf_is_enum64(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_ENUM64; +} + static inline bool btf_is_fwd(const struct btf_type *t) { return btf_kind(t) == BTF_KIND_FWD; @@ -549,6 +556,11 @@ static inline struct btf_enum *btf_enum(const struct btf_type *t) return (struct btf_enum *)(t + 1); } +static inline struct btf_enum64 *btf_enum64(const struct btf_type *t) +{ + return (struct btf_enum64 *)(t + 1); +} + static inline struct btf_member *btf_members(const struct btf_type *t) { return (struct btf_member *)(t + 1); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 38e284ff057d..116a2a8ee7c2 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -462,6 +462,8 @@ LIBBPF_0.8.0 { LIBBPF_1.0.0 { global: + btf__add_enum64; + btf__add_enum64_value; libbpf_bpf_attach_type_str; libbpf_bpf_link_type_str; libbpf_bpf_map_type_str; -- cgit v1.2.3 From 2ef2026349cf388b25601a094e99542ed536fde3 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 6 Jun 2022 23:26:26 -0700 Subject: libbpf: Add enum64 deduplication support Add enum64 deduplication support. BTF_KIND_ENUM64 handling is very similar to BTF_KIND_ENUM. Acked-by: Andrii Nakryiko Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220607062626.3720166-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/btf.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++-- tools/lib/bpf/btf.h | 5 +++++ 2 files changed, 65 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 8676efb2baba..ae1520f7e1b0 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -3582,7 +3582,7 @@ static bool btf_equal_int_tag(struct btf_type *t1, struct btf_type *t2) return info1 == info2; } -/* Calculate type signature hash of ENUM. */ +/* Calculate type signature hash of ENUM/ENUM64. */ static long btf_hash_enum(struct btf_type *t) { long h; @@ -3616,9 +3616,31 @@ static bool btf_equal_enum(struct btf_type *t1, struct btf_type *t2) return true; } +static bool btf_equal_enum64(struct btf_type *t1, struct btf_type *t2) +{ + const struct btf_enum64 *m1, *m2; + __u16 vlen; + int i; + + if (!btf_equal_common(t1, t2)) + return false; + + vlen = btf_vlen(t1); + m1 = btf_enum64(t1); + m2 = btf_enum64(t2); + for (i = 0; i < vlen; i++) { + if (m1->name_off != m2->name_off || m1->val_lo32 != m2->val_lo32 || + m1->val_hi32 != m2->val_hi32) + return false; + m1++; + m2++; + } + return true; +} + static inline bool btf_is_enum_fwd(struct btf_type *t) { - return btf_is_enum(t) && btf_vlen(t) == 0; + return btf_is_any_enum(t) && btf_vlen(t) == 0; } static bool btf_compat_enum(struct btf_type *t1, struct btf_type *t2) @@ -3631,6 +3653,17 @@ static bool btf_compat_enum(struct btf_type *t1, struct btf_type *t2) t1->size == t2->size; } +static bool btf_compat_enum64(struct btf_type *t1, struct btf_type *t2) +{ + if (!btf_is_enum_fwd(t1) && !btf_is_enum_fwd(t2)) + return btf_equal_enum64(t1, t2); + + /* ignore vlen when comparing */ + return t1->name_off == t2->name_off && + (t1->info & ~0xffff) == (t2->info & ~0xffff) && + t1->size == t2->size; +} + /* * Calculate type signature hash of STRUCT/UNION, ignoring referenced type IDs, * as referenced type IDs equivalence is established separately during type @@ -3843,6 +3876,7 @@ static int btf_dedup_prep(struct btf_dedup *d) h = btf_hash_int_decl_tag(t); break; case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: h = btf_hash_enum(t); break; case BTF_KIND_STRUCT: @@ -3932,6 +3966,27 @@ static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id) } break; + case BTF_KIND_ENUM64: + h = btf_hash_enum(t); + for_each_dedup_cand(d, hash_entry, h) { + cand_id = (__u32)(long)hash_entry->value; + cand = btf_type_by_id(d->btf, cand_id); + if (btf_equal_enum64(t, cand)) { + new_id = cand_id; + break; + } + if (btf_compat_enum64(t, cand)) { + if (btf_is_enum_fwd(t)) { + /* resolve fwd to full enum */ + new_id = cand_id; + break; + } + /* resolve canonical enum fwd to full enum */ + d->map[cand_id] = type_id; + } + } + break; + case BTF_KIND_FWD: case BTF_KIND_FLOAT: h = btf_hash_common(t); @@ -4227,6 +4282,9 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, case BTF_KIND_ENUM: return btf_compat_enum(cand_type, canon_type); + case BTF_KIND_ENUM64: + return btf_compat_enum64(cand_type, canon_type); + case BTF_KIND_FWD: case BTF_KIND_FLOAT: return btf_equal_common(cand_type, canon_type); diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index a41463bf9060..c7e8b1fdfe24 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -531,6 +531,11 @@ static inline bool btf_is_type_tag(const struct btf_type *t) return btf_kind(t) == BTF_KIND_TYPE_TAG; } +static inline bool btf_is_any_enum(const struct btf_type *t) +{ + return btf_is_enum(t) || btf_is_enum64(t); +} + static inline __u8 btf_int_encoding(const struct btf_type *t) { return BTF_INT_ENCODING(*(__u32 *)(t + 1)); -- cgit v1.2.3 From d90ec262b35bb6d30ae621dc15889638ba4c02be Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 6 Jun 2022 23:26:31 -0700 Subject: libbpf: Add enum64 support for btf_dump Add enum64 btf dumping support. For long long and unsigned long long dump, suffixes 'LL' and 'ULL' are added to avoid compilation errors in some cases. Acked-by: Andrii Nakryiko Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220607062631.3720526-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/btf.h | 5 ++ tools/lib/bpf/btf_dump.c | 137 +++++++++++++++++++++++++++++++++++------------ 2 files changed, 108 insertions(+), 34 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index c7e8b1fdfe24..dcb3f575a281 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -566,6 +566,11 @@ static inline struct btf_enum64 *btf_enum64(const struct btf_type *t) return (struct btf_enum64 *)(t + 1); } +static inline __u64 btf_enum64_value(const struct btf_enum64 *e) +{ + return ((__u64)e->val_hi32 << 32) | e->val_lo32; +} + static inline struct btf_member *btf_members(const struct btf_type *t) { return (struct btf_member *)(t + 1); diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 6b1bc1f43728..f5275f819027 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -318,6 +318,7 @@ static int btf_dump_mark_referenced(struct btf_dump *d) switch (btf_kind(t)) { case BTF_KIND_INT: case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: case BTF_KIND_FWD: case BTF_KIND_FLOAT: break; @@ -538,6 +539,7 @@ static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr) return 1; } case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: case BTF_KIND_FWD: /* * non-anonymous or non-referenced enums are top-level @@ -739,6 +741,7 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id) tstate->emit_state = EMITTED; break; case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: if (top_level_def) { btf_dump_emit_enum_def(d, id, t, 0); btf_dump_printf(d, ";\n\n"); @@ -989,38 +992,81 @@ static void btf_dump_emit_enum_fwd(struct btf_dump *d, __u32 id, btf_dump_printf(d, "enum %s", btf_dump_type_name(d, id)); } -static void btf_dump_emit_enum_def(struct btf_dump *d, __u32 id, - const struct btf_type *t, - int lvl) +static void btf_dump_emit_enum32_val(struct btf_dump *d, + const struct btf_type *t, + int lvl, __u16 vlen) { const struct btf_enum *v = btf_enum(t); - __u16 vlen = btf_vlen(t); + bool is_signed = btf_kflag(t); + const char *fmt_str; const char *name; size_t dup_cnt; int i; + for (i = 0; i < vlen; i++, v++) { + name = btf_name_of(d, v->name_off); + /* enumerators share namespace with typedef idents */ + dup_cnt = btf_dump_name_dups(d, d->ident_names, name); + if (dup_cnt > 1) { + fmt_str = is_signed ? "\n%s%s___%zd = %d," : "\n%s%s___%zd = %u,"; + btf_dump_printf(d, fmt_str, pfx(lvl + 1), name, dup_cnt, v->val); + } else { + fmt_str = is_signed ? "\n%s%s = %d," : "\n%s%s = %u,"; + btf_dump_printf(d, fmt_str, pfx(lvl + 1), name, v->val); + } + } +} + +static void btf_dump_emit_enum64_val(struct btf_dump *d, + const struct btf_type *t, + int lvl, __u16 vlen) +{ + const struct btf_enum64 *v = btf_enum64(t); + bool is_signed = btf_kflag(t); + const char *fmt_str; + const char *name; + size_t dup_cnt; + __u64 val; + int i; + + for (i = 0; i < vlen; i++, v++) { + name = btf_name_of(d, v->name_off); + dup_cnt = btf_dump_name_dups(d, d->ident_names, name); + val = btf_enum64_value(v); + if (dup_cnt > 1) { + fmt_str = is_signed ? "\n%s%s___%zd = %lldLL," + : "\n%s%s___%zd = %lluULL,"; + btf_dump_printf(d, fmt_str, + pfx(lvl + 1), name, dup_cnt, + (unsigned long long)val); + } else { + fmt_str = is_signed ? "\n%s%s = %lldLL," + : "\n%s%s = %lluULL,"; + btf_dump_printf(d, fmt_str, + pfx(lvl + 1), name, + (unsigned long long)val); + } + } +} +static void btf_dump_emit_enum_def(struct btf_dump *d, __u32 id, + const struct btf_type *t, + int lvl) +{ + __u16 vlen = btf_vlen(t); + btf_dump_printf(d, "enum%s%s", t->name_off ? " " : "", btf_dump_type_name(d, id)); - if (vlen) { - btf_dump_printf(d, " {"); - for (i = 0; i < vlen; i++, v++) { - name = btf_name_of(d, v->name_off); - /* enumerators share namespace with typedef idents */ - dup_cnt = btf_dump_name_dups(d, d->ident_names, name); - if (dup_cnt > 1) { - btf_dump_printf(d, "\n%s%s___%zu = %u,", - pfx(lvl + 1), name, dup_cnt, - (__u32)v->val); - } else { - btf_dump_printf(d, "\n%s%s = %u,", - pfx(lvl + 1), name, - (__u32)v->val); - } - } - btf_dump_printf(d, "\n%s}", pfx(lvl)); - } + if (!vlen) + return; + + btf_dump_printf(d, " {"); + if (btf_is_enum(t)) + btf_dump_emit_enum32_val(d, t, lvl, vlen); + else + btf_dump_emit_enum64_val(d, t, lvl, vlen); + btf_dump_printf(d, "\n%s}", pfx(lvl)); } static void btf_dump_emit_fwd_def(struct btf_dump *d, __u32 id, @@ -1178,6 +1224,7 @@ skip_mod: break; case BTF_KIND_INT: case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: case BTF_KIND_FWD: case BTF_KIND_STRUCT: case BTF_KIND_UNION: @@ -1312,6 +1359,7 @@ static void btf_dump_emit_type_chain(struct btf_dump *d, btf_dump_emit_struct_fwd(d, id, t); break; case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: btf_dump_emit_mods(d, decls); /* inline anonymous enum */ if (t->name_off == 0 && !d->skip_anon_defs) @@ -1988,7 +2036,8 @@ static int btf_dump_get_enum_value(struct btf_dump *d, __u32 id, __s64 *value) { - /* handle unaligned enum value */ + bool is_signed = btf_kflag(t); + if (!ptr_is_aligned(d->btf, id, data)) { __u64 val; int err; @@ -2005,13 +2054,13 @@ static int btf_dump_get_enum_value(struct btf_dump *d, *value = *(__s64 *)data; return 0; case 4: - *value = *(__s32 *)data; + *value = is_signed ? *(__s32 *)data : *(__u32 *)data; return 0; case 2: - *value = *(__s16 *)data; + *value = is_signed ? *(__s16 *)data : *(__u16 *)data; return 0; case 1: - *value = *(__s8 *)data; + *value = is_signed ? *(__s8 *)data : *(__u8 *)data; return 0; default: pr_warn("unexpected size %d for enum, id:[%u]\n", t->size, id); @@ -2024,7 +2073,7 @@ static int btf_dump_enum_data(struct btf_dump *d, __u32 id, const void *data) { - const struct btf_enum *e; + bool is_signed; __s64 value; int i, err; @@ -2032,14 +2081,31 @@ static int btf_dump_enum_data(struct btf_dump *d, if (err) return err; - for (i = 0, e = btf_enum(t); i < btf_vlen(t); i++, e++) { - if (value != e->val) - continue; - btf_dump_type_values(d, "%s", btf_name_of(d, e->name_off)); - return 0; - } + is_signed = btf_kflag(t); + if (btf_is_enum(t)) { + const struct btf_enum *e; + + for (i = 0, e = btf_enum(t); i < btf_vlen(t); i++, e++) { + if (value != e->val) + continue; + btf_dump_type_values(d, "%s", btf_name_of(d, e->name_off)); + return 0; + } - btf_dump_type_values(d, "%d", value); + btf_dump_type_values(d, is_signed ? "%d" : "%u", value); + } else { + const struct btf_enum64 *e; + + for (i = 0, e = btf_enum64(t); i < btf_vlen(t); i++, e++) { + if (value != btf_enum64_value(e)) + continue; + btf_dump_type_values(d, "%s", btf_name_of(d, e->name_off)); + return 0; + } + + btf_dump_type_values(d, is_signed ? "%lldLL" : "%lluULL", + (unsigned long long)value); + } return 0; } @@ -2099,6 +2165,7 @@ static int btf_dump_type_data_check_overflow(struct btf_dump *d, case BTF_KIND_FLOAT: case BTF_KIND_PTR: case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: if (data + bits_offset / 8 + size > d->typed_dump->data_end) return -E2BIG; break; @@ -2203,6 +2270,7 @@ static int btf_dump_type_data_check_zero(struct btf_dump *d, return -ENODATA; } case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: err = btf_dump_get_enum_value(d, t, data, id, &value); if (err) return err; @@ -2275,6 +2343,7 @@ static int btf_dump_dump_type_data(struct btf_dump *d, err = btf_dump_struct_data(d, t, id, data); break; case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: /* handle bitfield and int enum values */ if (bit_sz) { __u64 print_num; -- cgit v1.2.3 From f2a625889bb8d153d277f557c929d43874561f10 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 6 Jun 2022 23:26:36 -0700 Subject: libbpf: Add enum64 sanitization When old kernel does not support enum64 but user space btf contains non-zero enum kflag or enum64, libbpf needs to do proper sanitization so modified btf can be accepted by the kernel. Sanitization for enum kflag can be achieved by clearing the kflag bit. For enum64, the type is replaced with an union of integer member types and the integer member size must be smaller than enum64 size. If such an integer type cannot be found, a new type is created and used for union members. Acked-by: Andrii Nakryiko Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220607062636.3721375-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/btf.h | 3 ++- tools/lib/bpf/libbpf.c | 56 ++++++++++++++++++++++++++++++++++++++--- tools/lib/bpf/libbpf_internal.h | 2 ++ 3 files changed, 56 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index dcb3f575a281..83312c34007a 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -395,9 +395,10 @@ btf_dump__dump_type_data(struct btf_dump *d, __u32 id, #ifndef BTF_KIND_FLOAT #define BTF_KIND_FLOAT 16 /* Floating point */ #endif -/* The kernel header switched to enums, so these two were never #defined */ +/* The kernel header switched to enums, so the following were never #defined */ #define BTF_KIND_DECL_TAG 17 /* Decl Tag */ #define BTF_KIND_TYPE_TAG 18 /* Type Tag */ +#define BTF_KIND_ENUM64 19 /* Enum for up-to 64bit values */ static inline __u16 btf_kind(const struct btf_type *t) { diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index b03165687936..a0f5aae8626b 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -2242,6 +2242,7 @@ static const char *__btf_kind_str(__u16 kind) case BTF_KIND_FLOAT: return "float"; case BTF_KIND_DECL_TAG: return "decl_tag"; case BTF_KIND_TYPE_TAG: return "type_tag"; + case BTF_KIND_ENUM64: return "enum64"; default: return "unknown"; } } @@ -2770,12 +2771,13 @@ static bool btf_needs_sanitization(struct bpf_object *obj) bool has_func = kernel_supports(obj, FEAT_BTF_FUNC); bool has_decl_tag = kernel_supports(obj, FEAT_BTF_DECL_TAG); bool has_type_tag = kernel_supports(obj, FEAT_BTF_TYPE_TAG); + bool has_enum64 = kernel_supports(obj, FEAT_BTF_ENUM64); return !has_func || !has_datasec || !has_func_global || !has_float || - !has_decl_tag || !has_type_tag; + !has_decl_tag || !has_type_tag || !has_enum64; } -static void bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf) +static int bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf) { bool has_func_global = kernel_supports(obj, FEAT_BTF_GLOBAL_FUNC); bool has_datasec = kernel_supports(obj, FEAT_BTF_DATASEC); @@ -2783,6 +2785,8 @@ static void bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf) bool has_func = kernel_supports(obj, FEAT_BTF_FUNC); bool has_decl_tag = kernel_supports(obj, FEAT_BTF_DECL_TAG); bool has_type_tag = kernel_supports(obj, FEAT_BTF_TYPE_TAG); + bool has_enum64 = kernel_supports(obj, FEAT_BTF_ENUM64); + __u32 enum64_placeholder_id = 0; struct btf_type *t; int i, j, vlen; @@ -2845,8 +2849,32 @@ static void bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf) /* replace TYPE_TAG with a CONST */ t->name_off = 0; t->info = BTF_INFO_ENC(BTF_KIND_CONST, 0, 0); - } + } else if (!has_enum64 && btf_is_enum(t)) { + /* clear the kflag */ + t->info = btf_type_info(btf_kind(t), btf_vlen(t), false); + } else if (!has_enum64 && btf_is_enum64(t)) { + /* replace ENUM64 with a union */ + struct btf_member *m; + + if (enum64_placeholder_id == 0) { + enum64_placeholder_id = btf__add_int(btf, "enum64_placeholder", 1, 0); + if (enum64_placeholder_id < 0) + return enum64_placeholder_id; + + t = (struct btf_type *)btf__type_by_id(btf, i); + } + + m = btf_members(t); + vlen = btf_vlen(t); + t->info = BTF_INFO_ENC(BTF_KIND_UNION, 0, vlen); + for (j = 0; j < vlen; j++, m++) { + m->type = enum64_placeholder_id; + m->offset = 0; + } + } } + + return 0; } static bool libbpf_needs_btf(const struct bpf_object *obj) @@ -3184,7 +3212,9 @@ static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj) /* enforce 8-byte pointers for BPF-targeted BTFs */ btf__set_pointer_size(obj->btf, 8); - bpf_object__sanitize_btf(obj, kern_btf); + err = bpf_object__sanitize_btf(obj, kern_btf); + if (err) + return err; } if (obj->gen_loader) { @@ -3691,6 +3721,10 @@ static enum kcfg_type find_kcfg_type(const struct btf *btf, int id, if (strcmp(name, "libbpf_tristate")) return KCFG_UNKNOWN; return KCFG_TRISTATE; + case BTF_KIND_ENUM64: + if (strcmp(name, "libbpf_tristate")) + return KCFG_UNKNOWN; + return KCFG_TRISTATE; case BTF_KIND_ARRAY: if (btf_array(t)->nelems == 0) return KCFG_UNKNOWN; @@ -4874,6 +4908,17 @@ static int probe_kern_bpf_cookie(void) return probe_fd(ret); } +static int probe_kern_btf_enum64(void) +{ + static const char strs[] = "\0enum64"; + __u32 types[] = { + BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 0), 8), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + enum kern_feature_result { FEAT_UNKNOWN = 0, FEAT_SUPPORTED = 1, @@ -4939,6 +4984,9 @@ static struct kern_feature_desc { [FEAT_BPF_COOKIE] = { "BPF cookie support", probe_kern_bpf_cookie, }, + [FEAT_BTF_ENUM64] = { + "BTF_KIND_ENUM64 support", probe_kern_btf_enum64, + }, }; bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id) diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index ef5d975078e5..a1ad145ffa74 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -351,6 +351,8 @@ enum kern_feature_id { FEAT_MEMCG_ACCOUNT, /* BPF cookie (bpf_get_attach_cookie() BPF helper) support */ FEAT_BPF_COOKIE, + /* BTF_KIND_ENUM64 support and BTF_KIND_ENUM kflag support */ + FEAT_BTF_ENUM64, __FEAT_CNT, }; -- cgit v1.2.3 From 6ec7d79be202db14bd0ceeb9ed32908bcaae27fa Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 6 Jun 2022 23:26:42 -0700 Subject: libbpf: Add enum64 support for bpf linking Add BTF_KIND_ENUM64 support for bpf linking, which is very similar to BTF_KIND_ENUM. Acked-by: Andrii Nakryiko Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220607062642.3721494-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/linker.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index 85c0fddf55d1..4ac02c28e152 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -1335,6 +1335,7 @@ recur: case BTF_KIND_STRUCT: case BTF_KIND_UNION: case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: case BTF_KIND_FWD: case BTF_KIND_FUNC: case BTF_KIND_VAR: @@ -1357,6 +1358,7 @@ recur: case BTF_KIND_INT: case BTF_KIND_FLOAT: case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: /* ignore encoding for int and enum values for enum */ if (t1->size != t2->size) { pr_warn("global '%s': incompatible %s '%s' size %u and %u\n", -- cgit v1.2.3 From 23b2a3a8f63a9dc14ad0c7a63098e0b3575b1173 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 6 Jun 2022 23:26:47 -0700 Subject: libbpf: Add enum64 relocation support The enum64 relocation support is added. The bpf local type could be either enum or enum64 and the remote type could be either enum or enum64 too. The all combinations of local enum/enum64 and remote enum/enum64 are supported. Acked-by: Andrii Nakryiko Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220607062647.3721719-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/btf.h | 7 ++++++ tools/lib/bpf/libbpf.c | 7 +++--- tools/lib/bpf/relo_core.c | 54 ++++++++++++++++++++++++++++++++--------------- 3 files changed, 48 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 83312c34007a..9fb416eb5644 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -537,6 +537,13 @@ static inline bool btf_is_any_enum(const struct btf_type *t) return btf_is_enum(t) || btf_is_enum64(t); } +static inline bool btf_kind_core_compat(const struct btf_type *t1, + const struct btf_type *t2) +{ + return btf_kind(t1) == btf_kind(t2) || + (btf_is_any_enum(t1) && btf_is_any_enum(t2)); +} + static inline __u8 btf_int_encoding(const struct btf_type *t) { return BTF_INT_ENCODING(*(__u32 *)(t + 1)); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index a0f5aae8626b..ed3cf0911b98 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5524,7 +5524,7 @@ int bpf_core_add_cands(struct bpf_core_cand *local_cand, n = btf__type_cnt(targ_btf); for (i = targ_start_id; i < n; i++) { t = btf__type_by_id(targ_btf, i); - if (btf_kind(t) != btf_kind(local_t)) + if (!btf_kind_core_compat(t, local_t)) continue; targ_name = btf__name_by_offset(targ_btf, t->name_off); @@ -5738,7 +5738,7 @@ int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, /* caller made sure that names match (ignoring flavor suffix) */ local_type = btf__type_by_id(local_btf, local_id); targ_type = btf__type_by_id(targ_btf, targ_id); - if (btf_kind(local_type) != btf_kind(targ_type)) + if (!btf_kind_core_compat(local_type, targ_type)) return 0; recur: @@ -5751,7 +5751,7 @@ recur: if (!local_type || !targ_type) return -EINVAL; - if (btf_kind(local_type) != btf_kind(targ_type)) + if (!btf_kind_core_compat(local_type, targ_type)) return 0; switch (btf_kind(local_type)) { @@ -5759,6 +5759,7 @@ recur: case BTF_KIND_STRUCT: case BTF_KIND_UNION: case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: case BTF_KIND_FWD: return 1; case BTF_KIND_INT: diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c index 073a54ed7432..6ad3c3891a9a 100644 --- a/tools/lib/bpf/relo_core.c +++ b/tools/lib/bpf/relo_core.c @@ -186,7 +186,7 @@ int bpf_core_parse_spec(const char *prog_name, const struct btf *btf, struct bpf_core_accessor *acc; const struct btf_type *t; const char *name, *spec_str; - __u32 id; + __u32 id, name_off; __s64 sz; spec_str = btf__name_by_offset(btf, relo->access_str_off); @@ -231,11 +231,13 @@ int bpf_core_parse_spec(const char *prog_name, const struct btf *btf, spec->len++; if (core_relo_is_enumval_based(relo->kind)) { - if (!btf_is_enum(t) || spec->raw_len > 1 || access_idx >= btf_vlen(t)) + if (!btf_is_any_enum(t) || spec->raw_len > 1 || access_idx >= btf_vlen(t)) return -EINVAL; /* record enumerator name in a first accessor */ - acc->name = btf__name_by_offset(btf, btf_enum(t)[access_idx].name_off); + name_off = btf_is_enum(t) ? btf_enum(t)[access_idx].name_off + : btf_enum64(t)[access_idx].name_off; + acc->name = btf__name_by_offset(btf, name_off); return 0; } @@ -340,7 +342,7 @@ recur: if (btf_is_composite(local_type) && btf_is_composite(targ_type)) return 1; - if (btf_kind(local_type) != btf_kind(targ_type)) + if (!btf_kind_core_compat(local_type, targ_type)) return 0; switch (btf_kind(local_type)) { @@ -348,6 +350,7 @@ recur: case BTF_KIND_FLOAT: return 1; case BTF_KIND_FWD: + case BTF_KIND_ENUM64: case BTF_KIND_ENUM: { const char *local_name, *targ_name; size_t local_len, targ_len; @@ -477,6 +480,7 @@ static int bpf_core_spec_match(struct bpf_core_spec *local_spec, const struct bpf_core_accessor *local_acc; struct bpf_core_accessor *targ_acc; int i, sz, matched; + __u32 name_off; memset(targ_spec, 0, sizeof(*targ_spec)); targ_spec->btf = targ_btf; @@ -494,18 +498,22 @@ static int bpf_core_spec_match(struct bpf_core_spec *local_spec, if (core_relo_is_enumval_based(local_spec->relo_kind)) { size_t local_essent_len, targ_essent_len; - const struct btf_enum *e; const char *targ_name; /* has to resolve to an enum */ targ_type = skip_mods_and_typedefs(targ_spec->btf, targ_id, &targ_id); - if (!btf_is_enum(targ_type)) + if (!btf_is_any_enum(targ_type)) return 0; local_essent_len = bpf_core_essential_name_len(local_acc->name); - for (i = 0, e = btf_enum(targ_type); i < btf_vlen(targ_type); i++, e++) { - targ_name = btf__name_by_offset(targ_spec->btf, e->name_off); + for (i = 0; i < btf_vlen(targ_type); i++) { + if (btf_is_enum(targ_type)) + name_off = btf_enum(targ_type)[i].name_off; + else + name_off = btf_enum64(targ_type)[i].name_off; + + targ_name = btf__name_by_offset(targ_spec->btf, name_off); targ_essent_len = bpf_core_essential_name_len(targ_name); if (targ_essent_len != local_essent_len) continue; @@ -680,8 +688,7 @@ static int bpf_core_calc_field_relo(const char *prog_name, *val = byte_sz; break; case BPF_CORE_FIELD_SIGNED: - /* enums will be assumed unsigned */ - *val = btf_is_enum(mt) || + *val = (btf_is_any_enum(mt) && BTF_INFO_KFLAG(mt->info)) || (btf_int_encoding(mt) & BTF_INT_SIGNED); if (validate) *validate = true; /* signedness is never ambiguous */ @@ -754,7 +761,6 @@ static int bpf_core_calc_enumval_relo(const struct bpf_core_relo *relo, __u64 *val) { const struct btf_type *t; - const struct btf_enum *e; switch (relo->kind) { case BPF_CORE_ENUMVAL_EXISTS: @@ -764,8 +770,10 @@ static int bpf_core_calc_enumval_relo(const struct bpf_core_relo *relo, if (!spec) return -EUCLEAN; /* request instruction poisoning */ t = btf_type_by_id(spec->btf, spec->spec[0].type_id); - e = btf_enum(t) + spec->spec[0].idx; - *val = e->val; + if (btf_is_enum(t)) + *val = btf_enum(t)[spec->spec[0].idx].val; + else + *val = btf_enum64_value(btf_enum64(t) + spec->spec[0].idx); break; default: return -EOPNOTSUPP; @@ -1060,7 +1068,6 @@ poison: int bpf_core_format_spec(char *buf, size_t buf_sz, const struct bpf_core_spec *spec) { const struct btf_type *t; - const struct btf_enum *e; const char *s; __u32 type_id; int i, len = 0; @@ -1089,10 +1096,23 @@ int bpf_core_format_spec(char *buf, size_t buf_sz, const struct bpf_core_spec *s if (core_relo_is_enumval_based(spec->relo_kind)) { t = skip_mods_and_typedefs(spec->btf, type_id, NULL); - e = btf_enum(t) + spec->raw_spec[0]; - s = btf__name_by_offset(spec->btf, e->name_off); + if (btf_is_enum(t)) { + const struct btf_enum *e; + const char *fmt_str; + + e = btf_enum(t) + spec->raw_spec[0]; + s = btf__name_by_offset(spec->btf, e->name_off); + fmt_str = BTF_INFO_KFLAG(t->info) ? "::%s = %d" : "::%s = %u"; + append_buf(fmt_str, s, e->val); + } else { + const struct btf_enum64 *e; + const char *fmt_str; - append_buf("::%s = %u", s, e->val); + e = btf_enum64(t) + spec->raw_spec[0]; + s = btf__name_by_offset(spec->btf, e->name_off); + fmt_str = BTF_INFO_KFLAG(t->info) ? "::%s = %lld" : "::%s = %llu"; + append_buf(fmt_str, s, (unsigned long long)btf_enum64_value(e)); + } return len; } -- cgit v1.2.3 From 58a53978fdf65d12dae1798e44120efb992a3615 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 6 Jun 2022 23:26:52 -0700 Subject: bpftool: Add btf enum64 support Add BTF_KIND_ENUM64 support. For example, the following enum is defined in uapi bpf.h. $ cat core.c enum A { BPF_F_INDEX_MASK = 0xffffffffULL, BPF_F_CURRENT_CPU = BPF_F_INDEX_MASK, BPF_F_CTXLEN_MASK = (0xfffffULL << 32), } g; Compiled with clang -target bpf -O2 -g -c core.c Using bpftool to dump types and generate format C file: $ bpftool btf dump file core.o ... [1] ENUM64 'A' encoding=UNSIGNED size=8 vlen=3 'BPF_F_INDEX_MASK' val=4294967295ULL 'BPF_F_CURRENT_CPU' val=4294967295ULL 'BPF_F_CTXLEN_MASK' val=4503595332403200ULL $ bpftool btf dump file core.o format c ... enum A { BPF_F_INDEX_MASK = 4294967295ULL, BPF_F_CURRENT_CPU = 4294967295ULL, BPF_F_CTXLEN_MASK = 4503595332403200ULL, }; ... Note that for raw btf output, the encoding (UNSIGNED or SIGNED) is printed out as well. The 64bit value is also represented properly in BTF and C dump. Acked-by: Andrii Nakryiko Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220607062652.3722649-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/btf.c | 57 +++++++++++++++++++++++++++++++++++++++--- tools/bpf/bpftool/btf_dumper.c | 29 +++++++++++++++++++++ tools/bpf/bpftool/gen.c | 1 + 3 files changed, 84 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index 7e6accb9d9f7..0744bd1150be 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -40,6 +40,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_FLOAT] = "FLOAT", [BTF_KIND_DECL_TAG] = "DECL_TAG", [BTF_KIND_TYPE_TAG] = "TYPE_TAG", + [BTF_KIND_ENUM64] = "ENUM64", }; struct btf_attach_point { @@ -212,26 +213,76 @@ static int dump_btf_type(const struct btf *btf, __u32 id, case BTF_KIND_ENUM: { const struct btf_enum *v = (const void *)(t + 1); __u16 vlen = BTF_INFO_VLEN(t->info); + const char *encoding; int i; + encoding = btf_kflag(t) ? "SIGNED" : "UNSIGNED"; if (json_output) { + jsonw_string_field(w, "encoding", encoding); jsonw_uint_field(w, "size", t->size); jsonw_uint_field(w, "vlen", vlen); jsonw_name(w, "values"); jsonw_start_array(w); } else { - printf(" size=%u vlen=%u", t->size, vlen); + printf(" encoding=%s size=%u vlen=%u", encoding, t->size, vlen); + } + for (i = 0; i < vlen; i++, v++) { + const char *name = btf_str(btf, v->name_off); + + if (json_output) { + jsonw_start_object(w); + jsonw_string_field(w, "name", name); + if (btf_kflag(t)) + jsonw_int_field(w, "val", v->val); + else + jsonw_uint_field(w, "val", v->val); + jsonw_end_object(w); + } else { + if (btf_kflag(t)) + printf("\n\t'%s' val=%d", name, v->val); + else + printf("\n\t'%s' val=%u", name, v->val); + } + } + if (json_output) + jsonw_end_array(w); + break; + } + case BTF_KIND_ENUM64: { + const struct btf_enum64 *v = btf_enum64(t); + __u16 vlen = btf_vlen(t); + const char *encoding; + int i; + + encoding = btf_kflag(t) ? "SIGNED" : "UNSIGNED"; + if (json_output) { + jsonw_string_field(w, "encoding", encoding); + jsonw_uint_field(w, "size", t->size); + jsonw_uint_field(w, "vlen", vlen); + jsonw_name(w, "values"); + jsonw_start_array(w); + } else { + printf(" encoding=%s size=%u vlen=%u", encoding, t->size, vlen); } for (i = 0; i < vlen; i++, v++) { const char *name = btf_str(btf, v->name_off); + __u64 val = ((__u64)v->val_hi32 << 32) | v->val_lo32; if (json_output) { jsonw_start_object(w); jsonw_string_field(w, "name", name); - jsonw_uint_field(w, "val", v->val); + if (btf_kflag(t)) + jsonw_int_field(w, "val", val); + else + jsonw_uint_field(w, "val", val); jsonw_end_object(w); } else { - printf("\n\t'%s' val=%u", name, v->val); + if (btf_kflag(t)) + printf("\n\t'%s' val=%lldLL", name, + (unsigned long long)val); + else + printf("\n\t'%s' val=%lluULL", name, + (unsigned long long)val); } } if (json_output) diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c index f5dddf8ef404..125798b0bc5d 100644 --- a/tools/bpf/bpftool/btf_dumper.c +++ b/tools/bpf/bpftool/btf_dumper.c @@ -182,6 +182,32 @@ static int btf_dumper_enum(const struct btf_dumper *d, return 0; } +static int btf_dumper_enum64(const struct btf_dumper *d, + const struct btf_type *t, + const void *data) +{ + const struct btf_enum64 *enums = btf_enum64(t); + __u32 val_lo32, val_hi32; + __u64 value; + __u16 i; + + value = *(__u64 *)data; + val_lo32 = (__u32)value; + val_hi32 = value >> 32; + + for (i = 0; i < btf_vlen(t); i++) { + if (val_lo32 == enums[i].val_lo32 && val_hi32 == enums[i].val_hi32) { + jsonw_string(d->jw, + btf__name_by_offset(d->btf, + enums[i].name_off)); + return 0; + } + } + + jsonw_int(d->jw, value); + return 0; +} + static bool is_str_array(const struct btf *btf, const struct btf_array *arr, const char *s) { @@ -542,6 +568,8 @@ static int btf_dumper_do_type(const struct btf_dumper *d, __u32 type_id, return btf_dumper_array(d, type_id, data); case BTF_KIND_ENUM: return btf_dumper_enum(d, t, data); + case BTF_KIND_ENUM64: + return btf_dumper_enum64(d, t, data); case BTF_KIND_PTR: btf_dumper_ptr(d, t, data); return 0; @@ -618,6 +646,7 @@ static int __btf_dumper_type_only(const struct btf *btf, __u32 type_id, btf__name_by_offset(btf, t->name_off)); break; case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: BTF_PRINT_ARG("enum %s ", btf__name_by_offset(btf, t->name_off)); break; diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index f158dc1c2149..480cbd859359 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -1750,6 +1750,7 @@ btfgen_mark_type(struct btfgen_info *info, unsigned int type_id, bool follow_poi case BTF_KIND_INT: case BTF_KIND_FLOAT: case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: case BTF_KIND_STRUCT: case BTF_KIND_UNION: break; -- cgit v1.2.3 From d932815a4394b6e8e861f75600666db40c706b8b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 6 Jun 2022 23:26:57 -0700 Subject: selftests/bpf: Fix selftests failure The kflag is supported now for BTF_KIND_ENUM. So remove the test which tests verifier failure due to existence of kflag. Acked-by: Andrii Nakryiko Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220607062657.3723737-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/btf.c | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index ba5bde53d418..8e068e06b3e8 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -2896,26 +2896,6 @@ static struct btf_raw_test raw_tests[] = { .err_str = "Invalid btf_info kind_flag", }, -{ - .descr = "invalid enum kind_flag", - .raw_types = { - BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ - BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_ENUM, 1, 1), 4), /* [2] */ - BTF_ENUM_ENC(NAME_TBD, 0), - BTF_END_RAW, - }, - BTF_STR_SEC("\0A"), - .map_type = BPF_MAP_TYPE_ARRAY, - .map_name = "enum_type_check_btf", - .key_size = sizeof(int), - .value_size = sizeof(int), - .key_type_id = 1, - .value_type_id = 1, - .max_entries = 4, - .btf_load_err = true, - .err_str = "Invalid btf_info kind_flag", -}, - { .descr = "valid fwd kind_flag", .raw_types = { -- cgit v1.2.3 From 2b7301457ffe7be085744fa49f1244a71c1c6f1d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 6 Jun 2022 23:27:03 -0700 Subject: selftests/bpf: Test new enum kflag and enum64 API functions Add tests to use the new enum kflag and enum64 API functions in selftest btf_write. Acked-by: Andrii Nakryiko Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220607062703.3724287-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/btf_helpers.c | 25 +++- tools/testing/selftests/bpf/prog_tests/btf_write.c | 126 +++++++++++++++------ 2 files changed, 114 insertions(+), 37 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/btf_helpers.c b/tools/testing/selftests/bpf/btf_helpers.c index b5941d514e17..1c1c2c26690a 100644 --- a/tools/testing/selftests/bpf/btf_helpers.c +++ b/tools/testing/selftests/bpf/btf_helpers.c @@ -26,11 +26,12 @@ static const char * const btf_kind_str_mapping[] = { [BTF_KIND_FLOAT] = "FLOAT", [BTF_KIND_DECL_TAG] = "DECL_TAG", [BTF_KIND_TYPE_TAG] = "TYPE_TAG", + [BTF_KIND_ENUM64] = "ENUM64", }; static const char *btf_kind_str(__u16 kind) { - if (kind > BTF_KIND_TYPE_TAG) + if (kind > BTF_KIND_ENUM64) return "UNKNOWN"; return btf_kind_str_mapping[kind]; } @@ -139,14 +140,32 @@ int fprintf_btf_type_raw(FILE *out, const struct btf *btf, __u32 id) } case BTF_KIND_ENUM: { const struct btf_enum *v = btf_enum(t); + const char *fmt_str; - fprintf(out, " size=%u vlen=%u", t->size, vlen); + fmt_str = btf_kflag(t) ? "\n\t'%s' val=%d" : "\n\t'%s' val=%u"; + fprintf(out, " encoding=%s size=%u vlen=%u", + btf_kflag(t) ? "SIGNED" : "UNSIGNED", t->size, vlen); for (i = 0; i < vlen; i++, v++) { - fprintf(out, "\n\t'%s' val=%u", + fprintf(out, fmt_str, btf_str(btf, v->name_off), v->val); } break; } + case BTF_KIND_ENUM64: { + const struct btf_enum64 *v = btf_enum64(t); + const char *fmt_str; + + fmt_str = btf_kflag(t) ? "\n\t'%s' val=%lld" : "\n\t'%s' val=%llu"; + + fprintf(out, " encoding=%s size=%u vlen=%u", + btf_kflag(t) ? "SIGNED" : "UNSIGNED", t->size, vlen); + for (i = 0; i < vlen; i++, v++) { + fprintf(out, fmt_str, + btf_str(btf, v->name_off), + ((__u64)v->val_hi32 << 32) | v->val_lo32); + } + break; + } case BTF_KIND_FWD: fprintf(out, " fwd_kind=%s", btf_kflag(t) ? "union" : "struct"); break; diff --git a/tools/testing/selftests/bpf/prog_tests/btf_write.c b/tools/testing/selftests/bpf/prog_tests/btf_write.c index addf99c05896..6e36de1302fc 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_write.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_write.c @@ -9,6 +9,7 @@ static void gen_btf(struct btf *btf) const struct btf_var_secinfo *vi; const struct btf_type *t; const struct btf_member *m; + const struct btf_enum64 *v64; const struct btf_enum *v; const struct btf_param *p; int id, err, str_off; @@ -171,7 +172,7 @@ static void gen_btf(struct btf *btf) ASSERT_STREQ(btf__str_by_offset(btf, v->name_off), "v2", "v2_name"); ASSERT_EQ(v->val, 2, "v2_val"); ASSERT_STREQ(btf_type_raw_dump(btf, 9), - "[9] ENUM 'e1' size=4 vlen=2\n" + "[9] ENUM 'e1' encoding=UNSIGNED size=4 vlen=2\n" "\t'v1' val=1\n" "\t'v2' val=2", "raw_dump"); @@ -202,7 +203,7 @@ static void gen_btf(struct btf *btf) ASSERT_EQ(btf_vlen(t), 0, "enum_fwd_kind"); ASSERT_EQ(t->size, 4, "enum_fwd_sz"); ASSERT_STREQ(btf_type_raw_dump(btf, 12), - "[12] ENUM 'enum_fwd' size=4 vlen=0", "raw_dump"); + "[12] ENUM 'enum_fwd' encoding=UNSIGNED size=4 vlen=0", "raw_dump"); /* TYPEDEF */ id = btf__add_typedef(btf, "typedef1", 1); @@ -307,6 +308,48 @@ static void gen_btf(struct btf *btf) ASSERT_EQ(t->type, 1, "tag_type"); ASSERT_STREQ(btf_type_raw_dump(btf, 20), "[20] TYPE_TAG 'tag1' type_id=1", "raw_dump"); + + /* ENUM64 */ + id = btf__add_enum64(btf, "e1", 8, true); + ASSERT_EQ(id, 21, "enum64_id"); + err = btf__add_enum64_value(btf, "v1", -1); + ASSERT_OK(err, "v1_res"); + err = btf__add_enum64_value(btf, "v2", 0x123456789); /* 4886718345 */ + ASSERT_OK(err, "v2_res"); + t = btf__type_by_id(btf, 21); + ASSERT_STREQ(btf__str_by_offset(btf, t->name_off), "e1", "enum64_name"); + ASSERT_EQ(btf_kind(t), BTF_KIND_ENUM64, "enum64_kind"); + ASSERT_EQ(btf_vlen(t), 2, "enum64_vlen"); + ASSERT_EQ(t->size, 8, "enum64_sz"); + v64 = btf_enum64(t) + 0; + ASSERT_STREQ(btf__str_by_offset(btf, v64->name_off), "v1", "v1_name"); + ASSERT_EQ(v64->val_hi32, 0xffffffff, "v1_val"); + ASSERT_EQ(v64->val_lo32, 0xffffffff, "v1_val"); + v64 = btf_enum64(t) + 1; + ASSERT_STREQ(btf__str_by_offset(btf, v64->name_off), "v2", "v2_name"); + ASSERT_EQ(v64->val_hi32, 0x1, "v2_val"); + ASSERT_EQ(v64->val_lo32, 0x23456789, "v2_val"); + ASSERT_STREQ(btf_type_raw_dump(btf, 21), + "[21] ENUM64 'e1' encoding=SIGNED size=8 vlen=2\n" + "\t'v1' val=-1\n" + "\t'v2' val=4886718345", "raw_dump"); + + id = btf__add_enum64(btf, "e1", 8, false); + ASSERT_EQ(id, 22, "enum64_id"); + err = btf__add_enum64_value(btf, "v1", 0xffffffffFFFFFFFF); /* 18446744073709551615 */ + ASSERT_OK(err, "v1_res"); + t = btf__type_by_id(btf, 22); + ASSERT_STREQ(btf__str_by_offset(btf, t->name_off), "e1", "enum64_name"); + ASSERT_EQ(btf_kind(t), BTF_KIND_ENUM64, "enum64_kind"); + ASSERT_EQ(btf_vlen(t), 1, "enum64_vlen"); + ASSERT_EQ(t->size, 8, "enum64_sz"); + v64 = btf_enum64(t) + 0; + ASSERT_STREQ(btf__str_by_offset(btf, v64->name_off), "v1", "v1_name"); + ASSERT_EQ(v64->val_hi32, 0xffffffff, "v1_val"); + ASSERT_EQ(v64->val_lo32, 0xffffffff, "v1_val"); + ASSERT_STREQ(btf_type_raw_dump(btf, 22), + "[22] ENUM64 'e1' encoding=UNSIGNED size=8 vlen=1\n" + "\t'v1' val=18446744073709551615", "raw_dump"); } static void test_btf_add() @@ -332,12 +375,12 @@ static void test_btf_add() "\t'f2' type_id=1 bits_offset=32 bitfield_size=16", "[8] UNION 'u1' size=8 vlen=1\n" "\t'f1' type_id=1 bits_offset=0 bitfield_size=16", - "[9] ENUM 'e1' size=4 vlen=2\n" + "[9] ENUM 'e1' encoding=UNSIGNED size=4 vlen=2\n" "\t'v1' val=1\n" "\t'v2' val=2", "[10] FWD 'struct_fwd' fwd_kind=struct", "[11] FWD 'union_fwd' fwd_kind=union", - "[12] ENUM 'enum_fwd' size=4 vlen=0", + "[12] ENUM 'enum_fwd' encoding=UNSIGNED size=4 vlen=0", "[13] TYPEDEF 'typedef1' type_id=1", "[14] FUNC 'func1' type_id=15 linkage=global", "[15] FUNC_PROTO '(anon)' ret_type_id=1 vlen=2\n" @@ -348,7 +391,12 @@ static void test_btf_add() "\ttype_id=1 offset=4 size=8", "[18] DECL_TAG 'tag1' type_id=16 component_idx=-1", "[19] DECL_TAG 'tag2' type_id=14 component_idx=1", - "[20] TYPE_TAG 'tag1' type_id=1"); + "[20] TYPE_TAG 'tag1' type_id=1", + "[21] ENUM64 'e1' encoding=SIGNED size=8 vlen=2\n" + "\t'v1' val=-1\n" + "\t'v2' val=4886718345", + "[22] ENUM64 'e1' encoding=UNSIGNED size=8 vlen=1\n" + "\t'v1' val=18446744073709551615"); btf__free(btf); } @@ -370,7 +418,7 @@ static void test_btf_add_btf() gen_btf(btf2); id = btf__add_btf(btf1, btf2); - if (!ASSERT_EQ(id, 21, "id")) + if (!ASSERT_EQ(id, 23, "id")) goto cleanup; VALIDATE_RAW_BTF( @@ -386,12 +434,12 @@ static void test_btf_add_btf() "\t'f2' type_id=1 bits_offset=32 bitfield_size=16", "[8] UNION 'u1' size=8 vlen=1\n" "\t'f1' type_id=1 bits_offset=0 bitfield_size=16", - "[9] ENUM 'e1' size=4 vlen=2\n" + "[9] ENUM 'e1' encoding=UNSIGNED size=4 vlen=2\n" "\t'v1' val=1\n" "\t'v2' val=2", "[10] FWD 'struct_fwd' fwd_kind=struct", "[11] FWD 'union_fwd' fwd_kind=union", - "[12] ENUM 'enum_fwd' size=4 vlen=0", + "[12] ENUM 'enum_fwd' encoding=UNSIGNED size=4 vlen=0", "[13] TYPEDEF 'typedef1' type_id=1", "[14] FUNC 'func1' type_id=15 linkage=global", "[15] FUNC_PROTO '(anon)' ret_type_id=1 vlen=2\n" @@ -403,36 +451,46 @@ static void test_btf_add_btf() "[18] DECL_TAG 'tag1' type_id=16 component_idx=-1", "[19] DECL_TAG 'tag2' type_id=14 component_idx=1", "[20] TYPE_TAG 'tag1' type_id=1", + "[21] ENUM64 'e1' encoding=SIGNED size=8 vlen=2\n" + "\t'v1' val=-1\n" + "\t'v2' val=4886718345", + "[22] ENUM64 'e1' encoding=UNSIGNED size=8 vlen=1\n" + "\t'v1' val=18446744073709551615", /* types appended from the second BTF */ - "[21] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", - "[22] PTR '(anon)' type_id=21", - "[23] CONST '(anon)' type_id=25", - "[24] VOLATILE '(anon)' type_id=23", - "[25] RESTRICT '(anon)' type_id=24", - "[26] ARRAY '(anon)' type_id=22 index_type_id=21 nr_elems=10", - "[27] STRUCT 's1' size=8 vlen=2\n" - "\t'f1' type_id=21 bits_offset=0\n" - "\t'f2' type_id=21 bits_offset=32 bitfield_size=16", - "[28] UNION 'u1' size=8 vlen=1\n" - "\t'f1' type_id=21 bits_offset=0 bitfield_size=16", - "[29] ENUM 'e1' size=4 vlen=2\n" + "[23] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[24] PTR '(anon)' type_id=23", + "[25] CONST '(anon)' type_id=27", + "[26] VOLATILE '(anon)' type_id=25", + "[27] RESTRICT '(anon)' type_id=26", + "[28] ARRAY '(anon)' type_id=24 index_type_id=23 nr_elems=10", + "[29] STRUCT 's1' size=8 vlen=2\n" + "\t'f1' type_id=23 bits_offset=0\n" + "\t'f2' type_id=23 bits_offset=32 bitfield_size=16", + "[30] UNION 'u1' size=8 vlen=1\n" + "\t'f1' type_id=23 bits_offset=0 bitfield_size=16", + "[31] ENUM 'e1' encoding=UNSIGNED size=4 vlen=2\n" "\t'v1' val=1\n" "\t'v2' val=2", - "[30] FWD 'struct_fwd' fwd_kind=struct", - "[31] FWD 'union_fwd' fwd_kind=union", - "[32] ENUM 'enum_fwd' size=4 vlen=0", - "[33] TYPEDEF 'typedef1' type_id=21", - "[34] FUNC 'func1' type_id=35 linkage=global", - "[35] FUNC_PROTO '(anon)' ret_type_id=21 vlen=2\n" - "\t'p1' type_id=21\n" - "\t'p2' type_id=22", - "[36] VAR 'var1' type_id=21, linkage=global-alloc", - "[37] DATASEC 'datasec1' size=12 vlen=1\n" - "\ttype_id=21 offset=4 size=8", - "[38] DECL_TAG 'tag1' type_id=36 component_idx=-1", - "[39] DECL_TAG 'tag2' type_id=34 component_idx=1", - "[40] TYPE_TAG 'tag1' type_id=21"); + "[32] FWD 'struct_fwd' fwd_kind=struct", + "[33] FWD 'union_fwd' fwd_kind=union", + "[34] ENUM 'enum_fwd' encoding=UNSIGNED size=4 vlen=0", + "[35] TYPEDEF 'typedef1' type_id=23", + "[36] FUNC 'func1' type_id=37 linkage=global", + "[37] FUNC_PROTO '(anon)' ret_type_id=23 vlen=2\n" + "\t'p1' type_id=23\n" + "\t'p2' type_id=24", + "[38] VAR 'var1' type_id=23, linkage=global-alloc", + "[39] DATASEC 'datasec1' size=12 vlen=1\n" + "\ttype_id=23 offset=4 size=8", + "[40] DECL_TAG 'tag1' type_id=38 component_idx=-1", + "[41] DECL_TAG 'tag2' type_id=36 component_idx=1", + "[42] TYPE_TAG 'tag1' type_id=23", + "[43] ENUM64 'e1' encoding=SIGNED size=8 vlen=2\n" + "\t'v1' val=-1\n" + "\t'v2' val=4886718345", + "[44] ENUM64 'e1' encoding=UNSIGNED size=8 vlen=1\n" + "\t'v1' val=18446744073709551615"); cleanup: btf__free(btf1); -- cgit v1.2.3 From 3b5325186dfad5ad2b2d8f7e8a79662de1b2749d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 6 Jun 2022 23:27:08 -0700 Subject: selftests/bpf: Add BTF_KIND_ENUM64 unit tests Add unit tests for basic BTF_KIND_ENUM64 encoding. Acked-by: Andrii Nakryiko Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220607062708.3724845-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/btf.c | 36 ++++++++++++++++++++++++++++ tools/testing/selftests/bpf/test_btf.h | 1 + 2 files changed, 37 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index 8e068e06b3e8..a986ee56c5f7 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -4052,6 +4052,42 @@ static struct btf_raw_test raw_tests[] = { .btf_load_err = true, .err_str = "Type tags don't precede modifiers", }, +{ + .descr = "enum64 test #1, unsigned, size 8", + .raw_types = { + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 2), 8), /* [2] */ + BTF_ENUM64_ENC(NAME_TBD, 0, 0), + BTF_ENUM64_ENC(NAME_TBD, 1, 1), + BTF_END_RAW, + }, + BTF_STR_SEC("\0a\0b\0c"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "tag_type_check_btf", + .key_size = sizeof(int), + .value_size = 8, + .key_type_id = 1, + .value_type_id = 2, + .max_entries = 1, +}, +{ + .descr = "enum64 test #2, signed, size 4", + .raw_types = { + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_ENUM64, 1, 2), 4), /* [2] */ + BTF_ENUM64_ENC(NAME_TBD, -1, 0), + BTF_ENUM64_ENC(NAME_TBD, 1, 0), + BTF_END_RAW, + }, + BTF_STR_SEC("\0a\0b\0c"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "tag_type_check_btf", + .key_size = sizeof(int), + .value_size = 4, + .key_type_id = 1, + .value_type_id = 2, + .max_entries = 1, +}, }; /* struct btf_raw_test raw_tests[] */ diff --git a/tools/testing/selftests/bpf/test_btf.h b/tools/testing/selftests/bpf/test_btf.h index 128989bed8b7..38782bd47fdc 100644 --- a/tools/testing/selftests/bpf/test_btf.h +++ b/tools/testing/selftests/bpf/test_btf.h @@ -39,6 +39,7 @@ #define BTF_MEMBER_ENC(name, type, bits_offset) \ (name), (type), (bits_offset) #define BTF_ENUM_ENC(name, val) (name), (val) +#define BTF_ENUM64_ENC(name, val_lo32, val_hi32) (name), (val_lo32), (val_hi32) #define BTF_MEMBER_OFFSET(bitfield_size, bits_offset) \ ((bitfield_size) << 24 | (bits_offset)) -- cgit v1.2.3 From adc26d134ef3454c3d8ffb75ee6ca20c169b23d0 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 6 Jun 2022 23:27:13 -0700 Subject: selftests/bpf: Test BTF_KIND_ENUM64 for deduplication Add a few unit tests for BTF_KIND_ENUM64 deduplication. Acked-by: Andrii Nakryiko Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220607062713.3725409-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/btf.c | 97 +++++++++++++++++++++++++++- 1 file changed, 95 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index a986ee56c5f7..edb387163baa 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -7016,9 +7016,12 @@ static struct btf_dedup_test dedup_tests[] = { BTF_DECL_TAG_ENC(NAME_TBD, 13, 1), /* [16] decl_tag */ BTF_DECL_TAG_ENC(NAME_TBD, 7, -1), /* [17] decl_tag */ BTF_TYPE_TAG_ENC(NAME_TBD, 8), /* [18] type_tag */ + BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 2), 8), /* [19] enum64 */ + BTF_ENUM64_ENC(NAME_TBD, 0, 0), + BTF_ENUM64_ENC(NAME_TBD, 1, 1), BTF_END_RAW, }, - BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M\0N\0O\0P\0Q\0R"), + BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M\0N\0O\0P\0Q\0R\0S\0T\0U"), }, .expect = { .raw_types = { @@ -7046,9 +7049,12 @@ static struct btf_dedup_test dedup_tests[] = { BTF_DECL_TAG_ENC(NAME_TBD, 13, 1), /* [16] decl_tag */ BTF_DECL_TAG_ENC(NAME_TBD, 7, -1), /* [17] decl_tag */ BTF_TYPE_TAG_ENC(NAME_TBD, 8), /* [18] type_tag */ + BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 2), 8), /* [19] enum64 */ + BTF_ENUM64_ENC(NAME_TBD, 0, 0), + BTF_ENUM64_ENC(NAME_TBD, 1, 1), BTF_END_RAW, }, - BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M\0N\0O\0P\0Q\0R"), + BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M\0N\0O\0P\0Q\0R\0S\0T\0U"), }, }, { @@ -7509,6 +7515,91 @@ static struct btf_dedup_test dedup_tests[] = { BTF_STR_SEC("\0tag1\0t\0m"), }, }, +{ + .descr = "dedup: enum64, standalone", + .input = { + .raw_types = { + BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 1), 8), + BTF_ENUM64_ENC(NAME_NTH(2), 1, 123), + BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 1), 8), + BTF_ENUM64_ENC(NAME_NTH(2), 1, 123), + BTF_END_RAW, + }, + BTF_STR_SEC("\0e1\0e1_val"), + }, + .expect = { + .raw_types = { + BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 1), 8), + BTF_ENUM64_ENC(NAME_NTH(2), 1, 123), + BTF_END_RAW, + }, + BTF_STR_SEC("\0e1\0e1_val"), + }, +}, +{ + .descr = "dedup: enum64, fwd resolution", + .input = { + .raw_types = { + /* [1] fwd enum64 'e1' before full enum */ + BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 0), 8), + /* [2] full enum64 'e1' after fwd */ + BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 1), 8), + BTF_ENUM64_ENC(NAME_NTH(2), 1, 123), + /* [3] full enum64 'e2' before fwd */ + BTF_TYPE_ENC(NAME_NTH(3), BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 1), 8), + BTF_ENUM64_ENC(NAME_NTH(4), 0, 456), + /* [4] fwd enum64 'e2' after full enum */ + BTF_TYPE_ENC(NAME_NTH(3), BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 0), 8), + /* [5] incompatible full enum64 with different value */ + BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 1), 8), + BTF_ENUM64_ENC(NAME_NTH(2), 0, 321), + BTF_END_RAW, + }, + BTF_STR_SEC("\0e1\0e1_val\0e2\0e2_val"), + }, + .expect = { + .raw_types = { + /* [1] full enum64 'e1' */ + BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 1), 8), + BTF_ENUM64_ENC(NAME_NTH(2), 1, 123), + /* [2] full enum64 'e2' */ + BTF_TYPE_ENC(NAME_NTH(3), BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 1), 8), + BTF_ENUM64_ENC(NAME_NTH(4), 0, 456), + /* [3] incompatible full enum64 with different value */ + BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 1), 8), + BTF_ENUM64_ENC(NAME_NTH(2), 0, 321), + BTF_END_RAW, + }, + BTF_STR_SEC("\0e1\0e1_val\0e2\0e2_val"), + }, +}, +{ + .descr = "dedup: enum and enum64, no dedup", + .input = { + .raw_types = { + /* [1] enum 'e1' */ + BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1), 4), + BTF_ENUM_ENC(NAME_NTH(2), 1), + /* [2] enum64 'e1' */ + BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 1), 4), + BTF_ENUM64_ENC(NAME_NTH(2), 1, 0), + BTF_END_RAW, + }, + BTF_STR_SEC("\0e1\0e1_val"), + }, + .expect = { + .raw_types = { + /* [1] enum 'e1' */ + BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1), 4), + BTF_ENUM_ENC(NAME_NTH(2), 1), + /* [2] enum64 'e1' */ + BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 1), 4), + BTF_ENUM64_ENC(NAME_NTH(2), 1, 0), + BTF_END_RAW, + }, + BTF_STR_SEC("\0e1\0e1_val"), + }, +}, }; @@ -7533,6 +7624,8 @@ static int btf_type_size(const struct btf_type *t) return base_size + sizeof(__u32); case BTF_KIND_ENUM: return base_size + vlen * sizeof(struct btf_enum); + case BTF_KIND_ENUM64: + return base_size + vlen * sizeof(struct btf_enum64); case BTF_KIND_ARRAY: return base_size + sizeof(struct btf_array); case BTF_KIND_STRUCT: -- cgit v1.2.3 From f4db3dd5284d9e0be2abc2f6e1dbdfe93da5681c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 6 Jun 2022 23:27:18 -0700 Subject: selftests/bpf: Add a test for enum64 value relocations Add a test for enum64 value relocations. The test will be skipped if clang version is 14 or lower since enum64 is only supported from version 15. Acked-by: Andrii Nakryiko Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220607062718.3726307-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/core_reloc.c | 58 ++++++++++++++++ .../bpf/progs/btf__core_reloc_enum64val.c | 3 + .../bpf/progs/btf__core_reloc_enum64val___diff.c | 3 + .../btf__core_reloc_enum64val___err_missing.c | 3 + .../btf__core_reloc_enum64val___val3_missing.c | 3 + .../testing/selftests/bpf/progs/core_reloc_types.h | 78 ++++++++++++++++++++++ .../bpf/progs/test_core_reloc_enum64val.c | 70 +++++++++++++++++++ 7 files changed, 218 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/btf__core_reloc_enum64val.c create mode 100644 tools/testing/selftests/bpf/progs/btf__core_reloc_enum64val___diff.c create mode 100644 tools/testing/selftests/bpf/progs/btf__core_reloc_enum64val___err_missing.c create mode 100644 tools/testing/selftests/bpf/progs/btf__core_reloc_enum64val___val3_missing.c create mode 100644 tools/testing/selftests/bpf/progs/test_core_reloc_enum64val.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index 3712dfe1be59..47c1ef117275 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -363,6 +363,25 @@ static int duration = 0; .fails = true, \ } +#define ENUM64VAL_CASE_COMMON(name) \ + .case_name = #name, \ + .bpf_obj_file = "test_core_reloc_enum64val.o", \ + .btf_src_file = "btf__core_reloc_" #name ".o", \ + .raw_tp_name = "sys_enter", \ + .prog_name = "test_core_enum64val" + +#define ENUM64VAL_CASE(name, ...) { \ + ENUM64VAL_CASE_COMMON(name), \ + .output = STRUCT_TO_CHAR_PTR(core_reloc_enum64val_output) \ + __VA_ARGS__, \ + .output_len = sizeof(struct core_reloc_enum64val_output), \ +} + +#define ENUM64VAL_ERR_CASE(name) { \ + ENUM64VAL_CASE_COMMON(name), \ + .fails = true, \ +} + struct core_reloc_test_case; typedef int (*setup_test_fn)(struct core_reloc_test_case *test); @@ -831,6 +850,45 @@ static const struct core_reloc_test_case test_cases[] = { .anon_val2 = 0x222, }), ENUMVAL_ERR_CASE(enumval___err_missing), + + /* 64bit enumerator value existence and value relocations */ + ENUM64VAL_CASE(enum64val, { + .unsigned_val1_exists = true, + .unsigned_val2_exists = true, + .unsigned_val3_exists = true, + .signed_val1_exists = true, + .signed_val2_exists = true, + .signed_val3_exists = true, + .unsigned_val1 = 0x1ffffffffULL, + .unsigned_val2 = 0x2, + .signed_val1 = 0x1ffffffffLL, + .signed_val2 = -2, + }), + ENUM64VAL_CASE(enum64val___diff, { + .unsigned_val1_exists = true, + .unsigned_val2_exists = true, + .unsigned_val3_exists = true, + .signed_val1_exists = true, + .signed_val2_exists = true, + .signed_val3_exists = true, + .unsigned_val1 = 0x101ffffffffULL, + .unsigned_val2 = 0x202ffffffffULL, + .signed_val1 = -101, + .signed_val2 = -202, + }), + ENUM64VAL_CASE(enum64val___val3_missing, { + .unsigned_val1_exists = true, + .unsigned_val2_exists = true, + .unsigned_val3_exists = false, + .signed_val1_exists = true, + .signed_val2_exists = true, + .signed_val3_exists = false, + .unsigned_val1 = 0x111ffffffffULL, + .unsigned_val2 = 0x222, + .signed_val1 = 0x111ffffffffLL, + .signed_val2 = -222, + }), + ENUM64VAL_ERR_CASE(enum64val___err_missing), }; struct data { diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_enum64val.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_enum64val.c new file mode 100644 index 000000000000..888e79db6a77 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_enum64val.c @@ -0,0 +1,3 @@ +#include "core_reloc_types.h" + +void f(struct core_reloc_enum64val x) {} diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_enum64val___diff.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_enum64val___diff.c new file mode 100644 index 000000000000..194749130d87 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_enum64val___diff.c @@ -0,0 +1,3 @@ +#include "core_reloc_types.h" + +void f(struct core_reloc_enum64val___diff x) {} diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_enum64val___err_missing.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_enum64val___err_missing.c new file mode 100644 index 000000000000..3d732d4193e4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_enum64val___err_missing.c @@ -0,0 +1,3 @@ +#include "core_reloc_types.h" + +void f(struct core_reloc_enum64val___err_missing x) {} diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_enum64val___val3_missing.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_enum64val___val3_missing.c new file mode 100644 index 000000000000..17cf5d6a848d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_enum64val___val3_missing.c @@ -0,0 +1,3 @@ +#include "core_reloc_types.h" + +void f(struct core_reloc_enum64val___val3_missing x) {} diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h index f9dc9766546e..26e103302c05 100644 --- a/tools/testing/selftests/bpf/progs/core_reloc_types.h +++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h @@ -1117,6 +1117,20 @@ struct core_reloc_enumval_output { int anon_val2; }; +struct core_reloc_enum64val_output { + bool unsigned_val1_exists; + bool unsigned_val2_exists; + bool unsigned_val3_exists; + bool signed_val1_exists; + bool signed_val2_exists; + bool signed_val3_exists; + + long unsigned_val1; + long unsigned_val2; + long signed_val1; + long signed_val2; +}; + enum named_enum { NAMED_ENUM_VAL1 = 1, NAMED_ENUM_VAL2 = 2, @@ -1134,6 +1148,23 @@ struct core_reloc_enumval { anon_enum f2; }; +enum named_unsigned_enum64 { + UNSIGNED_ENUM64_VAL1 = 0x1ffffffffULL, + UNSIGNED_ENUM64_VAL2 = 0x2, + UNSIGNED_ENUM64_VAL3 = 0x3ffffffffULL, +}; + +enum named_signed_enum64 { + SIGNED_ENUM64_VAL1 = 0x1ffffffffLL, + SIGNED_ENUM64_VAL2 = -2, + SIGNED_ENUM64_VAL3 = 0x3ffffffffLL, +}; + +struct core_reloc_enum64val { + enum named_unsigned_enum64 f1; + enum named_signed_enum64 f2; +}; + /* differing enumerator values */ enum named_enum___diff { NAMED_ENUM_VAL1___diff = 101, @@ -1152,6 +1183,23 @@ struct core_reloc_enumval___diff { anon_enum___diff f2; }; +enum named_unsigned_enum64___diff { + UNSIGNED_ENUM64_VAL1___diff = 0x101ffffffffULL, + UNSIGNED_ENUM64_VAL2___diff = 0x202ffffffffULL, + UNSIGNED_ENUM64_VAL3___diff = 0x303ffffffffULL, +}; + +enum named_signed_enum64___diff { + SIGNED_ENUM64_VAL1___diff = -101, + SIGNED_ENUM64_VAL2___diff = -202, + SIGNED_ENUM64_VAL3___diff = -303, +}; + +struct core_reloc_enum64val___diff { + enum named_unsigned_enum64___diff f1; + enum named_signed_enum64___diff f2; +}; + /* missing (optional) third enum value */ enum named_enum___val3_missing { NAMED_ENUM_VAL1___val3_missing = 111, @@ -1168,6 +1216,21 @@ struct core_reloc_enumval___val3_missing { anon_enum___val3_missing f2; }; +enum named_unsigned_enum64___val3_missing { + UNSIGNED_ENUM64_VAL1___val3_missing = 0x111ffffffffULL, + UNSIGNED_ENUM64_VAL2___val3_missing = 0x222, +}; + +enum named_signed_enum64___val3_missing { + SIGNED_ENUM64_VAL1___val3_missing = 0x111ffffffffLL, + SIGNED_ENUM64_VAL2___val3_missing = -222, +}; + +struct core_reloc_enum64val___val3_missing { + enum named_unsigned_enum64___val3_missing f1; + enum named_signed_enum64___val3_missing f2; +}; + /* missing (mandatory) second enum value, should fail */ enum named_enum___err_missing { NAMED_ENUM_VAL1___err_missing = 1, @@ -1183,3 +1246,18 @@ struct core_reloc_enumval___err_missing { enum named_enum___err_missing f1; anon_enum___err_missing f2; }; + +enum named_unsigned_enum64___err_missing { + UNSIGNED_ENUM64_VAL1___err_missing = 0x1ffffffffULL, + UNSIGNED_ENUM64_VAL3___err_missing = 0x3ffffffffULL, +}; + +enum named_signed_enum64___err_missing { + SIGNED_ENUM64_VAL1___err_missing = 0x1ffffffffLL, + SIGNED_ENUM64_VAL3___err_missing = -3, +}; + +struct core_reloc_enum64val___err_missing { + enum named_unsigned_enum64___err_missing f1; + enum named_signed_enum64___err_missing f2; +}; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_enum64val.c b/tools/testing/selftests/bpf/progs/test_core_reloc_enum64val.c new file mode 100644 index 000000000000..63147fbfae6e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_enum64val.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct { + char in[256]; + char out[256]; + bool skip; +} data = {}; + +enum named_unsigned_enum64 { + UNSIGNED_ENUM64_VAL1 = 0x1ffffffffULL, + UNSIGNED_ENUM64_VAL2 = 0x2ffffffffULL, + UNSIGNED_ENUM64_VAL3 = 0x3ffffffffULL, +}; + +enum named_signed_enum64 { + SIGNED_ENUM64_VAL1 = 0x1ffffffffLL, + SIGNED_ENUM64_VAL2 = -2, + SIGNED_ENUM64_VAL3 = 0x3ffffffffLL, +}; + +struct core_reloc_enum64val_output { + bool unsigned_val1_exists; + bool unsigned_val2_exists; + bool unsigned_val3_exists; + bool signed_val1_exists; + bool signed_val2_exists; + bool signed_val3_exists; + + long unsigned_val1; + long unsigned_val2; + long signed_val1; + long signed_val2; +}; + +SEC("raw_tracepoint/sys_enter") +int test_core_enum64val(void *ctx) +{ +#if __clang_major__ >= 15 + struct core_reloc_enum64val_output *out = (void *)&data.out; + enum named_unsigned_enum64 named_unsigned = 0; + enum named_signed_enum64 named_signed = 0; + + out->unsigned_val1_exists = bpf_core_enum_value_exists(named_unsigned, UNSIGNED_ENUM64_VAL1); + out->unsigned_val2_exists = bpf_core_enum_value_exists(enum named_unsigned_enum64, UNSIGNED_ENUM64_VAL2); + out->unsigned_val3_exists = bpf_core_enum_value_exists(enum named_unsigned_enum64, UNSIGNED_ENUM64_VAL3); + out->signed_val1_exists = bpf_core_enum_value_exists(named_signed, SIGNED_ENUM64_VAL1); + out->signed_val2_exists = bpf_core_enum_value_exists(enum named_signed_enum64, SIGNED_ENUM64_VAL2); + out->signed_val3_exists = bpf_core_enum_value_exists(enum named_signed_enum64, SIGNED_ENUM64_VAL3); + + out->unsigned_val1 = bpf_core_enum_value(named_unsigned, UNSIGNED_ENUM64_VAL1); + out->unsigned_val2 = bpf_core_enum_value(named_unsigned, UNSIGNED_ENUM64_VAL2); + out->signed_val1 = bpf_core_enum_value(named_signed, SIGNED_ENUM64_VAL1); + out->signed_val2 = bpf_core_enum_value(named_signed, SIGNED_ENUM64_VAL2); + /* NAMED_ENUM64_VAL3 value is optional */ + +#else + data.skip = true; +#endif + + return 0; +} -- cgit v1.2.3 From d8969871253a4704f007b307b2dd6232d1e40da8 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Mon, 2 May 2022 00:07:35 +0200 Subject: KVM: selftests: nSVM: Add svm_nested_soft_inject_test Add a KVM self-test that checks whether a nSVM L1 is able to successfully inject a software interrupt, a soft exception and a NMI into its L2 guest. In practice, this tests both the next_rip field consistency and L1-injected event with intervening L0 VMEXIT during its delivery: the first nested VMRUN (that's also trying to inject a software interrupt) will immediately trigger a L0 NPF. This L0 NPF will have zero in its CPU-returned next_rip field, which if incorrectly reused by KVM will trigger a #PF when trying to return to such address 0 from the interrupt handler. For NMI injection this tests whether the L1 NMI state isn't getting incorrectly mixed with the L2 NMI state if a L1 -> L2 NMI needs to be re-injected. Reviewed-by: Maxim Levitsky [sean: check exact L2 RIP on first soft interrupt] Signed-off-by: Sean Christopherson Signed-off-by: Maciej S. Szmigiero Message-Id: Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 3 +- tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/include/x86_64/processor.h | 17 ++ .../selftests/kvm/include/x86_64/svm_util.h | 12 ++ tools/testing/selftests/kvm/x86_64/evmcs_test.c | 1 - .../testing/selftests/kvm/x86_64/hyperv_svm_test.c | 5 - .../kvm/x86_64/svm_nested_soft_inject_test.c | 217 +++++++++++++++++++++ 7 files changed, 249 insertions(+), 7 deletions(-) create mode 100644 tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c (limited to 'tools') diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 4509a3a7eeae..82e764d71ca7 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -36,9 +36,10 @@ /x86_64/state_test /x86_64/svm_vmcall_test /x86_64/svm_int_ctl_test -/x86_64/tsc_scaling_sync +/x86_64/svm_nested_soft_inject_test /x86_64/sync_regs_test /x86_64/tsc_msrs_test +/x86_64/tsc_scaling_sync /x86_64/userspace_io_test /x86_64/userspace_msr_exit_test /x86_64/vmx_apic_access_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 81470a99ed1c..f2647b88a8a0 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -66,6 +66,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test TEST_GEN_PROGS_x86_64 += x86_64/svm_int_ctl_test +TEST_GEN_PROGS_x86_64 += x86_64/svm_nested_soft_inject_test TEST_GEN_PROGS_x86_64 += x86_64/tsc_scaling_sync TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test TEST_GEN_PROGS_x86_64 += x86_64/userspace_io_test diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index d0d51adec76e..4fd870f37b9e 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -17,6 +17,8 @@ #include "../kvm_util.h" +#define NMI_VECTOR 0x02 + #define X86_EFLAGS_FIXED (1u << 1) #define X86_CR4_VME (1ul << 0) @@ -385,6 +387,21 @@ static inline void cpu_relax(void) asm volatile("rep; nop" ::: "memory"); } +#define vmmcall() \ + __asm__ __volatile__( \ + "vmmcall\n" \ + ) + +#define ud2() \ + __asm__ __volatile__( \ + "ud2\n" \ + ) + +#define hlt() \ + __asm__ __volatile__( \ + "hlt\n" \ + ) + bool is_intel_cpu(void); bool is_amd_cpu(void); diff --git a/tools/testing/selftests/kvm/include/x86_64/svm_util.h b/tools/testing/selftests/kvm/include/x86_64/svm_util.h index a25aabd8f5e7..136ba6a5d027 100644 --- a/tools/testing/selftests/kvm/include/x86_64/svm_util.h +++ b/tools/testing/selftests/kvm/include/x86_64/svm_util.h @@ -16,6 +16,8 @@ #define CPUID_SVM_BIT 2 #define CPUID_SVM BIT_ULL(CPUID_SVM_BIT) +#define SVM_EXIT_EXCP_BASE 0x040 +#define SVM_EXIT_HLT 0x078 #define SVM_EXIT_MSR 0x07c #define SVM_EXIT_VMMCALL 0x081 @@ -36,6 +38,16 @@ struct svm_test_data { uint64_t msr_gpa; }; +#define stgi() \ + __asm__ __volatile__( \ + "stgi\n" \ + ) + +#define clgi() \ + __asm__ __volatile__( \ + "clgi\n" \ + ) + struct svm_test_data *vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva); void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp); void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa); diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index d12e043aa2ee..e161c6dd7a02 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -19,7 +19,6 @@ #include "vmx.h" #define VCPU_ID 5 -#define NMI_VECTOR 2 static int ud_count; diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c index 21f5ca9197da..994b33fd8724 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c @@ -42,11 +42,6 @@ struct hv_enlightenments { */ #define VMCB_HV_NESTED_ENLIGHTENMENTS (1U << 31) -static inline void vmmcall(void) -{ - __asm__ __volatile__("vmmcall"); -} - void l2_guest_code(void) { GUEST_SYNC(3); diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c new file mode 100644 index 000000000000..f94f1b449aef --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c @@ -0,0 +1,217 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2022 Oracle and/or its affiliates. + * + * Based on: + * svm_int_ctl_test + * + * Copyright (C) 2021, Red Hat, Inc. + * + */ + +#include +#include +#include +#include "apic.h" +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" +#include "test_util.h" +#include "../lib/kvm_util_internal.h" + +#define VCPU_ID 0 +#define INT_NR 0x20 +#define X86_FEATURE_NRIPS BIT(3) + +static_assert(ATOMIC_INT_LOCK_FREE == 2, "atomic int is not lockless"); + +static unsigned int bp_fired; +static void guest_bp_handler(struct ex_regs *regs) +{ + bp_fired++; +} + +static unsigned int int_fired; +static void l2_guest_code_int(void); + +static void guest_int_handler(struct ex_regs *regs) +{ + int_fired++; + GUEST_ASSERT_2(regs->rip == (unsigned long)l2_guest_code_int, + regs->rip, (unsigned long)l2_guest_code_int); +} + +static void l2_guest_code_int(void) +{ + GUEST_ASSERT_1(int_fired == 1, int_fired); + vmmcall(); + ud2(); + + GUEST_ASSERT_1(bp_fired == 1, bp_fired); + hlt(); +} + +static atomic_int nmi_stage; +#define nmi_stage_get() atomic_load_explicit(&nmi_stage, memory_order_acquire) +#define nmi_stage_inc() atomic_fetch_add_explicit(&nmi_stage, 1, memory_order_acq_rel) +static void guest_nmi_handler(struct ex_regs *regs) +{ + nmi_stage_inc(); + + if (nmi_stage_get() == 1) { + vmmcall(); + GUEST_ASSERT(false); + } else { + GUEST_ASSERT_1(nmi_stage_get() == 3, nmi_stage_get()); + GUEST_DONE(); + } +} + +static void l2_guest_code_nmi(void) +{ + ud2(); +} + +static void l1_guest_code(struct svm_test_data *svm, uint64_t is_nmi, uint64_t idt_alt) +{ + #define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + struct vmcb *vmcb = svm->vmcb; + + if (is_nmi) + x2apic_enable(); + + /* Prepare for L2 execution. */ + generic_svm_setup(svm, + is_nmi ? l2_guest_code_nmi : l2_guest_code_int, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + vmcb->control.intercept_exceptions |= BIT(PF_VECTOR) | BIT(UD_VECTOR); + vmcb->control.intercept |= BIT(INTERCEPT_NMI) | BIT(INTERCEPT_HLT); + + if (is_nmi) { + vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; + } else { + vmcb->control.event_inj = INT_NR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_SOFT; + /* The return address pushed on stack */ + vmcb->control.next_rip = vmcb->save.rip; + } + + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT_3(vmcb->control.exit_code == SVM_EXIT_VMMCALL, + vmcb->control.exit_code, + vmcb->control.exit_info_1, vmcb->control.exit_info_2); + + if (is_nmi) { + clgi(); + x2apic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_NMI); + + GUEST_ASSERT_1(nmi_stage_get() == 1, nmi_stage_get()); + nmi_stage_inc(); + + stgi(); + /* self-NMI happens here */ + while (true) + cpu_relax(); + } + + /* Skip over VMMCALL */ + vmcb->save.rip += 3; + + /* Switch to alternate IDT to cause intervening NPF again */ + vmcb->save.idtr.base = idt_alt; + vmcb->control.clean = 0; /* &= ~BIT(VMCB_DT) would be enough */ + + vmcb->control.event_inj = BP_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT; + /* The return address pushed on stack, skip over UD2 */ + vmcb->control.next_rip = vmcb->save.rip + 2; + + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT_3(vmcb->control.exit_code == SVM_EXIT_HLT, + vmcb->control.exit_code, + vmcb->control.exit_info_1, vmcb->control.exit_info_2); + + GUEST_DONE(); +} + +static void run_test(bool is_nmi) +{ + struct kvm_vm *vm; + vm_vaddr_t svm_gva; + vm_vaddr_t idt_alt_vm; + struct kvm_guest_debug debug; + + pr_info("Running %s test\n", is_nmi ? "NMI" : "soft int"); + + vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, VCPU_ID); + + vm_install_exception_handler(vm, NMI_VECTOR, guest_nmi_handler); + vm_install_exception_handler(vm, BP_VECTOR, guest_bp_handler); + vm_install_exception_handler(vm, INT_NR, guest_int_handler); + + vcpu_alloc_svm(vm, &svm_gva); + + if (!is_nmi) { + void *idt, *idt_alt; + + idt_alt_vm = vm_vaddr_alloc_page(vm); + idt_alt = addr_gva2hva(vm, idt_alt_vm); + idt = addr_gva2hva(vm, vm->idt); + memcpy(idt_alt, idt, getpagesize()); + } else { + idt_alt_vm = 0; + } + vcpu_args_set(vm, VCPU_ID, 3, svm_gva, (uint64_t)is_nmi, (uint64_t)idt_alt_vm); + + memset(&debug, 0, sizeof(debug)); + vcpu_set_guest_debug(vm, VCPU_ID, &debug); + + struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + alarm(2); + vcpu_run(vm, VCPU_ID); + alarm(0); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld, vals = 0x%lx 0x%lx 0x%lx", (const char *)uc.args[0], + __FILE__, uc.args[1], uc.args[2], uc.args[3], uc.args[4]); + break; + /* NOT REACHED */ + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd); + } +done: + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + struct kvm_cpuid_entry2 *cpuid; + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + nested_svm_check_supported(); + + cpuid = kvm_get_supported_cpuid_entry(0x8000000a); + TEST_ASSERT(cpuid->edx & X86_FEATURE_NRIPS, + "KVM with nSVM is supposed to unconditionally advertise nRIP Save\n"); + + atomic_init(&nmi_stage, 0); + + run_test(false); + run_test(true); + + return 0; +} -- cgit v1.2.3 From 753dcf7a8686a750fa6aa4b4ca42c6945fc75ac1 Mon Sep 17 00:00:00 2001 From: Zeng Guang Date: Fri, 22 Apr 2022 21:44:56 +0800 Subject: kvm: selftests: Add KVM_CAP_MAX_VCPU_ID cap test Basic test coverage of KVM_CAP_MAX_VCPU_ID cap. This capability can be enabled before vCPU creation and only allowed to set once. if assigned vcpu id is beyond KVM_CAP_MAX_VCPU_ID capability, vCPU creation will fail. Signed-off-by: Zeng Guang Message-Id: <20220422134456.26655-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/x86_64/max_vcpuid_cap_test.c | 54 ++++++++++++++++++++++ 3 files changed, 56 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c (limited to 'tools') diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 82e764d71ca7..90a6dea2e84c 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -25,6 +25,7 @@ /x86_64/hyperv_cpuid /x86_64/hyperv_features /x86_64/hyperv_svm_test +/x86_64/max_vcpuid_cap_test /x86_64/mmio_warning_test /x86_64/mmu_role_test /x86_64/platform_info_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index f2647b88a8a0..a014368a2cd2 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -89,6 +89,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/xen_shinfo_test TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests TEST_GEN_PROGS_x86_64 += x86_64/amx_test +TEST_GEN_PROGS_x86_64 += x86_64/max_vcpuid_cap_test TEST_GEN_PROGS_x86_64 += access_tracking_perf_test TEST_GEN_PROGS_x86_64 += demand_paging_test TEST_GEN_PROGS_x86_64 += dirty_log_test diff --git a/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c new file mode 100644 index 000000000000..3f6c1ad86cc6 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * maximum APIC ID capability tests + * + * Copyright (C) 2022, Intel, Inc. + * + * Tests for getting/setting maximum APIC ID capability + */ + +#include "kvm_util.h" +#include "../lib/kvm_util_internal.h" + +#define MAX_VCPU_ID 2 + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct kvm_enable_cap cap = { 0 }; + int ret; + + vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); + + /* Get KVM_CAP_MAX_VCPU_ID cap supported in KVM */ + ret = vm_check_cap(vm, KVM_CAP_MAX_VCPU_ID); + + /* Try to set KVM_CAP_MAX_VCPU_ID beyond KVM cap */ + cap.cap = KVM_CAP_MAX_VCPU_ID; + cap.args[0] = ret + 1; + ret = ioctl(vm->fd, KVM_ENABLE_CAP, &cap); + TEST_ASSERT(ret < 0, + "Unexpected success to enable KVM_CAP_MAX_VCPU_ID" + "beyond KVM cap!\n"); + + /* Set KVM_CAP_MAX_VCPU_ID */ + cap.cap = KVM_CAP_MAX_VCPU_ID; + cap.args[0] = MAX_VCPU_ID; + ret = ioctl(vm->fd, KVM_ENABLE_CAP, &cap); + TEST_ASSERT(ret == 0, + "Unexpected failure to enable KVM_CAP_MAX_VCPU_ID!\n"); + + /* Try to set KVM_CAP_MAX_VCPU_ID again */ + cap.args[0] = MAX_VCPU_ID + 1; + ret = ioctl(vm->fd, KVM_ENABLE_CAP, &cap); + TEST_ASSERT(ret < 0, + "Unexpected success to enable KVM_CAP_MAX_VCPU_ID again\n"); + + /* Create vCPU with id beyond KVM_CAP_MAX_VCPU_ID cap*/ + ret = ioctl(vm->fd, KVM_CREATE_VCPU, MAX_VCPU_ID); + TEST_ASSERT(ret < 0, + "Unexpected success in creating a vCPU with VCPU ID out of range\n"); + + kvm_vm_free(vm); + return 0; +} -- cgit v1.2.3 From 5d9cd8b55cdc0fa2260bebab590430c8f90ce955 Mon Sep 17 00:00:00 2001 From: Guo Zhengkui Date: Wed, 11 May 2022 20:05:55 +0800 Subject: selftests: kvm: replace ternary operator with min() Fix the following coccicheck warnings: tools/testing/selftests/kvm/lib/s390x/ucall.c:25:15-17: WARNING opportunity for min() tools/testing/selftests/kvm/lib/x86_64/ucall.c:27:15-17: WARNING opportunity for min() tools/testing/selftests/kvm/lib/riscv/ucall.c:56:15-17: WARNING opportunity for min() tools/testing/selftests/kvm/lib/aarch64/ucall.c:82:15-17: WARNING opportunity for min() tools/testing/selftests/kvm/lib/aarch64/ucall.c:55:20-21: WARNING opportunity for min() min() is defined in tools/include/linux/kernel.h. Signed-off-by: Guo Zhengkui Acked-by: Claudio Imbrenda Acked-by: Anup Patel Message-Id: <20220511120621.36956-1-guozhengkui@vivo.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/aarch64/ucall.c | 4 ++-- tools/testing/selftests/kvm/lib/riscv/ucall.c | 2 +- tools/testing/selftests/kvm/lib/s390x/ucall.c | 2 +- tools/testing/selftests/kvm/lib/x86_64/ucall.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/aarch64/ucall.c b/tools/testing/selftests/kvm/lib/aarch64/ucall.c index e0b0164e9af8..00be3ef195ca 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/ucall.c +++ b/tools/testing/selftests/kvm/lib/aarch64/ucall.c @@ -52,7 +52,7 @@ void ucall_init(struct kvm_vm *vm, void *arg) * lower and won't match physical addresses. */ bits = vm->va_bits - 1; - bits = vm->pa_bits < bits ? vm->pa_bits : bits; + bits = min(vm->pa_bits, bits); end = 1ul << bits; start = end * 5 / 8; step = end / 16; @@ -79,7 +79,7 @@ void ucall(uint64_t cmd, int nargs, ...) va_list va; int i; - nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS; + nargs = min(nargs, UCALL_MAX_ARGS); va_start(va, nargs); for (i = 0; i < nargs; ++i) diff --git a/tools/testing/selftests/kvm/lib/riscv/ucall.c b/tools/testing/selftests/kvm/lib/riscv/ucall.c index 8550f424d093..c2ed59f5783d 100644 --- a/tools/testing/selftests/kvm/lib/riscv/ucall.c +++ b/tools/testing/selftests/kvm/lib/riscv/ucall.c @@ -53,7 +53,7 @@ void ucall(uint64_t cmd, int nargs, ...) va_list va; int i; - nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS; + nargs = min(nargs, UCALL_MAX_ARGS); va_start(va, nargs); for (i = 0; i < nargs; ++i) diff --git a/tools/testing/selftests/kvm/lib/s390x/ucall.c b/tools/testing/selftests/kvm/lib/s390x/ucall.c index 9d3b0f15249a..665267c1135d 100644 --- a/tools/testing/selftests/kvm/lib/s390x/ucall.c +++ b/tools/testing/selftests/kvm/lib/s390x/ucall.c @@ -22,7 +22,7 @@ void ucall(uint64_t cmd, int nargs, ...) va_list va; int i; - nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS; + nargs = min(nargs, UCALL_MAX_ARGS); va_start(va, nargs); for (i = 0; i < nargs; ++i) diff --git a/tools/testing/selftests/kvm/lib/x86_64/ucall.c b/tools/testing/selftests/kvm/lib/x86_64/ucall.c index a3489973e290..2ea31a0ebe30 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/ucall.c +++ b/tools/testing/selftests/kvm/lib/x86_64/ucall.c @@ -24,7 +24,7 @@ void ucall(uint64_t cmd, int nargs, ...) va_list va; int i; - nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS; + nargs = min(nargs, UCALL_MAX_ARGS); va_start(va, nargs); for (i = 0; i < nargs; ++i) -- cgit v1.2.3 From 30267b43c5b08260da7c76cacd28bf855b06ab93 Mon Sep 17 00:00:00 2001 From: Chenyi Qiang Date: Tue, 24 May 2022 21:56:22 +0800 Subject: KVM: selftests: Add a test to get/set triple fault event Add a selftest for triple fault event: - launch the L2 and exit to userspace via I/O. - using KVM_SET_VCPU_EVENTS to pend a triple fault event. - with the immediate_exit, check the triple fault is pending. - run for real with pending triple fault and L1 can see the triple fault. Suggested-by: Sean Christopherson Signed-off-by: Chenyi Qiang Message-Id: <20220524135624.22988-3-chenyi.qiang@intel.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/x86_64/triple_fault_event_test.c | 101 +++++++++++++++++++++ 3 files changed, 103 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c (limited to 'tools') diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 90a6dea2e84c..dd5c88c11059 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -58,6 +58,7 @@ /x86_64/xen_vmcall_test /x86_64/xss_msr_test /x86_64/vmx_pmu_caps_test +/x86_64/triple_fault_event_test /access_tracking_perf_test /demand_paging_test /dirty_log_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index a014368a2cd2..27e432273180 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -90,6 +90,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests TEST_GEN_PROGS_x86_64 += x86_64/amx_test TEST_GEN_PROGS_x86_64 += x86_64/max_vcpuid_cap_test +TEST_GEN_PROGS_x86_64 += x86_64/triple_fault_event_test TEST_GEN_PROGS_x86_64 += access_tracking_perf_test TEST_GEN_PROGS_x86_64 += demand_paging_test TEST_GEN_PROGS_x86_64 += dirty_log_test diff --git a/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c new file mode 100644 index 000000000000..6e1de0631ce9 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#include +#include + +#include "kselftest.h" + +#define VCPU_ID 0 +#define ARBITRARY_IO_PORT 0x2000 + +/* The virtual machine object. */ +static struct kvm_vm *vm; + +static void l2_guest_code(void) +{ + asm volatile("inb %%dx, %%al" + : : [port] "d" (ARBITRARY_IO_PORT) : "rax"); +} + +void l1_guest_code(struct vmx_pages *vmx) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + GUEST_ASSERT(vmx->vmcs_gpa); + GUEST_ASSERT(prepare_for_vmx_operation(vmx)); + GUEST_ASSERT(load_vmcs(vmx)); + + prepare_vmcs(vmx, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_ASSERT(!vmlaunch()); + /* L2 should triple fault after a triple fault event injected. */ + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_TRIPLE_FAULT); + GUEST_DONE(); +} + +int main(void) +{ + struct kvm_run *run; + struct kvm_vcpu_events events; + vm_vaddr_t vmx_pages_gva; + struct ucall uc; + + struct kvm_enable_cap cap = { + .cap = KVM_CAP_TRIPLE_FAULT_EVENT, + .args = {1} + }; + + if (!nested_vmx_supported()) { + print_skip("Nested VMX not supported"); + exit(KSFT_SKIP); + } + + if (!kvm_check_cap(KVM_CAP_TRIPLE_FAULT_EVENT)) { + print_skip("KVM_CAP_TRIPLE_FAULT_EVENT not supported"); + exit(KSFT_SKIP); + } + + vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + vm_enable_cap(vm, &cap); + + run = vcpu_state(vm, VCPU_ID); + vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + vcpu_run(vm, VCPU_ID); + + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Expected KVM_EXIT_IO, got: %u (%s)\n", + run->exit_reason, exit_reason_str(run->exit_reason)); + TEST_ASSERT(run->io.port == ARBITRARY_IO_PORT, + "Expected IN from port %d from L2, got port %d", + ARBITRARY_IO_PORT, run->io.port); + vcpu_events_get(vm, VCPU_ID, &events); + events.flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT; + events.triple_fault.pending = true; + vcpu_events_set(vm, VCPU_ID, &events); + run->immediate_exit = true; + vcpu_run_complete_io(vm, VCPU_ID); + + vcpu_events_get(vm, VCPU_ID, &events); + TEST_ASSERT(events.flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT, + "Triple fault event invalid"); + TEST_ASSERT(events.triple_fault.pending, + "No triple fault pending"); + vcpu_run(vm, VCPU_ID); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_DONE: + break; + case UCALL_ABORT: + TEST_FAIL("%s", (const char *)uc.args[0]); + default: + TEST_FAIL("Unexpected ucall: %lu", uc.cmd); + } + +} -- cgit v1.2.3 From 0b817059a8830b8bc3d50bb2402dea923cd89b01 Mon Sep 17 00:00:00 2001 From: Shahab Vahedi Date: Wed, 8 Jun 2022 14:29:28 +0000 Subject: bpftool: Fix bootstrapping during a cross compilation This change adjusts the Makefile to use "HOSTAR" as the archive tool to keep the sanity of the build process for the bootstrap part in check. For the rationale, please continue reading. When cross compiling bpftool with buildroot, it leads to an invocation like: $ AR="/path/to/buildroot/host/bin/arc-linux-gcc-ar" \ CC="/path/to/buildroot/host/bin/arc-linux-gcc" \ ... make Which in return fails while building the bootstrap section: ----------------------------------8<---------------------------------- make: Entering directory '/src/bpftool-v6.7.0/src' ... libbfd: [ on ] ... disassembler-four-args: [ on ] ... zlib: [ on ] ... libcap: [ OFF ] ... clang-bpf-co-re: [ on ] <-- triggers bootstrap . . . LINK /src/bpftool-v6.7.0/src/bootstrap/bpftool /usr/bin/ld: /src/bpftool-v6.7.0/src/bootstrap/libbpf/libbpf.a: error adding symbols: archive has no index; run ranlib to add one collect2: error: ld returned 1 exit status make: *** [Makefile:211: /src/bpftool-v6.7.0/src/bootstrap/bpftool] Error 1 make: *** Waiting for unfinished jobs.... AR /src/bpftool-v6.7.0/src/libbpf/libbpf.a make[1]: Leaving directory '/src/bpftool-v6.7.0/libbpf/src' make: Leaving directory '/src/bpftool-v6.7.0/src' ---------------------------------->8---------------------------------- This occurs because setting "AR" confuses the build process for the bootstrap section and it calls "arc-linux-gcc-ar" to create and index "libbpf.a" instead of the host "ar". Signed-off-by: Shahab Vahedi Signed-off-by: Daniel Borkmann Reviewed-by: Quentin Monnet Cc: Jean-Philippe Brucker Link: https://lore.kernel.org/bpf/8d297f0c-cfd0-ef6f-3970-6dddb3d9a87a@synopsys.com --- tools/bpf/bpftool/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index c6d2c77d0252..c19e0e4c41bd 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -53,7 +53,7 @@ $(LIBBPF_INTERNAL_HDRS): $(LIBBPF_HDRS_DIR)/%.h: $(BPF_DIR)/%.h | $(LIBBPF_HDRS_ $(LIBBPF_BOOTSTRAP): $(wildcard $(BPF_DIR)/*.[ch] $(BPF_DIR)/Makefile) | $(LIBBPF_BOOTSTRAP_OUTPUT) $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_BOOTSTRAP_OUTPUT) \ DESTDIR=$(LIBBPF_BOOTSTRAP_DESTDIR:/=) prefix= \ - ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD) $@ install_headers + ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD) AR=$(HOSTAR) $@ install_headers $(LIBBPF_BOOTSTRAP_INTERNAL_HDRS): $(LIBBPF_BOOTSTRAP_HDRS_DIR)/%.h: $(BPF_DIR)/%.h | $(LIBBPF_BOOTSTRAP_HDRS_DIR) $(call QUIET_INSTALL, $@) -- cgit v1.2.3 From fe92833524e368e59bba9c57e00f7359f133667f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 6 Jun 2022 15:01:43 -0700 Subject: libbpf: Fix uprobe symbol file offset calculation logic Fix libbpf's bpf_program__attach_uprobe() logic of determining function's *file offset* (which is what kernel is actually expecting) when attaching uprobe/uretprobe by function name. Previously calculation was determining virtual address offset relative to base load address, which (offset) is not always the same as file offset (though very frequently it is which is why this went unnoticed for a while). Fixes: 433966e3ae04 ("libbpf: Support function name-based attach uprobes") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Cc: Riham Selim Cc: Alan Maguire Link: https://lore.kernel.org/bpf/20220606220143.3796908-1-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 63 ++++++++++++++++++-------------------------------- 1 file changed, 22 insertions(+), 41 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index ed3cf0911b98..0781fae58a06 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -11192,43 +11192,6 @@ static int perf_event_uprobe_open_legacy(const char *probe_name, bool retprobe, return pfd; } -/* uprobes deal in relative offsets; subtract the base address associated with - * the mapped binary. See Documentation/trace/uprobetracer.rst for more - * details. - */ -static long elf_find_relative_offset(const char *filename, Elf *elf, long addr) -{ - size_t n; - int i; - - if (elf_getphdrnum(elf, &n)) { - pr_warn("elf: failed to find program headers for '%s': %s\n", filename, - elf_errmsg(-1)); - return -ENOENT; - } - - for (i = 0; i < n; i++) { - int seg_start, seg_end, seg_offset; - GElf_Phdr phdr; - - if (!gelf_getphdr(elf, i, &phdr)) { - pr_warn("elf: failed to get program header %d from '%s': %s\n", i, filename, - elf_errmsg(-1)); - return -ENOENT; - } - if (phdr.p_type != PT_LOAD || !(phdr.p_flags & PF_X)) - continue; - - seg_start = phdr.p_vaddr; - seg_end = seg_start + phdr.p_memsz; - seg_offset = phdr.p_offset; - if (addr >= seg_start && addr < seg_end) - return addr - seg_start + seg_offset; - } - pr_warn("elf: failed to find prog header containing 0x%lx in '%s'\n", addr, filename); - return -ENOENT; -} - /* Return next ELF section of sh_type after scn, or first of that type if scn is NULL. */ static Elf_Scn *elf_find_next_scn_by_type(Elf *elf, int sh_type, Elf_Scn *scn) { @@ -11315,6 +11278,8 @@ static long elf_find_func_offset(const char *binary_path, const char *name) for (idx = 0; idx < nr_syms; idx++) { int curr_bind; GElf_Sym sym; + Elf_Scn *sym_scn; + GElf_Shdr sym_sh; if (!gelf_getsym(symbols, idx, &sym)) continue; @@ -11352,12 +11317,28 @@ static long elf_find_func_offset(const char *binary_path, const char *name) continue; } } - ret = sym.st_value; + + /* Transform symbol's virtual address (absolute for + * binaries and relative for shared libs) into file + * offset, which is what kernel is expecting for + * uprobe/uretprobe attachment. + * See Documentation/trace/uprobetracer.rst for more + * details. + * This is done by looking up symbol's containing + * section's header and using it's virtual address + * (sh_addr) and corresponding file offset (sh_offset) + * to transform sym.st_value (virtual address) into + * desired final file offset. + */ + sym_scn = elf_getscn(elf, sym.st_shndx); + if (!sym_scn) + continue; + if (!gelf_getshdr(sym_scn, &sym_sh)) + continue; + + ret = sym.st_value - sym_sh.sh_addr + sym_sh.sh_offset; last_bind = curr_bind; } - /* For binaries that are not shared libraries, we need relative offset */ - if (ret > 0 && !is_shared_lib) - ret = elf_find_relative_offset(binary_path, elf, ret); if (ret > 0) break; } -- cgit v1.2.3 From fa82cce7a6bbb35ecf7fe66231c7076052cf66d5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Jun 2022 16:11:34 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - gpl-2.0_385.RULE Based on the normalized pattern: licensed under the gpl v2 extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference. Reviewed-by: Allison Randal Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- arch/sparc/vdso/vdso2c.c | 2 +- arch/x86/entry/vdso/vdso2c.c | 2 +- scripts/gcc-plugins/latent_entropy_plugin.c | 2 +- scripts/gcc-plugins/stackleak_plugin.c | 2 +- scripts/gcc-plugins/structleak_plugin.c | 2 +- tools/power/cpupower/debug/i386/dump_psb.c | 6 ++---- 6 files changed, 7 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/arch/sparc/vdso/vdso2c.c b/arch/sparc/vdso/vdso2c.c index ab7504176a7f..dc81240aab6f 100644 --- a/arch/sparc/vdso/vdso2c.c +++ b/arch/sparc/vdso/vdso2c.c @@ -1,7 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * vdso2c - A vdso image preparation tool * Copyright (c) 2014 Andy Lutomirski and others - * Licensed under the GPL v2 * * vdso2c requires stripped and unstripped input. It would be trivial * to fully strip the input in here, but, for reasons described below, diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index edfe9780f6d1..90d15f2a7205 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -1,7 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * vdso2c - A vdso image preparation tool * Copyright (c) 2014 Andy Lutomirski and others - * Licensed under the GPL v2 * * vdso2c requires stripped and unstripped input. It would be trivial * to fully strip the input in here, but, for reasons described below, diff --git a/scripts/gcc-plugins/latent_entropy_plugin.c b/scripts/gcc-plugins/latent_entropy_plugin.c index 848918764174..39e86be60dd2 100644 --- a/scripts/gcc-plugins/latent_entropy_plugin.c +++ b/scripts/gcc-plugins/latent_entropy_plugin.c @@ -1,7 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2012-2016 by the PaX Team * Copyright 2016 by Emese Revfy - * Licensed under the GPL v2 * * Note: the choice of the license means that the compilation process is * NOT 'eligible' as defined by gcc's library exception to the GPL v3, diff --git a/scripts/gcc-plugins/stackleak_plugin.c b/scripts/gcc-plugins/stackleak_plugin.c index ff91885f9470..c5c2ce113c92 100644 --- a/scripts/gcc-plugins/stackleak_plugin.c +++ b/scripts/gcc-plugins/stackleak_plugin.c @@ -1,7 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2011-2017 by the PaX Team * Modified by Alexander Popov - * Licensed under the GPL v2 * * Note: the choice of the license means that the compilation process is * NOT 'eligible' as defined by gcc's library exception to the GPL v3, diff --git a/scripts/gcc-plugins/structleak_plugin.c b/scripts/gcc-plugins/structleak_plugin.c index 8bc04068ed39..d8c744233832 100644 --- a/scripts/gcc-plugins/structleak_plugin.c +++ b/scripts/gcc-plugins/structleak_plugin.c @@ -1,6 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2013-2017 by PaX Team - * Licensed under the GPL v2 * * Note: the choice of the license means that the compilation process is * NOT 'eligible' as defined by gcc's library exception to the GPL v3, diff --git a/tools/power/cpupower/debug/i386/dump_psb.c b/tools/power/cpupower/debug/i386/dump_psb.c index 2c768cf70128..6fb81b42ea61 100644 --- a/tools/power/cpupower/debug/i386/dump_psb.c +++ b/tools/power/cpupower/debug/i386/dump_psb.c @@ -1,7 +1,5 @@ -/* - * dump_psb. (c) 2004, Dave Jones, Red Hat Inc. - * Licensed under the GPL v2. - */ +// SPDX-License-Identifier: GPL-2.0-only +// dump_psb. (c) 2004, Dave Jones, Red Hat Inc. #include #include -- cgit v1.2.3 From 8deb03e75f6048b33b80025b6475c92975670c5b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 1 Jun 2022 12:16:53 -0700 Subject: KVM: Fix references to non-existent KVM_CAP_TRIPLE_FAULT_EVENT The x86-only KVM_CAP_TRIPLE_FAULT_EVENT was (appropriately) renamed to KVM_CAP_X86_TRIPLE_FAULT_EVENT when the patches were applied, but the docs and selftests got left behind. Fix them. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 4 ++-- tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 9cbbfdb663b6..84c486ce6279 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -1152,7 +1152,7 @@ The following bits are defined in the flags field: - KVM_VCPUEVENT_VALID_TRIPLE_FAULT may be set to signal that the triple_fault_pending field contains a valid state. This bit will - be set whenever KVM_CAP_TRIPLE_FAULT_EVENT is enabled. + be set whenever KVM_CAP_X86_TRIPLE_FAULT_EVENT is enabled. ARM64: ^^^^^^ @@ -1249,7 +1249,7 @@ can be set in the flags field to signal that the exception_has_payload, exception_payload, and exception.pending fields contain a valid state and shall be written into the VCPU. -If KVM_CAP_TRIPLE_FAULT_EVENT is enabled, KVM_VCPUEVENT_VALID_TRIPLE_FAULT +If KVM_CAP_X86_TRIPLE_FAULT_EVENT is enabled, KVM_VCPUEVENT_VALID_TRIPLE_FAULT can be set in flags field to signal that the triple_fault field contains a valid state and shall be written into the VCPU. diff --git a/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c index 6e1de0631ce9..66378140764d 100644 --- a/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c +++ b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c @@ -47,7 +47,7 @@ int main(void) struct ucall uc; struct kvm_enable_cap cap = { - .cap = KVM_CAP_TRIPLE_FAULT_EVENT, + .cap = KVM_CAP_X86_TRIPLE_FAULT_EVENT, .args = {1} }; @@ -56,8 +56,8 @@ int main(void) exit(KSFT_SKIP); } - if (!kvm_check_cap(KVM_CAP_TRIPLE_FAULT_EVENT)) { - print_skip("KVM_CAP_TRIPLE_FAULT_EVENT not supported"); + if (!kvm_check_cap(KVM_CAP_X86_TRIPLE_FAULT_EVENT)) { + print_skip("KVM_CAP_X86_TRIPLE_FAULT_EVENT not supported"); exit(KSFT_SKIP); } -- cgit v1.2.3 From 1ca378f65378864b33a0356478252bd49bf6ef86 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Feb 2022 14:52:12 -0800 Subject: KVM: selftests: Fix buggy-but-benign check in test_v3_new_redist_regions() Update 'ret' with the return value of _kvm_device_access() prior to asserting that ret is non-zero. In the current code base, the flaw is benign as 'ret' is guaranteed to be -EBUSY from the previous run_vcpu(), which also means that errno==EBUSY prior to _kvm_device_access(), thus the "errno == EFAULT" part of the assert means that a false negative is impossible (unless the kernel is being truly mean and spuriously setting errno=EFAULT while returning success). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/vgic_init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index 34379c98d2f4..0f046e3e953d 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -381,8 +381,8 @@ static void test_v3_new_redist_regions(void) v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, NR_VCPUS); subtest_v3_redist_regions(&v); - _kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, dummy, true); + ret = _kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, dummy, true); TEST_ASSERT(ret && errno == EFAULT, "register a third region allowing to cover the 4 vcpus"); -- cgit v1.2.3 From ff624e57d8dfb6c66ff8118aca7bad61d75428f0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 11:47:05 -0800 Subject: KVM: selftests: Fix typo in vgic_init test When iterating over vCPUs, invoke access_v3_redist_reg() on the "current" vCPU instead of vCPU0, which is presumably what was intended by iterating over all vCPUs. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/vgic_init.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index 0f046e3e953d..defd196c69d2 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -426,8 +426,9 @@ static void test_v3_typer_accesses(void) KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); for (i = 0; i < NR_VCPUS ; i++) { - ret = access_v3_redist_reg(v.gic_fd, 0, GICR_TYPER, &val, false); - TEST_ASSERT(!ret && !val, "read GICR_TYPER before rdist region setting"); + ret = access_v3_redist_reg(v.gic_fd, i, GICR_TYPER, &val, false); + TEST_ASSERT(!ret && val == i * 0x100, + "read GICR_TYPER before rdist region setting"); } addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0); -- cgit v1.2.3 From d379749fdab68d0eee05e149a25f8ef45d2f19bc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 May 2022 15:25:35 -0700 Subject: KVM: selftests: Drop stale declarations from kvm_util_base.h Drop declarations for allocate_kvm_dirty_log() and vm_create_device(), which no longer have implementations. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/kvm_util_base.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 92cef0ffb19e..47b77ebda6a3 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -381,11 +381,6 @@ struct kvm_userspace_memory_region * kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end); -struct kvm_dirty_log * -allocate_kvm_dirty_log(struct kvm_userspace_memory_region *region); - -int vm_create_device(struct kvm_vm *vm, struct kvm_create_device *cd); - #define sync_global_to_guest(vm, g) ({ \ typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ memcpy(_p, &(g), sizeof(g)); \ -- cgit v1.2.3 From ccc82ba6bea45189a516c97480b2b70406832f35 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 14 Feb 2022 17:12:38 -0800 Subject: KVM: selftests: Always open VM file descriptors with O_RDWR Drop the @perm param from vm_create() and always open VM file descriptors with O_RDWR. There's no legitimate use case for other permissions, and if a selftest wants to do oddball negative testing it can open code the necessary bits instead of forcing a bunch of tests to provide useless information. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/get-reg-list.c | 2 +- tools/testing/selftests/kvm/aarch64/psci_test.c | 2 +- .../selftests/kvm/aarch64/vcpu_width_config.c | 6 +++--- tools/testing/selftests/kvm/dirty_log_test.c | 2 +- tools/testing/selftests/kvm/hardware_disable_test.c | 2 +- tools/testing/selftests/kvm/include/kvm_util_base.h | 4 ++-- tools/testing/selftests/kvm/kvm_binary_stats_test.c | 2 +- tools/testing/selftests/kvm/kvm_create_max_vcpus.c | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 20 +++++++++----------- tools/testing/selftests/kvm/set_memory_region_test.c | 4 ++-- tools/testing/selftests/kvm/x86_64/amx_test.c | 2 +- tools/testing/selftests/kvm/x86_64/evmcs_test.c | 2 +- .../selftests/kvm/x86_64/max_vcpuid_cap_test.c | 2 +- tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c | 2 +- tools/testing/selftests/kvm/x86_64/set_sregs_test.c | 2 +- .../testing/selftests/kvm/x86_64/sev_migrate_tests.c | 8 ++++---- tools/testing/selftests/kvm/x86_64/smm_test.c | 2 +- tools/testing/selftests/kvm/x86_64/state_test.c | 2 +- .../selftests/kvm/x86_64/vmx_preemption_timer_test.c | 2 +- 19 files changed, 34 insertions(+), 36 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c index d3a7dbfcbb3d..dd549cc75869 100644 --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -416,7 +416,7 @@ static void run_test(struct vcpu_config *c) check_supported(c); - vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); prepare_vcpu_init(c, &init); aarch64_vcpu_add_default(vm, 0, &init, NULL); finalize_vcpu(vm, 0, c); diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c index 88541de21c41..de3b5e176d04 100644 --- a/tools/testing/selftests/kvm/aarch64/psci_test.c +++ b/tools/testing/selftests/kvm/aarch64/psci_test.c @@ -78,7 +78,7 @@ static struct kvm_vm *setup_vm(void *guest_code) struct kvm_vcpu_init init; struct kvm_vm *vm; - vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); kvm_vm_elf_load(vm, program_invocation_name); ucall_init(vm, NULL); diff --git a/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c b/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c index 6e9402679229..d48129349213 100644 --- a/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c +++ b/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c @@ -24,7 +24,7 @@ static int add_init_2vcpus(struct kvm_vcpu_init *init1, struct kvm_vm *vm; int ret; - vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); vm_vcpu_add(vm, 0); ret = _vcpu_ioctl(vm, 0, KVM_ARM_VCPU_INIT, init1); @@ -49,7 +49,7 @@ static int add_2vcpus_init_2vcpus(struct kvm_vcpu_init *init1, struct kvm_vm *vm; int ret; - vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); vm_vcpu_add(vm, 0); vm_vcpu_add(vm, 1); @@ -86,7 +86,7 @@ int main(void) } /* Get the preferred target type and copy that to init2 for later use */ - vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init1); kvm_vm_free(vm); init2 = init1; diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 3fcd89e195c7..11bf606e3165 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -679,7 +679,7 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); - vm = vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); + vm = vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages); kvm_vm_elf_load(vm, program_invocation_name); #ifdef __x86_64__ vm_create_irqchip(vm); diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c index b21c69a56daa..1c9e2295c75b 100644 --- a/tools/testing/selftests/kvm/hardware_disable_test.c +++ b/tools/testing/selftests/kvm/hardware_disable_test.c @@ -104,7 +104,7 @@ static void run_test(uint32_t run) for (i = 0; i < VCPU_NUM; i++) CPU_SET(i, &cpu_set); - vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); kvm_vm_elf_load(vm, program_invocation_name); vm_create_irqchip(vm); diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 47b77ebda6a3..89b633b40247 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -110,9 +110,9 @@ int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id, void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size); const char *vm_guest_mode_string(uint32_t i); -struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm); +struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages); void kvm_vm_free(struct kvm_vm *vmp); -void kvm_vm_restart(struct kvm_vm *vmp, int perm); +void kvm_vm_restart(struct kvm_vm *vmp); void kvm_vm_release(struct kvm_vm *vmp); void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log); void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index 17f65d514915..6217f4630e6c 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -230,7 +230,7 @@ int main(int argc, char *argv[]) TEST_ASSERT(vms, "Allocate memory for storing VM pointers"); for (i = 0; i < max_vm; ++i) { vms[i] = vm_create(VM_MODE_DEFAULT, - DEFAULT_GUEST_PHY_PAGES, O_RDWR); + DEFAULT_GUEST_PHY_PAGES); for (j = 0; j < max_vcpu; ++j) vm_vcpu_add(vms[i], j); } diff --git a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c index aed9dc3ca1e9..bb69b75eac23 100644 --- a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c +++ b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c @@ -28,7 +28,7 @@ void test_vcpu_creation(int first_vcpu_id, int num_vcpus) pr_info("Testing creating %d vCPUs, with IDs %d...%d.\n", num_vcpus, first_vcpu_id, first_vcpu_id + num_vcpus - 1); - vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); for (i = first_vcpu_id; i < first_vcpu_id + num_vcpus; i++) /* This asserts that the vCPU was created. */ diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 1665a220abcb..da7e3369f4b8 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -173,9 +173,9 @@ void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size) vm->dirty_ring_size = ring_size; } -static void vm_open(struct kvm_vm *vm, int perm) +static void vm_open(struct kvm_vm *vm) { - vm->kvm_fd = _open_kvm_dev_path_or_exit(perm); + vm->kvm_fd = _open_kvm_dev_path_or_exit(O_RDWR); if (!kvm_check_cap(KVM_CAP_IMMEDIATE_EXIT)) { print_skip("immediate_exit not available"); @@ -240,7 +240,6 @@ _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) * Input Args: * mode - VM Mode (e.g. VM_MODE_P52V48_4K) * phy_pages - Physical memory pages - * perm - permission * * Output Args: None * @@ -253,12 +252,12 @@ _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) * descriptor to control the created VM is created with the permissions * given by perm (e.g. O_RDWR). */ -struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) +struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages) { struct kvm_vm *vm; - pr_debug("%s: mode='%s' pages='%ld' perm='%d'\n", __func__, - vm_guest_mode_string(mode), phy_pages, perm); + pr_debug("%s: mode='%s' pages='%ld'\n", __func__, + vm_guest_mode_string(mode), phy_pages); vm = calloc(1, sizeof(*vm)); TEST_ASSERT(vm != NULL, "Insufficient Memory"); @@ -340,7 +339,7 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) vm->type = KVM_VM_TYPE_ARM_IPA_SIZE(vm->pa_bits); #endif - vm_open(vm, perm); + vm_open(vm); /* Limit to VA-bit canonical virtual addresses. */ vm->vpages_valid = sparsebit_alloc(); @@ -366,7 +365,7 @@ struct kvm_vm *vm_create_without_vcpus(enum vm_guest_mode mode, uint64_t pages) { struct kvm_vm *vm; - vm = vm_create(mode, pages, O_RDWR); + vm = vm_create(mode, pages); kvm_vm_elf_load(vm, program_invocation_name); @@ -458,7 +457,6 @@ struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, * * Input Args: * vm - VM that has been released before - * perm - permission * * Output Args: None * @@ -466,12 +464,12 @@ struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, * global state, such as the irqchip and the memory regions that are mapped * into the guest. */ -void kvm_vm_restart(struct kvm_vm *vmp, int perm) +void kvm_vm_restart(struct kvm_vm *vmp) { int ctr; struct userspace_mem_region *region; - vm_open(vmp, perm); + vm_open(vmp); if (vmp->has_irqchip) vm_create_irqchip(vmp); diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index 73bc297dabe6..d97cfd6866c3 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -314,7 +314,7 @@ static void test_zero_memory_regions(void) pr_info("Testing KVM_RUN with zero added memory regions\n"); - vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); + vm = vm_create(VM_MODE_DEFAULT, 0); vm_vcpu_add(vm, VCPU_ID); TEST_ASSERT(!ioctl(vm_get_fd(vm), KVM_SET_NR_MMU_PAGES, 64), @@ -354,7 +354,7 @@ static void test_add_max_memory_regions(void) "KVM_CAP_NR_MEMSLOTS should be greater than 0"); pr_info("Allowed number of memory slots: %i\n", max_mem_slots); - vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); + vm = vm_create(VM_MODE_DEFAULT, 0); /* Check it can be added memory slots up to the maximum allowed */ pr_info("Adding slots 0..%i, each memory region with %dK size\n", diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c index 76f65c22796f..2f01247da0b5 100644 --- a/tools/testing/selftests/kvm/x86_64/amx_test.c +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -431,7 +431,7 @@ int main(int argc, char *argv[]) kvm_vm_release(vm); /* Restore state in a new VM. */ - kvm_vm_restart(vm, O_RDWR); + kvm_vm_restart(vm); vm_vcpu_add(vm, VCPU_ID); vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); vcpu_load_state(vm, VCPU_ID, state); diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index e161c6dd7a02..78668605f673 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -183,7 +183,7 @@ static void save_restore_vm(struct kvm_vm *vm) kvm_vm_release(vm); /* Restore state in a new VM. */ - kvm_vm_restart(vm, O_RDWR); + kvm_vm_restart(vm); vm_vcpu_add(vm, VCPU_ID); vcpu_set_hv_cpuid(vm, VCPU_ID); vcpu_enable_evmcs(vm, VCPU_ID); diff --git a/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c index 3f6c1ad86cc6..28cc316c5dbe 100644 --- a/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c +++ b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c @@ -18,7 +18,7 @@ int main(int argc, char *argv[]) struct kvm_enable_cap cap = { 0 }; int ret; - vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); + vm = vm_create(VM_MODE_DEFAULT, 0); /* Get KVM_CAP_MAX_VCPU_ID cap supported in KVM */ ret = vm_check_cap(vm, KVM_CAP_MAX_VCPU_ID); diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c index ae76436af0cc..2fe893ccedd0 100644 --- a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c +++ b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c @@ -88,7 +88,7 @@ static struct kvm_vm *create_vm(void) uint64_t pages = DEFAULT_GUEST_PHY_PAGES + vcpu_pages + extra_pg_pages; pages = vm_adjust_num_guest_pages(VM_MODE_DEFAULT, pages); - vm = vm_create(VM_MODE_DEFAULT, pages, O_RDWR); + vm = vm_create(VM_MODE_DEFAULT, pages); kvm_vm_elf_load(vm, program_invocation_name); vm_create_irqchip(vm); diff --git a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c index 318be0bf77ab..44711ab735c3 100644 --- a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c +++ b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c @@ -95,7 +95,7 @@ int main(int argc, char *argv[]) * use it to verify all supported CR4 bits can be set prior to defining * the vCPU model, i.e. without doing KVM_SET_CPUID2. */ - vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); vm_vcpu_add(vm, VCPU_ID); vcpu_sregs_get(vm, VCPU_ID, &sregs); diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c index d1dc1acf997c..b0c052443c44 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c +++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c @@ -54,7 +54,7 @@ static struct kvm_vm *sev_vm_create(bool es) struct kvm_sev_launch_start start = { 0 }; int i; - vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); + vm = vm_create(VM_MODE_DEFAULT, 0); sev_ioctl(vm->fd, es ? KVM_SEV_ES_INIT : KVM_SEV_INIT, NULL); for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i) vm_vcpu_add(vm, i); @@ -71,7 +71,7 @@ static struct kvm_vm *aux_vm_create(bool with_vcpus) struct kvm_vm *vm; int i; - vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); + vm = vm_create(VM_MODE_DEFAULT, 0); if (!with_vcpus) return vm; @@ -174,7 +174,7 @@ static void test_sev_migrate_parameters(void) *sev_es_vm_no_vmsa; int ret; - vm_no_vcpu = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); + vm_no_vcpu = vm_create(VM_MODE_DEFAULT, 0); vm_no_sev = aux_vm_create(true); ret = __sev_migrate_from(vm_no_vcpu->fd, vm_no_sev->fd); TEST_ASSERT(ret == -1 && errno == EINVAL, @@ -186,7 +186,7 @@ static void test_sev_migrate_parameters(void) sev_vm = sev_vm_create(/* es= */ false); sev_es_vm = sev_vm_create(/* es= */ true); - sev_es_vm_no_vmsa = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); + sev_es_vm_no_vmsa = vm_create(VM_MODE_DEFAULT, 0); sev_ioctl(sev_es_vm_no_vmsa->fd, KVM_SEV_ES_INIT, NULL); vm_vcpu_add(sev_es_vm_no_vmsa, 1); diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c index b4e0c860769e..dd2c1522ab90 100644 --- a/tools/testing/selftests/kvm/x86_64/smm_test.c +++ b/tools/testing/selftests/kvm/x86_64/smm_test.c @@ -204,7 +204,7 @@ int main(int argc, char *argv[]) state = vcpu_save_state(vm, VCPU_ID); kvm_vm_release(vm); - kvm_vm_restart(vm, O_RDWR); + kvm_vm_restart(vm); vm_vcpu_add(vm, VCPU_ID); vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); vcpu_load_state(vm, VCPU_ID, state); diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c index 2e0a92da8ff5..41f7faaef2ac 100644 --- a/tools/testing/selftests/kvm/x86_64/state_test.c +++ b/tools/testing/selftests/kvm/x86_64/state_test.c @@ -213,7 +213,7 @@ int main(int argc, char *argv[]) kvm_vm_release(vm); /* Restore state in a new VM. */ - kvm_vm_restart(vm, O_RDWR); + kvm_vm_restart(vm); vm_vcpu_add(vm, VCPU_ID); vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); vcpu_load_state(vm, VCPU_ID, state); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c index ff92e25b6f1e..f5b4ae914131 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c @@ -239,7 +239,7 @@ int main(int argc, char *argv[]) kvm_vm_release(vm); /* Restore state in a new VM. */ - kvm_vm_restart(vm, O_RDWR); + kvm_vm_restart(vm); vm_vcpu_add(vm, VCPU_ID); vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); vcpu_load_state(vm, VCPU_ID, state); -- cgit v1.2.3 From 2b38a7398f20dbbfa88689819b21967489d284f5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 10:27:55 -0800 Subject: KVM: selftests: Add another underscore to inner ioctl() helpers Add a second underscore to inner ioctl() helpers to better align with commonly accepted kernel coding style, and to allow using a single underscore variant in the future for macro shenanigans. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/get-reg-list.c | 6 +++--- tools/testing/selftests/kvm/aarch64/hypercalls.c | 2 +- .../selftests/kvm/aarch64/vcpu_width_config.c | 8 ++++---- tools/testing/selftests/kvm/aarch64/vgic_init.c | 2 +- .../testing/selftests/kvm/include/kvm_util_base.h | 8 ++++---- tools/testing/selftests/kvm/lib/kvm_util.c | 24 +++++++++++----------- tools/testing/selftests/kvm/lib/riscv/processor.c | 2 +- tools/testing/selftests/kvm/s390x/memop.c | 4 ++-- tools/testing/selftests/kvm/s390x/resets.c | 4 ++-- tools/testing/selftests/kvm/steal_time.c | 6 +++--- tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c | 4 ++-- .../testing/selftests/kvm/x86_64/set_boot_cpu_id.c | 6 +++--- .../kvm/x86_64/vmx_nested_tsc_scaling_test.c | 2 +- 13 files changed, 39 insertions(+), 39 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c index dd549cc75869..441c98ffb812 100644 --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -457,7 +457,7 @@ static void run_test(struct vcpu_config *c) bool reject_reg = false; int ret; - ret = _vcpu_ioctl(vm, 0, KVM_GET_ONE_REG, ®); + ret = __vcpu_ioctl(vm, 0, KVM_GET_ONE_REG, ®); if (ret) { printf("%s: Failed to get ", config_name(c)); print_reg(c, reg.id); @@ -469,7 +469,7 @@ static void run_test(struct vcpu_config *c) for_each_sublist(c, s) { if (s->rejects_set && find_reg(s->rejects_set, s->rejects_set_n, reg.id)) { reject_reg = true; - ret = _vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, ®); + ret = __vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, ®); if (ret != -1 || errno != EPERM) { printf("%s: Failed to reject (ret=%d, errno=%d) ", config_name(c), ret, errno); print_reg(c, reg.id); @@ -481,7 +481,7 @@ static void run_test(struct vcpu_config *c) } if (!reject_reg) { - ret = _vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, ®); + ret = __vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, ®); if (ret) { printf("%s: Failed to set ", config_name(c)); print_reg(c, reg.id); diff --git a/tools/testing/selftests/kvm/aarch64/hypercalls.c b/tools/testing/selftests/kvm/aarch64/hypercalls.c index 41e0210b7a5e..1eb9738453b4 100644 --- a/tools/testing/selftests/kvm/aarch64/hypercalls.c +++ b/tools/testing/selftests/kvm/aarch64/hypercalls.c @@ -148,7 +148,7 @@ static int set_fw_reg(struct kvm_vm *vm, uint64_t id, uint64_t val) .addr = (uint64_t)&val, }; - return _vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, ®); + return __vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, ®); } static void get_fw_reg(struct kvm_vm *vm, uint64_t id, uint64_t *addr) diff --git a/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c b/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c index d48129349213..271fa90e53fd 100644 --- a/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c +++ b/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c @@ -27,12 +27,12 @@ static int add_init_2vcpus(struct kvm_vcpu_init *init1, vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); vm_vcpu_add(vm, 0); - ret = _vcpu_ioctl(vm, 0, KVM_ARM_VCPU_INIT, init1); + ret = __vcpu_ioctl(vm, 0, KVM_ARM_VCPU_INIT, init1); if (ret) goto free_exit; vm_vcpu_add(vm, 1); - ret = _vcpu_ioctl(vm, 1, KVM_ARM_VCPU_INIT, init2); + ret = __vcpu_ioctl(vm, 1, KVM_ARM_VCPU_INIT, init2); free_exit: kvm_vm_free(vm); @@ -54,11 +54,11 @@ static int add_2vcpus_init_2vcpus(struct kvm_vcpu_init *init1, vm_vcpu_add(vm, 0); vm_vcpu_add(vm, 1); - ret = _vcpu_ioctl(vm, 0, KVM_ARM_VCPU_INIT, init1); + ret = __vcpu_ioctl(vm, 0, KVM_ARM_VCPU_INIT, init1); if (ret) goto free_exit; - ret = _vcpu_ioctl(vm, 1, KVM_ARM_VCPU_INIT, init2); + ret = __vcpu_ioctl(vm, 1, KVM_ARM_VCPU_INIT, init2); free_exit: kvm_vm_free(vm); diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index defd196c69d2..4b7e7ba43969 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -55,7 +55,7 @@ static void guest_code(void) static int run_vcpu(struct kvm_vm *vm, uint32_t vcpuid) { ucall_init(vm, NULL); - int ret = _vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL); + int ret = __vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL); if (ret) return -errno; return 0; diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 89b633b40247..662579a6358b 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -159,12 +159,12 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl, void *arg); -int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl, - void *arg); +int __vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl, + void *arg); void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); -int _vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg); +int __vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg); void kvm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); -int _kvm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); +int __kvm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index da7e3369f4b8..03c1f885a98b 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1719,7 +1719,7 @@ struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vm *vm, uint32_t vcpuid) struct kvm_reg_list reg_list_n = { .n = 0 }, *reg_list; int ret; - ret = _vcpu_ioctl(vm, vcpuid, KVM_GET_REG_LIST, ®_list_n); + ret = __vcpu_ioctl(vm, vcpuid, KVM_GET_REG_LIST, ®_list_n); TEST_ASSERT(ret == -1 && errno == E2BIG, "KVM_GET_REG_LIST n=0"); reg_list = calloc(1, sizeof(*reg_list) + reg_list_n.n * sizeof(__u64)); reg_list->n = reg_list_n.n; @@ -1905,7 +1905,7 @@ void vcpu_fpu_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_fpu *fpu) { int ret; - ret = _vcpu_ioctl(vm, vcpuid, KVM_GET_FPU, fpu); + ret = __vcpu_ioctl(vm, vcpuid, KVM_GET_FPU, fpu); TEST_ASSERT(ret == 0, "KVM_GET_FPU failed, rc: %i errno: %i (%s)", ret, errno, strerror(errno)); } @@ -1914,7 +1914,7 @@ void vcpu_fpu_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_fpu *fpu) { int ret; - ret = _vcpu_ioctl(vm, vcpuid, KVM_SET_FPU, fpu); + ret = __vcpu_ioctl(vm, vcpuid, KVM_SET_FPU, fpu); TEST_ASSERT(ret == 0, "KVM_SET_FPU failed, rc: %i errno: %i (%s)", ret, errno, strerror(errno)); } @@ -1923,7 +1923,7 @@ void vcpu_get_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg) { int ret; - ret = _vcpu_ioctl(vm, vcpuid, KVM_GET_ONE_REG, reg); + ret = __vcpu_ioctl(vm, vcpuid, KVM_GET_ONE_REG, reg); TEST_ASSERT(ret == 0, "KVM_GET_ONE_REG failed, rc: %i errno: %i (%s)", ret, errno, strerror(errno)); } @@ -1932,7 +1932,7 @@ void vcpu_set_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg) { int ret; - ret = _vcpu_ioctl(vm, vcpuid, KVM_SET_ONE_REG, reg); + ret = __vcpu_ioctl(vm, vcpuid, KVM_SET_ONE_REG, reg); TEST_ASSERT(ret == 0, "KVM_SET_ONE_REG failed, rc: %i errno: %i (%s)", ret, errno, strerror(errno)); } @@ -1955,13 +1955,13 @@ void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, { int ret; - ret = _vcpu_ioctl(vm, vcpuid, cmd, arg); + ret = __vcpu_ioctl(vm, vcpuid, cmd, arg); TEST_ASSERT(ret == 0, "vcpu ioctl %lu failed, rc: %i errno: %i (%s)", cmd, ret, errno, strerror(errno)); } -int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, - unsigned long cmd, void *arg) +int __vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, + unsigned long cmd, void *arg) { struct vcpu *vcpu = vcpu_find(vm, vcpuid); int ret; @@ -2025,12 +2025,12 @@ void vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) { int ret; - ret = _vm_ioctl(vm, cmd, arg); + ret = __vm_ioctl(vm, cmd, arg); TEST_ASSERT(ret == 0, "vm ioctl %lu failed, rc: %i errno: %i (%s)", cmd, ret, errno, strerror(errno)); } -int _vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) +int __vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) { return ioctl(vm->fd, cmd, arg); } @@ -2056,7 +2056,7 @@ void kvm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) cmd, ret, errno, strerror(errno)); } -int _kvm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) +int __kvm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) { return ioctl(vm->kvm_fd, cmd, arg); } @@ -2185,7 +2185,7 @@ int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level) .level = level, }; - return _vm_ioctl(vm, KVM_IRQ_LINE, &irq_level); + return __vm_ioctl(vm, KVM_IRQ_LINE, &irq_level); } void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level) diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index abc0ae5a4fe1..c89e6b1fbfb1 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -295,7 +295,7 @@ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) * are powered-on using KVM_SET_MP_STATE ioctl(). */ mps.mp_state = KVM_MP_STATE_RUNNABLE; - r = _vcpu_ioctl(vm, vcpuid, KVM_SET_MP_STATE, &mps); + r = __vcpu_ioctl(vm, vcpuid, KVM_SET_MP_STATE, &mps); TEST_ASSERT(!r, "IOCTL KVM_SET_MP_STATE failed (error %d)", r); /* Setup global pointer of guest to be same as the host */ diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c index e704c6fa5758..230d4b6301e6 100644 --- a/tools/testing/selftests/kvm/s390x/memop.c +++ b/tools/testing/selftests/kvm/s390x/memop.c @@ -159,9 +159,9 @@ static void memop_ioctl(struct test_vcpu vcpu, struct kvm_s390_mem_op *ksmo) static int err_memop_ioctl(struct test_vcpu vcpu, struct kvm_s390_mem_op *ksmo) { if (vcpu.id == VM_VCPU_ID) - return _vm_ioctl(vcpu.vm, KVM_S390_MEM_OP, ksmo); + return __vm_ioctl(vcpu.vm, KVM_S390_MEM_OP, ksmo); else - return _vcpu_ioctl(vcpu.vm, vcpu.id, KVM_S390_MEM_OP, ksmo); + return __vcpu_ioctl(vcpu.vm, vcpu.id, KVM_S390_MEM_OP, ksmo); } #define MEMOP(err, vcpu_p, mop_target_p, access_mode_p, buf_p, size_p, ...) \ diff --git a/tools/testing/selftests/kvm/s390x/resets.c b/tools/testing/selftests/kvm/s390x/resets.c index 889449a22e7a..298bff3fd52e 100644 --- a/tools/testing/selftests/kvm/s390x/resets.c +++ b/tools/testing/selftests/kvm/s390x/resets.c @@ -77,7 +77,7 @@ static void assert_noirq(void) irq_state.len = sizeof(buf); irq_state.buf = (unsigned long)buf; - irqs = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_GET_IRQ_STATE, &irq_state); + irqs = __vcpu_ioctl(vm, VCPU_ID, KVM_S390_GET_IRQ_STATE, &irq_state); /* * irqs contains the number of retrieved interrupts. Any interrupt * (notably, the emergency call interrupt we have injected) should @@ -197,7 +197,7 @@ static void inject_irq(int cpu_id) irq_state.buf = (unsigned long)buf; irq->type = KVM_S390_INT_EMERGENCY; irq->u.emerg.code = cpu_id; - irqs = _vcpu_ioctl(vm, cpu_id, KVM_S390_SET_IRQ_STATE, &irq_state); + irqs = __vcpu_ioctl(vm, cpu_id, KVM_S390_SET_IRQ_STATE, &irq_state); TEST_ASSERT(irqs >= 0, "Error injecting EMERGENCY IRQ errno %d\n", errno); } diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index 8c4e811bd586..75303fe8359d 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -166,7 +166,7 @@ static void steal_time_init(struct kvm_vm *vm) }; int i, ret; - ret = _vcpu_ioctl(vm, 0, KVM_HAS_DEVICE_ATTR, &dev); + ret = __vcpu_ioctl(vm, 0, KVM_HAS_DEVICE_ATTR, &dev); if (ret != 0 && errno == ENXIO) { print_skip("steal-time not supported"); exit(KSFT_SKIP); @@ -184,13 +184,13 @@ static void steal_time_init(struct kvm_vm *vm) sync_global_to_guest(vm, st_gva[i]); st_ipa = (ulong)st_gva[i] | 1; - ret = _vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev); + ret = __vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev); TEST_ASSERT(ret == -1 && errno == EINVAL, "Bad IPA didn't report EINVAL"); st_ipa = (ulong)st_gva[i]; vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev); - ret = _vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev); + ret = __vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev); TEST_ASSERT(ret == -1 && errno == EEXIST, "Set IPA twice without EEXIST"); } diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c index 8c245ab2d98a..7e45a3df8f98 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c @@ -121,9 +121,9 @@ void test_hv_cpuid_e2big(struct kvm_vm *vm, bool system) int ret; if (!system) - ret = _vcpu_ioctl(vm, VCPU_ID, KVM_GET_SUPPORTED_HV_CPUID, &cpuid); + ret = __vcpu_ioctl(vm, VCPU_ID, KVM_GET_SUPPORTED_HV_CPUID, &cpuid); else - ret = _kvm_ioctl(vm, KVM_GET_SUPPORTED_HV_CPUID, &cpuid); + ret = __kvm_ioctl(vm, KVM_GET_SUPPORTED_HV_CPUID, &cpuid); TEST_ASSERT(ret == -1 && errno == E2BIG, "%s KVM_GET_SUPPORTED_HV_CPUID didn't fail with -E2BIG when" diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c index 2fe893ccedd0..ee3d058a9fe1 100644 --- a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c +++ b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c @@ -42,7 +42,7 @@ static void test_set_boot_busy(struct kvm_vm *vm) { int res; - res = _vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID0); + res = __vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID0); TEST_ASSERT(res == -1 && errno == EBUSY, "KVM_SET_BOOT_CPU_ID set while running vm"); } @@ -133,13 +133,13 @@ static void check_set_bsp_busy(void) add_x86_vcpu(vm, VCPU_ID0, true); add_x86_vcpu(vm, VCPU_ID1, false); - res = _vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID1); + res = __vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID1); TEST_ASSERT(res == -1 && errno == EBUSY, "KVM_SET_BOOT_CPU_ID set after adding vcpu"); run_vcpu(vm, VCPU_ID0); run_vcpu(vm, VCPU_ID1); - res = _vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID1); + res = __vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID1); TEST_ASSERT(res == -1 && errno == EBUSY, "KVM_SET_BOOT_CPU_ID set to a terminated vcpu"); kvm_vm_free(vm); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c index 280c01fd2412..c35ada9f7f9c 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c @@ -186,7 +186,7 @@ int main(int argc, char *argv[]) vcpu_alloc_vmx(vm, &vmx_pages_gva); vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); - tsc_khz = _vcpu_ioctl(vm, VCPU_ID, KVM_GET_TSC_KHZ, NULL); + tsc_khz = __vcpu_ioctl(vm, VCPU_ID, KVM_GET_TSC_KHZ, NULL); TEST_ASSERT(tsc_khz != -1, "vcpu ioctl KVM_GET_TSC_KHZ failed"); /* scale down L1's TSC frequency */ -- cgit v1.2.3 From 02e04c15caeeb64a45d350a8fdeb32620b64a02d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 10:15:00 -0800 Subject: KVM: selftests: Make vcpu_ioctl() a wrapper to pretty print ioctl name Make vcpu_ioctl() a macro wrapper and pretty the _name_ of the ioctl on failure instead of the number. Add inner macros to allow handling cases where the name of the ioctl needs to be resolved higher up the stack, and to allow using the formatting for non-ioctl syscalls without being technically wrong. Deliberately do not use __stringify(), as that will expand the ioctl all the way down to its numerical sequence, again the intent is to print the name of the macro. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/kvm_util_base.h | 13 +++++++-- tools/testing/selftests/kvm/lib/kvm_util.c | 31 ++++++---------------- 2 files changed, 19 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 662579a6358b..00f3103dc85e 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -157,10 +157,19 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, uint64_t guest_paddr, uint32_t slot, uint64_t npages, uint32_t flags); -void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl, - void *arg); +#define __KVM_SYSCALL_ERROR(_name, _ret) \ + "%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno) + +#define __KVM_IOCTL_ERROR(_name, _ret) __KVM_SYSCALL_ERROR(_name, _ret) +#define KVM_IOCTL_ERROR(_ioctl, _ret) __KVM_IOCTL_ERROR(#_ioctl, _ret) + +void _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl, + const char *name, void *arg); int __vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl, void *arg); +#define vcpu_ioctl(vm, vcpuid, ioctl, arg) \ + _vcpu_ioctl(vm, vcpuid, ioctl, #ioctl, arg) + void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); int __vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg); void kvm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 03c1f885a98b..fdcaf74b5959 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1937,29 +1937,6 @@ void vcpu_set_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg) ret, errno, strerror(errno)); } -/* - * VCPU Ioctl - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * cmd - Ioctl number - * arg - Argument to pass to the ioctl - * - * Return: None - * - * Issues an arbitrary ioctl on a VCPU fd. - */ -void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, - unsigned long cmd, void *arg) -{ - int ret; - - ret = __vcpu_ioctl(vm, vcpuid, cmd, arg); - TEST_ASSERT(ret == 0, "vcpu ioctl %lu failed, rc: %i errno: %i (%s)", - cmd, ret, errno, strerror(errno)); -} - int __vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long cmd, void *arg) { @@ -1973,6 +1950,14 @@ int __vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, return ret; } +void _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long cmd, + const char *name, void *arg) +{ + int ret = __vcpu_ioctl(vm, vcpuid, cmd, arg); + + TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(name, ret)); +} + void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid) { struct vcpu *vcpu; -- cgit v1.2.3 From 2ab2c307c734266e3c3e89d14d39c3b2327cb750 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 14 Feb 2022 17:20:17 -0800 Subject: KVM: selftests: Drop @mode from common vm_create() helper Drop @mode from vm_create() and have it use VM_MODE_DEFAULT. Add and use an inner helper, __vm_create(), to service the handful of tests that want something other than VM_MODE_DEFAULT. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/get-reg-list.c | 2 +- tools/testing/selftests/kvm/aarch64/psci_test.c | 2 +- .../selftests/kvm/aarch64/vcpu_width_config.c | 6 ++-- tools/testing/selftests/kvm/dirty_log_test.c | 2 +- .../testing/selftests/kvm/hardware_disable_test.c | 2 +- .../testing/selftests/kvm/include/kvm_util_base.h | 3 +- .../testing/selftests/kvm/kvm_binary_stats_test.c | 3 +- tools/testing/selftests/kvm/kvm_create_max_vcpus.c | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 42 +++++++++++----------- .../testing/selftests/kvm/set_memory_region_test.c | 4 +-- .../selftests/kvm/x86_64/max_vcpuid_cap_test.c | 2 +- .../testing/selftests/kvm/x86_64/set_boot_cpu_id.c | 2 +- .../testing/selftests/kvm/x86_64/set_sregs_test.c | 2 +- .../selftests/kvm/x86_64/sev_migrate_tests.c | 8 ++--- 14 files changed, 42 insertions(+), 40 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c index 441c98ffb812..ecfb773ec41e 100644 --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -416,7 +416,7 @@ static void run_test(struct vcpu_config *c) check_supported(c); - vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); + vm = vm_create(DEFAULT_GUEST_PHY_PAGES); prepare_vcpu_init(c, &init); aarch64_vcpu_add_default(vm, 0, &init, NULL); finalize_vcpu(vm, 0, c); diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c index de3b5e176d04..024a84064f1f 100644 --- a/tools/testing/selftests/kvm/aarch64/psci_test.c +++ b/tools/testing/selftests/kvm/aarch64/psci_test.c @@ -78,7 +78,7 @@ static struct kvm_vm *setup_vm(void *guest_code) struct kvm_vcpu_init init; struct kvm_vm *vm; - vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); + vm = vm_create(DEFAULT_GUEST_PHY_PAGES); kvm_vm_elf_load(vm, program_invocation_name); ucall_init(vm, NULL); diff --git a/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c b/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c index 271fa90e53fd..4145c28a245a 100644 --- a/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c +++ b/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c @@ -24,7 +24,7 @@ static int add_init_2vcpus(struct kvm_vcpu_init *init1, struct kvm_vm *vm; int ret; - vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); + vm = vm_create(DEFAULT_GUEST_PHY_PAGES); vm_vcpu_add(vm, 0); ret = __vcpu_ioctl(vm, 0, KVM_ARM_VCPU_INIT, init1); @@ -49,7 +49,7 @@ static int add_2vcpus_init_2vcpus(struct kvm_vcpu_init *init1, struct kvm_vm *vm; int ret; - vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); + vm = vm_create(DEFAULT_GUEST_PHY_PAGES); vm_vcpu_add(vm, 0); vm_vcpu_add(vm, 1); @@ -86,7 +86,7 @@ int main(void) } /* Get the preferred target type and copy that to init2 for later use */ - vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); + vm = vm_create(DEFAULT_GUEST_PHY_PAGES); vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init1); kvm_vm_free(vm); init2 = init1; diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 11bf606e3165..01c01d40201f 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -679,7 +679,7 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); - vm = vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages); + vm = __vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages); kvm_vm_elf_load(vm, program_invocation_name); #ifdef __x86_64__ vm_create_irqchip(vm); diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c index 1c9e2295c75b..81ba8645772a 100644 --- a/tools/testing/selftests/kvm/hardware_disable_test.c +++ b/tools/testing/selftests/kvm/hardware_disable_test.c @@ -104,7 +104,7 @@ static void run_test(uint32_t run) for (i = 0; i < VCPU_NUM; i++) CPU_SET(i, &cpu_set); - vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); + vm = vm_create(DEFAULT_GUEST_PHY_PAGES); kvm_vm_elf_load(vm, program_invocation_name); vm_create_irqchip(vm); diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 00f3103dc85e..f6984b0c3816 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -110,7 +110,8 @@ int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id, void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size); const char *vm_guest_mode_string(uint32_t i); -struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages); +struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint64_t phy_pages); +struct kvm_vm *vm_create(uint64_t phy_pages); void kvm_vm_free(struct kvm_vm *vmp); void kvm_vm_restart(struct kvm_vm *vmp); void kvm_vm_release(struct kvm_vm *vmp); diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index 6217f4630e6c..4b149b383678 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -229,8 +229,7 @@ int main(int argc, char *argv[]) vms = malloc(sizeof(vms[0]) * max_vm); TEST_ASSERT(vms, "Allocate memory for storing VM pointers"); for (i = 0; i < max_vm; ++i) { - vms[i] = vm_create(VM_MODE_DEFAULT, - DEFAULT_GUEST_PHY_PAGES); + vms[i] = vm_create(DEFAULT_GUEST_PHY_PAGES); for (j = 0; j < max_vcpu; ++j) vm_vcpu_add(vms[i], j); } diff --git a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c index bb69b75eac23..9de5e1376c49 100644 --- a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c +++ b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c @@ -28,7 +28,7 @@ void test_vcpu_creation(int first_vcpu_id, int num_vcpus) pr_info("Testing creating %d vCPUs, with IDs %d...%d.\n", num_vcpus, first_vcpu_id, first_vcpu_id + num_vcpus - 1); - vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); + vm = vm_create(DEFAULT_GUEST_PHY_PAGES); for (i = first_vcpu_id; i < first_vcpu_id + num_vcpus; i++) /* This asserts that the vCPU was created. */ diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index fdcaf74b5959..bab4ab297fcc 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -234,25 +234,7 @@ const struct vm_guest_mode_params vm_guest_mode_params[] = { _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES, "Missing new mode params?"); -/* - * VM Create - * - * Input Args: - * mode - VM Mode (e.g. VM_MODE_P52V48_4K) - * phy_pages - Physical memory pages - * - * Output Args: None - * - * Return: - * Pointer to opaque structure that describes the created VM. - * - * Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K). - * When phy_pages is non-zero, a memory region of phy_pages physical pages - * is created and mapped starting at guest physical address 0. The file - * descriptor to control the created VM is created with the permissions - * given by perm (e.g. O_RDWR). - */ -struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages) +struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint64_t phy_pages) { struct kvm_vm *vm; @@ -361,11 +343,31 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages) return vm; } +/* + * VM Create + * + * Input Args: + * phy_pages - Physical memory pages + * + * Output Args: None + * + * Return: + * Pointer to opaque structure that describes the created VM. + * + * Creates a VM with the default physical/virtual address widths and page size. + * When phy_pages is non-zero, a memory region of phy_pages physical pages + * is created and mapped starting at guest physical address 0. + */ +struct kvm_vm *vm_create(uint64_t phy_pages) +{ + return __vm_create(VM_MODE_DEFAULT, phy_pages); +} + struct kvm_vm *vm_create_without_vcpus(enum vm_guest_mode mode, uint64_t pages) { struct kvm_vm *vm; - vm = vm_create(mode, pages); + vm = __vm_create(mode, pages); kvm_vm_elf_load(vm, program_invocation_name); diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index d97cfd6866c3..89b13f23c3ac 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -314,7 +314,7 @@ static void test_zero_memory_regions(void) pr_info("Testing KVM_RUN with zero added memory regions\n"); - vm = vm_create(VM_MODE_DEFAULT, 0); + vm = vm_create(0); vm_vcpu_add(vm, VCPU_ID); TEST_ASSERT(!ioctl(vm_get_fd(vm), KVM_SET_NR_MMU_PAGES, 64), @@ -354,7 +354,7 @@ static void test_add_max_memory_regions(void) "KVM_CAP_NR_MEMSLOTS should be greater than 0"); pr_info("Allowed number of memory slots: %i\n", max_mem_slots); - vm = vm_create(VM_MODE_DEFAULT, 0); + vm = vm_create(0); /* Check it can be added memory slots up to the maximum allowed */ pr_info("Adding slots 0..%i, each memory region with %dK size\n", diff --git a/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c index 28cc316c5dbe..e83afd4bb4cf 100644 --- a/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c +++ b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c @@ -18,7 +18,7 @@ int main(int argc, char *argv[]) struct kvm_enable_cap cap = { 0 }; int ret; - vm = vm_create(VM_MODE_DEFAULT, 0); + vm = vm_create(0); /* Get KVM_CAP_MAX_VCPU_ID cap supported in KVM */ ret = vm_check_cap(vm, KVM_CAP_MAX_VCPU_ID); diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c index ee3d058a9fe1..b4da92ddc1c6 100644 --- a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c +++ b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c @@ -88,7 +88,7 @@ static struct kvm_vm *create_vm(void) uint64_t pages = DEFAULT_GUEST_PHY_PAGES + vcpu_pages + extra_pg_pages; pages = vm_adjust_num_guest_pages(VM_MODE_DEFAULT, pages); - vm = vm_create(VM_MODE_DEFAULT, pages); + vm = vm_create(pages); kvm_vm_elf_load(vm, program_invocation_name); vm_create_irqchip(vm); diff --git a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c index 44711ab735c3..4dc7fd925023 100644 --- a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c +++ b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c @@ -95,7 +95,7 @@ int main(int argc, char *argv[]) * use it to verify all supported CR4 bits can be set prior to defining * the vCPU model, i.e. without doing KVM_SET_CPUID2. */ - vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); + vm = vm_create(DEFAULT_GUEST_PHY_PAGES); vm_vcpu_add(vm, VCPU_ID); vcpu_sregs_get(vm, VCPU_ID, &sregs); diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c index b0c052443c44..7424bec5ae23 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c +++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c @@ -54,7 +54,7 @@ static struct kvm_vm *sev_vm_create(bool es) struct kvm_sev_launch_start start = { 0 }; int i; - vm = vm_create(VM_MODE_DEFAULT, 0); + vm = vm_create(0); sev_ioctl(vm->fd, es ? KVM_SEV_ES_INIT : KVM_SEV_INIT, NULL); for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i) vm_vcpu_add(vm, i); @@ -71,7 +71,7 @@ static struct kvm_vm *aux_vm_create(bool with_vcpus) struct kvm_vm *vm; int i; - vm = vm_create(VM_MODE_DEFAULT, 0); + vm = vm_create(0); if (!with_vcpus) return vm; @@ -174,7 +174,7 @@ static void test_sev_migrate_parameters(void) *sev_es_vm_no_vmsa; int ret; - vm_no_vcpu = vm_create(VM_MODE_DEFAULT, 0); + vm_no_vcpu = vm_create(0); vm_no_sev = aux_vm_create(true); ret = __sev_migrate_from(vm_no_vcpu->fd, vm_no_sev->fd); TEST_ASSERT(ret == -1 && errno == EINVAL, @@ -186,7 +186,7 @@ static void test_sev_migrate_parameters(void) sev_vm = sev_vm_create(/* es= */ false); sev_es_vm = sev_vm_create(/* es= */ true); - sev_es_vm_no_vmsa = vm_create(VM_MODE_DEFAULT, 0); + sev_es_vm_no_vmsa = vm_create(0); sev_ioctl(sev_es_vm_no_vmsa->fd, KVM_SEV_ES_INIT, NULL); vm_vcpu_add(sev_es_vm_no_vmsa, 1); -- cgit v1.2.3 From 1d438b3bc25e7ea2e7af95b9c07d88a287346df1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 10:08:45 -0800 Subject: KVM: selftests: Split vcpu_set_nested_state() into two helpers Split vcpu_nested_state_set() into a wrapper that asserts, and an inner helper that does not. Passing a bool is all kinds of awful as it's unintuitive for readers and requires returning an 'int' from a function that for most users can never return anything other than "success". Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/kvm_util_base.h | 6 ++++-- tools/testing/selftests/kvm/lib/kvm_util.c | 20 ++++++++++---------- .../selftests/kvm/x86_64/vmx_set_nested_state_test.c | 4 ++-- 3 files changed, 16 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index f6984b0c3816..314d971c1f06 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -261,8 +261,10 @@ void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid, #ifdef __x86_64__ void vcpu_nested_state_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_nested_state *state); -int vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_nested_state *state, bool ignore_error); +int __vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_nested_state *state); +void vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_nested_state *state); #endif void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index bab4ab297fcc..7b339f98070b 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1826,22 +1826,22 @@ void vcpu_nested_state_get(struct kvm_vm *vm, uint32_t vcpuid, ret, errno); } -int vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_nested_state *state, bool ignore_error) +int __vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_nested_state *state) { struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int ret; TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - ret = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, state); - if (!ignore_error) { - TEST_ASSERT(ret == 0, - "KVM_SET_NESTED_STATE failed, ret: %i errno: %i", - ret, errno); - } + return ioctl(vcpu->fd, KVM_SET_NESTED_STATE, state); +} - return ret; +void vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_nested_state *state) +{ + int ret = __vcpu_nested_state_set(vm, vcpuid, state); + + TEST_ASSERT(!ret, "KVM_SET_NESTED_STATE failed, ret: %i errno: %i", ret, errno); } #endif diff --git a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c index 5827b9bae468..af3b60eb35ec 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c @@ -29,7 +29,7 @@ bool have_evmcs; void test_nested_state(struct kvm_vm *vm, struct kvm_nested_state *state) { - vcpu_nested_state_set(vm, VCPU_ID, state, false); + vcpu_nested_state_set(vm, VCPU_ID, state); } void test_nested_state_expect_errno(struct kvm_vm *vm, @@ -38,7 +38,7 @@ void test_nested_state_expect_errno(struct kvm_vm *vm, { int rv; - rv = vcpu_nested_state_set(vm, VCPU_ID, state, true); + rv = __vcpu_nested_state_set(vm, VCPU_ID, state); TEST_ASSERT(rv == -1 && errno == expected_errno, "Expected %s (%d) from vcpu_nested_state_set but got rv: %i errno: %s (%d)", strerror(expected_errno), expected_errno, rv, strerror(errno), -- cgit v1.2.3 From ffb7c77fd5039a01b2534465a2cdf7514f7fc9df Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 11:05:21 -0800 Subject: KVM: sefltests: Use vcpu_ioctl() and __vcpu_ioctl() helpers Use the recently introduced vCPU-specific ioctl() helpers instead of open coding calls to ioctl() just to pretty print the ioctl name. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/kvm_util_base.h | 142 +++++++--- .../selftests/kvm/include/x86_64/processor.h | 28 +- .../testing/selftests/kvm/kvm_binary_stats_test.c | 6 +- tools/testing/selftests/kvm/lib/kvm_util.c | 286 +-------------------- tools/testing/selftests/kvm/lib/x86_64/processor.c | 112 +------- 5 files changed, 135 insertions(+), 439 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 314d971c1f06..4f18f03c537f 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -105,8 +105,6 @@ int open_kvm_dev_path_or_exit(void); int kvm_check_cap(long cap); int vm_check_cap(struct kvm_vm *vm, long cap); int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap); -int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id, - struct kvm_enable_cap *cap); void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size); const char *vm_guest_mode_string(uint32_t i); @@ -212,13 +210,112 @@ void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid); int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid); int vcpu_get_fd(struct kvm_vm *vm, uint32_t vcpuid); void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid); -void vcpu_set_guest_debug(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_guest_debug *debug); -void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_mp_state *mp_state); struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vm *vm, uint32_t vcpuid); -void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs); -void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs); + +static inline void vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id, + struct kvm_enable_cap *cap) +{ + vcpu_ioctl(vm, vcpu_id, KVM_ENABLE_CAP, cap); +} + +static inline void vcpu_set_guest_debug(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_guest_debug *debug) +{ + vcpu_ioctl(vm, vcpuid, KVM_SET_GUEST_DEBUG, debug); +} + +static inline void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_mp_state *mp_state) +{ + vcpu_ioctl(vm, vcpuid, KVM_SET_MP_STATE, mp_state); +} + +static inline void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_regs *regs) +{ + vcpu_ioctl(vm, vcpuid, KVM_GET_REGS, regs); +} + +static inline void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_regs *regs) +{ + vcpu_ioctl(vm, vcpuid, KVM_SET_REGS, regs); +} +static inline void vcpu_sregs_get(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_sregs *sregs) +{ + vcpu_ioctl(vm, vcpuid, KVM_GET_SREGS, sregs); + +} +static inline void vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_sregs *sregs) +{ + vcpu_ioctl(vm, vcpuid, KVM_SET_SREGS, sregs); +} +static inline int _vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_sregs *sregs) +{ + return __vcpu_ioctl(vm, vcpuid, KVM_SET_SREGS, sregs); +} +static inline void vcpu_fpu_get(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_fpu *fpu) +{ + vcpu_ioctl(vm, vcpuid, KVM_GET_FPU, fpu); +} +static inline void vcpu_fpu_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_fpu *fpu) +{ + vcpu_ioctl(vm, vcpuid, KVM_SET_FPU, fpu); +} +static inline void vcpu_get_reg(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_one_reg *reg) +{ + vcpu_ioctl(vm, vcpuid, KVM_GET_ONE_REG, reg); +} +static inline void vcpu_set_reg(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_one_reg *reg) +{ + vcpu_ioctl(vm, vcpuid, KVM_SET_ONE_REG, reg); +} +#ifdef __KVM_HAVE_VCPU_EVENTS +static inline void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_vcpu_events *events) +{ + vcpu_ioctl(vm, vcpuid, KVM_GET_VCPU_EVENTS, events); +} +static inline void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_vcpu_events *events) +{ + vcpu_ioctl(vm, vcpuid, KVM_SET_VCPU_EVENTS, events); +} +#endif +#ifdef __x86_64__ +static inline void vcpu_nested_state_get(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_nested_state *state) +{ + vcpu_ioctl(vm, vcpuid, KVM_GET_NESTED_STATE, state); +} +static inline int __vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_nested_state *state) +{ + return __vcpu_ioctl(vm, vcpuid, KVM_SET_NESTED_STATE, state); +} + +static inline void vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_nested_state *state) +{ + vcpu_ioctl(vm, vcpuid, KVM_SET_NESTED_STATE, state); +} +#endif +static inline int vcpu_get_stats_fd(struct kvm_vm *vm, uint32_t vcpuid) +{ + int fd = __vcpu_ioctl(vm, vcpuid, KVM_GET_STATS_FD, NULL); + + TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_GET_STATS_FD, fd)); + return fd; +} + +void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid); /* * VM VCPU Args Set @@ -240,34 +337,6 @@ void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs); */ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...); -void vcpu_sregs_get(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_sregs *sregs); -void vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_sregs *sregs); -int _vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_sregs *sregs); -void vcpu_fpu_get(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_fpu *fpu); -void vcpu_fpu_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_fpu *fpu); -void vcpu_get_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg); -void vcpu_set_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg); -#ifdef __KVM_HAVE_VCPU_EVENTS -void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_vcpu_events *events); -void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_vcpu_events *events); -#endif -#ifdef __x86_64__ -void vcpu_nested_state_get(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_nested_state *state); -int __vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_nested_state *state); -void vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_nested_state *state); -#endif -void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid); - int _kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr); int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr); int _kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test, int *fd); @@ -406,7 +475,6 @@ kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid); int vm_get_stats_fd(struct kvm_vm *vm); -int vcpu_get_stats_fd(struct kvm_vm *vm, uint32_t vcpuid); uint32_t guest_get_vcpuid(void); diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 1c0419c4786d..583754bf18c3 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -430,12 +430,19 @@ void kvm_x86_state_cleanup(struct kvm_x86_state *state); struct kvm_msr_list *kvm_get_msr_index_list(void); uint64_t kvm_get_feature_msr(uint64_t msr_index); struct kvm_cpuid2 *kvm_get_supported_cpuid(void); - struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid); -int __vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_cpuid2 *cpuid); -void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_cpuid2 *cpuid); + +static inline int __vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_cpuid2 *cpuid) +{ + return __vcpu_ioctl(vm, vcpuid, KVM_SET_CPUID2, cpuid); +} + +static inline void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_cpuid2 *cpuid) +{ + vcpu_ioctl(vm, vcpuid, KVM_SET_CPUID2, cpuid); +} struct kvm_cpuid_entry2 * kvm_get_supported_cpuid_index(uint32_t function, uint32_t index); @@ -449,8 +456,15 @@ kvm_get_supported_cpuid_entry(uint32_t function) uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index); int _vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, uint64_t msr_value); -void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, - uint64_t msr_value); + +static inline void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, + uint64_t msr_index, uint64_t msr_value) +{ + int r = _vcpu_set_msr(vm, vcpuid, msr_index, msr_value); + + TEST_ASSERT(r == 1, KVM_IOCTL_ERROR(KVM_SET_MSRS, r)); +} + uint32_t kvm_get_cpuid_max_basic(void); uint32_t kvm_get_cpuid_max_extended(void); diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index 4b149b383678..bab8b49b52da 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -178,11 +178,7 @@ static void vm_stats_test(struct kvm_vm *vm) static void vcpu_stats_test(struct kvm_vm *vm, int vcpu_id) { - int stats_fd; - - /* Get fd for VCPU stats */ - stats_fd = vcpu_get_stats_fd(vm, vcpu_id); - TEST_ASSERT(stats_fd >= 0, "Get VCPU stats fd"); + int stats_fd = vcpu_get_stats_fd(vm, vcpu_id); stats_test(stats_fd); close(stats_fd); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 7b339f98070b..7ac4516d764c 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -135,34 +135,6 @@ int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap) return ret; } -/* VCPU Enable Capability - * - * Input Args: - * vm - Virtual Machine - * vcpu_id - VCPU - * cap - Capability - * - * Output Args: None - * - * Return: On success, 0. On failure a TEST_ASSERT failure is produced. - * - * Enables a capability (KVM_CAP_*) on the VCPU. - */ -int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id, - struct kvm_enable_cap *cap) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpu_id); - int r; - - TEST_ASSERT(vcpu, "cannot find vcpu %d", vcpu_id); - - r = ioctl(vcpu->fd, KVM_ENABLE_CAP, cap); - TEST_ASSERT(!r, "KVM_ENABLE_CAP vCPU ioctl failed,\n" - " rc: %i, errno: %i", r, errno); - - return r; -} - void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size) { struct kvm_enable_cap cap = { 0 }; @@ -1619,8 +1591,8 @@ struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid) void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid) { int ret = _vcpu_run(vm, vcpuid); - TEST_ASSERT(ret == 0, "KVM_RUN IOCTL failed, " - "rc: %i errno: %i", ret, errno); + + TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_RUN, ret)); } int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid) @@ -1663,43 +1635,6 @@ void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid) ret, errno); } -void vcpu_set_guest_debug(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_guest_debug *debug) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int ret = ioctl(vcpu->fd, KVM_SET_GUEST_DEBUG, debug); - - TEST_ASSERT(ret == 0, "KVM_SET_GUEST_DEBUG failed: %d", ret); -} - -/* - * VM VCPU Set MP State - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * mp_state - mp_state to be set - * - * Output Args: None - * - * Return: None - * - * Sets the MP state of the VCPU given by vcpuid, to the state given - * by mp_state. - */ -void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_mp_state *mp_state) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int ret; - - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - - ret = ioctl(vcpu->fd, KVM_SET_MP_STATE, mp_state); - TEST_ASSERT(ret == 0, "KVM_SET_MP_STATE IOCTL failed, " - "rc: %i errno: %i", ret, errno); -} - /* * VM VCPU Get Reg List * @@ -1729,216 +1664,6 @@ struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vm *vm, uint32_t vcpuid) return reg_list; } -/* - * VM VCPU Regs Get - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * - * Output Args: - * regs - current state of VCPU regs - * - * Return: None - * - * Obtains the current register state for the VCPU specified by vcpuid - * and stores it at the location given by regs. - */ -void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int ret; - - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - - ret = ioctl(vcpu->fd, KVM_GET_REGS, regs); - TEST_ASSERT(ret == 0, "KVM_GET_REGS failed, rc: %i errno: %i", - ret, errno); -} - -/* - * VM VCPU Regs Set - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * regs - Values to set VCPU regs to - * - * Output Args: None - * - * Return: None - * - * Sets the regs of the VCPU specified by vcpuid to the values - * given by regs. - */ -void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int ret; - - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - - ret = ioctl(vcpu->fd, KVM_SET_REGS, regs); - TEST_ASSERT(ret == 0, "KVM_SET_REGS failed, rc: %i errno: %i", - ret, errno); -} - -#ifdef __KVM_HAVE_VCPU_EVENTS -void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_vcpu_events *events) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int ret; - - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - - ret = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, events); - TEST_ASSERT(ret == 0, "KVM_GET_VCPU_EVENTS, failed, rc: %i errno: %i", - ret, errno); -} - -void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_vcpu_events *events) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int ret; - - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - - ret = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, events); - TEST_ASSERT(ret == 0, "KVM_SET_VCPU_EVENTS, failed, rc: %i errno: %i", - ret, errno); -} -#endif - -#ifdef __x86_64__ -void vcpu_nested_state_get(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_nested_state *state) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int ret; - - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - - ret = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, state); - TEST_ASSERT(ret == 0, - "KVM_SET_NESTED_STATE failed, ret: %i errno: %i", - ret, errno); -} - -int __vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_nested_state *state) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - - return ioctl(vcpu->fd, KVM_SET_NESTED_STATE, state); -} - -void vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_nested_state *state) -{ - int ret = __vcpu_nested_state_set(vm, vcpuid, state); - - TEST_ASSERT(!ret, "KVM_SET_NESTED_STATE failed, ret: %i errno: %i", ret, errno); -} -#endif - -/* - * VM VCPU System Regs Get - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * - * Output Args: - * sregs - current state of VCPU system regs - * - * Return: None - * - * Obtains the current system register state for the VCPU specified by - * vcpuid and stores it at the location given by sregs. - */ -void vcpu_sregs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int ret; - - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - - ret = ioctl(vcpu->fd, KVM_GET_SREGS, sregs); - TEST_ASSERT(ret == 0, "KVM_GET_SREGS failed, rc: %i errno: %i", - ret, errno); -} - -/* - * VM VCPU System Regs Set - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * sregs - Values to set VCPU system regs to - * - * Output Args: None - * - * Return: None - * - * Sets the system regs of the VCPU specified by vcpuid to the values - * given by sregs. - */ -void vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs) -{ - int ret = _vcpu_sregs_set(vm, vcpuid, sregs); - TEST_ASSERT(ret == 0, "KVM_SET_SREGS IOCTL failed, " - "rc: %i errno: %i", ret, errno); -} - -int _vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - - return ioctl(vcpu->fd, KVM_SET_SREGS, sregs); -} - -void vcpu_fpu_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_fpu *fpu) -{ - int ret; - - ret = __vcpu_ioctl(vm, vcpuid, KVM_GET_FPU, fpu); - TEST_ASSERT(ret == 0, "KVM_GET_FPU failed, rc: %i errno: %i (%s)", - ret, errno, strerror(errno)); -} - -void vcpu_fpu_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_fpu *fpu) -{ - int ret; - - ret = __vcpu_ioctl(vm, vcpuid, KVM_SET_FPU, fpu); - TEST_ASSERT(ret == 0, "KVM_SET_FPU failed, rc: %i errno: %i (%s)", - ret, errno, strerror(errno)); -} - -void vcpu_get_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg) -{ - int ret; - - ret = __vcpu_ioctl(vm, vcpuid, KVM_GET_ONE_REG, reg); - TEST_ASSERT(ret == 0, "KVM_GET_ONE_REG failed, rc: %i errno: %i (%s)", - ret, errno, strerror(errno)); -} - -void vcpu_set_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg) -{ - int ret; - - ret = __vcpu_ioctl(vm, vcpuid, KVM_SET_ONE_REG, reg); - TEST_ASSERT(ret == 0, "KVM_SET_ONE_REG failed, rc: %i errno: %i (%s)", - ret, errno, strerror(errno)); -} - int __vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long cmd, void *arg) { @@ -2534,10 +2259,3 @@ int vm_get_stats_fd(struct kvm_vm *vm) { return ioctl(vm->fd, KVM_GET_STATS_FD, NULL); } - -int vcpu_get_stats_fd(struct kvm_vm *vm, uint32_t vcpuid) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - - return ioctl(vcpu->fd, KVM_GET_STATS_FD, NULL); -} diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index ead7011ee8f6..af2bf114249d 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -802,18 +802,15 @@ uint64_t kvm_get_feature_msr(uint64_t msr_index) */ struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); struct kvm_cpuid2 *cpuid; int max_ent; int rc = -1; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - cpuid = allocate_kvm_cpuid2(); max_ent = cpuid->nent; for (cpuid->nent = 1; cpuid->nent <= max_ent; cpuid->nent++) { - rc = ioctl(vcpu->fd, KVM_GET_CPUID2, cpuid); + rc = __vcpu_ioctl(vm, vcpuid, KVM_GET_CPUID2, cpuid); if (!rc) break; @@ -822,9 +819,7 @@ struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid) rc, errno); } - TEST_ASSERT(rc == 0, "KVM_GET_CPUID2 failed, rc: %i errno: %i", - rc, errno); - + TEST_ASSERT(!rc, KVM_IOCTL_ERROR(KVM_GET_CPUID2, rc)); return cpuid; } @@ -862,132 +857,37 @@ kvm_get_supported_cpuid_index(uint32_t function, uint32_t index) return entry; } - -int __vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_cpuid2 *cpuid) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - - return ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid); -} - -/* - * VM VCPU CPUID Set - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU id - * cpuid - The CPUID values to set. - * - * Output Args: None - * - * Return: void - * - * Set the VCPU's CPUID. - */ -void vcpu_set_cpuid(struct kvm_vm *vm, - uint32_t vcpuid, struct kvm_cpuid2 *cpuid) -{ - int rc; - - rc = __vcpu_set_cpuid(vm, vcpuid, cpuid); - TEST_ASSERT(rc == 0, "KVM_SET_CPUID2 failed, rc: %i errno: %i", - rc, errno); - -} - -/* - * VCPU Get MSR - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * msr_index - Index of MSR - * - * Output Args: None - * - * Return: On success, value of the MSR. On failure a TEST_ASSERT is produced. - * - * Get value of MSR for VCPU. - */ uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); struct { struct kvm_msrs header; struct kvm_msr_entry entry; } buffer = {}; int r; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); buffer.header.nmsrs = 1; buffer.entry.index = msr_index; - r = ioctl(vcpu->fd, KVM_GET_MSRS, &buffer.header); - TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n" - " rc: %i errno: %i", r, errno); + + r = __vcpu_ioctl(vm, vcpuid, KVM_GET_MSRS, &buffer.header); + TEST_ASSERT(r == 1, KVM_IOCTL_ERROR(KVM_GET_MSRS, r)); return buffer.entry.data; } -/* - * _VCPU Set MSR - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * msr_index - Index of MSR - * msr_value - New value of MSR - * - * Output Args: None - * - * Return: The result of KVM_SET_MSRS. - * - * Sets the value of an MSR for the given VCPU. - */ int _vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, uint64_t msr_value) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); struct { struct kvm_msrs header; struct kvm_msr_entry entry; } buffer = {}; - int r; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); memset(&buffer, 0, sizeof(buffer)); buffer.header.nmsrs = 1; buffer.entry.index = msr_index; buffer.entry.data = msr_value; - r = ioctl(vcpu->fd, KVM_SET_MSRS, &buffer.header); - return r; -} -/* - * VCPU Set MSR - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * msr_index - Index of MSR - * msr_value - New value of MSR - * - * Output Args: None - * - * Return: On success, nothing. On failure a TEST_ASSERT is produced. - * - * Set value of MSR for VCPU. - */ -void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, - uint64_t msr_value) -{ - int r; - - r = _vcpu_set_msr(vm, vcpuid, msr_index, msr_value); - TEST_ASSERT(r == 1, "KVM_SET_MSRS IOCTL failed,\n" - " rc: %i errno: %i", r, errno); + return __vcpu_ioctl(vm, vcpuid, KVM_SET_MSRS, &buffer.header); } void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) -- cgit v1.2.3 From 38d4a385a345d807652f748822630f309786b658 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 11:49:42 -0800 Subject: KVM: selftests: Add __vcpu_run() helper Add __vcpu_run() so that tests that want to avoid asserts on KVM_RUN failures don't need to open code the ioctl() call. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/vgic_init.c | 6 ++---- tools/testing/selftests/kvm/dirty_log_test.c | 6 ++---- tools/testing/selftests/kvm/include/kvm_util_base.h | 6 ++++++ tools/testing/selftests/kvm/lib/kvm_util.c | 6 ++---- 4 files changed, 12 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index 4b7e7ba43969..42c9209f7786 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -55,10 +55,8 @@ static void guest_code(void) static int run_vcpu(struct kvm_vm *vm, uint32_t vcpuid) { ucall_init(vm, NULL); - int ret = __vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL); - if (ret) - return -errno; - return 0; + + return __vcpu_run(vm, vcpuid) ? -errno : 0; } static struct vm_gic vm_gic_create_with_vcpus(uint32_t gic_dev_type, uint32_t nr_vcpus) diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 01c01d40201f..5752486764c9 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -509,7 +509,7 @@ static void generate_random_array(uint64_t *guest_array, uint64_t size) static void *vcpu_worker(void *data) { - int ret, vcpu_fd; + int ret; struct kvm_vm *vm = data; uint64_t *guest_array; uint64_t pages_count = 0; @@ -517,8 +517,6 @@ static void *vcpu_worker(void *data) + sizeof(sigset_t)); sigset_t *sigset = (sigset_t *) &sigmask->sigset; - vcpu_fd = vcpu_get_fd(vm, VCPU_ID); - /* * SIG_IPI is unblocked atomically while in KVM_RUN. It causes the * ioctl to return with -EINTR, but it is still pending and we need @@ -539,7 +537,7 @@ static void *vcpu_worker(void *data) generate_random_array(guest_array, TEST_PAGES_PER_LOOP); pages_count += TEST_PAGES_PER_LOOP; /* Let the guest dirty the random pages */ - ret = ioctl(vcpu_fd, KVM_RUN, NULL); + ret = __vcpu_run(vm, VCPU_ID); if (ret == -1 && errno == EINTR) { int sig = -1; sigwait(sigset, &sig); diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 4f18f03c537f..6b7a5297053e 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -208,6 +208,12 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva); struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid); void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid); int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid); + +static inline int __vcpu_run(struct kvm_vm *vm, uint32_t vcpuid) +{ + return __vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL); +} + int vcpu_get_fd(struct kvm_vm *vm, uint32_t vcpuid); void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid); struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vm *vm, uint32_t vcpuid); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 7ac4516d764c..45895c9ca35a 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1597,12 +1597,10 @@ void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid) int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); int rc; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); do { - rc = ioctl(vcpu->fd, KVM_RUN, NULL); + rc = __vcpu_run(vm, vcpuid); } while (rc == -1 && errno == EINTR); assert_on_unhandled_exception(vm, vcpuid); @@ -1627,7 +1625,7 @@ void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid) TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); vcpu->state->immediate_exit = 1; - ret = ioctl(vcpu->fd, KVM_RUN, NULL); + ret = __vcpu_run(vm, vcpuid); vcpu->state->immediate_exit = 0; TEST_ASSERT(ret == -1 && errno == EINTR, -- cgit v1.2.3 From caf12f3b1d629c06634b059acc7b18d05bb5fb94 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 11:23:34 -0800 Subject: KVM: selftests: Use vcpu_access_device_attr() in arm64 code Use vcpu_access_device_attr() in arm's arch_timer test instead of manually retrieving the vCPU's fd. This will allow dropping vcpu_get_fd() in a future patch. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/arch_timer.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c index 3b940a101bc0..f55c4c20d8b3 100644 --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -349,12 +349,10 @@ static void test_run(struct kvm_vm *vm) static void test_init_timer_irq(struct kvm_vm *vm) { /* Timer initid should be same for all the vCPUs, so query only vCPU-0 */ - int vcpu0_fd = vcpu_get_fd(vm, 0); - - kvm_device_access(vcpu0_fd, KVM_ARM_VCPU_TIMER_CTRL, - KVM_ARM_VCPU_TIMER_IRQ_PTIMER, &ptimer_irq, false); - kvm_device_access(vcpu0_fd, KVM_ARM_VCPU_TIMER_CTRL, - KVM_ARM_VCPU_TIMER_IRQ_VTIMER, &vtimer_irq, false); + vcpu_access_device_attr(vm, 0, KVM_ARM_VCPU_TIMER_CTRL, + KVM_ARM_VCPU_TIMER_IRQ_PTIMER, &ptimer_irq, false); + vcpu_access_device_attr(vm, 0, KVM_ARM_VCPU_TIMER_CTRL, + KVM_ARM_VCPU_TIMER_IRQ_VTIMER, &vtimer_irq, false); sync_global_to_guest(vm, ptimer_irq); sync_global_to_guest(vm, vtimer_irq); -- cgit v1.2.3 From 21c6ee2b3ac25b36cba22a2c725382ff7706dc95 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 11:24:01 -0800 Subject: KVM: selftests: Remove vcpu_get_fd() Drop vcpu_get_fd(), it no longer has any users, and really should not exist as the framework has failed if tests need to manually operate on a vCPU fd. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/kvm_util_base.h | 1 - tools/testing/selftests/kvm/lib/kvm_util.c | 9 --------- 2 files changed, 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 6b7a5297053e..c2dfc4341b31 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -214,7 +214,6 @@ static inline int __vcpu_run(struct kvm_vm *vm, uint32_t vcpuid) return __vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL); } -int vcpu_get_fd(struct kvm_vm *vm, uint32_t vcpuid); void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid); struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vm *vm, uint32_t vcpuid); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 45895c9ca35a..73123b9d9625 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1608,15 +1608,6 @@ int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid) return rc; } -int vcpu_get_fd(struct kvm_vm *vm, uint32_t vcpuid) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - - return vcpu->fd; -} - void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid) { struct vcpu *vcpu = vcpu_find(vm, vcpuid); -- cgit v1.2.3 From 47a7c924b62de4b9d1752c1f5bc111c847229799 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 11:31:16 -0800 Subject: KVM: selftests: Add vcpu_get() to retrieve and assert on vCPU existence Add vcpu_get() to wrap vcpu_find() and deduplicate a pile of code that asserts the requested vCPU exists. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/kvm_util.c | 56 +++++++--------------- .../testing/selftests/kvm/lib/kvm_util_internal.h | 2 +- tools/testing/selftests/kvm/lib/s390x/processor.c | 5 +- tools/testing/selftests/kvm/lib/x86_64/processor.c | 4 +- 4 files changed, 20 insertions(+), 47 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 73123b9d9625..940decfaa633 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -561,23 +561,7 @@ kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, return ®ion->region; } -/* - * VCPU Find - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * - * Output Args: None - * - * Return: - * Pointer to VCPU structure - * - * Locates a vcpu structure that describes the VCPU specified by vcpuid and - * returns a pointer to it. Returns NULL if the VM doesn't contain a VCPU - * for the specified vcpuid. - */ -struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid) +static struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid) { struct vcpu *vcpu; @@ -589,6 +573,14 @@ struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid) return NULL; } +struct vcpu *vcpu_get(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + + TEST_ASSERT(vcpu, "vCPU %d does not exist", vcpuid); + return vcpu; +} + /* * VM VCPU Remove * @@ -1568,8 +1560,7 @@ void vm_create_irqchip(struct kvm_vm *vm) */ struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + struct vcpu *vcpu = vcpu_get(vm, vcpuid); return vcpu->state; } @@ -1610,11 +1601,9 @@ int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid) void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); + struct vcpu *vcpu = vcpu_get(vm, vcpuid); int ret; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - vcpu->state->immediate_exit = 1; ret = __vcpu_run(vm, vcpuid); vcpu->state->immediate_exit = 0; @@ -1656,14 +1645,9 @@ struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vm *vm, uint32_t vcpuid) int __vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long cmd, void *arg) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int ret; - - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + struct vcpu *vcpu = vcpu_get(vm, vcpuid); - ret = ioctl(vcpu->fd, cmd, arg); - - return ret; + return ioctl(vcpu->fd, cmd, arg); } void _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long cmd, @@ -1676,15 +1660,11 @@ void _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long cmd, void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid) { - struct vcpu *vcpu; + struct vcpu *vcpu = vcpu_get(vm, vcpuid); uint32_t size = vm->dirty_ring_size; TEST_ASSERT(size > 0, "Should enable dirty ring first"); - vcpu = vcpu_find(vm, vcpuid); - - TEST_ASSERT(vcpu, "Cannot find vcpu %u", vcpuid); - if (!vcpu->dirty_gfns) { void *addr; @@ -1840,9 +1820,7 @@ int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, int _vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, uint64_t attr) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - - TEST_ASSERT(vcpu, "nonexistent vcpu id: %d", vcpuid); + struct vcpu *vcpu = vcpu_get(vm, vcpuid); return _kvm_device_check_attr(vcpu->fd, group, attr); } @@ -1859,9 +1837,7 @@ int vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, int _vcpu_access_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, uint64_t attr, void *val, bool write) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - - TEST_ASSERT(vcpu, "nonexistent vcpu id: %d", vcpuid); + struct vcpu *vcpu = vcpu_get(vm, vcpuid); return _kvm_device_access(vcpu->fd, group, attr, val, write); } diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h index a03febc24ba6..0c7c44499129 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h +++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h @@ -69,7 +69,7 @@ struct kvm_vm { uint32_t dirty_ring_size; }; -struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid); +struct vcpu *vcpu_get(struct kvm_vm *vm, uint32_t vcpuid); /* * Virtual Translation Tables Dump diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c index f87c7137598e..7cc1051c4b71 100644 --- a/tools/testing/selftests/kvm/lib/s390x/processor.c +++ b/tools/testing/selftests/kvm/lib/s390x/processor.c @@ -208,10 +208,7 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - - if (!vcpu) - return; + struct vcpu *vcpu = vcpu_get(vm, vcpuid); fprintf(stream, "%*spstate: psw: 0x%.16llx:0x%.16llx\n", indent, "", vcpu->state->psw_mask, vcpu->state->psw_addr); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index af2bf114249d..723cb49d1994 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -995,7 +995,7 @@ static int vcpu_save_xsave_state(struct kvm_vm *vm, struct vcpu *vcpu, struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); + struct vcpu *vcpu = vcpu_get(vm, vcpuid); struct kvm_msr_list *list; struct kvm_x86_state *state; int nmsrs, r, i; @@ -1078,7 +1078,7 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *state) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); + struct vcpu *vcpu = vcpu_get(vm, vcpuid); int r; r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs); -- cgit v1.2.3 From 71ab5a6fea49875290e6ca035ff1d3e3ef3813f7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 12:06:02 -0800 Subject: KVM: selftests: Make vm_ioctl() a wrapper to pretty print ioctl name Make vm_ioctl() a macro wrapper and print the _name_ of the ioctl on failure instead of the number. Deliberately do not use __stringify(), as that will expand the ioctl all the way down to its numerical sequence. Again the intent is to print the name of the macro. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/kvm_util_base.h | 38 ++++++++++++---------- tools/testing/selftests/kvm/lib/kvm_util.c | 26 ++++----------- 2 files changed, 27 insertions(+), 37 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index c2dfc4341b31..39e1971e5d65 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -105,6 +105,27 @@ int open_kvm_dev_path_or_exit(void); int kvm_check_cap(long cap); int vm_check_cap(struct kvm_vm *vm, long cap); int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap); + +#define __KVM_SYSCALL_ERROR(_name, _ret) \ + "%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno) + +#define __KVM_IOCTL_ERROR(_name, _ret) __KVM_SYSCALL_ERROR(_name, _ret) +#define KVM_IOCTL_ERROR(_ioctl, _ret) __KVM_IOCTL_ERROR(#_ioctl, _ret) + +int __kvm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg); +void kvm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg); + +int __vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg); +void _vm_ioctl(struct kvm_vm *vm, unsigned long cmd, const char *name, void *arg); +#define vm_ioctl(vm, cmd, arg) _vm_ioctl(vm, cmd, #cmd, arg) + +int __vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long cmd, + void *arg); +void _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long cmd, + const char *name, void *arg); +#define vcpu_ioctl(vm, vcpuid, cmd, arg) \ + _vcpu_ioctl(vm, vcpuid, cmd, #cmd, arg) + void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size); const char *vm_guest_mode_string(uint32_t i); @@ -156,23 +177,6 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, uint64_t guest_paddr, uint32_t slot, uint64_t npages, uint32_t flags); -#define __KVM_SYSCALL_ERROR(_name, _ret) \ - "%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno) - -#define __KVM_IOCTL_ERROR(_name, _ret) __KVM_SYSCALL_ERROR(_name, _ret) -#define KVM_IOCTL_ERROR(_ioctl, _ret) __KVM_IOCTL_ERROR(#_ioctl, _ret) - -void _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl, - const char *name, void *arg); -int __vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl, - void *arg); -#define vcpu_ioctl(vm, vcpuid, ioctl, arg) \ - _vcpu_ioctl(vm, vcpuid, ioctl, #ioctl, arg) - -void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); -int __vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg); -void kvm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); -int __kvm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 940decfaa633..7eedd9ff20fa 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1690,30 +1690,16 @@ void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid) return vcpu->dirty_gfns; } -/* - * VM Ioctl - * - * Input Args: - * vm - Virtual Machine - * cmd - Ioctl number - * arg - Argument to pass to the ioctl - * - * Return: None - * - * Issues an arbitrary ioctl on a VM fd. - */ -void vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) +int __vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) { - int ret; - - ret = __vm_ioctl(vm, cmd, arg); - TEST_ASSERT(ret == 0, "vm ioctl %lu failed, rc: %i errno: %i (%s)", - cmd, ret, errno, strerror(errno)); + return ioctl(vm->fd, cmd, arg); } -int __vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) +void _vm_ioctl(struct kvm_vm *vm, unsigned long cmd, const char *name, void *arg) { - return ioctl(vm->fd, cmd, arg); + int ret = __vm_ioctl(vm, cmd, arg); + + TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(name, ret)); } /* -- cgit v1.2.3 From 10825b55b9d5a402294095a3d5fc9d0ea39cc282 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 12:09:56 -0800 Subject: KVM: sefltests: Use vm_ioctl() and __vm_ioctl() helpers Use the recently introduced VM-specific ioctl() helpers instead of open coding calls to ioctl() just to pretty print the ioctl name. Keep a few open coded assertions that provide additional info. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/kvm_util_base.h | 60 +++++++++-- .../testing/selftests/kvm/kvm_binary_stats_test.c | 6 +- tools/testing/selftests/kvm/lib/aarch64/vgic.c | 6 +- tools/testing/selftests/kvm/lib/kvm_util.c | 116 +++------------------ .../testing/selftests/kvm/set_memory_region_test.c | 3 +- .../selftests/kvm/x86_64/pmu_event_filter_test.c | 2 +- 6 files changed, 67 insertions(+), 126 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 39e1971e5d65..1ccb91103e74 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -103,8 +103,6 @@ extern const struct vm_guest_mode_params vm_guest_mode_params[]; int open_path_or_exit(const char *path, int flags); int open_kvm_dev_path_or_exit(void); int kvm_check_cap(long cap); -int vm_check_cap(struct kvm_vm *vm, long cap); -int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap); #define __KVM_SYSCALL_ERROR(_name, _ret) \ "%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno) @@ -126,6 +124,23 @@ void _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long cmd, #define vcpu_ioctl(vm, vcpuid, cmd, arg) \ _vcpu_ioctl(vm, vcpuid, cmd, #cmd, arg) +/* + * Looks up and returns the value corresponding to the capability + * (KVM_CAP_*) given by cap. + */ +static inline int vm_check_cap(struct kvm_vm *vm, long cap) +{ + int ret = __vm_ioctl(vm, KVM_CHECK_EXTENSION, (void *)cap); + + TEST_ASSERT(ret >= 0, KVM_IOCTL_ERROR(KVM_CHECK_EXTENSION, ret)); + return ret; +} + +static inline void vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap) +{ + vm_ioctl(vm, KVM_ENABLE_CAP, cap); +} + void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size); const char *vm_guest_mode_string(uint32_t i); @@ -134,19 +149,46 @@ struct kvm_vm *vm_create(uint64_t phy_pages); void kvm_vm_free(struct kvm_vm *vmp); void kvm_vm_restart(struct kvm_vm *vmp); void kvm_vm_release(struct kvm_vm *vmp); -void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log); -void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, - uint64_t first_page, uint32_t num_pages); -uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm); - int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva, size_t len); - void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename); int kvm_memfd_alloc(size_t size, bool hugepages); void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); +static inline void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log) +{ + struct kvm_dirty_log args = { .dirty_bitmap = log, .slot = slot }; + + vm_ioctl(vm, KVM_GET_DIRTY_LOG, &args); +} + +static inline void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, + uint64_t first_page, uint32_t num_pages) +{ + struct kvm_clear_dirty_log args = { + .dirty_bitmap = log, + .slot = slot, + .first_page = first_page, + .num_pages = num_pages + }; + + vm_ioctl(vm, KVM_CLEAR_DIRTY_LOG, &args); +} + +static inline uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm) +{ + return __vm_ioctl(vm, KVM_RESET_DIRTY_RINGS, NULL); +} + +static inline int vm_get_stats_fd(struct kvm_vm *vm) +{ + int fd = __vm_ioctl(vm, KVM_GET_STATS_FD, NULL); + + TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_GET_STATS_FD, fd)); + return fd; +} + /* * VM VCPU Dump * @@ -483,8 +525,6 @@ kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid); -int vm_get_stats_fd(struct kvm_vm *vm); - uint32_t guest_get_vcpuid(void); #endif /* SELFTEST_KVM_UTIL_BASE_H */ diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index bab8b49b52da..0a27b0f85009 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -165,11 +165,7 @@ static void stats_test(int stats_fd) static void vm_stats_test(struct kvm_vm *vm) { - int stats_fd; - - /* Get fd for VM stats */ - stats_fd = vm_get_stats_fd(vm); - TEST_ASSERT(stats_fd >= 0, "Get VM stats fd"); + int stats_fd = vm_get_stats_fd(vm); stats_test(stats_fd); close(stats_fd); diff --git a/tools/testing/selftests/kvm/lib/aarch64/vgic.c b/tools/testing/selftests/kvm/lib/aarch64/vgic.c index 5d45046c1b80..25d1ec65621d 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/vgic.c +++ b/tools/testing/selftests/kvm/lib/aarch64/vgic.c @@ -104,8 +104,7 @@ void kvm_irq_set_level_info(int gic_fd, uint32_t intid, int level) { int ret = _kvm_irq_set_level_info(gic_fd, intid, level); - TEST_ASSERT(ret == 0, "KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO failed, " - "rc: %i errno: %i", ret, errno); + TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO, ret)); } int _kvm_arm_irq_line(struct kvm_vm *vm, uint32_t intid, int level) @@ -127,8 +126,7 @@ void kvm_arm_irq_line(struct kvm_vm *vm, uint32_t intid, int level) { int ret = _kvm_arm_irq_line(vm, intid, level); - TEST_ASSERT(ret == 0, "KVM_IRQ_LINE failed, rc: %i errno: %i", - ret, errno); + TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_IRQ_LINE, ret)); } static void vgic_poke_irq(int gic_fd, uint32_t intid, diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 7eedd9ff20fa..339d524a0399 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -85,56 +85,6 @@ int kvm_check_cap(long cap) return ret; } -/* VM Check Capability - * - * Input Args: - * vm - Virtual Machine - * cap - Capability - * - * Output Args: None - * - * Return: - * On success, the Value corresponding to the capability (KVM_CAP_*) - * specified by the value of cap. On failure a TEST_ASSERT failure - * is produced. - * - * Looks up and returns the value corresponding to the capability - * (KVM_CAP_*) given by cap. - */ -int vm_check_cap(struct kvm_vm *vm, long cap) -{ - int ret; - - ret = ioctl(vm->fd, KVM_CHECK_EXTENSION, cap); - TEST_ASSERT(ret >= 0, "KVM_CHECK_EXTENSION VM IOCTL failed,\n" - " rc: %i errno: %i", ret, errno); - - return ret; -} - -/* VM Enable Capability - * - * Input Args: - * vm - Virtual Machine - * cap - Capability - * - * Output Args: None - * - * Return: On success, 0. On failure a TEST_ASSERT failure is produced. - * - * Enables a capability (KVM_CAP_*) on the VM. - */ -int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap) -{ - int ret; - - ret = ioctl(vm->fd, KVM_ENABLE_CAP, cap); - TEST_ASSERT(ret == 0, "KVM_ENABLE_CAP IOCTL failed,\n" - " rc: %i errno: %i", ret, errno); - - return ret; -} - void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size) { struct kvm_enable_cap cap = { 0 }; @@ -460,36 +410,6 @@ void kvm_vm_restart(struct kvm_vm *vmp) } } -void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log) -{ - struct kvm_dirty_log args = { .dirty_bitmap = log, .slot = slot }; - int ret; - - ret = ioctl(vm->fd, KVM_GET_DIRTY_LOG, &args); - TEST_ASSERT(ret == 0, "%s: KVM_GET_DIRTY_LOG failed: %s", - __func__, strerror(-ret)); -} - -void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, - uint64_t first_page, uint32_t num_pages) -{ - struct kvm_clear_dirty_log args = { - .dirty_bitmap = log, .slot = slot, - .first_page = first_page, - .num_pages = num_pages - }; - int ret; - - ret = ioctl(vm->fd, KVM_CLEAR_DIRTY_LOG, &args); - TEST_ASSERT(ret == 0, "%s: KVM_CLEAR_DIRTY_LOG failed: %s", - __func__, strerror(-ret)); -} - -uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm) -{ - return ioctl(vm->fd, KVM_RESET_DIRTY_RINGS); -} - /* * Userspace Memory Region Find * @@ -645,9 +565,7 @@ static void __vm_mem_region_delete(struct kvm_vm *vm, } region->region.memory_size = 0; - ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); - TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed, " - "rc: %i errno: %i", ret, errno); + vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, ®ion->region); sparsebit_free(®ion->unused_phy_pages); ret = munmap(region->mmap_start, region->mmap_size); @@ -993,7 +911,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, region->region.guest_phys_addr = guest_paddr; region->region.memory_size = npages * vm->page_size; region->region.userspace_addr = (uintptr_t) region->host_mem; - ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); + ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, ®ion->region); TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" " rc: %i errno: %i\n" " slot: %u flags: 0x%x\n" @@ -1076,7 +994,7 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags) region->region.flags = flags; - ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); + ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, ®ion->region); TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" " rc: %i errno: %i slot: %u flags: 0x%x", @@ -1106,7 +1024,7 @@ void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa) region->region.guest_phys_addr = new_gpa; - ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); + ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, ®ion->region); TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION failed\n" "ret: %i errno: %i slot: %u new_gpa: 0x%lx", @@ -1190,10 +1108,10 @@ void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid) /* Allocate and initialize new vcpu structure. */ vcpu = calloc(1, sizeof(*vcpu)); TEST_ASSERT(vcpu != NULL, "Insufficient Memory"); + vcpu->id = vcpuid; - vcpu->fd = ioctl(vm->fd, KVM_CREATE_VCPU, vcpuid); - TEST_ASSERT(vcpu->fd >= 0, "KVM_CREATE_VCPU failed, rc: %i errno: %i", - vcpu->fd, errno); + vcpu->fd = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)(unsigned long)vcpuid); + TEST_ASSERT(vcpu->fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VCPU, vcpu->fd)); TEST_ASSERT(vcpu_mmap_sz() >= sizeof(*vcpu->state), "vcpu mmap size " "smaller than expected, vcpu_mmap_sz: %i expected_min: %zi", @@ -1534,11 +1452,7 @@ void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa) */ void vm_create_irqchip(struct kvm_vm *vm) { - int ret; - - ret = ioctl(vm->fd, KVM_CREATE_IRQCHIP, 0); - TEST_ASSERT(ret == 0, "KVM_CREATE_IRQCHIP IOCTL failed, " - "rc: %i errno: %i", ret, errno); + vm_ioctl(vm, KVM_CREATE_IRQCHIP, NULL); vm->has_irqchip = true; } @@ -1759,7 +1673,7 @@ int _kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test, int *fd) create_dev.type = type; create_dev.fd = -1; create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0; - ret = ioctl(vm_get_fd(vm), KVM_CREATE_DEVICE, &create_dev); + ret = __vm_ioctl(vm, KVM_CREATE_DEVICE, &create_dev); *fd = create_dev.fd; return ret; } @@ -1855,7 +1769,7 @@ void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level) { int ret = _kvm_irq_line(vm, irq, level); - TEST_ASSERT(ret >= 0, "KVM_IRQ_LINE failed, rc: %i errno: %i", ret, errno); + TEST_ASSERT(ret >= 0, KVM_IOCTL_ERROR(KVM_IRQ_LINE, ret)); } struct kvm_irq_routing *kvm_gsi_routing_create(void) @@ -1894,7 +1808,7 @@ int _kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing) int ret; assert(routing); - ret = ioctl(vm_get_fd(vm), KVM_SET_GSI_ROUTING, routing); + ret = __vm_ioctl(vm, KVM_SET_GSI_ROUTING, routing); free(routing); return ret; @@ -1905,8 +1819,7 @@ void kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing) int ret; ret = _kvm_gsi_routing_write(vm, routing); - TEST_ASSERT(ret == 0, "KVM_SET_GSI_ROUTING failed, rc: %i errno: %i", - ret, errno); + TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_SET_GSI_ROUTING, ret)); } /* @@ -2205,8 +2118,3 @@ unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size) n = DIV_ROUND_UP(size, vm_guest_mode_params[mode].page_size); return vm_adjust_num_guest_pages(mode, n); } - -int vm_get_stats_fd(struct kvm_vm *vm) -{ - return ioctl(vm->fd, KVM_GET_STATS_FD, NULL); -} diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index 89b13f23c3ac..e66deb8ba7e0 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -317,8 +317,7 @@ static void test_zero_memory_regions(void) vm = vm_create(0); vm_vcpu_add(vm, VCPU_ID); - TEST_ASSERT(!ioctl(vm_get_fd(vm), KVM_SET_NR_MMU_PAGES, 64), - "KVM_SET_NR_MMU_PAGES failed, errno = %d\n", errno); + vm_ioctl(vm, KVM_SET_NR_MMU_PAGES, (void *)64ul); vcpu_run(vm, VCPU_ID); run = vcpu_state(vm, VCPU_ID); diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index 93d77574b255..84cd5e717fcf 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -370,7 +370,7 @@ static void test_pmu_config_disable(void (*guest_code)(void)) cap.cap = KVM_CAP_PMU_CAPABILITY; cap.args[0] = KVM_PMU_CAP_DISABLE; - TEST_ASSERT(!vm_enable_cap(vm, &cap), "Failed to set KVM_PMU_CAP_DISABLE."); + vm_enable_cap(vm, &cap); vm_vcpu_add_default(vm, VCPU_ID, guest_code); vm_init_descriptor_tables(vm); -- cgit v1.2.3 From 2de1b7b127da0e87d278af172fa37ebfe04c8675 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 17 Feb 2022 10:21:36 -0800 Subject: KVM: selftests: Make kvm_ioctl() a wrapper to pretty print ioctl name Make kvm_ioctl() a macro wrapper and print the _name_ of the ioctl on failure instead of the number. Deliberately do not use __stringify(), as that will expand the ioctl all the way down to its numerical sequence, again the intent is to print the name of the macro. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/kvm_util_base.h | 16 +++++++++-- tools/testing/selftests/kvm/lib/kvm_util.c | 31 ++++------------------ tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c | 2 +- 3 files changed, 20 insertions(+), 29 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 1ccb91103e74..f5bfdf0b4548 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -110,8 +110,19 @@ int kvm_check_cap(long cap); #define __KVM_IOCTL_ERROR(_name, _ret) __KVM_SYSCALL_ERROR(_name, _ret) #define KVM_IOCTL_ERROR(_ioctl, _ret) __KVM_IOCTL_ERROR(#_ioctl, _ret) -int __kvm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg); -void kvm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg); +#define __kvm_ioctl(kvm_fd, cmd, arg) \ + ioctl(kvm_fd, cmd, arg) + +static inline void _kvm_ioctl(int kvm_fd, unsigned long cmd, const char *name, + void *arg) +{ + int ret = __kvm_ioctl(kvm_fd, cmd, arg); + + TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(name, ret)); +} + +#define kvm_ioctl(kvm_fd, cmd, arg) \ + _kvm_ioctl(kvm_fd, cmd, #cmd, arg) int __vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg); void _vm_ioctl(struct kvm_vm *vm, unsigned long cmd, const char *name, void *arg); @@ -492,6 +503,7 @@ unsigned int vm_get_page_size(struct kvm_vm *vm); unsigned int vm_get_page_shift(struct kvm_vm *vm); unsigned long vm_compute_max_gfn(struct kvm_vm *vm); uint64_t vm_get_max_gfn(struct kvm_vm *vm); +int vm_get_kvm_fd(struct kvm_vm *vm); int vm_get_fd(struct kvm_vm *vm); unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 339d524a0399..ac8faf072288 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1616,32 +1616,6 @@ void _vm_ioctl(struct kvm_vm *vm, unsigned long cmd, const char *name, void *arg TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(name, ret)); } -/* - * KVM system ioctl - * - * Input Args: - * vm - Virtual Machine - * cmd - Ioctl number - * arg - Argument to pass to the ioctl - * - * Return: None - * - * Issues an arbitrary ioctl on a KVM fd. - */ -void kvm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) -{ - int ret; - - ret = ioctl(vm->kvm_fd, cmd, arg); - TEST_ASSERT(ret == 0, "KVM ioctl %lu failed, rc: %i errno: %i (%s)", - cmd, ret, errno, strerror(errno)); -} - -int __kvm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) -{ - return ioctl(vm->kvm_fd, cmd, arg); -} - /* * Device Ioctl */ @@ -2074,6 +2048,11 @@ uint64_t vm_get_max_gfn(struct kvm_vm *vm) return vm->max_gfn; } +int vm_get_kvm_fd(struct kvm_vm *vm) +{ + return vm->kvm_fd; +} + int vm_get_fd(struct kvm_vm *vm) { return vm->fd; diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c index 7e45a3df8f98..896e1e7c1df7 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c @@ -123,7 +123,7 @@ void test_hv_cpuid_e2big(struct kvm_vm *vm, bool system) if (!system) ret = __vcpu_ioctl(vm, VCPU_ID, KVM_GET_SUPPORTED_HV_CPUID, &cpuid); else - ret = __kvm_ioctl(vm, KVM_GET_SUPPORTED_HV_CPUID, &cpuid); + ret = __kvm_ioctl(vm_get_kvm_fd(vm), KVM_GET_SUPPORTED_HV_CPUID, &cpuid); TEST_ASSERT(ret == -1 && errno == E2BIG, "%s KVM_GET_SUPPORTED_HV_CPUID didn't fail with -E2BIG when" -- cgit v1.2.3 From f9725f89dc5027c7d94f8fdfbac8bbad846a0891 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 12:39:12 -0800 Subject: KVM: selftests: Use kvm_ioctl() helpers Use the recently introduced KVM-specific ioctl() helpers instead of open coding calls to ioctl() just to pretty print the ioctl name. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/lib/aarch64/processor.c | 4 +-- tools/testing/selftests/kvm/lib/guest_modes.c | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 13 ++++----- tools/testing/selftests/kvm/lib/x86_64/processor.c | 34 +++++++--------------- .../selftests/kvm/x86_64/get_msr_index_features.c | 16 +++------- .../selftests/kvm/x86_64/mmio_warning_test.c | 6 ++-- 6 files changed, 26 insertions(+), 49 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index 6a041289fa80..d28cc12cea1d 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -469,8 +469,8 @@ void aarch64_get_supported_page_sizes(uint32_t ipa, }; kvm_fd = open_kvm_dev_path_or_exit(); - vm_fd = ioctl(kvm_fd, KVM_CREATE_VM, ipa); - TEST_ASSERT(vm_fd >= 0, "Can't create VM"); + vm_fd = __kvm_ioctl(kvm_fd, KVM_CREATE_VM, ipa); + TEST_ASSERT(vm_fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VM, vm_fd)); vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, 0); TEST_ASSERT(vcpu_fd >= 0, "Can't create vcpu"); diff --git a/tools/testing/selftests/kvm/lib/guest_modes.c b/tools/testing/selftests/kvm/lib/guest_modes.c index 8784013b747c..9ab27b4169bf 100644 --- a/tools/testing/selftests/kvm/lib/guest_modes.c +++ b/tools/testing/selftests/kvm/lib/guest_modes.c @@ -65,7 +65,7 @@ void guest_modes_append_default(void) struct kvm_s390_vm_cpu_processor info; kvm_fd = open_kvm_dev_path_or_exit(); - vm_fd = ioctl(kvm_fd, KVM_CREATE_VM, 0); + vm_fd = __kvm_ioctl(kvm_fd, KVM_CREATE_VM, 0); kvm_device_access(vm_fd, KVM_S390_VM_CPU_MODEL, KVM_S390_VM_CPU_PROCESSOR, &info, false); close(vm_fd); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index ac8faf072288..4d2748e8428a 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -76,9 +76,8 @@ int kvm_check_cap(long cap) int kvm_fd; kvm_fd = open_kvm_dev_path_or_exit(); - ret = ioctl(kvm_fd, KVM_CHECK_EXTENSION, cap); - TEST_ASSERT(ret >= 0, "KVM_CHECK_EXTENSION IOCTL failed,\n" - " rc: %i errno: %i", ret, errno); + ret = __kvm_ioctl(kvm_fd, KVM_CHECK_EXTENSION, cap); + TEST_ASSERT(ret >= 0, KVM_IOCTL_ERROR(KVM_CHECK_EXTENSION, ret)); close(kvm_fd); @@ -104,9 +103,8 @@ static void vm_open(struct kvm_vm *vm) exit(KSFT_SKIP); } - vm->fd = ioctl(vm->kvm_fd, KVM_CREATE_VM, vm->type); - TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, " - "rc: %i errno: %i", vm->fd, errno); + vm->fd = __kvm_ioctl(vm->kvm_fd, KVM_CREATE_VM, vm->type); + TEST_ASSERT(vm->fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VM, vm->fd)); } const char *vm_guest_mode_string(uint32_t i) @@ -1070,8 +1068,7 @@ static int vcpu_mmap_sz(void) ret = ioctl(dev_fd, KVM_GET_VCPU_MMAP_SIZE, NULL); TEST_ASSERT(ret >= sizeof(struct kvm_run), - "%s KVM_GET_VCPU_MMAP_SIZE ioctl failed, rc: %i errno: %i", - __func__, ret, errno); + KVM_IOCTL_ERROR(KVM_GET_VCPU_MMAP_SIZE, ret)); close(dev_fd); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 723cb49d1994..852274d64483 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -638,7 +638,7 @@ void vm_xsave_req_perm(int bit) }; kvm_fd = open_kvm_dev_path_or_exit(); - rc = ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr); + rc = __kvm_ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr); close(kvm_fd); if (rc == -1 && (errno == ENXIO || errno == EINVAL)) exit(KSFT_SKIP); @@ -738,7 +738,6 @@ static struct kvm_cpuid2 *allocate_kvm_cpuid2(void) struct kvm_cpuid2 *kvm_get_supported_cpuid(void) { static struct kvm_cpuid2 *cpuid; - int ret; int kvm_fd; if (cpuid) @@ -747,9 +746,7 @@ struct kvm_cpuid2 *kvm_get_supported_cpuid(void) cpuid = allocate_kvm_cpuid2(); kvm_fd = open_kvm_dev_path_or_exit(); - ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid); - TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_CPUID failed %d %d\n", - ret, errno); + kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid); close(kvm_fd); return cpuid; @@ -779,9 +776,8 @@ uint64_t kvm_get_feature_msr(uint64_t msr_index) buffer.entry.index = msr_index; kvm_fd = open_kvm_dev_path_or_exit(); - r = ioctl(kvm_fd, KVM_GET_MSRS, &buffer.header); - TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n" - " rc: %i errno: %i", r, errno); + r = __kvm_ioctl(kvm_fd, KVM_GET_MSRS, &buffer.header); + TEST_ASSERT(r == 1, KVM_IOCTL_ERROR(KVM_GET_MSRS, r)); close(kvm_fd); return buffer.entry.data; @@ -946,9 +942,9 @@ static int kvm_get_num_msrs_fd(int kvm_fd) int r; nmsrs.nmsrs = 0; - r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs); - TEST_ASSERT(r == -1 && errno == E2BIG, "Unexpected result from KVM_GET_MSR_INDEX_LIST probe, r: %i", - r); + r = __kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs); + TEST_ASSERT(r == -1 && errno == E2BIG, + "Unexpected result from KVM_GET_MSR_INDEX_LIST probe, r: %i", r); return nmsrs.nmsrs; } @@ -961,19 +957,16 @@ static int kvm_get_num_msrs(struct kvm_vm *vm) struct kvm_msr_list *kvm_get_msr_index_list(void) { struct kvm_msr_list *list; - int nmsrs, r, kvm_fd; + int nmsrs, kvm_fd; kvm_fd = open_kvm_dev_path_or_exit(); nmsrs = kvm_get_num_msrs_fd(kvm_fd); list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); list->nmsrs = nmsrs; - r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list); + kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list); close(kvm_fd); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i", - r); - return list; } @@ -1019,9 +1012,7 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) nmsrs = kvm_get_num_msrs(vm); list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); list->nmsrs = nmsrs; - r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, list); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i", - r); + kvm_ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, list); state = malloc(sizeof(*state) + nmsrs * sizeof(state->msrs.entries[0])); r = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, &state->events); @@ -1329,7 +1320,6 @@ uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void) { static struct kvm_cpuid2 *cpuid; - int ret; int kvm_fd; if (cpuid) @@ -1338,9 +1328,7 @@ struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void) cpuid = allocate_kvm_cpuid2(); kvm_fd = open_kvm_dev_path_or_exit(); - ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid); - TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_HV_CPUID failed %d %d\n", - ret, errno); + kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid); close(kvm_fd); return cpuid; diff --git a/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c index 8aed0db1331d..4ef60adbe108 100644 --- a/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c +++ b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c @@ -34,7 +34,7 @@ static int kvm_num_index_msrs(int kvm_fd, int nmsrs) static void test_get_msr_index(void) { - int old_res, res, kvm_fd, r; + int old_res, res, kvm_fd; struct kvm_msr_list *list; kvm_fd = open_kvm_dev_path_or_exit(); @@ -50,11 +50,8 @@ static void test_get_msr_index(void) list = malloc(sizeof(*list) + old_res * sizeof(list->indices[0])); list->nmsrs = old_res; - r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list); + kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list); - TEST_ASSERT(r == 0, - "Unexpected result from KVM_GET_MSR_FEATURE_INDEX_LIST, r: %i", - r); TEST_ASSERT(list->nmsrs == old_res, "Expecting nmsrs to be identical"); free(list); @@ -68,7 +65,7 @@ static int kvm_num_feature_msrs(int kvm_fd, int nmsrs) list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); list->nmsrs = nmsrs; - r = ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, list); + r = __kvm_ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, list); TEST_ASSERT(r == -1 && errno == E2BIG, "Unexpected result from KVM_GET_MSR_FEATURE_INDEX_LIST probe, r: %i", r); @@ -81,15 +78,10 @@ static int kvm_num_feature_msrs(int kvm_fd, int nmsrs) struct kvm_msr_list *kvm_get_msr_feature_list(int kvm_fd, int nmsrs) { struct kvm_msr_list *list; - int r; list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); list->nmsrs = nmsrs; - r = ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, list); - - TEST_ASSERT(r == 0, - "Unexpected result from KVM_GET_MSR_FEATURE_INDEX_LIST, r: %i", - r); + kvm_ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, list); return list; } diff --git a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c index 9f55ccd169a1..31ae837fedb1 100644 --- a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c +++ b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c @@ -59,10 +59,10 @@ void test(void) kvm = open("/dev/kvm", O_RDWR); TEST_ASSERT(kvm != -1, "failed to open /dev/kvm"); - kvmvm = ioctl(kvm, KVM_CREATE_VM, 0); - TEST_ASSERT(kvmvm != -1, "KVM_CREATE_VM failed"); + kvmvm = __kvm_ioctl(kvm, KVM_CREATE_VM, 0); + TEST_ASSERT(kvmvm > 0, KVM_IOCTL_ERROR(KVM_CREATE_VM, kvmvm)); kvmcpu = ioctl(kvmvm, KVM_CREATE_VCPU, 0); - TEST_ASSERT(kvmcpu != -1, "KVM_CREATE_VCPU failed"); + TEST_ASSERT(kvmcpu != -1, KVM_IOCTL_ERROR(KVM_CREATE_VCPU, kvmcpu)); run = (struct kvm_run *)mmap(0, 4096, PROT_READ|PROT_WRITE, MAP_SHARED, kvmcpu, 0); tc.kvmcpu = kvmcpu; -- cgit v1.2.3 From f17cf5674a1e70c142cb3acb5eb7667560e62012 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 4 May 2022 09:44:49 -0700 Subject: KVM: selftests: Use __KVM_SYSCALL_ERROR() to handle non-KVM syscall errors Use __KVM_SYSCALL_ERROR() to report and pretty print non-KVM syscall and ioctl errors, e.g. for mmap(), munmap(), uffd ioctls, etc... Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/vgic_irq.c | 5 ++-- tools/testing/selftests/kvm/demand_paging_test.c | 12 ++++----- tools/testing/selftests/kvm/lib/kvm_util.c | 34 ++++++++++-------------- 3 files changed, 22 insertions(+), 29 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c index 554ca649d470..87e41895b385 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c @@ -630,8 +630,7 @@ static void kvm_routing_and_irqfd_check(struct kvm_vm *vm, for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) { fd[f] = eventfd(0, 0); - TEST_ASSERT(fd[f] != -1, - "eventfd failed, errno: %i\n", errno); + TEST_ASSERT(fd[f] != -1, __KVM_SYSCALL_ERROR("eventfd()", fd[f])); } for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) { @@ -647,7 +646,7 @@ static void kvm_routing_and_irqfd_check(struct kvm_vm *vm, val = 1; ret = write(fd[f], &val, sizeof(uint64_t)); TEST_ASSERT(ret == sizeof(uint64_t), - "Write to KVM_IRQFD failed with ret: %d\n", ret); + __KVM_SYSCALL_ERROR("write()", ret)); } for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 6a719d065599..d8db0a37e973 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -223,6 +223,7 @@ static void setup_demand_paging(struct kvm_vm *vm, struct uffdio_api uffdio_api; struct uffdio_register uffdio_register; uint64_t expected_ioctls = ((uint64_t) 1) << _UFFDIO_COPY; + int ret; PER_PAGE_DEBUG("Userfaultfd %s mode, faults resolved with %s\n", is_minor ? "MINOR" : "MISSING", @@ -242,19 +243,18 @@ static void setup_demand_paging(struct kvm_vm *vm, } uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); - TEST_ASSERT(uffd >= 0, "uffd creation failed, errno: %d", errno); + TEST_ASSERT(uffd >= 0, __KVM_SYSCALL_ERROR("userfaultfd()", uffd)); uffdio_api.api = UFFD_API; uffdio_api.features = 0; - TEST_ASSERT(ioctl(uffd, UFFDIO_API, &uffdio_api) != -1, - "ioctl UFFDIO_API failed: %" PRIu64, - (uint64_t)uffdio_api.api); + ret = ioctl(uffd, UFFDIO_API, &uffdio_api); + TEST_ASSERT(ret != -1, __KVM_SYSCALL_ERROR("UFFDIO_API", ret)); uffdio_register.range.start = (uint64_t)hva; uffdio_register.range.len = len; uffdio_register.mode = uffd_mode; - TEST_ASSERT(ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) != -1, - "ioctl UFFDIO_REGISTER failed"); + ret = ioctl(uffd, UFFDIO_REGISTER, &uffdio_register); + TEST_ASSERT(ret != -1, __KVM_SYSCALL_ERROR("UFFDIO_REGISTER", ret)); TEST_ASSERT((uffdio_register.ioctls & expected_ioctls) == expected_ioctls, "missing userfaultfd ioctls"); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 4d2748e8428a..c7df8ba04ec5 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -517,17 +517,15 @@ static void vm_vcpu_rm(struct kvm_vm *vm, struct vcpu *vcpu) if (vcpu->dirty_gfns) { ret = munmap(vcpu->dirty_gfns, vm->dirty_ring_size); - TEST_ASSERT(ret == 0, "munmap of VCPU dirty ring failed, " - "rc: %i errno: %i", ret, errno); + TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); vcpu->dirty_gfns = NULL; } ret = munmap(vcpu->state, vcpu_mmap_sz()); - TEST_ASSERT(ret == 0, "munmap of VCPU fd failed, rc: %i " - "errno: %i", ret, errno); + TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); + ret = close(vcpu->fd); - TEST_ASSERT(ret == 0, "Close of VCPU fd failed, rc: %i " - "errno: %i", ret, errno); + TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); list_del(&vcpu->list); free(vcpu); @@ -542,12 +540,10 @@ void kvm_vm_release(struct kvm_vm *vmp) vm_vcpu_rm(vmp, vcpu); ret = close(vmp->fd); - TEST_ASSERT(ret == 0, "Close of vm fd failed,\n" - " vmp->fd: %i rc: %i errno: %i", vmp->fd, ret, errno); + TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); ret = close(vmp->kvm_fd); - TEST_ASSERT(ret == 0, "Close of /dev/kvm fd failed,\n" - " vmp->kvm_fd: %i rc: %i errno: %i", vmp->kvm_fd, ret, errno); + TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); } static void __vm_mem_region_delete(struct kvm_vm *vm, @@ -567,7 +563,7 @@ static void __vm_mem_region_delete(struct kvm_vm *vm, sparsebit_free(®ion->unused_phy_pages); ret = munmap(region->mmap_start, region->mmap_size); - TEST_ASSERT(ret == 0, "munmap failed, rc: %i errno: %i", ret, errno); + TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); free(region); } @@ -607,14 +603,13 @@ int kvm_memfd_alloc(size_t size, bool hugepages) memfd_flags |= MFD_HUGETLB; fd = memfd_create("kvm_selftest", memfd_flags); - TEST_ASSERT(fd != -1, "memfd_create() failed, errno: %i (%s)", - errno, strerror(errno)); + TEST_ASSERT(fd != -1, __KVM_SYSCALL_ERROR("memfd_create()", fd)); r = ftruncate(fd, size); - TEST_ASSERT(!r, "ftruncate() failed, errno: %i (%s)", errno, strerror(errno)); + TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("ftruncate()", r)); r = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, size); - TEST_ASSERT(!r, "fallocate() failed, errno: %i (%s)", errno, strerror(errno)); + TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); return fd; } @@ -880,8 +875,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, vm_mem_backing_src_alias(src_type)->flag, region->fd, 0); TEST_ASSERT(region->mmap_start != MAP_FAILED, - "test_malloc failed, mmap_start: %p errno: %i", - region->mmap_start, errno); + __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED)); TEST_ASSERT(!is_backing_src_hugetlb(src_type) || region->mmap_start == align_ptr_up(region->mmap_start, backing_src_pagesz), @@ -929,7 +923,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, vm_mem_backing_src_alias(src_type)->flag, region->fd, 0); TEST_ASSERT(region->mmap_alias != MAP_FAILED, - "mmap of alias failed, errno: %i", errno); + __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED)); /* Align host alias address */ region->host_alias = align_ptr_up(region->mmap_alias, alignment); @@ -1115,8 +1109,8 @@ void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid) vcpu_mmap_sz(), sizeof(*vcpu->state)); vcpu->state = (struct kvm_run *) mmap(NULL, vcpu_mmap_sz(), PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd, 0); - TEST_ASSERT(vcpu->state != MAP_FAILED, "mmap vcpu_state failed, " - "vcpu id: %u errno: %i", vcpuid, errno); + TEST_ASSERT(vcpu->state != MAP_FAILED, + __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED)); /* Add to linked-list of VCPUs. */ list_add(&vcpu->list, &vm->vcpus); -- cgit v1.2.3 From b938cafdde4e2dac94154bd2a345f2baa68919dc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 09:14:17 -0800 Subject: KVM: selftests: Make x86-64's register dump helpers static Make regs_dump() and sregs_dump() static, they're only implemented by x86 and only used internally. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/lib/kvm_util_internal.h | 34 -------------------- tools/testing/selftests/kvm/lib/x86_64/processor.c | 36 ++-------------------- 2 files changed, 2 insertions(+), 68 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h index 0c7c44499129..544b90df2f80 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h +++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h @@ -88,40 +88,6 @@ struct vcpu *vcpu_get(struct kvm_vm *vm, uint32_t vcpuid); */ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); -/* - * Register Dump - * - * Input Args: - * stream - Output FILE stream - * regs - Registers - * indent - Left margin indent amount - * - * Output Args: None - * - * Return: None - * - * Dumps the state of the registers given by @regs, to the FILE stream - * given by @stream. - */ -void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent); - -/* - * System Register Dump - * - * Input Args: - * stream - Output FILE stream - * sregs - System registers - * indent - Left margin indent amount - * - * Output Args: None - * - * Return: None - * - * Dumps the state of the system registers given by @sregs, to the FILE stream - * given by @stream. - */ -void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent); - struct userspace_mem_region * memslot2region(struct kvm_vm *vm, uint32_t memslot); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 852274d64483..31293dbe10ec 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -19,8 +19,7 @@ vm_vaddr_t exception_handlers; -void regs_dump(FILE *stream, struct kvm_regs *regs, - uint8_t indent) +static void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent) { fprintf(stream, "%*srax: 0x%.16llx rbx: 0x%.16llx " "rcx: 0x%.16llx rdx: 0x%.16llx\n", @@ -43,21 +42,6 @@ void regs_dump(FILE *stream, struct kvm_regs *regs, regs->rip, regs->rflags); } -/* - * Segment Dump - * - * Input Args: - * stream - Output FILE stream - * segment - KVM segment - * indent - Left margin indent amount - * - * Output Args: None - * - * Return: None - * - * Dumps the state of the KVM segment given by @segment, to the FILE stream - * given by @stream. - */ static void segment_dump(FILE *stream, struct kvm_segment *segment, uint8_t indent) { @@ -75,21 +59,6 @@ static void segment_dump(FILE *stream, struct kvm_segment *segment, segment->unusable, segment->padding); } -/* - * dtable Dump - * - * Input Args: - * stream - Output FILE stream - * dtable - KVM dtable - * indent - Left margin indent amount - * - * Output Args: None - * - * Return: None - * - * Dumps the state of the KVM dtable given by @dtable, to the FILE stream - * given by @stream. - */ static void dtable_dump(FILE *stream, struct kvm_dtable *dtable, uint8_t indent) { @@ -99,8 +68,7 @@ static void dtable_dump(FILE *stream, struct kvm_dtable *dtable, dtable->padding[0], dtable->padding[1], dtable->padding[2]); } -void sregs_dump(FILE *stream, struct kvm_sregs *sregs, - uint8_t indent) +static void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent) { unsigned int i; -- cgit v1.2.3 From b530eba14c7001882ab517c8408878ef2de97863 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 13:21:19 -0800 Subject: KVM: selftests: Get rid of kvm_util_internal.h Fold kvm_util_internal.h into kvm_util_base.h, i.e. make all KVM utility stuff "public". Hiding struct implementations from tests has been a massive failure, as it has led to pointless and poorly named wrappers, unnecessarily opaque code, etc... Not to mention that the approach was a complete failure as evidenced by the non-zero number of tests that were including kvm_util_internal.h. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/kvm_util_base.h | 97 ++++++++++++++++++++-- .../testing/selftests/kvm/lib/aarch64/processor.c | 1 - tools/testing/selftests/kvm/lib/aarch64/ucall.c | 1 - tools/testing/selftests/kvm/lib/aarch64/vgic.c | 1 - tools/testing/selftests/kvm/lib/elf.c | 1 - tools/testing/selftests/kvm/lib/kvm_util.c | 1 - .../testing/selftests/kvm/lib/kvm_util_internal.h | 94 --------------------- tools/testing/selftests/kvm/lib/riscv/processor.c | 1 - tools/testing/selftests/kvm/lib/riscv/ucall.c | 1 - tools/testing/selftests/kvm/lib/s390x/processor.c | 1 - .../selftests/kvm/lib/x86_64/perf_test_util.c | 1 - tools/testing/selftests/kvm/lib/x86_64/processor.c | 1 - tools/testing/selftests/kvm/lib/x86_64/svm.c | 1 - tools/testing/selftests/kvm/lib/x86_64/vmx.c | 1 - .../selftests/kvm/x86_64/max_vcpuid_cap_test.c | 1 - .../selftests/kvm/x86_64/sev_migrate_tests.c | 1 - .../kvm/x86_64/svm_nested_soft_inject_test.c | 1 - 17 files changed, 90 insertions(+), 116 deletions(-) delete mode 100644 tools/testing/selftests/kvm/lib/kvm_util_internal.h (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index f5bfdf0b4548..c0199f3b59bb 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -9,9 +9,13 @@ #include "test_util.h" -#include "asm/kvm.h" +#include +#include "linux/hashtable.h" #include "linux/list.h" -#include "linux/kvm.h" +#include +#include +#include "linux/rbtree.h" + #include #include "sparsebit.h" @@ -21,14 +25,93 @@ #define NSEC_PER_SEC 1000000000L +typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */ +typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */ + +struct userspace_mem_region { + struct kvm_userspace_memory_region region; + struct sparsebit *unused_phy_pages; + int fd; + off_t offset; + void *host_mem; + void *host_alias; + void *mmap_start; + void *mmap_alias; + size_t mmap_size; + struct rb_node gpa_node; + struct rb_node hva_node; + struct hlist_node slot_node; +}; + +struct vcpu { + struct list_head list; + uint32_t id; + int fd; + struct kvm_run *state; + struct kvm_dirty_gfn *dirty_gfns; + uint32_t fetch_index; + uint32_t dirty_gfns_count; +}; + +struct userspace_mem_regions { + struct rb_root gpa_tree; + struct rb_root hva_tree; + DECLARE_HASHTABLE(slot_hash, 9); +}; + +struct kvm_vm { + int mode; + unsigned long type; + int kvm_fd; + int fd; + unsigned int pgtable_levels; + unsigned int page_size; + unsigned int page_shift; + unsigned int pa_bits; + unsigned int va_bits; + uint64_t max_gfn; + struct list_head vcpus; + struct userspace_mem_regions regions; + struct sparsebit *vpages_valid; + struct sparsebit *vpages_mapped; + bool has_irqchip; + bool pgd_created; + vm_paddr_t pgd; + vm_vaddr_t gdt; + vm_vaddr_t tss; + vm_vaddr_t idt; + vm_vaddr_t handlers; + uint32_t dirty_ring_size; +}; + + +#define kvm_for_each_vcpu(vm, i, vcpu) \ + for ((i) = 0; (i) <= (vm)->last_vcpu_id; (i)++) \ + if (!((vcpu) = vm->vcpus[i])) \ + continue; \ + else + +struct vcpu *vcpu_get(struct kvm_vm *vm, uint32_t vcpuid); + /* - * Callers of kvm_util only have an incomplete/opaque description of the - * structure kvm_util is using to maintain the state of a VM. + * Virtual Translation Tables Dump + * + * Input Args: + * stream - Output FILE stream + * vm - Virtual Machine + * indent - Left margin indent amount + * + * Output Args: None + * + * Return: None + * + * Dumps to the FILE stream given by @stream, the contents of all the + * virtual translation tables for the VM given by @vm. */ -struct kvm_vm; +void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); -typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */ -typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */ +struct userspace_mem_region * +memslot2region(struct kvm_vm *vm, uint32_t memslot); /* Minimum allocated guest virtual and physical addresses */ #define KVM_UTIL_MIN_VADDR 0x2000 diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index d28cc12cea1d..388bd7d87c02 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -10,7 +10,6 @@ #include "guest_modes.h" #include "kvm_util.h" -#include "../kvm_util_internal.h" #include "processor.h" #define DEFAULT_ARM64_GUEST_STACK_VADDR_MIN 0xac0000 diff --git a/tools/testing/selftests/kvm/lib/aarch64/ucall.c b/tools/testing/selftests/kvm/lib/aarch64/ucall.c index 00be3ef195ca..868ebab5369e 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/ucall.c +++ b/tools/testing/selftests/kvm/lib/aarch64/ucall.c @@ -5,7 +5,6 @@ * Copyright (C) 2018, Red Hat, Inc. */ #include "kvm_util.h" -#include "../kvm_util_internal.h" static vm_vaddr_t *ucall_exit_mmio_addr; diff --git a/tools/testing/selftests/kvm/lib/aarch64/vgic.c b/tools/testing/selftests/kvm/lib/aarch64/vgic.c index 25d1ec65621d..c34f0f116f39 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/vgic.c +++ b/tools/testing/selftests/kvm/lib/aarch64/vgic.c @@ -9,7 +9,6 @@ #include #include "kvm_util.h" -#include "../kvm_util_internal.h" #include "vgic.h" #include "gic.h" #include "gic_v3.h" diff --git a/tools/testing/selftests/kvm/lib/elf.c b/tools/testing/selftests/kvm/lib/elf.c index 13e8e3dcf984..9f54c098d9d0 100644 --- a/tools/testing/selftests/kvm/lib/elf.c +++ b/tools/testing/selftests/kvm/lib/elf.c @@ -11,7 +11,6 @@ #include #include "kvm_util.h" -#include "kvm_util_internal.h" static void elfhdr_get(const char *filename, Elf64_Ehdr *hdrp) { diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index c7df8ba04ec5..a57958a39c1b 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -8,7 +8,6 @@ #define _GNU_SOURCE /* for program_invocation_name */ #include "test_util.h" #include "kvm_util.h" -#include "kvm_util_internal.h" #include "processor.h" #include diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h deleted file mode 100644 index 544b90df2f80..000000000000 --- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h +++ /dev/null @@ -1,94 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * tools/testing/selftests/kvm/lib/kvm_util_internal.h - * - * Copyright (C) 2018, Google LLC. - */ - -#ifndef SELFTEST_KVM_UTIL_INTERNAL_H -#define SELFTEST_KVM_UTIL_INTERNAL_H - -#include "linux/hashtable.h" -#include "linux/rbtree.h" - -#include "sparsebit.h" - -struct userspace_mem_region { - struct kvm_userspace_memory_region region; - struct sparsebit *unused_phy_pages; - int fd; - off_t offset; - void *host_mem; - void *host_alias; - void *mmap_start; - void *mmap_alias; - size_t mmap_size; - struct rb_node gpa_node; - struct rb_node hva_node; - struct hlist_node slot_node; -}; - -struct vcpu { - struct list_head list; - uint32_t id; - int fd; - struct kvm_run *state; - struct kvm_dirty_gfn *dirty_gfns; - uint32_t fetch_index; - uint32_t dirty_gfns_count; -}; - -struct userspace_mem_regions { - struct rb_root gpa_tree; - struct rb_root hva_tree; - DECLARE_HASHTABLE(slot_hash, 9); -}; - -struct kvm_vm { - int mode; - unsigned long type; - int kvm_fd; - int fd; - unsigned int pgtable_levels; - unsigned int page_size; - unsigned int page_shift; - unsigned int pa_bits; - unsigned int va_bits; - uint64_t max_gfn; - struct list_head vcpus; - struct userspace_mem_regions regions; - struct sparsebit *vpages_valid; - struct sparsebit *vpages_mapped; - bool has_irqchip; - bool pgd_created; - vm_paddr_t pgd; - vm_vaddr_t gdt; - vm_vaddr_t tss; - vm_vaddr_t idt; - vm_vaddr_t handlers; - uint32_t dirty_ring_size; -}; - -struct vcpu *vcpu_get(struct kvm_vm *vm, uint32_t vcpuid); - -/* - * Virtual Translation Tables Dump - * - * Input Args: - * stream - Output FILE stream - * vm - Virtual Machine - * indent - Left margin indent amount - * - * Output Args: None - * - * Return: None - * - * Dumps to the FILE stream given by @stream, the contents of all the - * virtual translation tables for the VM given by @vm. - */ -void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); - -struct userspace_mem_region * -memslot2region(struct kvm_vm *vm, uint32_t memslot); - -#endif /* SELFTEST_KVM_UTIL_INTERNAL_H */ diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index c89e6b1fbfb1..5ee8250dd74c 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -9,7 +9,6 @@ #include #include "kvm_util.h" -#include "../kvm_util_internal.h" #include "processor.h" #define DEFAULT_RISCV_GUEST_STACK_VADDR_MIN 0xac0000 diff --git a/tools/testing/selftests/kvm/lib/riscv/ucall.c b/tools/testing/selftests/kvm/lib/riscv/ucall.c index c2ed59f5783d..48d91b77fa1d 100644 --- a/tools/testing/selftests/kvm/lib/riscv/ucall.c +++ b/tools/testing/selftests/kvm/lib/riscv/ucall.c @@ -8,7 +8,6 @@ #include #include "kvm_util.h" -#include "../kvm_util_internal.h" #include "processor.h" void ucall_init(struct kvm_vm *vm, void *arg) diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c index 7cc1051c4b71..53c413932f64 100644 --- a/tools/testing/selftests/kvm/lib/s390x/processor.c +++ b/tools/testing/selftests/kvm/lib/s390x/processor.c @@ -7,7 +7,6 @@ #include "processor.h" #include "kvm_util.h" -#include "../kvm_util_internal.h" #define PAGES_PER_REGION 4 diff --git a/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c b/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c index e258524435a0..f525427a37c4 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c @@ -12,7 +12,6 @@ #include "test_util.h" #include "kvm_util.h" #include "perf_test_util.h" -#include "../kvm_util_internal.h" #include "processor.h" #include "vmx.h" diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 31293dbe10ec..02266b8123df 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -7,7 +7,6 @@ #include "test_util.h" #include "kvm_util.h" -#include "../kvm_util_internal.h" #include "processor.h" #ifndef NUM_INTERRUPTS diff --git a/tools/testing/selftests/kvm/lib/x86_64/svm.c b/tools/testing/selftests/kvm/lib/x86_64/svm.c index 736ee4a23df6..01a9d831da13 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/svm.c +++ b/tools/testing/selftests/kvm/lib/x86_64/svm.c @@ -9,7 +9,6 @@ #include "test_util.h" #include "kvm_util.h" -#include "../kvm_util_internal.h" #include "processor.h" #include "svm_util.h" diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index b77a01d0a271..fdca123397e4 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -7,7 +7,6 @@ #include "test_util.h" #include "kvm_util.h" -#include "../kvm_util_internal.h" #include "processor.h" #include "vmx.h" diff --git a/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c index e83afd4bb4cf..419fbdc51246 100644 --- a/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c +++ b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c @@ -8,7 +8,6 @@ */ #include "kvm_util.h" -#include "../lib/kvm_util_internal.h" #define MAX_VCPU_ID 2 diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c index 7424bec5ae23..5b565aa11e32 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c +++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c @@ -12,7 +12,6 @@ #include "processor.h" #include "svm_util.h" #include "kselftest.h" -#include "../lib/kvm_util_internal.h" #define SEV_POLICY_ES 0b100 diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c index f94f1b449aef..18061677154f 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c @@ -17,7 +17,6 @@ #include "processor.h" #include "svm_util.h" #include "test_util.h" -#include "../lib/kvm_util_internal.h" #define VCPU_ID 0 #define INT_NR 0x20 -- cgit v1.2.3 From a78593fd8717349852fa9a3f7292516edc5b50ea Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 17 Feb 2022 10:57:06 -0800 Subject: KVM: selftests: Use KVM_IOCTL_ERROR() for one-off arm64 ioctls Use the KVM_IOCTL_ERROR() macro to generate error messages for a handful of one-off arm64 ioctls. The calls in question are made without an associated struct kvm_vm/kvm_vcpu as they are used to configure those structs, i.e. can't be easily converted to e.g. vcpu_ioctl(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/aarch64/processor.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index 388bd7d87c02..2e73853f485e 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -472,15 +472,15 @@ void aarch64_get_supported_page_sizes(uint32_t ipa, TEST_ASSERT(vm_fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VM, vm_fd)); vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, 0); - TEST_ASSERT(vcpu_fd >= 0, "Can't create vcpu"); + TEST_ASSERT(vcpu_fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VCPU, vcpu_fd)); err = ioctl(vm_fd, KVM_ARM_PREFERRED_TARGET, &preferred_init); - TEST_ASSERT(err == 0, "Can't get target"); + TEST_ASSERT(err == 0, KVM_IOCTL_ERROR(KVM_ARM_PREFERRED_TARGET, err)); err = ioctl(vcpu_fd, KVM_ARM_VCPU_INIT, &preferred_init); - TEST_ASSERT(err == 0, "Can't get init vcpu"); + TEST_ASSERT(err == 0, KVM_IOCTL_ERROR(KVM_ARM_VCPU_INIT, err)); err = ioctl(vcpu_fd, KVM_GET_ONE_REG, ®); - TEST_ASSERT(err == 0, "Can't get MMFR0"); + TEST_ASSERT(err == 0, KVM_IOCTL_ERROR(KVM_GET_ONE_REG, vcpu_fd)); *ps4k = ((val >> 28) & 0xf) != 0xf; *ps64k = ((val >> 24) & 0xf) == 0; -- cgit v1.2.3 From f3165dc02212e8eaf5cb675f105aac89c6de431c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 17 Feb 2022 12:09:28 -0800 Subject: KVM: selftests: Drop @test param from kvm_create_device() Remove the two calls that pass @test=true to kvm_create_device() and drop the @test param entirely. The two removed calls don't check the return value of kvm_create_device(), so other than verifying KVM doesn't explode, which is extremely unlikely given that the non-test variant was _just_ called, they are pointless and provide no validation coverage. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/vgic_init.c | 14 ++++++-------- tools/testing/selftests/kvm/include/kvm_util_base.h | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 12 ++++-------- 3 files changed, 11 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index 42c9209f7786..4928cb43c8a7 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -65,7 +65,7 @@ static struct vm_gic vm_gic_create_with_vcpus(uint32_t gic_dev_type, uint32_t nr v.gic_dev_type = gic_dev_type; v.vm = vm_create_default_with_vcpus(nr_vcpus, 0, 0, guest_code, NULL); - v.gic_fd = kvm_create_device(v.vm, gic_dev_type, false); + v.gic_fd = kvm_create_device(v.vm, gic_dev_type); return v; } @@ -406,7 +406,7 @@ static void test_v3_typer_accesses(void) v.vm = vm_create_default(0, 0, guest_code); - v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); + v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3); vm_vcpu_add_default(v.vm, 3, guest_code); @@ -487,7 +487,7 @@ static void test_v3_last_bit_redist_regions(void) v.vm = vm_create_default_with_vcpus(6, 0, 0, guest_code, vcpuids); - v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); + v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3); kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); @@ -536,7 +536,7 @@ static void test_v3_last_bit_single_rdist(void) v.vm = vm_create_default_with_vcpus(6, 0, 0, guest_code, vcpuids); - v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); + v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3); kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); @@ -603,7 +603,7 @@ static void test_v3_its_region(void) int its_fd, ret; v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, NR_VCPUS); - its_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_ITS, false); + its_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_ITS); addr = 0x401000; ret = _kvm_device_access(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, @@ -656,13 +656,11 @@ int test_kvm_device(uint32_t gic_dev_type) ret = _kvm_create_device(v.vm, gic_dev_type, true, &fd); if (ret) return ret; - v.gic_fd = kvm_create_device(v.vm, gic_dev_type, false); + v.gic_fd = kvm_create_device(v.vm, gic_dev_type); ret = _kvm_create_device(v.vm, gic_dev_type, false, &fd); TEST_ASSERT(ret && errno == EEXIST, "create GIC device twice"); - kvm_create_device(v.vm, gic_dev_type, true); - /* try to create the other gic_dev_type */ other = VGIC_DEV_IS_V2(gic_dev_type) ? KVM_DEV_TYPE_ARM_VGIC_V3 : KVM_DEV_TYPE_ARM_VGIC_V2; diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index c0199f3b59bb..6e1926abb248 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -485,7 +485,7 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...); int _kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr); int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr); int _kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test, int *fd); -int kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test); +int kvm_create_device(struct kvm_vm *vm, uint64_t type); int _kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, void *val, bool write); int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index a57958a39c1b..cb2e42aa1c03 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1642,18 +1642,14 @@ int _kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test, int *fd) return ret; } -int kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test) +int kvm_create_device(struct kvm_vm *vm, uint64_t type) { int fd, ret; - ret = _kvm_create_device(vm, type, test, &fd); + ret = _kvm_create_device(vm, type, false, &fd); - if (!test) { - TEST_ASSERT(!ret, - "KVM_CREATE_DEVICE IOCTL failed, rc: %i errno: %i", ret, errno); - return fd; - } - return ret; + TEST_ASSERT(!ret, "KVM_CREATE_DEVICE IOCTL failed, rc: %i errno: %i", ret, errno); + return fd; } int _kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, -- cgit v1.2.3 From 98f94ce42ac672b2abdcf38cfc0e60fd52e04995 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 17 Feb 2022 12:16:20 -0800 Subject: KVM: selftests: Move KVM_CREATE_DEVICE_TEST code to separate helper Move KVM_CREATE_DEVICE_TEST to its own helper, identifying "real" versus "test" device creation based on a hardcoded boolean buried in the middle of a param list is painful for readers. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/vgic_init.c | 10 +++++----- .../testing/selftests/kvm/include/kvm_util_base.h | 3 ++- tools/testing/selftests/kvm/lib/aarch64/vgic.c | 3 +-- tools/testing/selftests/kvm/lib/kvm_util.c | 23 ++++++++++++++++------ 4 files changed, 25 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index 4928cb43c8a7..18054b94207f 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -649,24 +649,24 @@ int test_kvm_device(uint32_t gic_dev_type) v.vm = vm_create_default_with_vcpus(NR_VCPUS, 0, 0, guest_code, NULL); /* try to create a non existing KVM device */ - ret = _kvm_create_device(v.vm, 0, true, &fd); + ret = __kvm_test_create_device(v.vm, 0); TEST_ASSERT(ret && errno == ENODEV, "unsupported device"); /* trial mode */ - ret = _kvm_create_device(v.vm, gic_dev_type, true, &fd); + ret = __kvm_test_create_device(v.vm, gic_dev_type); if (ret) return ret; v.gic_fd = kvm_create_device(v.vm, gic_dev_type); - ret = _kvm_create_device(v.vm, gic_dev_type, false, &fd); + ret = __kvm_create_device(v.vm, gic_dev_type, &fd); TEST_ASSERT(ret && errno == EEXIST, "create GIC device twice"); /* try to create the other gic_dev_type */ other = VGIC_DEV_IS_V2(gic_dev_type) ? KVM_DEV_TYPE_ARM_VGIC_V3 : KVM_DEV_TYPE_ARM_VGIC_V2; - if (!_kvm_create_device(v.vm, other, true, &fd)) { - ret = _kvm_create_device(v.vm, other, false, &fd); + if (!__kvm_test_create_device(v.vm, other)) { + ret = __kvm_test_create_device(v.vm, other); TEST_ASSERT(ret && errno == EINVAL, "create GIC device while other version exists"); } diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 6e1926abb248..8795f4624c2c 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -484,7 +484,8 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...); int _kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr); int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr); -int _kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test, int *fd); +int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type); +int __kvm_create_device(struct kvm_vm *vm, uint64_t type, int *fd); int kvm_create_device(struct kvm_vm *vm, uint64_t type); int _kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, void *val, bool write); diff --git a/tools/testing/selftests/kvm/lib/aarch64/vgic.c b/tools/testing/selftests/kvm/lib/aarch64/vgic.c index c34f0f116f39..74b4bcaffcfa 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/vgic.c +++ b/tools/testing/selftests/kvm/lib/aarch64/vgic.c @@ -51,8 +51,7 @@ int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs, nr_vcpus, nr_vcpus_created); /* Distributor setup */ - if (_kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V3, - false, &gic_fd) != 0) + if (__kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V3, &gic_fd)) return -1; kvm_device_access(gic_fd, KVM_DEV_ARM_VGIC_GRP_NR_IRQS, diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index cb2e42aa1c03..9c0122b0e393 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1629,14 +1629,25 @@ int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr) return ret; } -int _kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test, int *fd) +int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type) { - struct kvm_create_device create_dev; + struct kvm_create_device create_dev = { + .type = type, + .flags = KVM_CREATE_DEVICE_TEST, + }; + + return __vm_ioctl(vm, KVM_CREATE_DEVICE, &create_dev); +} + +int __kvm_create_device(struct kvm_vm *vm, uint64_t type, int *fd) +{ + struct kvm_create_device create_dev = { + .type = type, + .fd = -1, + .flags = 0, + }; int ret; - create_dev.type = type; - create_dev.fd = -1; - create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0; ret = __vm_ioctl(vm, KVM_CREATE_DEVICE, &create_dev); *fd = create_dev.fd; return ret; @@ -1646,7 +1657,7 @@ int kvm_create_device(struct kvm_vm *vm, uint64_t type) { int fd, ret; - ret = _kvm_create_device(vm, type, false, &fd); + ret = __kvm_create_device(vm, type, &fd); TEST_ASSERT(!ret, "KVM_CREATE_DEVICE IOCTL failed, rc: %i errno: %i", ret, errno); return fd; -- cgit v1.2.3 From 279eacbefad5d163a20f8fcde2fd9362bc24f7c7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 17 Feb 2022 12:21:33 -0800 Subject: KVM: selftests: Multiplex return code and fd in __kvm_create_device() Multiplex the return value and fd (on success) in __kvm_create_device() to mimic common library helpers that return file descriptors, e.g. open(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/vgic_init.c | 6 +++--- tools/testing/selftests/kvm/include/kvm_util_base.h | 2 +- tools/testing/selftests/kvm/lib/aarch64/vgic.c | 5 +++-- tools/testing/selftests/kvm/lib/kvm_util.c | 16 +++++++--------- 4 files changed, 14 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index 18054b94207f..77d6c09e0f80 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -643,8 +643,8 @@ static void test_v3_its_region(void) int test_kvm_device(uint32_t gic_dev_type) { struct vm_gic v; - int ret, fd; uint32_t other; + int ret; v.vm = vm_create_default_with_vcpus(NR_VCPUS, 0, 0, guest_code, NULL); @@ -658,8 +658,8 @@ int test_kvm_device(uint32_t gic_dev_type) return ret; v.gic_fd = kvm_create_device(v.vm, gic_dev_type); - ret = __kvm_create_device(v.vm, gic_dev_type, &fd); - TEST_ASSERT(ret && errno == EEXIST, "create GIC device twice"); + ret = __kvm_create_device(v.vm, gic_dev_type); + TEST_ASSERT(ret < 0 && errno == EEXIST, "create GIC device twice"); /* try to create the other gic_dev_type */ other = VGIC_DEV_IS_V2(gic_dev_type) ? KVM_DEV_TYPE_ARM_VGIC_V3 diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 8795f4624c2c..1ccf44805fa0 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -485,7 +485,7 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...); int _kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr); int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr); int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type); -int __kvm_create_device(struct kvm_vm *vm, uint64_t type, int *fd); +int __kvm_create_device(struct kvm_vm *vm, uint64_t type); int kvm_create_device(struct kvm_vm *vm, uint64_t type); int _kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, void *val, bool write); diff --git a/tools/testing/selftests/kvm/lib/aarch64/vgic.c b/tools/testing/selftests/kvm/lib/aarch64/vgic.c index 74b4bcaffcfa..7925b4c5dad0 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/vgic.c +++ b/tools/testing/selftests/kvm/lib/aarch64/vgic.c @@ -51,8 +51,9 @@ int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs, nr_vcpus, nr_vcpus_created); /* Distributor setup */ - if (__kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V3, &gic_fd)) - return -1; + gic_fd = __kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V3); + if (gic_fd < 0) + return gic_fd; kvm_device_access(gic_fd, KVM_DEV_ARM_VGIC_GRP_NR_IRQS, 0, &nr_irqs, true); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 9c0122b0e393..17e226107b65 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1639,27 +1639,25 @@ int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type) return __vm_ioctl(vm, KVM_CREATE_DEVICE, &create_dev); } -int __kvm_create_device(struct kvm_vm *vm, uint64_t type, int *fd) +int __kvm_create_device(struct kvm_vm *vm, uint64_t type) { struct kvm_create_device create_dev = { .type = type, .fd = -1, .flags = 0, }; - int ret; + int err; - ret = __vm_ioctl(vm, KVM_CREATE_DEVICE, &create_dev); - *fd = create_dev.fd; - return ret; + err = __vm_ioctl(vm, KVM_CREATE_DEVICE, &create_dev); + TEST_ASSERT(err <= 0, "KVM_CREATE_DEVICE shouldn't return a positive value"); + return err ? : create_dev.fd; } int kvm_create_device(struct kvm_vm *vm, uint64_t type) { - int fd, ret; - - ret = __kvm_create_device(vm, type, &fd); + int fd = __kvm_create_device(vm, type); - TEST_ASSERT(!ret, "KVM_CREATE_DEVICE IOCTL failed, rc: %i errno: %i", ret, errno); + TEST_ASSERT(fd >= 0, "KVM_CREATE_DEVICE IOCTL failed, rc: %i errno: %i", fd, errno); return fd; } -- cgit v1.2.3 From 9367504f77ebb47d09847993aae01f8a38d5b4f6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 2 Mar 2022 19:49:16 -0800 Subject: KVM: selftests: Rename KVM_HAS_DEVICE_ATTR helpers for consistency Rename kvm_device_check_attr() and its variants to kvm_has_device_attr() to be consistent with the ioctl names and with other helpers in the KVM selftests framework. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/vgic_init.c | 12 +++++------- tools/testing/selftests/kvm/include/kvm_util_base.h | 6 +++--- tools/testing/selftests/kvm/lib/kvm_util.c | 12 ++++++------ tools/testing/selftests/kvm/system_counter_offset_test.c | 2 +- 4 files changed, 15 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index 77d6c09e0f80..3013b4f522e7 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -127,14 +127,12 @@ static void subtest_dist_rdist(struct vm_gic *v) : gic_v2_dist_region; /* Check existing group/attributes */ - kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - dist.attr); + kvm_has_device_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, dist.attr); - kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - rdist.attr); + kvm_has_device_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, rdist.attr); /* check non existing attribute */ - ret = _kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, -1); + ret = __kvm_has_device_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, -1); TEST_ASSERT(ret && errno == ENXIO, "attribute not supported"); /* misaligned DIST and REDIST address settings */ @@ -176,7 +174,7 @@ static void subtest_dist_rdist(struct vm_gic *v) rdist.attr, &addr, true); TEST_ASSERT(ret && errno == EEXIST, "GIC redist base set again"); - ret = _kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + ret = __kvm_has_device_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_REDIST); if (!ret) { /* Attempt to mix legacy and new redistributor regions */ @@ -203,7 +201,7 @@ static void subtest_v3_redist_regions(struct vm_gic *v) uint64_t addr, expected_addr; int ret; - ret = kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + ret = kvm_has_device_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_REDIST); TEST_ASSERT(!ret, "Multiple redist regions advertised"); diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 1ccf44805fa0..66d896c8e19b 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -482,8 +482,8 @@ void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid); */ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...); -int _kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr); -int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr); +int __kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr); +int kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr); int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type); int __kvm_create_device(struct kvm_vm *vm, uint64_t type); int kvm_create_device(struct kvm_vm *vm, uint64_t type); @@ -494,7 +494,7 @@ int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level); int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level); -int _vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, +int __vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, uint64_t attr); int vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, uint64_t attr); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 17e226107b65..ca313dc8b37a 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1610,7 +1610,7 @@ void _vm_ioctl(struct kvm_vm *vm, unsigned long cmd, const char *name, void *arg * Device Ioctl */ -int _kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr) +int __kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr) { struct kvm_device_attr attribute = { .group = group, @@ -1621,9 +1621,9 @@ int _kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr) return ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute); } -int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr) +int kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr) { - int ret = _kvm_device_check_attr(dev_fd, group, attr); + int ret = __kvm_has_device_attr(dev_fd, group, attr); TEST_ASSERT(!ret, "KVM_HAS_DEVICE_ATTR failed, rc: %i errno: %i", ret, errno); return ret; @@ -1686,18 +1686,18 @@ int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, return ret; } -int _vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, +int __vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, uint64_t attr) { struct vcpu *vcpu = vcpu_get(vm, vcpuid); - return _kvm_device_check_attr(vcpu->fd, group, attr); + return __kvm_has_device_attr(vcpu->fd, group, attr); } int vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, uint64_t attr) { - int ret = _vcpu_has_device_attr(vm, vcpuid, group, attr); + int ret = __vcpu_has_device_attr(vm, vcpuid, group, attr); TEST_ASSERT(!ret, "KVM_HAS_DEVICE_ATTR IOCTL failed, rc: %i errno: %i", ret, errno); return ret; diff --git a/tools/testing/selftests/kvm/system_counter_offset_test.c b/tools/testing/selftests/kvm/system_counter_offset_test.c index b337bbbfa41f..2b10c53abf4f 100644 --- a/tools/testing/selftests/kvm/system_counter_offset_test.c +++ b/tools/testing/selftests/kvm/system_counter_offset_test.c @@ -30,7 +30,7 @@ static struct test_case test_cases[] = { static void check_preconditions(struct kvm_vm *vm) { - if (!_vcpu_has_device_attr(vm, VCPU_ID, KVM_VCPU_TSC_CTRL, KVM_VCPU_TSC_OFFSET)) + if (!__vcpu_has_device_attr(vm, VCPU_ID, KVM_VCPU_TSC_CTRL, KVM_VCPU_TSC_OFFSET)) return; print_skip("KVM_VCPU_TSC_OFFSET not supported; skipping test"); -- cgit v1.2.3 From d2752e2eb331cbbed6f51f901f2685654e1cafa7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 17 Feb 2022 16:33:21 -0800 Subject: KVM: selftests: Drop 'int' return from asserting *_has_device_attr() Drop 'int' returns from *_device_has_attr() helpers that assert the return is '0', there's no point in returning '0' and "requiring" the caller to perform a redundant assertion. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/vgic_init.c | 4 ++-- tools/testing/selftests/kvm/include/kvm_util_base.h | 20 +++++++++++++++++--- tools/testing/selftests/kvm/lib/kvm_util.c | 17 ----------------- 3 files changed, 19 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index 3013b4f522e7..fe3d0259a9f4 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -201,8 +201,8 @@ static void subtest_v3_redist_regions(struct vm_gic *v) uint64_t addr, expected_addr; int ret; - ret = kvm_has_device_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST); + ret = __kvm_has_device_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST); TEST_ASSERT(!ret, "Multiple redist regions advertised"); addr = REDIST_REGION_ATTR_ADDR(NR_VCPUS, 0x100000, 2, 0); diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 66d896c8e19b..f9aeac540699 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -483,7 +483,14 @@ void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid); void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...); int __kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr); -int kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr); + +static inline void kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr) +{ + int ret = __kvm_has_device_attr(dev_fd, group, attr); + + TEST_ASSERT(!ret, "KVM_HAS_DEVICE_ATTR failed, rc: %i errno: %i", ret, errno); +} + int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type); int __kvm_create_device(struct kvm_vm *vm, uint64_t type); int kvm_create_device(struct kvm_vm *vm, uint64_t type); @@ -496,8 +503,15 @@ int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level); int __vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, uint64_t attr); -int vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, - uint64_t attr); + +static inline void vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, + uint32_t group, uint64_t attr) +{ + int ret = __vcpu_has_device_attr(vm, vcpuid, group, attr); + + TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_HAS_DEVICE_ATTR, ret)); +} + int _vcpu_access_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, uint64_t attr, void *val, bool write); int vcpu_access_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index ca313dc8b37a..a7bc6b623871 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1621,14 +1621,6 @@ int __kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr) return ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute); } -int kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr) -{ - int ret = __kvm_has_device_attr(dev_fd, group, attr); - - TEST_ASSERT(!ret, "KVM_HAS_DEVICE_ATTR failed, rc: %i errno: %i", ret, errno); - return ret; -} - int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type) { struct kvm_create_device create_dev = { @@ -1694,15 +1686,6 @@ int __vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, return __kvm_has_device_attr(vcpu->fd, group, attr); } -int vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, - uint64_t attr) -{ - int ret = __vcpu_has_device_attr(vm, vcpuid, group, attr); - - TEST_ASSERT(!ret, "KVM_HAS_DEVICE_ATTR IOCTL failed, rc: %i errno: %i", ret, errno); - return ret; -} - int _vcpu_access_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, uint64_t attr, void *val, bool write) { -- cgit v1.2.3 From 4091818426d98f4e7093808264caf86ad90287c8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 9 Jun 2022 13:06:57 -0700 Subject: KVM: selftests: Split get/set device_attr helpers Split the get/set device_attr helpers instead of using a boolean param to select between get and set. Duplicating upper level wrappers is a very, very small price to pay for improved readability, and having constant (at compile time) inputs will allow the selftests framework to sanity check ioctl() invocations. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/arch_timer.c | 8 +- tools/testing/selftests/kvm/aarch64/vgic_init.c | 246 ++++++++++----------- .../testing/selftests/kvm/include/kvm_util_base.h | 91 +++++--- tools/testing/selftests/kvm/lib/aarch64/vgic.c | 31 ++- tools/testing/selftests/kvm/lib/guest_modes.c | 4 +- tools/testing/selftests/kvm/lib/kvm_util.c | 58 +++-- .../selftests/kvm/system_counter_offset_test.c | 4 +- 7 files changed, 231 insertions(+), 211 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c index f55c4c20d8b3..f04ca07c7f14 100644 --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -349,10 +349,10 @@ static void test_run(struct kvm_vm *vm) static void test_init_timer_irq(struct kvm_vm *vm) { /* Timer initid should be same for all the vCPUs, so query only vCPU-0 */ - vcpu_access_device_attr(vm, 0, KVM_ARM_VCPU_TIMER_CTRL, - KVM_ARM_VCPU_TIMER_IRQ_PTIMER, &ptimer_irq, false); - vcpu_access_device_attr(vm, 0, KVM_ARM_VCPU_TIMER_CTRL, - KVM_ARM_VCPU_TIMER_IRQ_VTIMER, &vtimer_irq, false); + vcpu_device_attr_get(vm, 0, KVM_ARM_VCPU_TIMER_CTRL, + KVM_ARM_VCPU_TIMER_IRQ_PTIMER, &ptimer_irq); + vcpu_device_attr_get(vm, 0, KVM_ARM_VCPU_TIMER_CTRL, + KVM_ARM_VCPU_TIMER_IRQ_VTIMER, &vtimer_irq); sync_global_to_guest(vm, ptimer_irq); sync_global_to_guest(vm, vtimer_irq); diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index fe3d0259a9f4..4da1524cc49c 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -33,13 +33,10 @@ struct vm_gic { static uint64_t max_phys_size; /* helper to access a redistributor register */ -static int access_v3_redist_reg(int gicv3_fd, int vcpu, int offset, - uint32_t *val, bool write) +static int v3_redist_reg_get(int gicv3_fd, int vcpu, int offset, uint32_t *val) { - uint64_t attr = REG_OFFSET(vcpu, offset); - - return _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_REDIST_REGS, - attr, val, write); + return __kvm_device_attr_get(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_REDIST_REGS, + REG_OFFSET(vcpu, offset), val); } /* dummy guest code */ @@ -137,41 +134,41 @@ static void subtest_dist_rdist(struct vm_gic *v) /* misaligned DIST and REDIST address settings */ addr = dist.alignment / 0x10; - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - dist.attr, &addr, true); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + dist.attr, &addr); TEST_ASSERT(ret && errno == EINVAL, "GIC dist base not aligned"); addr = rdist.alignment / 0x10; - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - rdist.attr, &addr, true); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + rdist.attr, &addr); TEST_ASSERT(ret && errno == EINVAL, "GIC redist/cpu base not aligned"); /* out of range address */ addr = max_phys_size; - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - dist.attr, &addr, true); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + dist.attr, &addr); TEST_ASSERT(ret && errno == E2BIG, "dist address beyond IPA limit"); - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - rdist.attr, &addr, true); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + rdist.attr, &addr); TEST_ASSERT(ret && errno == E2BIG, "redist address beyond IPA limit"); /* Space for half a rdist (a rdist is: 2 * rdist.alignment). */ addr = max_phys_size - dist.alignment; - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - rdist.attr, &addr, true); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + rdist.attr, &addr); TEST_ASSERT(ret && errno == E2BIG, "half of the redist is beyond IPA limit"); /* set REDIST base address @0x0*/ addr = 0x00000; - kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - rdist.attr, &addr, true); + kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + rdist.attr, &addr); /* Attempt to create a second legacy redistributor region */ addr = 0xE0000; - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - rdist.attr, &addr, true); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + rdist.attr, &addr); TEST_ASSERT(ret && errno == EEXIST, "GIC redist base set again"); ret = __kvm_has_device_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, @@ -179,9 +176,8 @@ static void subtest_dist_rdist(struct vm_gic *v) if (!ret) { /* Attempt to mix legacy and new redistributor regions */ addr = REDIST_REGION_ATTR_ADDR(NR_VCPUS, 0x100000, 0, 0); - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, - &addr, true); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); TEST_ASSERT(ret && errno == EINVAL, "attempt to mix GICv3 REDIST and REDIST_REGION"); } @@ -191,8 +187,8 @@ static void subtest_dist_rdist(struct vm_gic *v) * on first vcpu run instead. */ addr = rdist.size - rdist.alignment; - kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - dist.attr, &addr, true); + kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + dist.attr, &addr); } /* Test the new REDIST region API */ @@ -206,66 +202,66 @@ static void subtest_v3_redist_regions(struct vm_gic *v) TEST_ASSERT(!ret, "Multiple redist regions advertised"); addr = REDIST_REGION_ATTR_ADDR(NR_VCPUS, 0x100000, 2, 0); - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); TEST_ASSERT(ret && errno == EINVAL, "redist region attr value with flags != 0"); addr = REDIST_REGION_ATTR_ADDR(0, 0x100000, 0, 0); - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); TEST_ASSERT(ret && errno == EINVAL, "redist region attr value with count== 0"); addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 1); - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); TEST_ASSERT(ret && errno == EINVAL, "attempt to register the first rdist region with index != 0"); addr = REDIST_REGION_ATTR_ADDR(2, 0x201000, 0, 1); - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); TEST_ASSERT(ret && errno == EINVAL, "rdist region with misaligned address"); addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0); - kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 1); - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); TEST_ASSERT(ret && errno == EINVAL, "register an rdist region with already used index"); addr = REDIST_REGION_ATTR_ADDR(1, 0x210000, 0, 2); - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); TEST_ASSERT(ret && errno == EINVAL, "register an rdist region overlapping with another one"); addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 2); - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); TEST_ASSERT(ret && errno == EINVAL, "register redist region with index not +1"); addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 1); - kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); addr = REDIST_REGION_ATTR_ADDR(1, max_phys_size, 0, 2); - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); TEST_ASSERT(ret && errno == E2BIG, "register redist region with base address beyond IPA range"); /* The last redist is above the pa range. */ addr = REDIST_REGION_ATTR_ADDR(2, max_phys_size - 0x30000, 0, 2); - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); TEST_ASSERT(ret && errno == E2BIG, "register redist region with top address beyond IPA range"); addr = 0x260000; - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr); TEST_ASSERT(ret && errno == EINVAL, "Mix KVM_VGIC_V3_ADDR_TYPE_REDIST and REDIST_REGION"); @@ -278,28 +274,28 @@ static void subtest_v3_redist_regions(struct vm_gic *v) addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 0); expected_addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0); - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, false); + ret = __kvm_device_attr_get(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); TEST_ASSERT(!ret && addr == expected_addr, "read characteristics of region #0"); addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 1); expected_addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 1); - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, false); + ret = __kvm_device_attr_get(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); TEST_ASSERT(!ret && addr == expected_addr, "read characteristics of region #1"); addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 2); - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, false); + ret = __kvm_device_attr_get(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); TEST_ASSERT(ret && errno == ENOENT, "read characteristics of non existing region"); addr = 0x260000; - kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_DIST, &addr, true); + kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_DIST, &addr); addr = REDIST_REGION_ATTR_ADDR(1, 0x260000, 0, 2); - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); TEST_ASSERT(ret && errno == EINVAL, "register redist region colliding with dist"); } @@ -351,8 +347,8 @@ static void test_v3_new_redist_regions(void) v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, NR_VCPUS); subtest_v3_redist_regions(&v); - kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, - KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); ret = run_vcpu(v.vm, 3); TEST_ASSERT(ret == -ENXIO, "running without sufficient number of rdists"); @@ -364,8 +360,8 @@ static void test_v3_new_redist_regions(void) subtest_v3_redist_regions(&v); addr = REDIST_REGION_ATTR_ADDR(1, 0x280000, 0, 2); - kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); ret = run_vcpu(v.vm, 3); TEST_ASSERT(ret == -EBUSY, "running without vgic explicit init"); @@ -377,17 +373,17 @@ static void test_v3_new_redist_regions(void) v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, NR_VCPUS); subtest_v3_redist_regions(&v); - ret = _kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, dummy, true); + ret = __kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, dummy); TEST_ASSERT(ret && errno == EFAULT, "register a third region allowing to cover the 4 vcpus"); addr = REDIST_REGION_ATTR_ADDR(1, 0x280000, 0, 2); - kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); - kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, - KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); ret = run_vcpu(v.vm, 3); TEST_ASSERT(!ret, "vcpu run"); @@ -408,57 +404,57 @@ static void test_v3_typer_accesses(void) vm_vcpu_add_default(v.vm, 3, guest_code); - ret = access_v3_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false); + ret = v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, &val); TEST_ASSERT(ret && errno == EINVAL, "attempting to read GICR_TYPER of non created vcpu"); vm_vcpu_add_default(v.vm, 1, guest_code); - ret = access_v3_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false); + ret = v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, &val); TEST_ASSERT(ret && errno == EBUSY, "read GICR_TYPER before GIC initialized"); vm_vcpu_add_default(v.vm, 2, guest_code); - kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, - KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); for (i = 0; i < NR_VCPUS ; i++) { - ret = access_v3_redist_reg(v.gic_fd, i, GICR_TYPER, &val, false); + ret = v3_redist_reg_get(v.gic_fd, 0, GICR_TYPER, &val); TEST_ASSERT(!ret && val == i * 0x100, "read GICR_TYPER before rdist region setting"); } addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0); - kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); /* The 2 first rdists should be put there (vcpu 0 and 3) */ - ret = access_v3_redist_reg(v.gic_fd, 0, GICR_TYPER, &val, false); + ret = v3_redist_reg_get(v.gic_fd, 0, GICR_TYPER, &val); TEST_ASSERT(!ret && !val, "read typer of rdist #0"); - ret = access_v3_redist_reg(v.gic_fd, 3, GICR_TYPER, &val, false); + ret = v3_redist_reg_get(v.gic_fd, 3, GICR_TYPER, &val); TEST_ASSERT(!ret && val == 0x310, "read typer of rdist #1"); addr = REDIST_REGION_ATTR_ADDR(10, 0x100000, 0, 1); - ret = _kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + ret = __kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); TEST_ASSERT(ret && errno == EINVAL, "collision with previous rdist region"); - ret = access_v3_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false); + ret = v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, &val); TEST_ASSERT(!ret && val == 0x100, "no redist region attached to vcpu #1 yet, last cannot be returned"); - ret = access_v3_redist_reg(v.gic_fd, 2, GICR_TYPER, &val, false); + ret = v3_redist_reg_get(v.gic_fd, 2, GICR_TYPER, &val); TEST_ASSERT(!ret && val == 0x200, "no redist region attached to vcpu #2, last cannot be returned"); addr = REDIST_REGION_ATTR_ADDR(10, 0x20000, 0, 1); - kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); - ret = access_v3_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false); + ret = v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, &val); TEST_ASSERT(!ret && val == 0x100, "read typer of rdist #1"); - ret = access_v3_redist_reg(v.gic_fd, 2, GICR_TYPER, &val, false); + ret = v3_redist_reg_get(v.gic_fd, 2, GICR_TYPER, &val); TEST_ASSERT(!ret && val == 0x210, "read typer of rdist #1, last properly returned"); @@ -487,37 +483,37 @@ static void test_v3_last_bit_redist_regions(void) v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3); - kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, - KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); addr = REDIST_REGION_ATTR_ADDR(2, 0x100000, 0, 0); - kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); addr = REDIST_REGION_ATTR_ADDR(2, 0x240000, 0, 1); - kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 2); - kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); - ret = access_v3_redist_reg(v.gic_fd, 0, GICR_TYPER, &val, false); + ret = v3_redist_reg_get(v.gic_fd, 0, GICR_TYPER, &val); TEST_ASSERT(!ret && val == 0x000, "read typer of rdist #0"); - ret = access_v3_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false); + ret = v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, &val); TEST_ASSERT(!ret && val == 0x100, "read typer of rdist #1"); - ret = access_v3_redist_reg(v.gic_fd, 2, GICR_TYPER, &val, false); + ret = v3_redist_reg_get(v.gic_fd, 2, GICR_TYPER, &val); TEST_ASSERT(!ret && val == 0x200, "read typer of rdist #2"); - ret = access_v3_redist_reg(v.gic_fd, 3, GICR_TYPER, &val, false); + ret = v3_redist_reg_get(v.gic_fd, 3, GICR_TYPER, &val); TEST_ASSERT(!ret && val == 0x310, "read typer of rdist #3"); - ret = access_v3_redist_reg(v.gic_fd, 5, GICR_TYPER, &val, false); + ret = v3_redist_reg_get(v.gic_fd, 5, GICR_TYPER, &val); TEST_ASSERT(!ret && val == 0x500, "read typer of rdist #5"); - ret = access_v3_redist_reg(v.gic_fd, 4, GICR_TYPER, &val, false); + ret = v3_redist_reg_get(v.gic_fd, 4, GICR_TYPER, &val); TEST_ASSERT(!ret && val == 0x410, "read typer of rdist #4"); vm_gic_destroy(&v); @@ -536,26 +532,26 @@ static void test_v3_last_bit_single_rdist(void) v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3); - kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, - KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); addr = 0x10000; - kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true); + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr); - ret = access_v3_redist_reg(v.gic_fd, 0, GICR_TYPER, &val, false); + ret = v3_redist_reg_get(v.gic_fd, 0, GICR_TYPER, &val); TEST_ASSERT(!ret && val == 0x000, "read typer of rdist #0"); - ret = access_v3_redist_reg(v.gic_fd, 3, GICR_TYPER, &val, false); + ret = v3_redist_reg_get(v.gic_fd, 3, GICR_TYPER, &val); TEST_ASSERT(!ret && val == 0x300, "read typer of rdist #1"); - ret = access_v3_redist_reg(v.gic_fd, 5, GICR_TYPER, &val, false); + ret = v3_redist_reg_get(v.gic_fd, 5, GICR_TYPER, &val); TEST_ASSERT(!ret && val == 0x500, "read typer of rdist #2"); - ret = access_v3_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false); + ret = v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, &val); TEST_ASSERT(!ret && val == 0x100, "read typer of rdist #3"); - ret = access_v3_redist_reg(v.gic_fd, 2, GICR_TYPER, &val, false); + ret = v3_redist_reg_get(v.gic_fd, 2, GICR_TYPER, &val); TEST_ASSERT(!ret && val == 0x210, "read typer of rdist #3"); vm_gic_destroy(&v); @@ -572,19 +568,19 @@ static void test_v3_redist_ipa_range_check_at_vcpu_run(void) /* Set space for 3 redists, we have 1 vcpu, so this succeeds. */ addr = max_phys_size - (3 * 2 * 0x10000); - kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true); + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr); addr = 0x00000; - kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_DIST, &addr, true); + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_DIST, &addr); /* Add the rest of the VCPUs */ for (i = 1; i < NR_VCPUS; ++i) vm_vcpu_add_default(v.vm, i, guest_code); - kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, - KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); /* Attempt to run a vcpu without enough redist space. */ ret = run_vcpu(v.vm, 2); @@ -604,31 +600,31 @@ static void test_v3_its_region(void) its_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_ITS); addr = 0x401000; - ret = _kvm_device_access(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_ITS_ADDR_TYPE, &addr, true); + ret = __kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_ITS_ADDR_TYPE, &addr); TEST_ASSERT(ret && errno == EINVAL, "ITS region with misaligned address"); addr = max_phys_size; - ret = _kvm_device_access(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_ITS_ADDR_TYPE, &addr, true); + ret = __kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_ITS_ADDR_TYPE, &addr); TEST_ASSERT(ret && errno == E2BIG, "register ITS region with base address beyond IPA range"); addr = max_phys_size - 0x10000; - ret = _kvm_device_access(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_ITS_ADDR_TYPE, &addr, true); + ret = __kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_ITS_ADDR_TYPE, &addr); TEST_ASSERT(ret && errno == E2BIG, "Half of ITS region is beyond IPA range"); /* This one succeeds setting the ITS base */ addr = 0x400000; - kvm_device_access(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_ITS_ADDR_TYPE, &addr, true); + kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_ITS_ADDR_TYPE, &addr); addr = 0x300000; - ret = _kvm_device_access(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_ITS_ADDR_TYPE, &addr, true); + ret = __kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_ITS_ADDR_TYPE, &addr); TEST_ASSERT(ret && errno == EEXIST, "ITS base set again"); close(its_fd); diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index f9aeac540699..6e63e7e57752 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -460,6 +460,65 @@ static inline int vcpu_get_stats_fd(struct kvm_vm *vm, uint32_t vcpuid) return fd; } +int __kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr); + +static inline void kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr) +{ + int ret = __kvm_has_device_attr(dev_fd, group, attr); + + TEST_ASSERT(!ret, "KVM_HAS_DEVICE_ATTR failed, rc: %i errno: %i", ret, errno); +} + +int __kvm_device_attr_get(int dev_fd, uint32_t group, uint64_t attr, void *val); + +static inline void kvm_device_attr_get(int dev_fd, uint32_t group, + uint64_t attr, void *val) +{ + int ret = __kvm_device_attr_get(dev_fd, group, attr, val); + + TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_GET_DEVICE_ATTR, ret)); +} + +int __kvm_device_attr_set(int dev_fd, uint32_t group, uint64_t attr, void *val); + +static inline void kvm_device_attr_set(int dev_fd, uint32_t group, + uint64_t attr, void *val) +{ + int ret = __kvm_device_attr_set(dev_fd, group, attr, val); + + TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_SET_DEVICE_ATTR, ret)); +} + +int __vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, + uint64_t attr); + +static inline void vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, + uint32_t group, uint64_t attr) +{ + int ret = __vcpu_has_device_attr(vm, vcpuid, group, attr); + + TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_HAS_DEVICE_ATTR, ret)); +} + +int __vcpu_device_attr_get(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, + uint64_t attr, void *val); +void vcpu_device_attr_get(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, + uint64_t attr, void *val); +int __vcpu_device_attr_set(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, + uint64_t attr, void *val); +void vcpu_device_attr_set(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, + uint64_t attr, void *val); +int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type); +int __kvm_create_device(struct kvm_vm *vm, uint64_t type); + +static inline int kvm_create_device(struct kvm_vm *vm, uint64_t type) +{ + int fd = __kvm_create_device(vm, type); + + TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_DEVICE, fd)); + return fd; +} + void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid); /* @@ -482,41 +541,9 @@ void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid); */ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...); -int __kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr); - -static inline void kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr) -{ - int ret = __kvm_has_device_attr(dev_fd, group, attr); - - TEST_ASSERT(!ret, "KVM_HAS_DEVICE_ATTR failed, rc: %i errno: %i", ret, errno); -} - -int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type); -int __kvm_create_device(struct kvm_vm *vm, uint64_t type); -int kvm_create_device(struct kvm_vm *vm, uint64_t type); -int _kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, - void *val, bool write); -int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, - void *val, bool write); void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level); int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level); -int __vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, - uint64_t attr); - -static inline void vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, - uint32_t group, uint64_t attr) -{ - int ret = __vcpu_has_device_attr(vm, vcpuid, group, attr); - - TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_HAS_DEVICE_ATTR, ret)); -} - -int _vcpu_access_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, - uint64_t attr, void *val, bool write); -int vcpu_access_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, - uint64_t attr, void *val, bool write); - #define KVM_MAX_IRQ_ROUTES 4096 struct kvm_irq_routing *kvm_gsi_routing_create(void); diff --git a/tools/testing/selftests/kvm/lib/aarch64/vgic.c b/tools/testing/selftests/kvm/lib/aarch64/vgic.c index 7925b4c5dad0..cfe3067efbf0 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/vgic.c +++ b/tools/testing/selftests/kvm/lib/aarch64/vgic.c @@ -55,27 +55,26 @@ int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs, if (gic_fd < 0) return gic_fd; - kvm_device_access(gic_fd, KVM_DEV_ARM_VGIC_GRP_NR_IRQS, - 0, &nr_irqs, true); + kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_NR_IRQS, 0, &nr_irqs); - kvm_device_access(gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, - KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); + kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); - kvm_device_access(gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_DIST, &gicd_base_gpa, true); + kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_DIST, &gicd_base_gpa); nr_gic_pages = vm_calc_num_guest_pages(vm->mode, KVM_VGIC_V3_DIST_SIZE); virt_map(vm, gicd_base_gpa, gicd_base_gpa, nr_gic_pages); /* Redistributor setup */ redist_attr = REDIST_REGION_ATTR_ADDR(nr_vcpus, gicr_base_gpa, 0, 0); - kvm_device_access(gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &redist_attr, true); + kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &redist_attr); nr_gic_pages = vm_calc_num_guest_pages(vm->mode, KVM_VGIC_V3_REDIST_SIZE * nr_vcpus); virt_map(vm, gicr_base_gpa, gicr_base_gpa, nr_gic_pages); - kvm_device_access(gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, - KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); + kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); return gic_fd; } @@ -88,14 +87,14 @@ int _kvm_irq_set_level_info(int gic_fd, uint32_t intid, int level) uint64_t val; int ret; - ret = _kvm_device_access(gic_fd, KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO, - attr, &val, false); + ret = __kvm_device_attr_get(gic_fd, KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO, + attr, &val); if (ret != 0) return ret; val |= 1U << index; - ret = _kvm_device_access(gic_fd, KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO, - attr, &val, true); + ret = __kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO, + attr, &val); return ret; } @@ -155,9 +154,9 @@ static void vgic_poke_irq(int gic_fd, uint32_t intid, * intid will just make the read/writes point to above the intended * register space (i.e., ICPENDR after ISPENDR). */ - kvm_device_access(gic_fd, group, attr, &val, false); + kvm_device_attr_get(gic_fd, group, attr, &val); val |= 1ULL << index; - kvm_device_access(gic_fd, group, attr, &val, true); + kvm_device_attr_set(gic_fd, group, attr, &val); } void kvm_irq_write_ispendr(int gic_fd, uint32_t intid, uint32_t vcpu) diff --git a/tools/testing/selftests/kvm/lib/guest_modes.c b/tools/testing/selftests/kvm/lib/guest_modes.c index 9ab27b4169bf..0be56c63aed6 100644 --- a/tools/testing/selftests/kvm/lib/guest_modes.c +++ b/tools/testing/selftests/kvm/lib/guest_modes.c @@ -66,8 +66,8 @@ void guest_modes_append_default(void) kvm_fd = open_kvm_dev_path_or_exit(); vm_fd = __kvm_ioctl(kvm_fd, KVM_CREATE_VM, 0); - kvm_device_access(vm_fd, KVM_S390_VM_CPU_MODEL, - KVM_S390_VM_CPU_PROCESSOR, &info, false); + kvm_device_attr_get(vm_fd, KVM_S390_VM_CPU_MODEL, + KVM_S390_VM_CPU_PROCESSOR, &info); close(vm_fd); close(kvm_fd); /* Starting with z13 we have 47bits of physical address */ diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index a7bc6b623871..220e079dc749 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1645,16 +1645,19 @@ int __kvm_create_device(struct kvm_vm *vm, uint64_t type) return err ? : create_dev.fd; } -int kvm_create_device(struct kvm_vm *vm, uint64_t type) +int __kvm_device_attr_get(int dev_fd, uint32_t group, uint64_t attr, void *val) { - int fd = __kvm_create_device(vm, type); + struct kvm_device_attr kvmattr = { + .group = group, + .attr = attr, + .flags = 0, + .addr = (uintptr_t)val, + }; - TEST_ASSERT(fd >= 0, "KVM_CREATE_DEVICE IOCTL failed, rc: %i errno: %i", fd, errno); - return fd; + return __kvm_ioctl(dev_fd, KVM_GET_DEVICE_ATTR, &kvmattr); } -int _kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, - void *val, bool write) +int __kvm_device_attr_set(int dev_fd, uint32_t group, uint64_t attr, void *val) { struct kvm_device_attr kvmattr = { .group = group, @@ -1662,45 +1665,40 @@ int _kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, .flags = 0, .addr = (uintptr_t)val, }; - int ret; - ret = ioctl(dev_fd, write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR, - &kvmattr); - return ret; + return __kvm_ioctl(dev_fd, KVM_SET_DEVICE_ATTR, &kvmattr); } -int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, - void *val, bool write) +int __vcpu_device_attr_get(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, + uint64_t attr, void *val) { - int ret = _kvm_device_access(dev_fd, group, attr, val, write); - - TEST_ASSERT(!ret, "KVM_SET|GET_DEVICE_ATTR IOCTL failed, rc: %i errno: %i", ret, errno); - return ret; + return __kvm_device_attr_get(vcpu_get(vm, vcpuid)->fd, group, attr, val); } -int __vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, - uint64_t attr) +void vcpu_device_attr_get(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, + uint64_t attr, void *val) { - struct vcpu *vcpu = vcpu_get(vm, vcpuid); - - return __kvm_has_device_attr(vcpu->fd, group, attr); + kvm_device_attr_get(vcpu_get(vm, vcpuid)->fd, group, attr, val); } -int _vcpu_access_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, - uint64_t attr, void *val, bool write) +int __vcpu_device_attr_set(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, + uint64_t attr, void *val) { - struct vcpu *vcpu = vcpu_get(vm, vcpuid); + return __kvm_device_attr_set(vcpu_get(vm, vcpuid)->fd, group, attr, val); +} - return _kvm_device_access(vcpu->fd, group, attr, val, write); +void vcpu_device_attr_set(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, + uint64_t attr, void *val) +{ + kvm_device_attr_set(vcpu_get(vm, vcpuid)->fd, group, attr, val); } -int vcpu_access_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, - uint64_t attr, void *val, bool write) +int __vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, + uint64_t attr) { - int ret = _vcpu_access_device_attr(vm, vcpuid, group, attr, val, write); + struct vcpu *vcpu = vcpu_get(vm, vcpuid); - TEST_ASSERT(!ret, "KVM_SET|GET_DEVICE_ATTR IOCTL failed, rc: %i errno: %i", ret, errno); - return ret; + return __kvm_has_device_attr(vcpu->fd, group, attr); } /* diff --git a/tools/testing/selftests/kvm/system_counter_offset_test.c b/tools/testing/selftests/kvm/system_counter_offset_test.c index 2b10c53abf4f..5dd9d28efb97 100644 --- a/tools/testing/selftests/kvm/system_counter_offset_test.c +++ b/tools/testing/selftests/kvm/system_counter_offset_test.c @@ -39,8 +39,8 @@ static void check_preconditions(struct kvm_vm *vm) static void setup_system_counter(struct kvm_vm *vm, struct test_case *test) { - vcpu_access_device_attr(vm, VCPU_ID, KVM_VCPU_TSC_CTRL, - KVM_VCPU_TSC_OFFSET, &test->tsc_offset, true); + vcpu_device_attr_set(vm, VCPU_ID, KVM_VCPU_TSC_CTRL, + KVM_VCPU_TSC_OFFSET, &test->tsc_offset); } static uint64_t guest_read_system_counter(struct test_case *test) -- cgit v1.2.3 From 114eef6e461a6a0810d6ed354b4ce7f3b74fe547 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 9 Jun 2022 13:20:53 -0700 Subject: KVM: selftests: Dedup vgic_init's asserts and improve error messages Move the asserts for the many REDIST_REGS accesses into common helpers instead of copy+pasting the same, unhelpful asserts over and over. Not providing the actual (or expected) value makes it unnecessarily painful to debug failures, especially since test_assert() prints the errno unconditionally, e.g. on success, it may print a stale, misleading errno. Use kvm_device_attr_get() to handle the "success" check so that the "success" and "expected == actual" asserts are separated, which will make it far less likely that a user incorrectly assumes the ioctl() failed. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/vgic_init.c | 112 ++++++++++-------------- 1 file changed, 47 insertions(+), 65 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index 4da1524cc49c..c5866c3f4516 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -32,11 +32,28 @@ struct vm_gic { static uint64_t max_phys_size; -/* helper to access a redistributor register */ -static int v3_redist_reg_get(int gicv3_fd, int vcpu, int offset, uint32_t *val) +/* + * Helpers to access a redistributor register and verify the ioctl() failed or + * succeeded as expected, and provided the correct value on success. + */ +static void v3_redist_reg_get_errno(int gicv3_fd, int vcpu, int offset, + int want, const char *msg) { - return __kvm_device_attr_get(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_REDIST_REGS, - REG_OFFSET(vcpu, offset), val); + uint32_t ignored_val; + int ret = __kvm_device_attr_get(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_REDIST_REGS, + REG_OFFSET(vcpu, offset), &ignored_val); + + TEST_ASSERT(ret && errno == want, "%s; want errno = %d", msg, want); +} + +static void v3_redist_reg_get(int gicv3_fd, int vcpu, int offset, uint32_t want, + const char *msg) +{ + uint32_t val; + + kvm_device_attr_get(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_REDIST_REGS, + REG_OFFSET(vcpu, offset), &val); + TEST_ASSERT(val == want, "%s; want '0x%x', got '0x%x'", msg, want, val); } /* dummy guest code */ @@ -395,7 +412,6 @@ static void test_v3_typer_accesses(void) { struct vm_gic v; uint64_t addr; - uint32_t val; int ret, i; v.vm = vm_create_default(0, 0, guest_code); @@ -404,13 +420,13 @@ static void test_v3_typer_accesses(void) vm_vcpu_add_default(v.vm, 3, guest_code); - ret = v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, &val); - TEST_ASSERT(ret && errno == EINVAL, "attempting to read GICR_TYPER of non created vcpu"); + v3_redist_reg_get_errno(v.gic_fd, 1, GICR_TYPER, EINVAL, + "attempting to read GICR_TYPER of non created vcpu"); vm_vcpu_add_default(v.vm, 1, guest_code); - ret = v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, &val); - TEST_ASSERT(ret && errno == EBUSY, "read GICR_TYPER before GIC initialized"); + v3_redist_reg_get_errno(v.gic_fd, 1, GICR_TYPER, EBUSY, + "read GICR_TYPER before GIC initialized"); vm_vcpu_add_default(v.vm, 2, guest_code); @@ -418,9 +434,8 @@ static void test_v3_typer_accesses(void) KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); for (i = 0; i < NR_VCPUS ; i++) { - ret = v3_redist_reg_get(v.gic_fd, 0, GICR_TYPER, &val); - TEST_ASSERT(!ret && val == i * 0x100, - "read GICR_TYPER before rdist region setting"); + v3_redist_reg_get(v.gic_fd, i, GICR_TYPER, i * 0x100, + "read GICR_TYPER before rdist region setting"); } addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0); @@ -428,35 +443,26 @@ static void test_v3_typer_accesses(void) KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); /* The 2 first rdists should be put there (vcpu 0 and 3) */ - ret = v3_redist_reg_get(v.gic_fd, 0, GICR_TYPER, &val); - TEST_ASSERT(!ret && !val, "read typer of rdist #0"); - - ret = v3_redist_reg_get(v.gic_fd, 3, GICR_TYPER, &val); - TEST_ASSERT(!ret && val == 0x310, "read typer of rdist #1"); + v3_redist_reg_get(v.gic_fd, 0, GICR_TYPER, 0x0, "read typer of rdist #0"); + v3_redist_reg_get(v.gic_fd, 3, GICR_TYPER, 0x310, "read typer of rdist #1"); addr = REDIST_REGION_ATTR_ADDR(10, 0x100000, 0, 1); ret = __kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); TEST_ASSERT(ret && errno == EINVAL, "collision with previous rdist region"); - ret = v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, &val); - TEST_ASSERT(!ret && val == 0x100, - "no redist region attached to vcpu #1 yet, last cannot be returned"); - - ret = v3_redist_reg_get(v.gic_fd, 2, GICR_TYPER, &val); - TEST_ASSERT(!ret && val == 0x200, - "no redist region attached to vcpu #2, last cannot be returned"); + v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, 0x100, + "no redist region attached to vcpu #1 yet, last cannot be returned"); + v3_redist_reg_get(v.gic_fd, 2, GICR_TYPER, 0x200, + "no redist region attached to vcpu #2, last cannot be returned"); addr = REDIST_REGION_ATTR_ADDR(10, 0x20000, 0, 1); kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); - ret = v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, &val); - TEST_ASSERT(!ret && val == 0x100, "read typer of rdist #1"); - - ret = v3_redist_reg_get(v.gic_fd, 2, GICR_TYPER, &val); - TEST_ASSERT(!ret && val == 0x210, - "read typer of rdist #1, last properly returned"); + v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, 0x100, "read typer of rdist #1"); + v3_redist_reg_get(v.gic_fd, 2, GICR_TYPER, 0x210, + "read typer of rdist #1, last properly returned"); vm_gic_destroy(&v); } @@ -476,8 +482,6 @@ static void test_v3_last_bit_redist_regions(void) uint32_t vcpuids[] = { 0, 3, 5, 4, 1, 2 }; struct vm_gic v; uint64_t addr; - uint32_t val; - int ret; v.vm = vm_create_default_with_vcpus(6, 0, 0, guest_code, vcpuids); @@ -498,23 +502,12 @@ static void test_v3_last_bit_redist_regions(void) kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); - ret = v3_redist_reg_get(v.gic_fd, 0, GICR_TYPER, &val); - TEST_ASSERT(!ret && val == 0x000, "read typer of rdist #0"); - - ret = v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, &val); - TEST_ASSERT(!ret && val == 0x100, "read typer of rdist #1"); - - ret = v3_redist_reg_get(v.gic_fd, 2, GICR_TYPER, &val); - TEST_ASSERT(!ret && val == 0x200, "read typer of rdist #2"); - - ret = v3_redist_reg_get(v.gic_fd, 3, GICR_TYPER, &val); - TEST_ASSERT(!ret && val == 0x310, "read typer of rdist #3"); - - ret = v3_redist_reg_get(v.gic_fd, 5, GICR_TYPER, &val); - TEST_ASSERT(!ret && val == 0x500, "read typer of rdist #5"); - - ret = v3_redist_reg_get(v.gic_fd, 4, GICR_TYPER, &val); - TEST_ASSERT(!ret && val == 0x410, "read typer of rdist #4"); + v3_redist_reg_get(v.gic_fd, 0, GICR_TYPER, 0x000, "read typer of rdist #0"); + v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, 0x100, "read typer of rdist #1"); + v3_redist_reg_get(v.gic_fd, 2, GICR_TYPER, 0x200, "read typer of rdist #2"); + v3_redist_reg_get(v.gic_fd, 3, GICR_TYPER, 0x310, "read typer of rdist #3"); + v3_redist_reg_get(v.gic_fd, 5, GICR_TYPER, 0x500, "read typer of rdist #5"); + v3_redist_reg_get(v.gic_fd, 4, GICR_TYPER, 0x410, "read typer of rdist #4"); vm_gic_destroy(&v); } @@ -525,8 +518,6 @@ static void test_v3_last_bit_single_rdist(void) uint32_t vcpuids[] = { 0, 3, 5, 4, 1, 2 }; struct vm_gic v; uint64_t addr; - uint32_t val; - int ret; v.vm = vm_create_default_with_vcpus(6, 0, 0, guest_code, vcpuids); @@ -539,20 +530,11 @@ static void test_v3_last_bit_single_rdist(void) kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr); - ret = v3_redist_reg_get(v.gic_fd, 0, GICR_TYPER, &val); - TEST_ASSERT(!ret && val == 0x000, "read typer of rdist #0"); - - ret = v3_redist_reg_get(v.gic_fd, 3, GICR_TYPER, &val); - TEST_ASSERT(!ret && val == 0x300, "read typer of rdist #1"); - - ret = v3_redist_reg_get(v.gic_fd, 5, GICR_TYPER, &val); - TEST_ASSERT(!ret && val == 0x500, "read typer of rdist #2"); - - ret = v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, &val); - TEST_ASSERT(!ret && val == 0x100, "read typer of rdist #3"); - - ret = v3_redist_reg_get(v.gic_fd, 2, GICR_TYPER, &val); - TEST_ASSERT(!ret && val == 0x210, "read typer of rdist #3"); + v3_redist_reg_get(v.gic_fd, 0, GICR_TYPER, 0x000, "read typer of rdist #0"); + v3_redist_reg_get(v.gic_fd, 3, GICR_TYPER, 0x300, "read typer of rdist #1"); + v3_redist_reg_get(v.gic_fd, 5, GICR_TYPER, 0x500, "read typer of rdist #2"); + v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, 0x100, "read typer of rdist #3"); + v3_redist_reg_get(v.gic_fd, 2, GICR_TYPER, 0x210, "read typer of rdist #3"); vm_gic_destroy(&v); } -- cgit v1.2.3 From c472df1ac318c1120f32cd4a70d202806be30c43 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 15:14:29 -0800 Subject: KVM: selftests: Add a VM backpointer to 'struct vcpu' Add a backpointer to 'struct vcpu' so that tests can get at the owning VM when passing around a vCPU object. Long term, this will be little more than a nice-to-have feature, but in the short term it is a critical step toward purging the VM+vcpu_id ioctl mess without introducing even more churn. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/kvm_util_base.h | 1 + tools/testing/selftests/kvm/lib/kvm_util.c | 1 + 2 files changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 6e63e7e57752..2e1453cb0511 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -47,6 +47,7 @@ struct vcpu { struct list_head list; uint32_t id; int fd; + struct kvm_vm *vm; struct kvm_run *state; struct kvm_dirty_gfn *dirty_gfns; uint32_t fetch_index; diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 220e079dc749..2d82b5720737 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1099,6 +1099,7 @@ void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid) vcpu = calloc(1, sizeof(*vcpu)); TEST_ASSERT(vcpu != NULL, "Insufficient Memory"); + vcpu->vm = vm; vcpu->id = vcpuid; vcpu->fd = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)(unsigned long)vcpuid); TEST_ASSERT(vcpu->fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VCPU, vcpu->fd)); -- cgit v1.2.3 From ac71220934a9240864ec485932308866500b1e61 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jun 2022 12:55:59 -0700 Subject: KVM: selftests: Consolidate KVM_ENABLE_CAP usage Add __vm_enable_cap() and use it for negative tests that expect KVM_ENABLE_CAP to fail. Opportunistically clean up the MAX_VCPU_ID test error messages. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/kvm_util_base.h | 4 ++ tools/testing/selftests/kvm/lib/x86_64/vmx.c | 2 +- .../selftests/kvm/x86_64/max_vcpuid_cap_test.c | 19 +++--- .../selftests/kvm/x86_64/sev_migrate_tests.c | 78 +++++++++++----------- 4 files changed, 52 insertions(+), 51 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 2e1453cb0511..f0afc1dce8ba 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -231,6 +231,10 @@ static inline int vm_check_cap(struct kvm_vm *vm, long cap) return ret; } +static inline int __vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap) +{ + return __vm_ioctl(vm, KVM_ENABLE_CAP, cap); +} static inline void vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap) { vm_ioctl(vm, KVM_ENABLE_CAP, cap); diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index fdca123397e4..dbfca8ed79aa 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -51,7 +51,7 @@ int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id) .args[0] = (unsigned long)&evmcs_ver }; - vcpu_ioctl(vm, vcpu_id, KVM_ENABLE_CAP, &enable_evmcs_cap); + vcpu_enable_cap(vm, vcpu_id, &enable_evmcs_cap); /* KVM should return supported EVMCS version range */ TEST_ASSERT(((evmcs_ver >> 8) >= (evmcs_ver & 0xff)) && diff --git a/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c index 419fbdc51246..c6fd36a31c8c 100644 --- a/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c +++ b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c @@ -25,28 +25,25 @@ int main(int argc, char *argv[]) /* Try to set KVM_CAP_MAX_VCPU_ID beyond KVM cap */ cap.cap = KVM_CAP_MAX_VCPU_ID; cap.args[0] = ret + 1; - ret = ioctl(vm->fd, KVM_ENABLE_CAP, &cap); + ret = __vm_enable_cap(vm, &cap); TEST_ASSERT(ret < 0, - "Unexpected success to enable KVM_CAP_MAX_VCPU_ID" - "beyond KVM cap!\n"); + "Setting KVM_CAP_MAX_VCPU_ID beyond KVM cap should fail"); /* Set KVM_CAP_MAX_VCPU_ID */ cap.cap = KVM_CAP_MAX_VCPU_ID; cap.args[0] = MAX_VCPU_ID; - ret = ioctl(vm->fd, KVM_ENABLE_CAP, &cap); - TEST_ASSERT(ret == 0, - "Unexpected failure to enable KVM_CAP_MAX_VCPU_ID!\n"); + vm_enable_cap(vm, &cap); + /* Try to set KVM_CAP_MAX_VCPU_ID again */ cap.args[0] = MAX_VCPU_ID + 1; - ret = ioctl(vm->fd, KVM_ENABLE_CAP, &cap); + ret = __vm_enable_cap(vm, &cap); TEST_ASSERT(ret < 0, - "Unexpected success to enable KVM_CAP_MAX_VCPU_ID again\n"); + "Setting KVM_CAP_MAX_VCPU_ID multiple times should fail"); /* Create vCPU with id beyond KVM_CAP_MAX_VCPU_ID cap*/ - ret = ioctl(vm->fd, KVM_CREATE_VCPU, MAX_VCPU_ID); - TEST_ASSERT(ret < 0, - "Unexpected success in creating a vCPU with VCPU ID out of range\n"); + ret = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)MAX_VCPU_ID); + TEST_ASSERT(ret < 0, "Creating vCPU with ID > MAX_VCPU_ID should fail"); kvm_vm_free(vm); return 0; diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c index 5b565aa11e32..f127f2fccca6 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c +++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c @@ -80,22 +80,22 @@ static struct kvm_vm *aux_vm_create(bool with_vcpus) return vm; } -static int __sev_migrate_from(int dst_fd, int src_fd) +static int __sev_migrate_from(struct kvm_vm *dst, struct kvm_vm *src) { struct kvm_enable_cap cap = { .cap = KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM, - .args = { src_fd } + .args = { src->fd } }; - return ioctl(dst_fd, KVM_ENABLE_CAP, &cap); + return __vm_enable_cap(dst, &cap); } -static void sev_migrate_from(int dst_fd, int src_fd) +static void sev_migrate_from(struct kvm_vm *dst, struct kvm_vm *src) { int ret; - ret = __sev_migrate_from(dst_fd, src_fd); + ret = __sev_migrate_from(dst, src); TEST_ASSERT(!ret, "Migration failed, ret: %d, errno: %d\n", ret, errno); } @@ -110,13 +110,13 @@ static void test_sev_migrate_from(bool es) dst_vms[i] = aux_vm_create(true); /* Initial migration from the src to the first dst. */ - sev_migrate_from(dst_vms[0]->fd, src_vm->fd); + sev_migrate_from(dst_vms[0], src_vm); for (i = 1; i < NR_MIGRATE_TEST_VMS; i++) - sev_migrate_from(dst_vms[i]->fd, dst_vms[i - 1]->fd); + sev_migrate_from(dst_vms[i], dst_vms[i - 1]); /* Migrate the guest back to the original VM. */ - ret = __sev_migrate_from(src_vm->fd, dst_vms[NR_MIGRATE_TEST_VMS - 1]->fd); + ret = __sev_migrate_from(src_vm, dst_vms[NR_MIGRATE_TEST_VMS - 1]); TEST_ASSERT(ret == -1 && errno == EIO, "VM that was migrated from should be dead. ret %d, errno: %d\n", ret, errno); @@ -128,7 +128,7 @@ static void test_sev_migrate_from(bool es) struct locking_thread_input { struct kvm_vm *vm; - int source_fds[NR_LOCK_TESTING_THREADS]; + struct kvm_vm *source_vms[NR_LOCK_TESTING_THREADS]; }; static void *locking_test_thread(void *arg) @@ -138,7 +138,7 @@ static void *locking_test_thread(void *arg) for (i = 0; i < NR_LOCK_TESTING_ITERATIONS; ++i) { j = i % NR_LOCK_TESTING_THREADS; - __sev_migrate_from(input->vm->fd, input->source_fds[j]); + __sev_migrate_from(input->vm, input->source_vms[j]); } return NULL; @@ -152,11 +152,11 @@ static void test_sev_migrate_locking(void) for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i) { input[i].vm = sev_vm_create(/* es= */ false); - input[0].source_fds[i] = input[i].vm->fd; + input[0].source_vms[i] = input[i].vm; } for (i = 1; i < NR_LOCK_TESTING_THREADS; ++i) - memcpy(input[i].source_fds, input[0].source_fds, - sizeof(input[i].source_fds)); + memcpy(input[i].source_vms, input[0].source_vms, + sizeof(input[i].source_vms)); for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i) pthread_create(&pt[i], NULL, locking_test_thread, &input[i]); @@ -175,7 +175,7 @@ static void test_sev_migrate_parameters(void) vm_no_vcpu = vm_create(0); vm_no_sev = aux_vm_create(true); - ret = __sev_migrate_from(vm_no_vcpu->fd, vm_no_sev->fd); + ret = __sev_migrate_from(vm_no_vcpu, vm_no_sev); TEST_ASSERT(ret == -1 && errno == EINVAL, "Migrations require SEV enabled. ret %d, errno: %d\n", ret, errno); @@ -189,25 +189,25 @@ static void test_sev_migrate_parameters(void) sev_ioctl(sev_es_vm_no_vmsa->fd, KVM_SEV_ES_INIT, NULL); vm_vcpu_add(sev_es_vm_no_vmsa, 1); - ret = __sev_migrate_from(sev_vm->fd, sev_es_vm->fd); + ret = __sev_migrate_from(sev_vm, sev_es_vm); TEST_ASSERT( ret == -1 && errno == EINVAL, "Should not be able migrate to SEV enabled VM. ret: %d, errno: %d\n", ret, errno); - ret = __sev_migrate_from(sev_es_vm->fd, sev_vm->fd); + ret = __sev_migrate_from(sev_es_vm, sev_vm); TEST_ASSERT( ret == -1 && errno == EINVAL, "Should not be able migrate to SEV-ES enabled VM. ret: %d, errno: %d\n", ret, errno); - ret = __sev_migrate_from(vm_no_vcpu->fd, sev_es_vm->fd); + ret = __sev_migrate_from(vm_no_vcpu, sev_es_vm); TEST_ASSERT( ret == -1 && errno == EINVAL, "SEV-ES migrations require same number of vCPUS. ret: %d, errno: %d\n", ret, errno); - ret = __sev_migrate_from(vm_no_vcpu->fd, sev_es_vm_no_vmsa->fd); + ret = __sev_migrate_from(vm_no_vcpu, sev_es_vm_no_vmsa); TEST_ASSERT( ret == -1 && errno == EINVAL, "SEV-ES migrations require UPDATE_VMSA. ret %d, errno: %d\n", @@ -221,22 +221,22 @@ out: kvm_vm_free(vm_no_sev); } -static int __sev_mirror_create(int dst_fd, int src_fd) +static int __sev_mirror_create(struct kvm_vm *dst, struct kvm_vm *src) { struct kvm_enable_cap cap = { .cap = KVM_CAP_VM_COPY_ENC_CONTEXT_FROM, - .args = { src_fd } + .args = { src->fd } }; - return ioctl(dst_fd, KVM_ENABLE_CAP, &cap); + return __vm_enable_cap(dst, &cap); } -static void sev_mirror_create(int dst_fd, int src_fd) +static void sev_mirror_create(struct kvm_vm *dst, struct kvm_vm *src) { int ret; - ret = __sev_mirror_create(dst_fd, src_fd); + ret = __sev_mirror_create(dst, src); TEST_ASSERT(!ret, "Copying context failed, ret: %d, errno: %d\n", ret, errno); } @@ -284,7 +284,7 @@ static void test_sev_mirror(bool es) src_vm = sev_vm_create(es); dst_vm = aux_vm_create(false); - sev_mirror_create(dst_vm->fd, src_vm->fd); + sev_mirror_create(dst_vm, src_vm); /* Check that we can complete creation of the mirror VM. */ for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i) @@ -308,18 +308,18 @@ static void test_sev_mirror_parameters(void) vm_with_vcpu = aux_vm_create(true); vm_no_vcpu = aux_vm_create(false); - ret = __sev_mirror_create(sev_vm->fd, sev_vm->fd); + ret = __sev_mirror_create(sev_vm, sev_vm); TEST_ASSERT( ret == -1 && errno == EINVAL, "Should not be able copy context to self. ret: %d, errno: %d\n", ret, errno); - ret = __sev_mirror_create(vm_no_vcpu->fd, vm_with_vcpu->fd); + ret = __sev_mirror_create(vm_no_vcpu, vm_with_vcpu); TEST_ASSERT(ret == -1 && errno == EINVAL, "Copy context requires SEV enabled. ret %d, errno: %d\n", ret, errno); - ret = __sev_mirror_create(vm_with_vcpu->fd, sev_vm->fd); + ret = __sev_mirror_create(vm_with_vcpu, sev_vm); TEST_ASSERT( ret == -1 && errno == EINVAL, "SEV copy context requires no vCPUS on the destination. ret: %d, errno: %d\n", @@ -329,13 +329,13 @@ static void test_sev_mirror_parameters(void) goto out; sev_es_vm = sev_vm_create(/* es= */ true); - ret = __sev_mirror_create(sev_vm->fd, sev_es_vm->fd); + ret = __sev_mirror_create(sev_vm, sev_es_vm); TEST_ASSERT( ret == -1 && errno == EINVAL, "Should not be able copy context to SEV enabled VM. ret: %d, errno: %d\n", ret, errno); - ret = __sev_mirror_create(sev_es_vm->fd, sev_vm->fd); + ret = __sev_mirror_create(sev_es_vm, sev_vm); TEST_ASSERT( ret == -1 && errno == EINVAL, "Should not be able copy context to SEV-ES enabled VM. ret: %d, errno: %d\n", @@ -363,16 +363,16 @@ static void test_sev_move_copy(void) dst2_mirror_vm = aux_vm_create(false); dst3_mirror_vm = aux_vm_create(false); - sev_mirror_create(mirror_vm->fd, sev_vm->fd); + sev_mirror_create(mirror_vm, sev_vm); - sev_migrate_from(dst_mirror_vm->fd, mirror_vm->fd); - sev_migrate_from(dst_vm->fd, sev_vm->fd); + sev_migrate_from(dst_mirror_vm, mirror_vm); + sev_migrate_from(dst_vm, sev_vm); - sev_migrate_from(dst2_vm->fd, dst_vm->fd); - sev_migrate_from(dst2_mirror_vm->fd, dst_mirror_vm->fd); + sev_migrate_from(dst2_vm, dst_vm); + sev_migrate_from(dst2_mirror_vm, dst_mirror_vm); - sev_migrate_from(dst3_mirror_vm->fd, dst2_mirror_vm->fd); - sev_migrate_from(dst3_vm->fd, dst2_vm->fd); + sev_migrate_from(dst3_mirror_vm, dst2_mirror_vm); + sev_migrate_from(dst3_vm, dst2_vm); kvm_vm_free(dst_vm); kvm_vm_free(sev_vm); @@ -392,10 +392,10 @@ static void test_sev_move_copy(void) mirror_vm = aux_vm_create(false); dst_mirror_vm = aux_vm_create(false); - sev_mirror_create(mirror_vm->fd, sev_vm->fd); + sev_mirror_create(mirror_vm, sev_vm); - sev_migrate_from(dst_mirror_vm->fd, mirror_vm->fd); - sev_migrate_from(dst_vm->fd, sev_vm->fd); + sev_migrate_from(dst_mirror_vm, mirror_vm); + sev_migrate_from(dst_vm, sev_vm); kvm_vm_free(mirror_vm); kvm_vm_free(dst_mirror_vm); -- cgit v1.2.3 From a12c86c447f4bce6d2725c3fab426aba6630b376 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jun 2022 13:19:09 -0700 Subject: KVM: selftests: Simplify KVM_ENABLE_CAP helper APIs Rework the KVM_ENABLE_CAP helpers to take the cap and arg0; literally every current user, and likely every future user, wants to set 0 or 1 arguments and nothing else. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/psci_test.c | 11 +------- tools/testing/selftests/kvm/dirty_log_perf_test.c | 9 +++---- tools/testing/selftests/kvm/dirty_log_test.c | 5 +--- .../testing/selftests/kvm/include/kvm_util_base.h | 18 ++++++++----- tools/testing/selftests/kvm/lib/kvm_util.c | 6 +---- tools/testing/selftests/kvm/lib/x86_64/vmx.c | 8 ++---- .../selftests/kvm/x86_64/emulator_error_test.c | 6 +---- .../selftests/kvm/x86_64/fix_hypercall_test.c | 6 ++--- .../testing/selftests/kvm/x86_64/hyperv_features.c | 16 +++-------- tools/testing/selftests/kvm/x86_64/kvm_pv_test.c | 5 +--- .../selftests/kvm/x86_64/max_vcpuid_cap_test.c | 12 +++------ .../selftests/kvm/x86_64/platform_info_test.c | 14 ++-------- .../selftests/kvm/x86_64/pmu_event_filter_test.c | 5 +--- .../selftests/kvm/x86_64/sev_migrate_tests.c | 14 ++-------- .../selftests/kvm/x86_64/triple_fault_event_test.c | 7 +---- .../selftests/kvm/x86_64/userspace_msr_exit_test.c | 31 ++++++++-------------- 16 files changed, 47 insertions(+), 126 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c index 024a84064f1f..1a351f3f443d 100644 --- a/tools/testing/selftests/kvm/aarch64/psci_test.c +++ b/tools/testing/selftests/kvm/aarch64/psci_test.c @@ -156,15 +156,6 @@ static void host_test_cpu_on(void) kvm_vm_free(vm); } -static void enable_system_suspend(struct kvm_vm *vm) -{ - struct kvm_enable_cap cap = { - .cap = KVM_CAP_ARM_SYSTEM_SUSPEND, - }; - - vm_enable_cap(vm, &cap); -} - static void guest_test_system_suspend(void) { uint64_t ret; @@ -183,7 +174,7 @@ static void host_test_system_suspend(void) struct kvm_vm *vm; vm = setup_vm(guest_test_system_suspend); - enable_system_suspend(vm); + vm_enable_cap(vm, KVM_CAP_ARM_SYSTEM_SUSPEND, 0); vcpu_power_off(vm, VCPU_ID_TARGET); run = vcpu_state(vm, VCPU_ID_SOURCE); diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index d60a34cdfaee..8aaa0949707a 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -213,7 +213,6 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct timespec get_dirty_log_total = (struct timespec){0}; struct timespec vcpu_dirty_total = (struct timespec){0}; struct timespec avg; - struct kvm_enable_cap cap = {}; struct timespec clear_dirty_log_total = (struct timespec){0}; vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, @@ -229,11 +228,9 @@ static void run_test(enum vm_guest_mode mode, void *arg) bitmaps = alloc_bitmaps(p->slots, pages_per_slot); - if (dirty_log_manual_caps) { - cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2; - cap.args[0] = dirty_log_manual_caps; - vm_enable_cap(vm, &cap); - } + if (dirty_log_manual_caps) + vm_enable_cap(vm, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, + dirty_log_manual_caps); arch_setup_vm(vm, nr_vcpus); diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 5752486764c9..9dfc861a3cf3 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -217,16 +217,13 @@ static bool clear_log_supported(void) static void clear_log_create_vm_done(struct kvm_vm *vm) { - struct kvm_enable_cap cap = {}; u64 manual_caps; manual_caps = kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); TEST_ASSERT(manual_caps, "MANUAL_CAPS is zero!"); manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | KVM_DIRTY_LOG_INITIALLY_SET); - cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2; - cap.args[0] = manual_caps; - vm_enable_cap(vm, &cap); + vm_enable_cap(vm, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, manual_caps); } static void dirty_log_collect_dirty_pages(struct kvm_vm *vm, int slot, diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index f0afc1dce8ba..c9d94c9f2031 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -231,13 +231,17 @@ static inline int vm_check_cap(struct kvm_vm *vm, long cap) return ret; } -static inline int __vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap) +static inline int __vm_enable_cap(struct kvm_vm *vm, uint32_t cap, uint64_t arg0) { - return __vm_ioctl(vm, KVM_ENABLE_CAP, cap); + struct kvm_enable_cap enable_cap = { .cap = cap, .args = { arg0 } }; + + return __vm_ioctl(vm, KVM_ENABLE_CAP, &enable_cap); } -static inline void vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap) +static inline void vm_enable_cap(struct kvm_vm *vm, uint32_t cap, uint64_t arg0) { - vm_ioctl(vm, KVM_ENABLE_CAP, cap); + struct kvm_enable_cap enable_cap = { .cap = cap, .args = { arg0 } }; + + vm_ioctl(vm, KVM_ENABLE_CAP, &enable_cap); } void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size); @@ -363,9 +367,11 @@ void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid); struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vm *vm, uint32_t vcpuid); static inline void vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id, - struct kvm_enable_cap *cap) + uint32_t cap, uint64_t arg0) { - vcpu_ioctl(vm, vcpu_id, KVM_ENABLE_CAP, cap); + struct kvm_enable_cap enable_cap = { .cap = cap, .args = { arg0 } }; + + vcpu_ioctl(vm, vcpu_id, KVM_ENABLE_CAP, &enable_cap); } static inline void vcpu_set_guest_debug(struct kvm_vm *vm, uint32_t vcpuid, diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 2d82b5720737..8f670cef6faa 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -85,11 +85,7 @@ int kvm_check_cap(long cap) void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size) { - struct kvm_enable_cap cap = { 0 }; - - cap.cap = KVM_CAP_DIRTY_LOG_RING; - cap.args[0] = ring_size; - vm_enable_cap(vm, &cap); + vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING, ring_size); vm->dirty_ring_size = ring_size; } diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index dbfca8ed79aa..a12c0e1224af 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -46,12 +46,8 @@ int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id) { uint16_t evmcs_ver; - struct kvm_enable_cap enable_evmcs_cap = { - .cap = KVM_CAP_HYPERV_ENLIGHTENED_VMCS, - .args[0] = (unsigned long)&evmcs_ver - }; - - vcpu_enable_cap(vm, vcpu_id, &enable_evmcs_cap); + vcpu_enable_cap(vm, vcpu_id, KVM_CAP_HYPERV_ENLIGHTENED_VMCS, + (unsigned long)&evmcs_ver); /* KVM should return supported EVMCS version range */ TEST_ASSERT(((evmcs_ver >> 8) >= (evmcs_ver & 0xff)) && diff --git a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c index aeb3850f81bd..9c156f9cfa15 100644 --- a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c +++ b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c @@ -161,10 +161,6 @@ static uint64_t process_ucall(struct kvm_vm *vm) int main(int argc, char *argv[]) { - struct kvm_enable_cap emul_failure_cap = { - .cap = KVM_CAP_EXIT_ON_EMULATION_FAILURE, - .args[0] = 1, - }; struct kvm_cpuid_entry2 *entry; struct kvm_cpuid2 *cpuid; struct kvm_vm *vm; @@ -192,7 +188,7 @@ int main(int argc, char *argv[]) rc = kvm_check_cap(KVM_CAP_EXIT_ON_EMULATION_FAILURE); TEST_ASSERT(rc, "KVM_CAP_EXIT_ON_EMULATION_FAILURE is unavailable"); - vm_enable_cap(vm, &emul_failure_cap); + vm_enable_cap(vm, KVM_CAP_EXIT_ON_EMULATION_FAILURE, 1); vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, MEM_REGION_GPA, MEM_REGION_SLOT, diff --git a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c index 1f5c32146f3d..81f9f5b1f655 100644 --- a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c +++ b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c @@ -140,15 +140,13 @@ static void test_fix_hypercall(void) static void test_fix_hypercall_disabled(void) { - struct kvm_enable_cap cap = {0}; struct kvm_vm *vm; vm = vm_create_default(VCPU_ID, 0, guest_main); setup_ud_vector(vm); - cap.cap = KVM_CAP_DISABLE_QUIRKS2; - cap.args[0] = KVM_X86_QUIRK_FIX_HYPERCALL_INSN; - vm_enable_cap(vm, &cap); + vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, + KVM_X86_QUIRK_FIX_HYPERCALL_INSN); ud_expected = true; sync_global_to_guest(vm, ud_expected); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c index 672915ce73d8..7ff6e4d70333 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c @@ -182,10 +182,6 @@ static void guest_test_msrs_access(void) }; struct kvm_cpuid2 *best; vm_vaddr_t msr_gva; - struct kvm_enable_cap cap = { - .cap = KVM_CAP_HYPERV_ENFORCE_CPUID, - .args = {1} - }; struct msr_data *msr; while (true) { @@ -196,7 +192,7 @@ static void guest_test_msrs_access(void) msr = addr_gva2hva(vm, msr_gva); vcpu_args_set(vm, VCPU_ID, 1, msr_gva); - vcpu_enable_cap(vm, VCPU_ID, &cap); + vcpu_enable_cap(vm, VCPU_ID, KVM_CAP_HYPERV_ENFORCE_CPUID, 1); vcpu_set_hv_cpuid(vm, VCPU_ID); @@ -337,9 +333,7 @@ static void guest_test_msrs_access(void) * Remains unavailable even with KVM_CAP_HYPERV_SYNIC2 * capability enabled and guest visible CPUID bit unset. */ - cap.cap = KVM_CAP_HYPERV_SYNIC2; - cap.args[0] = 0; - vcpu_enable_cap(vm, VCPU_ID, &cap); + vcpu_enable_cap(vm, VCPU_ID, KVM_CAP_HYPERV_SYNIC2, 0); break; case 22: feat.eax |= HV_MSR_SYNIC_AVAILABLE; @@ -518,10 +512,6 @@ static void guest_test_hcalls_access(void) struct kvm_cpuid_entry2 dbg = { .function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES }; - struct kvm_enable_cap cap = { - .cap = KVM_CAP_HYPERV_ENFORCE_CPUID, - .args = {1} - }; vm_vaddr_t hcall_page, hcall_params; struct hcall_data *hcall; struct kvm_cpuid2 *best; @@ -542,7 +532,7 @@ static void guest_test_hcalls_access(void) memset(addr_gva2hva(vm, hcall_params), 0x0, getpagesize()); vcpu_args_set(vm, VCPU_ID, 2, addr_gva2gpa(vm, hcall_page), hcall_params); - vcpu_enable_cap(vm, VCPU_ID, &cap); + vcpu_enable_cap(vm, VCPU_ID, KVM_CAP_HYPERV_ENFORCE_CPUID, 1); vcpu_set_hv_cpuid(vm, VCPU_ID); diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c index 04ed975662c9..5eea3ac7958e 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c @@ -206,7 +206,6 @@ static void enter_guest(struct kvm_vm *vm) int main(void) { - struct kvm_enable_cap cap = {0}; struct kvm_cpuid2 *best; struct kvm_vm *vm; @@ -217,9 +216,7 @@ int main(void) vm = vm_create_default(VCPU_ID, 0, guest_main); - cap.cap = KVM_CAP_ENFORCE_PV_FEATURE_CPUID; - cap.args[0] = 1; - vcpu_enable_cap(vm, VCPU_ID, &cap); + vcpu_enable_cap(vm, VCPU_ID, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, 1); best = kvm_get_supported_cpuid(); clear_kvm_cpuid_features(best); diff --git a/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c index c6fd36a31c8c..7211fd8d5d24 100644 --- a/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c +++ b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c @@ -14,7 +14,6 @@ int main(int argc, char *argv[]) { struct kvm_vm *vm; - struct kvm_enable_cap cap = { 0 }; int ret; vm = vm_create(0); @@ -23,21 +22,16 @@ int main(int argc, char *argv[]) ret = vm_check_cap(vm, KVM_CAP_MAX_VCPU_ID); /* Try to set KVM_CAP_MAX_VCPU_ID beyond KVM cap */ - cap.cap = KVM_CAP_MAX_VCPU_ID; - cap.args[0] = ret + 1; - ret = __vm_enable_cap(vm, &cap); + ret = __vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, ret + 1); TEST_ASSERT(ret < 0, "Setting KVM_CAP_MAX_VCPU_ID beyond KVM cap should fail"); /* Set KVM_CAP_MAX_VCPU_ID */ - cap.cap = KVM_CAP_MAX_VCPU_ID; - cap.args[0] = MAX_VCPU_ID; - vm_enable_cap(vm, &cap); + vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, MAX_VCPU_ID); /* Try to set KVM_CAP_MAX_VCPU_ID again */ - cap.args[0] = MAX_VCPU_ID + 1; - ret = __vm_enable_cap(vm, &cap); + ret = __vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, MAX_VCPU_ID + 1); TEST_ASSERT(ret < 0, "Setting KVM_CAP_MAX_VCPU_ID multiple times should fail"); diff --git a/tools/testing/selftests/kvm/x86_64/platform_info_test.c b/tools/testing/selftests/kvm/x86_64/platform_info_test.c index 1e89688cbbbf..e79c04581ca8 100644 --- a/tools/testing/selftests/kvm/x86_64/platform_info_test.c +++ b/tools/testing/selftests/kvm/x86_64/platform_info_test.c @@ -35,22 +35,12 @@ static void guest_code(void) } } -static void set_msr_platform_info_enabled(struct kvm_vm *vm, bool enable) -{ - struct kvm_enable_cap cap = {}; - - cap.cap = KVM_CAP_MSR_PLATFORM_INFO; - cap.flags = 0; - cap.args[0] = (int)enable; - vm_enable_cap(vm, &cap); -} - static void test_msr_platform_info_enabled(struct kvm_vm *vm) { struct kvm_run *run = vcpu_state(vm, VCPU_ID); struct ucall uc; - set_msr_platform_info_enabled(vm, true); + vm_enable_cap(vm, KVM_CAP_MSR_PLATFORM_INFO, true); vcpu_run(vm, VCPU_ID); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Exit_reason other than KVM_EXIT_IO: %u (%s),\n", @@ -69,7 +59,7 @@ static void test_msr_platform_info_disabled(struct kvm_vm *vm) { struct kvm_run *run = vcpu_state(vm, VCPU_ID); - set_msr_platform_info_enabled(vm, false); + vm_enable_cap(vm, KVM_CAP_MSR_PLATFORM_INFO, false); vcpu_run(vm, VCPU_ID); TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN, "Exit_reason other than KVM_EXIT_SHUTDOWN: %u (%s)\n", diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index 84cd5e717fcf..9ec6ba2ad89b 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -360,7 +360,6 @@ static void test_pmu_config_disable(void (*guest_code)(void)) { int r; struct kvm_vm *vm; - struct kvm_enable_cap cap = { 0 }; r = kvm_check_cap(KVM_CAP_PMU_CAPABILITY); if (!(r & KVM_PMU_CAP_DISABLE)) @@ -368,9 +367,7 @@ static void test_pmu_config_disable(void (*guest_code)(void)) vm = vm_create_without_vcpus(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); - cap.cap = KVM_CAP_PMU_CAPABILITY; - cap.args[0] = KVM_PMU_CAP_DISABLE; - vm_enable_cap(vm, &cap); + vm_enable_cap(vm, KVM_CAP_PMU_CAPABILITY, KVM_PMU_CAP_DISABLE); vm_vcpu_add_default(vm, VCPU_ID, guest_code); vm_init_descriptor_tables(vm); diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c index f127f2fccca6..e814748bf7ba 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c +++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c @@ -82,12 +82,7 @@ static struct kvm_vm *aux_vm_create(bool with_vcpus) static int __sev_migrate_from(struct kvm_vm *dst, struct kvm_vm *src) { - struct kvm_enable_cap cap = { - .cap = KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM, - .args = { src->fd } - }; - - return __vm_enable_cap(dst, &cap); + return __vm_enable_cap(dst, KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM, src->fd); } @@ -223,12 +218,7 @@ out: static int __sev_mirror_create(struct kvm_vm *dst, struct kvm_vm *src) { - struct kvm_enable_cap cap = { - .cap = KVM_CAP_VM_COPY_ENC_CONTEXT_FROM, - .args = { src->fd } - }; - - return __vm_enable_cap(dst, &cap); + return __vm_enable_cap(dst, KVM_CAP_VM_COPY_ENC_CONTEXT_FROM, src->fd); } diff --git a/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c index 66378140764d..68e0f1c5ec5a 100644 --- a/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c +++ b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c @@ -46,11 +46,6 @@ int main(void) vm_vaddr_t vmx_pages_gva; struct ucall uc; - struct kvm_enable_cap cap = { - .cap = KVM_CAP_X86_TRIPLE_FAULT_EVENT, - .args = {1} - }; - if (!nested_vmx_supported()) { print_skip("Nested VMX not supported"); exit(KSFT_SKIP); @@ -62,7 +57,7 @@ int main(void) } vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); - vm_enable_cap(vm, &cap); + vm_enable_cap(vm, KVM_CAP_X86_TRIPLE_FAULT_EVENT, 1); run = vcpu_state(vm, VCPU_ID); vcpu_alloc_vmx(vm, &vmx_pages_gva); diff --git a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c index e3e20e8848d0..23e9292580c9 100644 --- a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c +++ b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c @@ -550,11 +550,8 @@ static void run_guest_then_process_ucall_done(struct kvm_vm *vm) process_ucall_done(vm); } -static void test_msr_filter_allow(void) { - struct kvm_enable_cap cap = { - .cap = KVM_CAP_X86_USER_SPACE_MSR, - .args[0] = KVM_MSR_EXIT_REASON_FILTER, - }; +static void test_msr_filter_allow(void) +{ struct kvm_vm *vm; int rc; @@ -564,7 +561,7 @@ static void test_msr_filter_allow(void) { rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR); TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available"); - vm_enable_cap(vm, &cap); + vm_enable_cap(vm, KVM_CAP_X86_USER_SPACE_MSR, KVM_MSR_EXIT_REASON_FILTER); rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER); TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available"); @@ -673,13 +670,8 @@ static void handle_wrmsr(struct kvm_run *run) } } -static void test_msr_filter_deny(void) { - struct kvm_enable_cap cap = { - .cap = KVM_CAP_X86_USER_SPACE_MSR, - .args[0] = KVM_MSR_EXIT_REASON_INVAL | - KVM_MSR_EXIT_REASON_UNKNOWN | - KVM_MSR_EXIT_REASON_FILTER, - }; +static void test_msr_filter_deny(void) +{ struct kvm_vm *vm; struct kvm_run *run; int rc; @@ -691,7 +683,9 @@ static void test_msr_filter_deny(void) { rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR); TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available"); - vm_enable_cap(vm, &cap); + vm_enable_cap(vm, KVM_CAP_X86_USER_SPACE_MSR, KVM_MSR_EXIT_REASON_INVAL | + KVM_MSR_EXIT_REASON_UNKNOWN | + KVM_MSR_EXIT_REASON_FILTER); rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER); TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available"); @@ -726,11 +720,8 @@ done: kvm_vm_free(vm); } -static void test_msr_permission_bitmap(void) { - struct kvm_enable_cap cap = { - .cap = KVM_CAP_X86_USER_SPACE_MSR, - .args[0] = KVM_MSR_EXIT_REASON_FILTER, - }; +static void test_msr_permission_bitmap(void) +{ struct kvm_vm *vm; int rc; @@ -740,7 +731,7 @@ static void test_msr_permission_bitmap(void) { rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR); TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available"); - vm_enable_cap(vm, &cap); + vm_enable_cap(vm, KVM_CAP_X86_USER_SPACE_MSR, KVM_MSR_EXIT_REASON_FILTER); rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER); TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available"); -- cgit v1.2.3 From c095cb609b3aa56fd24e2907556c24fef88b9180 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jun 2022 14:12:14 -0700 Subject: KVM: selftests: Cache list of MSRs to save/restore Cache the list of MSRs to save restore, mostly to justify not freeing the list in the caller, which simplifies consumption of the list. Opportunistically move the XSS test's so called is_supported_msr() to common code as kvm_msr_is_in_save_restore_list(). The XSS is "supported" by KVM, it's simply not in the save/restore list because KVM doesn't yet allow a non-zero value. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../selftests/kvm/include/x86_64/processor.h | 4 +- tools/testing/selftests/kvm/lib/x86_64/processor.c | 81 ++++++++++------------ tools/testing/selftests/kvm/x86_64/xss_msr_test.c | 27 ++------ 3 files changed, 45 insertions(+), 67 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 583754bf18c3..b8023c30bded 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -427,8 +427,10 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *state); void kvm_x86_state_cleanup(struct kvm_x86_state *state); -struct kvm_msr_list *kvm_get_msr_index_list(void); +const struct kvm_msr_list *kvm_get_msr_index_list(void); +bool kvm_msr_is_in_save_restore_list(uint32_t msr_index); uint64_t kvm_get_feature_msr(uint64_t msr_index); + struct kvm_cpuid2 *kvm_get_supported_cpuid(void); struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 02266b8123df..3f83857009a3 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -719,18 +719,6 @@ struct kvm_cpuid2 *kvm_get_supported_cpuid(void) return cpuid; } -/* - * KVM Get MSR - * - * Input Args: - * msr_index - Index of MSR - * - * Output Args: None - * - * Return: On success, value of the MSR. On failure a TEST_ASSERT is produced. - * - * Get value of MSR for VCPU. - */ uint64_t kvm_get_feature_msr(uint64_t msr_index) { struct { @@ -903,38 +891,47 @@ void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) sregs_dump(stream, &sregs, indent + 4); } -static int kvm_get_num_msrs_fd(int kvm_fd) +const struct kvm_msr_list *kvm_get_msr_index_list(void) { + static struct kvm_msr_list *list; struct kvm_msr_list nmsrs; - int r; + int kvm_fd, r; + + if (list) + return list; + + kvm_fd = open_kvm_dev_path_or_exit(); nmsrs.nmsrs = 0; r = __kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs); TEST_ASSERT(r == -1 && errno == E2BIG, - "Unexpected result from KVM_GET_MSR_INDEX_LIST probe, r: %i", r); + "Expected -E2BIG, got rc: %i errno: %i (%s)", + r, errno, strerror(errno)); - return nmsrs.nmsrs; -} + list = malloc(sizeof(*list) + nmsrs.nmsrs * sizeof(list->indices[0])); + TEST_ASSERT(list, "-ENOMEM when allocating MSR index list"); + list->nmsrs = nmsrs.nmsrs; -static int kvm_get_num_msrs(struct kvm_vm *vm) -{ - return kvm_get_num_msrs_fd(vm->kvm_fd); + kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list); + close(kvm_fd); + + TEST_ASSERT(list->nmsrs == nmsrs.nmsrs, + "Number of save/restore MSRs changed, was %d, now %d", + nmsrs.nmsrs, list->nmsrs); + return list; } -struct kvm_msr_list *kvm_get_msr_index_list(void) +bool kvm_msr_is_in_save_restore_list(uint32_t msr_index) { - struct kvm_msr_list *list; - int nmsrs, kvm_fd; - - kvm_fd = open_kvm_dev_path_or_exit(); + const struct kvm_msr_list *list = kvm_get_msr_index_list(); + int i; - nmsrs = kvm_get_num_msrs_fd(kvm_fd); - list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); - list->nmsrs = nmsrs; - kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list); - close(kvm_fd); + for (i = 0; i < list->nmsrs; ++i) { + if (list->indices[i] == msr_index) + return true; + } - return list; + return false; } static int vcpu_save_xsave_state(struct kvm_vm *vm, struct vcpu *vcpu, @@ -955,10 +952,10 @@ static int vcpu_save_xsave_state(struct kvm_vm *vm, struct vcpu *vcpu, struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) { + const struct kvm_msr_list *msr_list = kvm_get_msr_index_list(); struct vcpu *vcpu = vcpu_get(vm, vcpuid); - struct kvm_msr_list *list; struct kvm_x86_state *state; - int nmsrs, r, i; + int r, i; static int nested_size = -1; if (nested_size == -1) { @@ -976,12 +973,7 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) */ vcpu_run_complete_io(vm, vcpuid); - nmsrs = kvm_get_num_msrs(vm); - list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); - list->nmsrs = nmsrs; - kvm_ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, list); - - state = malloc(sizeof(*state) + nmsrs * sizeof(state->msrs.entries[0])); + state = malloc(sizeof(*state) + msr_list->nmsrs * sizeof(state->msrs.entries[0])); r = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, &state->events); TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i", r); @@ -1019,18 +1011,17 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) } else state->nested.size = 0; - state->msrs.nmsrs = nmsrs; - for (i = 0; i < nmsrs; i++) - state->msrs.entries[i].index = list->indices[i]; + state->msrs.nmsrs = msr_list->nmsrs; + for (i = 0; i < msr_list->nmsrs; i++) + state->msrs.entries[i].index = msr_list->indices[i]; r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs); - TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)", - r, r == nmsrs ? -1 : list->indices[r]); + TEST_ASSERT(r == msr_list->nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)", + r, r == msr_list->nmsrs ? -1 : msr_list->indices[r]); r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs); TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i", r); - free(list); return state; } diff --git a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c index 3529376747c2..7bd15f8a805c 100644 --- a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c +++ b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c @@ -17,28 +17,11 @@ #define X86_FEATURE_XSAVES (1<<3) -bool is_supported_msr(u32 msr_index) -{ - struct kvm_msr_list *list; - bool found = false; - int i; - - list = kvm_get_msr_index_list(); - for (i = 0; i < list->nmsrs; ++i) { - if (list->indices[i] == msr_index) { - found = true; - break; - } - } - - free(list); - return found; -} - int main(int argc, char *argv[]) { struct kvm_cpuid_entry2 *entry; bool xss_supported = false; + bool xss_in_msr_list; struct kvm_vm *vm; uint64_t xss_val; int i, r; @@ -64,12 +47,14 @@ int main(int argc, char *argv[]) * At present, KVM only supports a guest IA32_XSS value of 0. Verify * that trying to set the guest IA32_XSS to an unsupported value fails. * Also, in the future when a non-zero value succeeds check that - * IA32_XSS is in the KVM_GET_MSR_INDEX_LIST. + * IA32_XSS is in the list of MSRs to save/restore. */ + xss_in_msr_list = kvm_msr_is_in_save_restore_list(MSR_IA32_XSS); for (i = 0; i < MSR_BITS; ++i) { r = _vcpu_set_msr(vm, VCPU_ID, MSR_IA32_XSS, 1ull << i); - TEST_ASSERT(r == 0 || is_supported_msr(MSR_IA32_XSS), - "IA32_XSS was able to be set, but was not found in KVM_GET_MSR_INDEX_LIST.\n"); + + TEST_ASSERT(r == 0 || xss_in_msr_list, + "IA32_XSS was able to be set, but was not in save/restore list"); } kvm_vm_free(vm); -- cgit v1.2.3 From 0ce74180f306981534d023199017a90b77a92004 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jun 2022 14:12:22 -0700 Subject: KVM: selftests: Harden and comment XSS / KVM_SET_MSRS interaction Assert that KVM_SET_MSRS returns '0' or '1' when setting XSS to a non-zero value. The ioctl() itself should "succeed", its only the setting of the XSS MSR that should fail/fault. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/xss_msr_test.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c index 7bd15f8a805c..a6abcb559e7c 100644 --- a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c +++ b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c @@ -53,7 +53,12 @@ int main(int argc, char *argv[]) for (i = 0; i < MSR_BITS; ++i) { r = _vcpu_set_msr(vm, VCPU_ID, MSR_IA32_XSS, 1ull << i); - TEST_ASSERT(r == 0 || xss_in_msr_list, + /* + * Setting a list of MSRs returns the entry that "faulted", or + * the last entry +1 if all MSRs were successfully written. + */ + TEST_ASSERT(!r || r == 1, KVM_IOCTL_ERROR(KVM_SET_MSRS, r)); + TEST_ASSERT(r != 1 || xss_in_msr_list, "IA32_XSS was able to be set, but was not in save/restore list"); } -- cgit v1.2.3 From 2128e30b01867a4d5e41bbaba8e35bcca7537f3b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jun 2022 12:24:18 -0700 Subject: KVM: selftests: Dedup MSR index list helpers, simplify dedicated test Consolidate the helper for retrieving the list of save/restore MSRs and the list of feature MSRs, and use the common helpers in the related get_msr_index_features test. Switching to the common helpers eliminates the testcase that KVM returns the same -E2BIG result if the input number of MSRs is '1' versus '0', but considered that testcase isn't very interesting, e.g. '0' and '1' are equally arbitrary, and certainly not worth the additional code. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../selftests/kvm/include/x86_64/processor.h | 1 + tools/testing/selftests/kvm/lib/x86_64/processor.c | 39 +++++-- .../selftests/kvm/x86_64/get_msr_index_features.c | 112 +++------------------ 3 files changed, 46 insertions(+), 106 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index b8023c30bded..9145da0bc61e 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -428,6 +428,7 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, void kvm_x86_state_cleanup(struct kvm_x86_state *state); const struct kvm_msr_list *kvm_get_msr_index_list(void); +const struct kvm_msr_list *kvm_get_feature_msr_index_list(void); bool kvm_msr_is_in_save_restore_list(uint32_t msr_index); uint64_t kvm_get_feature_msr(uint64_t msr_index); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 3f83857009a3..18fce5e8a5e5 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -891,19 +891,20 @@ void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) sregs_dump(stream, &sregs, indent + 4); } -const struct kvm_msr_list *kvm_get_msr_index_list(void) +static struct kvm_msr_list *__kvm_get_msr_index_list(bool feature_msrs) { - static struct kvm_msr_list *list; + struct kvm_msr_list *list; struct kvm_msr_list nmsrs; int kvm_fd, r; - if (list) - return list; - kvm_fd = open_kvm_dev_path_or_exit(); nmsrs.nmsrs = 0; - r = __kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs); + if (!feature_msrs) + r = __kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs); + else + r = __kvm_ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, &nmsrs); + TEST_ASSERT(r == -1 && errno == E2BIG, "Expected -E2BIG, got rc: %i errno: %i (%s)", r, errno, strerror(errno)); @@ -912,15 +913,37 @@ const struct kvm_msr_list *kvm_get_msr_index_list(void) TEST_ASSERT(list, "-ENOMEM when allocating MSR index list"); list->nmsrs = nmsrs.nmsrs; - kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list); + if (!feature_msrs) + kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list); + else + kvm_ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, list); close(kvm_fd); TEST_ASSERT(list->nmsrs == nmsrs.nmsrs, - "Number of save/restore MSRs changed, was %d, now %d", + "Number of MSRs in list changed, was %d, now %d", nmsrs.nmsrs, list->nmsrs); return list; } +const struct kvm_msr_list *kvm_get_msr_index_list(void) +{ + static const struct kvm_msr_list *list; + + if (!list) + list = __kvm_get_msr_index_list(false); + return list; +} + + +const struct kvm_msr_list *kvm_get_feature_msr_index_list(void) +{ + static const struct kvm_msr_list *list; + + if (!list) + list = __kvm_get_msr_index_list(true); + return list; +} + bool kvm_msr_is_in_save_restore_list(uint32_t msr_index) { const struct kvm_msr_list *list = kvm_get_msr_index_list(); diff --git a/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c index 4ef60adbe108..1e366fdfe7be 100644 --- a/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c +++ b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c @@ -15,108 +15,24 @@ #include "kvm_util.h" #include "processor.h" -static int kvm_num_index_msrs(int kvm_fd, int nmsrs) -{ - struct kvm_msr_list *list; - int r; - - list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); - list->nmsrs = nmsrs; - r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list); - TEST_ASSERT(r == -1 && errno == E2BIG, - "Unexpected result from KVM_GET_MSR_INDEX_LIST probe, r: %i", - r); - - r = list->nmsrs; - free(list); - return r; -} - -static void test_get_msr_index(void) -{ - int old_res, res, kvm_fd; - struct kvm_msr_list *list; - - kvm_fd = open_kvm_dev_path_or_exit(); - - old_res = kvm_num_index_msrs(kvm_fd, 0); - TEST_ASSERT(old_res != 0, "Expecting nmsrs to be > 0"); - - if (old_res != 1) { - res = kvm_num_index_msrs(kvm_fd, 1); - TEST_ASSERT(res > 1, "Expecting nmsrs to be > 1"); - TEST_ASSERT(res == old_res, "Expecting nmsrs to be identical"); - } - - list = malloc(sizeof(*list) + old_res * sizeof(list->indices[0])); - list->nmsrs = old_res; - kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list); - - TEST_ASSERT(list->nmsrs == old_res, "Expecting nmsrs to be identical"); - free(list); - - close(kvm_fd); -} - -static int kvm_num_feature_msrs(int kvm_fd, int nmsrs) -{ - struct kvm_msr_list *list; - int r; - - list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); - list->nmsrs = nmsrs; - r = __kvm_ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, list); - TEST_ASSERT(r == -1 && errno == E2BIG, - "Unexpected result from KVM_GET_MSR_FEATURE_INDEX_LIST probe, r: %i", - r); - - r = list->nmsrs; - free(list); - return r; -} - -struct kvm_msr_list *kvm_get_msr_feature_list(int kvm_fd, int nmsrs) -{ - struct kvm_msr_list *list; - - list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); - list->nmsrs = nmsrs; - kvm_ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, list); - - return list; -} - -static void test_get_msr_feature(void) +int main(int argc, char *argv[]) { - int res, old_res, i, kvm_fd; - struct kvm_msr_list *feature_list; - - kvm_fd = open_kvm_dev_path_or_exit(); - - old_res = kvm_num_feature_msrs(kvm_fd, 0); - TEST_ASSERT(old_res != 0, "Expecting nmsrs to be > 0"); - - if (old_res != 1) { - res = kvm_num_feature_msrs(kvm_fd, 1); - TEST_ASSERT(res > 1, "Expecting nmsrs to be > 1"); - TEST_ASSERT(res == old_res, "Expecting nmsrs to be identical"); + const struct kvm_msr_list *feature_list; + int i; + + /* + * Skip the entire test if MSR_FEATURES isn't supported, other tests + * will cover the "regular" list of MSRs, the coverage here is purely + * opportunistic and not interesting on its own. + */ + if (!kvm_check_cap(KVM_CAP_GET_MSR_FEATURES)) { + print_skip("KVM_CAP_GET_MSR_FEATURES not supported"); + exit(KSFT_SKIP); } - feature_list = kvm_get_msr_feature_list(kvm_fd, old_res); - TEST_ASSERT(old_res == feature_list->nmsrs, - "Unmatching number of msr indexes"); + (void)kvm_get_msr_index_list(); + feature_list = kvm_get_feature_msr_index_list(); for (i = 0; i < feature_list->nmsrs; i++) kvm_get_feature_msr(feature_list->indices[i]); - - free(feature_list); - close(kvm_fd); -} - -int main(int argc, char *argv[]) -{ - if (kvm_check_cap(KVM_CAP_GET_MSR_FEATURES)) - test_get_msr_feature(); - - test_get_msr_index(); } -- cgit v1.2.3 From 877bd3997c508691b30054524353d574a3597cd9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jun 2022 10:25:56 -0700 Subject: KVM: selftests: Rename MP_STATE and GUEST_DEBUG helpers for consistency Move the get/set part of the MP_STATE and GUEST_DEBUG helpers to the end to align with the many other ioctl() wrappers/helpers. Note, this is not an endorsement of the predominant style, the goal is purely to provide consistency in the selftests. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/psci_test.c | 2 +- tools/testing/selftests/kvm/include/kvm_util_base.h | 9 +++++++-- tools/testing/selftests/kvm/lib/x86_64/processor.c | 2 +- tools/testing/selftests/kvm/x86_64/debug_regs.c | 2 +- tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c | 2 +- 5 files changed, 11 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c index 1a351f3f443d..1485d0b05b66 100644 --- a/tools/testing/selftests/kvm/aarch64/psci_test.c +++ b/tools/testing/selftests/kvm/aarch64/psci_test.c @@ -70,7 +70,7 @@ static void vcpu_power_off(struct kvm_vm *vm, uint32_t vcpuid) .mp_state = KVM_MP_STATE_STOPPED, }; - vcpu_set_mp_state(vm, vcpuid, &mp_state); + vcpu_mp_state_set(vm, vcpuid, &mp_state); } static struct kvm_vm *setup_vm(void *guest_code) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index c9d94c9f2031..edbbbbe4cd5d 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -374,13 +374,18 @@ static inline void vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id, vcpu_ioctl(vm, vcpu_id, KVM_ENABLE_CAP, &enable_cap); } -static inline void vcpu_set_guest_debug(struct kvm_vm *vm, uint32_t vcpuid, +static inline void vcpu_guest_debug_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_guest_debug *debug) { vcpu_ioctl(vm, vcpuid, KVM_SET_GUEST_DEBUG, debug); } -static inline void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid, +static inline void vcpu_mp_state_get(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_mp_state *mp_state) +{ + vcpu_ioctl(vm, vcpuid, KVM_GET_MP_STATE, mp_state); +} +static inline void vcpu_mp_state_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_mp_state *mp_state) { vcpu_ioctl(vm, vcpuid, KVM_SET_MP_STATE, mp_state); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 18fce5e8a5e5..8dfe40eeceac 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -654,7 +654,7 @@ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) /* Setup the MP state */ mp_state.mp_state = 0; - vcpu_set_mp_state(vm, vcpuid, &mp_state); + vcpu_mp_state_set(vm, vcpuid, &mp_state); } /* diff --git a/tools/testing/selftests/kvm/x86_64/debug_regs.c b/tools/testing/selftests/kvm/x86_64/debug_regs.c index 5f078db1bcba..f726645bb9c3 100644 --- a/tools/testing/selftests/kvm/x86_64/debug_regs.c +++ b/tools/testing/selftests/kvm/x86_64/debug_regs.c @@ -67,7 +67,7 @@ static void guest_code(void) } #define CLEAR_DEBUG() memset(&debug, 0, sizeof(debug)) -#define APPLY_DEBUG() vcpu_set_guest_debug(vm, VCPU_ID, &debug) +#define APPLY_DEBUG() vcpu_guest_debug_set(vm, VCPU_ID, &debug) #define CAST_TO_RIP(v) ((unsigned long long)&(v)) #define SET_RIP(v) do { \ vcpu_regs_get(vm, VCPU_ID, ®s); \ diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c index 18061677154f..f834b9a1a7fa 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c @@ -166,7 +166,7 @@ static void run_test(bool is_nmi) vcpu_args_set(vm, VCPU_ID, 3, svm_gva, (uint64_t)is_nmi, (uint64_t)idt_alt_vm); memset(&debug, 0, sizeof(debug)); - vcpu_set_guest_debug(vm, VCPU_ID, &debug); + vcpu_guest_debug_set(vm, VCPU_ID, &debug); struct kvm_run *run = vcpu_state(vm, VCPU_ID); struct ucall uc; -- cgit v1.2.3 From 6ebfef83f03f216b25b09a386bef16d05796cdaa Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jun 2022 10:30:06 -0700 Subject: KVM: selftest: Add proper helpers for x86-specific save/restore ioctls Add helpers for the various one-off helpers used by x86's vCPU state save/restore helpers, and convert the other open coded ioctl()s to use existing helpers. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../selftests/kvm/include/x86_64/processor.h | 54 +++++++++ tools/testing/selftests/kvm/lib/x86_64/processor.c | 130 +++++++-------------- 2 files changed, 93 insertions(+), 91 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 9145da0bc61e..0bb9ba955d18 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -432,6 +432,60 @@ const struct kvm_msr_list *kvm_get_feature_msr_index_list(void); bool kvm_msr_is_in_save_restore_list(uint32_t msr_index); uint64_t kvm_get_feature_msr(uint64_t msr_index); +static inline void vcpu_msrs_get(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_msrs *msrs) +{ + int r = __vcpu_ioctl(vm, vcpuid, KVM_GET_MSRS, msrs); + + TEST_ASSERT(r == msrs->nmsrs, + "KVM_GET_MSRS failed, r: %i (failed on MSR %x)", + r, r < 0 || r >= msrs->nmsrs ? -1 : msrs->entries[r].index); +} +static inline void vcpu_msrs_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_msrs *msrs) +{ + int r = __vcpu_ioctl(vm, vcpuid, KVM_SET_MSRS, msrs); + + TEST_ASSERT(r == msrs->nmsrs, + "KVM_GET_MSRS failed, r: %i (failed on MSR %x)", + r, r < 0 || r >= msrs->nmsrs ? -1 : msrs->entries[r].index); +} +static inline void vcpu_debugregs_get(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_debugregs *debugregs) +{ + vcpu_ioctl(vm, vcpuid, KVM_GET_DEBUGREGS, debugregs); +} +static inline void vcpu_debugregs_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_debugregs *debugregs) +{ + vcpu_ioctl(vm, vcpuid, KVM_SET_DEBUGREGS, debugregs); +} +static inline void vcpu_xsave_get(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_xsave *xsave) +{ + vcpu_ioctl(vm, vcpuid, KVM_GET_XSAVE, xsave); +} +static inline void vcpu_xsave2_get(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_xsave *xsave) +{ + vcpu_ioctl(vm, vcpuid, KVM_GET_XSAVE2, xsave); +} +static inline void vcpu_xsave_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_xsave *xsave) +{ + vcpu_ioctl(vm, vcpuid, KVM_SET_XSAVE, xsave); +} +static inline void vcpu_xcrs_get(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_xcrs *xcrs) +{ + vcpu_ioctl(vm, vcpuid, KVM_GET_XCRS, xcrs); +} +static inline void vcpu_xcrs_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_xcrs *xcrs) +{ + vcpu_ioctl(vm, vcpuid, KVM_SET_XCRS, xcrs); +} + struct kvm_cpuid2 *kvm_get_supported_cpuid(void); struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 8dfe40eeceac..47bc84d1aeb0 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -814,13 +814,11 @@ uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index) struct kvm_msrs header; struct kvm_msr_entry entry; } buffer = {}; - int r; buffer.header.nmsrs = 1; buffer.entry.index = msr_index; - r = __vcpu_ioctl(vm, vcpuid, KVM_GET_MSRS, &buffer.header); - TEST_ASSERT(r == 1, KVM_IOCTL_ERROR(KVM_GET_MSRS, r)); + vcpu_msrs_get(vm, vcpuid, &buffer.header); return buffer.entry.data; } @@ -957,28 +955,26 @@ bool kvm_msr_is_in_save_restore_list(uint32_t msr_index) return false; } -static int vcpu_save_xsave_state(struct kvm_vm *vm, struct vcpu *vcpu, - struct kvm_x86_state *state) +static void vcpu_save_xsave_state(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_x86_state *state) { - int size; - - size = vm_check_cap(vm, KVM_CAP_XSAVE2); - if (!size) - size = sizeof(struct kvm_xsave); + int size = vm_check_cap(vm, KVM_CAP_XSAVE2); - state->xsave = malloc(size); - if (size == sizeof(struct kvm_xsave)) - return ioctl(vcpu->fd, KVM_GET_XSAVE, state->xsave); - else - return ioctl(vcpu->fd, KVM_GET_XSAVE2, state->xsave); + if (size) { + state->xsave = malloc(size); + vcpu_xsave2_get(vm, vcpuid, state->xsave); + } else { + state->xsave = malloc(sizeof(struct kvm_xsave)); + vcpu_xsave_get(vm, vcpuid, state->xsave); + } } struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) { const struct kvm_msr_list *msr_list = kvm_get_msr_index_list(); - struct vcpu *vcpu = vcpu_get(vm, vcpuid); struct kvm_x86_state *state; - int r, i; + int i; + static int nested_size = -1; if (nested_size == -1) { @@ -997,102 +993,54 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) vcpu_run_complete_io(vm, vcpuid); state = malloc(sizeof(*state) + msr_list->nmsrs * sizeof(state->msrs.entries[0])); - r = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, &state->events); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i", - r); - - r = ioctl(vcpu->fd, KVM_GET_MP_STATE, &state->mp_state); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i", - r); - - r = ioctl(vcpu->fd, KVM_GET_REGS, &state->regs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i", - r); - - r = vcpu_save_xsave_state(vm, vcpu, state); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i", - r); - - if (kvm_check_cap(KVM_CAP_XCRS)) { - r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XCRS, r: %i", - r); - } - r = ioctl(vcpu->fd, KVM_GET_SREGS, &state->sregs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i", - r); + vcpu_events_get(vm, vcpuid, &state->events); + vcpu_mp_state_get(vm, vcpuid, &state->mp_state); + vcpu_regs_get(vm, vcpuid, &state->regs); + vcpu_save_xsave_state(vm, vcpuid, state); + + if (kvm_check_cap(KVM_CAP_XCRS)) + vcpu_xcrs_get(vm, vcpuid, &state->xcrs); + + vcpu_sregs_get(vm, vcpuid, &state->sregs); if (nested_size) { state->nested.size = sizeof(state->nested_); - r = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, &state->nested); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_NESTED_STATE, r: %i", - r); + + vcpu_nested_state_get(vm, vcpuid, &state->nested); TEST_ASSERT(state->nested.size <= nested_size, "Nested state size too big, %i (KVM_CHECK_CAP gave %i)", state->nested.size, nested_size); - } else + } else { state->nested.size = 0; + } state->msrs.nmsrs = msr_list->nmsrs; for (i = 0; i < msr_list->nmsrs; i++) state->msrs.entries[i].index = msr_list->indices[i]; - r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs); - TEST_ASSERT(r == msr_list->nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)", - r, r == msr_list->nmsrs ? -1 : msr_list->indices[r]); + vcpu_msrs_get(vm, vcpuid, &state->msrs); - r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i", - r); + vcpu_debugregs_get(vm, vcpuid, &state->debugregs); return state; } void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *state) { - struct vcpu *vcpu = vcpu_get(vm, vcpuid); - int r; - - r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i", - r); - - r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs); - TEST_ASSERT(r == state->msrs.nmsrs, - "Unexpected result from KVM_SET_MSRS, r: %i (failed at %x)", - r, r == state->msrs.nmsrs ? -1 : state->msrs.entries[r].index); - - if (kvm_check_cap(KVM_CAP_XCRS)) { - r = ioctl(vcpu->fd, KVM_SET_XCRS, &state->xcrs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XCRS, r: %i", - r); - } + vcpu_sregs_set(vm, vcpuid, &state->sregs); + vcpu_msrs_set(vm, vcpuid, &state->msrs); - r = ioctl(vcpu->fd, KVM_SET_XSAVE, state->xsave); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i", - r); + if (kvm_check_cap(KVM_CAP_XCRS)) + vcpu_xcrs_set(vm, vcpuid, &state->xcrs); - r = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, &state->events); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i", - r); + vcpu_xsave_set(vm, vcpuid, state->xsave); + vcpu_events_set(vm, vcpuid, &state->events); + vcpu_mp_state_set(vm, vcpuid, &state->mp_state); + vcpu_debugregs_set(vm, vcpuid, &state->debugregs); + vcpu_regs_set(vm, vcpuid, &state->regs); - r = ioctl(vcpu->fd, KVM_SET_MP_STATE, &state->mp_state); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i", - r); - - r = ioctl(vcpu->fd, KVM_SET_DEBUGREGS, &state->debugregs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i", - r); - - r = ioctl(vcpu->fd, KVM_SET_REGS, &state->regs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i", - r); - - if (state->nested.size) { - r = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, &state->nested); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_NESTED_STATE, r: %i", - r); - } + if (state->nested.size) + vcpu_nested_state_set(vm, vcpuid, &state->nested); } void kvm_x86_state_cleanup(struct kvm_x86_state *state) -- cgit v1.2.3 From f17686aac61fb23c08efba56ee3f3ceb89572d08 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 15:14:51 -0800 Subject: KVM: selftests: Add vm_create_*() variants to expose/return 'struct vcpu' Add VM creation helpers to expose/return 'struct vcpu' so that tests don't have to hardcode a VCPU_ID or make assumptions about what vCPU ID is used by the framework just to retrieve a vCPU the test created. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/kvm_util_base.h | 16 ++++++++++++++++ tools/testing/selftests/kvm/lib/kvm_util.c | 18 ++++++++++++++++++ 2 files changed, 34 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index edbbbbe4cd5d..c46c03750043 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -628,6 +628,22 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, /* Create a default VM without any vcpus. */ struct kvm_vm *vm_create_without_vcpus(enum vm_guest_mode mode, uint64_t pages); +/* + * Create a VM with a single vCPU with reasonable defaults and @extra_mem_pages + * additional pages of guest memory. Returns the VM and vCPU (via out param). + */ +struct kvm_vm *__vm_create_with_one_vcpu(struct vcpu **vcpu, + uint64_t extra_mem_pages, + void *guest_code); + +static inline struct kvm_vm *vm_create_with_one_vcpu(struct vcpu **vcpu, + void *guest_code) +{ + return __vm_create_with_one_vcpu(vcpu, 0, guest_code); +} + +struct vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm); + /* * Adds a vCPU with reasonable defaults (e.g. a stack) * diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 8f670cef6faa..1c5caf2ddca4 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -369,6 +369,16 @@ struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, (uint32_t []){ vcpuid }); } +struct kvm_vm *__vm_create_with_one_vcpu(struct vcpu **vcpu, + uint64_t extra_mem_pages, + void *guest_code) +{ + struct kvm_vm *vm = vm_create_default(0, extra_mem_pages, guest_code); + + *vcpu = vcpu_get(vm, 0); + return vm; +} + /* * VM Restart * @@ -403,6 +413,14 @@ void kvm_vm_restart(struct kvm_vm *vmp) } } +struct vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm) +{ + kvm_vm_restart(vm); + + vm_vcpu_add(vm, 0); + return vcpu_get(vm, 0); +} + /* * Userspace Memory Region Find * -- cgit v1.2.3 From 0c276ff22c7eb1a6ebf889b25d631f4cd358a482 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 18 Apr 2022 12:58:44 -0700 Subject: KVM: selftests: Push vm_adjust_num_guest_pages() into "w/o vCPUs" helper Move the call to vm_adjust_num_guest_pages() from vm_create_with_vcpus() down into vm_create_without_vcpus(). This will allow a future patch to make the "w/o vCPUs" variant the common inner helper, e.g. so that the "with_vcpus" helper calls the "without_vcpus" helper, instead of having them be separate paths. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/kvm_util.c | 4 ++-- tools/testing/selftests/kvm/x86_64/monitor_mwait_test | Bin 0 -> 1485656 bytes 2 files changed, 2 insertions(+), 2 deletions(-) create mode 100755 tools/testing/selftests/kvm/x86_64/monitor_mwait_test (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 1c5caf2ddca4..8c0310094d73 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -282,6 +282,8 @@ struct kvm_vm *vm_create_without_vcpus(enum vm_guest_mode mode, uint64_t pages) { struct kvm_vm *vm; + pages = vm_adjust_num_guest_pages(mode, pages); + vm = __vm_create(mode, pages); kvm_vm_elf_load(vm, program_invocation_name); @@ -341,8 +343,6 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, "nr_vcpus = %d too large for host, max-vcpus = %d", nr_vcpus, kvm_check_cap(KVM_CAP_MAX_VCPUS)); - pages = vm_adjust_num_guest_pages(mode, pages); - vm = vm_create_without_vcpus(mode, pages); for (i = 0; i < nr_vcpus; ++i) { diff --git a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test new file mode 100755 index 000000000000..598b597e1eec Binary files /dev/null and b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test differ -- cgit v1.2.3 From bb47ed8b71d06d7ede40374fd3064e2457eafb62 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 18 Apr 2022 13:00:58 -0700 Subject: KVM: selftests: Use vm_create_without_vcpus() in set_boot_cpu_id Use vm_create_without_vcpus() in set_boot_cpu_id instead of open coding the equivlant now that the "without_vcpus" variant does vm_adjust_num_guest_pages(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c index b4da92ddc1c6..4c5775a8de6a 100644 --- a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c +++ b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c @@ -82,18 +82,11 @@ static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid) static struct kvm_vm *create_vm(void) { - struct kvm_vm *vm; uint64_t vcpu_pages = (DEFAULT_STACK_PGS) * 2; uint64_t extra_pg_pages = vcpu_pages / PTES_PER_MIN_PAGE * N_VCPU; uint64_t pages = DEFAULT_GUEST_PHY_PAGES + vcpu_pages + extra_pg_pages; - pages = vm_adjust_num_guest_pages(VM_MODE_DEFAULT, pages); - vm = vm_create(pages); - - kvm_vm_elf_load(vm, program_invocation_name); - vm_create_irqchip(vm); - - return vm; + return vm_create_without_vcpus(VM_MODE_DEFAULT, pages); } static void add_x86_vcpu(struct kvm_vm *vm, uint32_t vcpuid, bool bsp_code) -- cgit v1.2.3 From 4acefa385c8233de756b450f87188ebadf3a3591 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 18 Apr 2022 13:02:55 -0700 Subject: KVM: selftests: Use vm_create_without_vcpus() in dirty_log_test Use vm_create_without_vcpus() instead of open coding a rough equivalent. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/dirty_log_test.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 9dfc861a3cf3..13962d107948 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -674,11 +674,8 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); - vm = __vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages); - kvm_vm_elf_load(vm, program_invocation_name); -#ifdef __x86_64__ - vm_create_irqchip(vm); -#endif + vm = vm_create_without_vcpus(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages); + log_mode_create_vm_done(vm); vm_vcpu_add_default(vm, vcpuid, guest_code); return vm; -- cgit v1.2.3 From 3c16181b2652e4e99b478e2c0251aaa24a15e695 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 18 Apr 2022 13:16:32 -0700 Subject: KVM: selftests: Use vm_create_without_vcpus() in hardware_disable_test Use vm_create_without_vcpus() instead of open coding a rough equivalent in hardware_disable_test. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/hardware_disable_test.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c index 81ba8645772a..76f51afa7ccc 100644 --- a/tools/testing/selftests/kvm/hardware_disable_test.c +++ b/tools/testing/selftests/kvm/hardware_disable_test.c @@ -104,9 +104,7 @@ static void run_test(uint32_t run) for (i = 0; i < VCPU_NUM; i++) CPU_SET(i, &cpu_set); - vm = vm_create(DEFAULT_GUEST_PHY_PAGES); - kvm_vm_elf_load(vm, program_invocation_name); - vm_create_irqchip(vm); + vm = vm_create_without_vcpus(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); pr_debug("%s: [%d] start vcpus\n", __func__, run); for (i = 0; i < VCPU_NUM; ++i) { -- cgit v1.2.3 From 47b1e0ec2e1460b25674ba947723052919da7e67 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 18 Apr 2022 13:16:39 -0700 Subject: KVM: selftests: Use vm_create_without_vcpus() in psci_test Use vm_create_without_vcpus() instead of open coding a rough equivalent in psci_test. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/psci_test.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c index 1485d0b05b66..c9b82c0cc8d5 100644 --- a/tools/testing/selftests/kvm/aarch64/psci_test.c +++ b/tools/testing/selftests/kvm/aarch64/psci_test.c @@ -78,8 +78,7 @@ static struct kvm_vm *setup_vm(void *guest_code) struct kvm_vcpu_init init; struct kvm_vm *vm; - vm = vm_create(DEFAULT_GUEST_PHY_PAGES); - kvm_vm_elf_load(vm, program_invocation_name); + vm = vm_create_without_vcpus(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); ucall_init(vm, NULL); vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init); -- cgit v1.2.3 From eb0adbc03aafdb3dd0740eaaeb26051fd5e35c20 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 9 Jun 2022 10:19:01 -0700 Subject: KVM: selftests: Avoid memory allocations when adding vCPU in get-reg-list Open code adding and doing setup for a vCPU in get-reg-list in order to avoid the stack allocation that comes with aarch64_vcpu_add_default(). get-reg-list doesn't need to run the vCPU, and so doesn't need the guest to be backed with memory. This will allow future cleanup to turn what is current vm_create() into a barebones helper. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/get-reg-list.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c index ecfb773ec41e..b2d1d2eaf1d3 100644 --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -418,7 +418,8 @@ static void run_test(struct vcpu_config *c) vm = vm_create(DEFAULT_GUEST_PHY_PAGES); prepare_vcpu_init(c, &init); - aarch64_vcpu_add_default(vm, 0, &init, NULL); + vm_vcpu_add(vm, 0); + aarch64_vcpu_setup(vm, 0, &init); finalize_vcpu(vm, 0, c); reg_list = vcpu_get_reg_list(vm, 0); -- cgit v1.2.3 From 95fb0460719721962997d344bf9d63812c1dab67 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 18 Apr 2022 16:26:06 -0700 Subject: KVM: selftests: Rename vm_create() => vm_create_barebones(), drop param Rename vm_create() to vm_create_barebones() and drop the @phys_pages param. Pass '0' for the number of pages even though some callers pass 'DEFAULT_GUEST_PHY_PAGES', as the intent behind creating truly barebones VMs is purely to create a VM, i.e. there aren't vCPUs, there's no guest code loaded, etc..., and so there is nothing that will ever need or consume guest memory. Freeing up the name vm_create() will allow using the name for an inner helper to the other VM creators, which need a "full" VM. Opportunisticaly rewrite the function comment for addr_gpa2alias() to focus on what the _function_ does, not what its _sole caller_ does. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/get-reg-list.c | 2 +- .../selftests/kvm/aarch64/vcpu_width_config.c | 6 ++--- .../testing/selftests/kvm/include/kvm_util_base.h | 6 ++++- .../testing/selftests/kvm/kvm_binary_stats_test.c | 2 +- tools/testing/selftests/kvm/kvm_create_max_vcpus.c | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 29 +++------------------- .../testing/selftests/kvm/set_memory_region_test.c | 4 +-- .../selftests/kvm/x86_64/max_vcpuid_cap_test.c | 2 +- .../testing/selftests/kvm/x86_64/set_sregs_test.c | 2 +- .../selftests/kvm/x86_64/sev_migrate_tests.c | 8 +++--- 10 files changed, 23 insertions(+), 40 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c index b2d1d2eaf1d3..5476bb465b78 100644 --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -416,7 +416,7 @@ static void run_test(struct vcpu_config *c) check_supported(c); - vm = vm_create(DEFAULT_GUEST_PHY_PAGES); + vm = vm_create_barebones(); prepare_vcpu_init(c, &init); vm_vcpu_add(vm, 0); aarch64_vcpu_setup(vm, 0, &init); diff --git a/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c b/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c index 4145c28a245a..1757f44dd3e2 100644 --- a/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c +++ b/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c @@ -24,7 +24,7 @@ static int add_init_2vcpus(struct kvm_vcpu_init *init1, struct kvm_vm *vm; int ret; - vm = vm_create(DEFAULT_GUEST_PHY_PAGES); + vm = vm_create_barebones(); vm_vcpu_add(vm, 0); ret = __vcpu_ioctl(vm, 0, KVM_ARM_VCPU_INIT, init1); @@ -49,7 +49,7 @@ static int add_2vcpus_init_2vcpus(struct kvm_vcpu_init *init1, struct kvm_vm *vm; int ret; - vm = vm_create(DEFAULT_GUEST_PHY_PAGES); + vm = vm_create_barebones(); vm_vcpu_add(vm, 0); vm_vcpu_add(vm, 1); @@ -86,7 +86,7 @@ int main(void) } /* Get the preferred target type and copy that to init2 for later use */ - vm = vm_create(DEFAULT_GUEST_PHY_PAGES); + vm = vm_create_barebones(); vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init1); kvm_vm_free(vm); init2 = init1; diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index c46c03750043..c119726ba018 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -248,7 +248,6 @@ void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size); const char *vm_guest_mode_string(uint32_t i); struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint64_t phy_pages); -struct kvm_vm *vm_create(uint64_t phy_pages); void kvm_vm_free(struct kvm_vm *vmp); void kvm_vm_restart(struct kvm_vm *vmp); void kvm_vm_release(struct kvm_vm *vmp); @@ -596,6 +595,11 @@ vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, vm_paddr_t paddr_min, uint32_t memslot); vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm); +static inline struct kvm_vm *vm_create_barebones(void) +{ + return __vm_create(VM_MODE_DEFAULT, 0); +} + /* * Create a VM with reasonable defaults * diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index 0a27b0f85009..edeb08239036 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -221,7 +221,7 @@ int main(int argc, char *argv[]) vms = malloc(sizeof(vms[0]) * max_vm); TEST_ASSERT(vms, "Allocate memory for storing VM pointers"); for (i = 0; i < max_vm; ++i) { - vms[i] = vm_create(DEFAULT_GUEST_PHY_PAGES); + vms[i] = vm_create_barebones(); for (j = 0; j < max_vcpu; ++j) vm_vcpu_add(vms[i], j); } diff --git a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c index 9de5e1376c49..acc92703f563 100644 --- a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c +++ b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c @@ -28,7 +28,7 @@ void test_vcpu_creation(int first_vcpu_id, int num_vcpus) pr_info("Testing creating %d vCPUs, with IDs %d...%d.\n", num_vcpus, first_vcpu_id, first_vcpu_id + num_vcpus - 1); - vm = vm_create(DEFAULT_GUEST_PHY_PAGES); + vm = vm_create_barebones(); for (i = first_vcpu_id; i < first_vcpu_id + num_vcpus; i++) /* This asserts that the vCPU was created. */ diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 8c0310094d73..50e50ed5f636 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -258,26 +258,6 @@ struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint64_t phy_pages) return vm; } -/* - * VM Create - * - * Input Args: - * phy_pages - Physical memory pages - * - * Output Args: None - * - * Return: - * Pointer to opaque structure that describes the created VM. - * - * Creates a VM with the default physical/virtual address widths and page size. - * When phy_pages is non-zero, a memory region of phy_pages physical pages - * is created and mapped starting at guest physical address 0. - */ -struct kvm_vm *vm_create(uint64_t phy_pages) -{ - return __vm_create(VM_MODE_DEFAULT, phy_pages); -} - struct kvm_vm *vm_create_without_vcpus(enum vm_guest_mode mode, uint64_t pages) { struct kvm_vm *vm; @@ -1421,11 +1401,10 @@ vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva) * (without failing the test) if the guest memory is not shared (so * no alias exists). * - * When vm_create() and related functions are called with a shared memory - * src_type, we also create a writable, shared alias mapping of the - * underlying guest memory. This allows the host to manipulate guest memory - * without mapping that memory in the guest's address space. And, for - * userfaultfd-based demand paging, we can do so without triggering userfaults. + * Create a writable, shared virtual=>physical alias for the specific GPA. + * The primary use case is to allow the host selftest to manipulate guest + * memory without mapping said memory in the guest's address space. And, for + * userfaultfd-based demand paging, to do so without triggering userfaults. */ void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa) { diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index e66deb8ba7e0..c33402ba7587 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -314,7 +314,7 @@ static void test_zero_memory_regions(void) pr_info("Testing KVM_RUN with zero added memory regions\n"); - vm = vm_create(0); + vm = vm_create_barebones(); vm_vcpu_add(vm, VCPU_ID); vm_ioctl(vm, KVM_SET_NR_MMU_PAGES, (void *)64ul); @@ -353,7 +353,7 @@ static void test_add_max_memory_regions(void) "KVM_CAP_NR_MEMSLOTS should be greater than 0"); pr_info("Allowed number of memory slots: %i\n", max_mem_slots); - vm = vm_create(0); + vm = vm_create_barebones(); /* Check it can be added memory slots up to the maximum allowed */ pr_info("Adding slots 0..%i, each memory region with %dK size\n", diff --git a/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c index 7211fd8d5d24..3cc4b86832fe 100644 --- a/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c +++ b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c @@ -16,7 +16,7 @@ int main(int argc, char *argv[]) struct kvm_vm *vm; int ret; - vm = vm_create(0); + vm = vm_create_barebones(); /* Get KVM_CAP_MAX_VCPU_ID cap supported in KVM */ ret = vm_check_cap(vm, KVM_CAP_MAX_VCPU_ID); diff --git a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c index 4dc7fd925023..f5e65db9f451 100644 --- a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c +++ b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c @@ -95,7 +95,7 @@ int main(int argc, char *argv[]) * use it to verify all supported CR4 bits can be set prior to defining * the vCPU model, i.e. without doing KVM_SET_CPUID2. */ - vm = vm_create(DEFAULT_GUEST_PHY_PAGES); + vm = vm_create_barebones(); vm_vcpu_add(vm, VCPU_ID); vcpu_sregs_get(vm, VCPU_ID, &sregs); diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c index e814748bf7ba..245fd0755390 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c +++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c @@ -53,7 +53,7 @@ static struct kvm_vm *sev_vm_create(bool es) struct kvm_sev_launch_start start = { 0 }; int i; - vm = vm_create(0); + vm = vm_create_barebones(); sev_ioctl(vm->fd, es ? KVM_SEV_ES_INIT : KVM_SEV_INIT, NULL); for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i) vm_vcpu_add(vm, i); @@ -70,7 +70,7 @@ static struct kvm_vm *aux_vm_create(bool with_vcpus) struct kvm_vm *vm; int i; - vm = vm_create(0); + vm = vm_create_barebones(); if (!with_vcpus) return vm; @@ -168,7 +168,7 @@ static void test_sev_migrate_parameters(void) *sev_es_vm_no_vmsa; int ret; - vm_no_vcpu = vm_create(0); + vm_no_vcpu = vm_create_barebones(); vm_no_sev = aux_vm_create(true); ret = __sev_migrate_from(vm_no_vcpu, vm_no_sev); TEST_ASSERT(ret == -1 && errno == EINVAL, @@ -180,7 +180,7 @@ static void test_sev_migrate_parameters(void) sev_vm = sev_vm_create(/* es= */ false); sev_es_vm = sev_vm_create(/* es= */ true); - sev_es_vm_no_vmsa = vm_create(0); + sev_es_vm_no_vmsa = vm_create_barebones(); sev_ioctl(sev_es_vm_no_vmsa->fd, KVM_SEV_ES_INIT, NULL); vm_vcpu_add(sev_es_vm_no_vmsa, 1); -- cgit v1.2.3 From cfe122db3ea6908d44f51c239fb1f1608e71522b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 19 Apr 2022 14:16:34 -0700 Subject: KVM: selftests: Rename vm_create_without_vcpus() => vm_create() Rename vm_create_without_vcpus() to vm_create() so that it's not misconstrued as helper that creates a VM that can never have vCPUs, as opposed to a helper that "just" creates a VM without vCPUs added at time zero. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/psci_test.c | 2 +- tools/testing/selftests/kvm/dirty_log_test.c | 2 +- tools/testing/selftests/kvm/hardware_disable_test.c | 2 +- tools/testing/selftests/kvm/include/kvm_util_base.h | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 4 ++-- tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c | 2 +- tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c index c9b82c0cc8d5..ffa0cdc0ab3d 100644 --- a/tools/testing/selftests/kvm/aarch64/psci_test.c +++ b/tools/testing/selftests/kvm/aarch64/psci_test.c @@ -78,7 +78,7 @@ static struct kvm_vm *setup_vm(void *guest_code) struct kvm_vcpu_init init; struct kvm_vm *vm; - vm = vm_create_without_vcpus(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); ucall_init(vm, NULL); vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init); diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 13962d107948..b921d0b45647 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -674,7 +674,7 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); - vm = vm_create_without_vcpus(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages); + vm = vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages); log_mode_create_vm_done(vm); vm_vcpu_add_default(vm, vcpuid, guest_code); diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c index 76f51afa7ccc..7e4df30dd1e8 100644 --- a/tools/testing/selftests/kvm/hardware_disable_test.c +++ b/tools/testing/selftests/kvm/hardware_disable_test.c @@ -104,7 +104,7 @@ static void run_test(uint32_t run) for (i = 0; i < VCPU_NUM; i++) CPU_SET(i, &cpu_set); - vm = vm_create_without_vcpus(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); pr_debug("%s: [%d] start vcpus\n", __func__, run); for (i = 0; i < VCPU_NUM; ++i) { diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index c119726ba018..b09ef551d61b 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -630,7 +630,7 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, uint32_t vcpuids[]); /* Create a default VM without any vcpus. */ -struct kvm_vm *vm_create_without_vcpus(enum vm_guest_mode mode, uint64_t pages); +struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t pages); /* * Create a VM with a single vCPU with reasonable defaults and @extra_mem_pages diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 50e50ed5f636..d538811cf93d 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -258,7 +258,7 @@ struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint64_t phy_pages) return vm; } -struct kvm_vm *vm_create_without_vcpus(enum vm_guest_mode mode, uint64_t pages) +struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t pages) { struct kvm_vm *vm; @@ -323,7 +323,7 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, "nr_vcpus = %d too large for host, max-vcpus = %d", nr_vcpus, kvm_check_cap(KVM_CAP_MAX_VCPUS)); - vm = vm_create_without_vcpus(mode, pages); + vm = vm_create(mode, pages); for (i = 0; i < nr_vcpus; ++i) { uint32_t vcpuid = vcpuids ? vcpuids[i] : i; diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index 9ec6ba2ad89b..9146d62439fc 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -365,7 +365,7 @@ static void test_pmu_config_disable(void (*guest_code)(void)) if (!(r & KVM_PMU_CAP_DISABLE)) return; - vm = vm_create_without_vcpus(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); vm_enable_cap(vm, KVM_CAP_PMU_CAPABILITY, KVM_PMU_CAP_DISABLE); diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c index 4c5775a8de6a..6bc13cf17220 100644 --- a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c +++ b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c @@ -86,7 +86,7 @@ static struct kvm_vm *create_vm(void) uint64_t extra_pg_pages = vcpu_pages / PTES_PER_MIN_PAGE * N_VCPU; uint64_t pages = DEFAULT_GUEST_PHY_PAGES + vcpu_pages + extra_pg_pages; - return vm_create_without_vcpus(VM_MODE_DEFAULT, pages); + return vm_create(VM_MODE_DEFAULT, pages); } static void add_x86_vcpu(struct kvm_vm *vm, uint32_t vcpuid, bool bsp_code) -- cgit v1.2.3 From 3f44e7fdca4eaeed3728d0b898ace5258f9ed474 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 19 Apr 2022 14:21:38 -0700 Subject: KVM: selftests: Make vm_create() a wrapper that specifies VM_MODE_DEFAULT Add ____vm_create() to be the innermost helper, and turn vm_create() into a wrapper the specifies VM_MODE_DEFAULT. Most of the vm_create() callers just want the default mode, or more accurately, don't care about the mode. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/psci_test.c | 2 +- tools/testing/selftests/kvm/dirty_log_test.c | 2 +- tools/testing/selftests/kvm/hardware_disable_test.c | 2 +- tools/testing/selftests/kvm/include/kvm_util_base.h | 18 +++++++++++++----- tools/testing/selftests/kvm/lib/kvm_util.c | 16 ++++++++-------- .../selftests/kvm/x86_64/pmu_event_filter_test.c | 2 +- tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c | 2 +- 7 files changed, 26 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c index ffa0cdc0ab3d..fa4e6c3343d7 100644 --- a/tools/testing/selftests/kvm/aarch64/psci_test.c +++ b/tools/testing/selftests/kvm/aarch64/psci_test.c @@ -78,7 +78,7 @@ static struct kvm_vm *setup_vm(void *guest_code) struct kvm_vcpu_init init; struct kvm_vm *vm; - vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); + vm = vm_create(DEFAULT_GUEST_PHY_PAGES); ucall_init(vm, NULL); vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init); diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index b921d0b45647..cf426a8ae816 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -674,7 +674,7 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); - vm = vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages); + vm = __vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages); log_mode_create_vm_done(vm); vm_vcpu_add_default(vm, vcpuid, guest_code); diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c index 7e4df30dd1e8..29f6ca51408f 100644 --- a/tools/testing/selftests/kvm/hardware_disable_test.c +++ b/tools/testing/selftests/kvm/hardware_disable_test.c @@ -104,7 +104,7 @@ static void run_test(uint32_t run) for (i = 0; i < VCPU_NUM; i++) CPU_SET(i, &cpu_set); - vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); + vm = vm_create(DEFAULT_GUEST_PHY_PAGES); pr_debug("%s: [%d] start vcpus\n", __func__, run); for (i = 0; i < VCPU_NUM; ++i) { diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index b09ef551d61b..6418b1c04bc0 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -247,7 +247,6 @@ static inline void vm_enable_cap(struct kvm_vm *vm, uint32_t cap, uint64_t arg0) void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size); const char *vm_guest_mode_string(uint32_t i); -struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint64_t phy_pages); void kvm_vm_free(struct kvm_vm *vmp); void kvm_vm_restart(struct kvm_vm *vmp); void kvm_vm_release(struct kvm_vm *vmp); @@ -595,9 +594,21 @@ vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, vm_paddr_t paddr_min, uint32_t memslot); vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm); +/* + * ____vm_create() does KVM_CREATE_VM and little else. __vm_create() also + * loads the test binary into guest memory and creates an IRQ chip (x86 only). + */ +struct kvm_vm *____vm_create(enum vm_guest_mode mode, uint64_t nr_pages); +struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint64_t nr_pages); + static inline struct kvm_vm *vm_create_barebones(void) { - return __vm_create(VM_MODE_DEFAULT, 0); + return ____vm_create(VM_MODE_DEFAULT, 0); +} + +static inline struct kvm_vm *vm_create(uint64_t nr_pages) +{ + return __vm_create(VM_MODE_DEFAULT, nr_pages); } /* @@ -629,9 +640,6 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, uint32_t num_percpu_pages, void *guest_code, uint32_t vcpuids[]); -/* Create a default VM without any vcpus. */ -struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t pages); - /* * Create a VM with a single vCPU with reasonable defaults and @extra_mem_pages * additional pages of guest memory. Returns the VM and vCPU (via out param). diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index d538811cf93d..ea85e0e6688b 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -149,12 +149,12 @@ const struct vm_guest_mode_params vm_guest_mode_params[] = { _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES, "Missing new mode params?"); -struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint64_t phy_pages) +struct kvm_vm *____vm_create(enum vm_guest_mode mode, uint64_t nr_pages) { struct kvm_vm *vm; pr_debug("%s: mode='%s' pages='%ld'\n", __func__, - vm_guest_mode_string(mode), phy_pages); + vm_guest_mode_string(mode), nr_pages); vm = calloc(1, sizeof(*vm)); TEST_ASSERT(vm != NULL, "Insufficient Memory"); @@ -251,20 +251,20 @@ struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint64_t phy_pages) /* Allocate and setup memory for guest. */ vm->vpages_mapped = sparsebit_alloc(); - if (phy_pages != 0) + if (nr_pages != 0) vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, - 0, 0, phy_pages, 0); + 0, 0, nr_pages, 0); return vm; } -struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t pages) +struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint64_t nr_pages) { struct kvm_vm *vm; - pages = vm_adjust_num_guest_pages(mode, pages); + nr_pages = vm_adjust_num_guest_pages(mode, nr_pages); - vm = __vm_create(mode, pages); + vm = ____vm_create(mode, nr_pages); kvm_vm_elf_load(vm, program_invocation_name); @@ -323,7 +323,7 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, "nr_vcpus = %d too large for host, max-vcpus = %d", nr_vcpus, kvm_check_cap(KVM_CAP_MAX_VCPUS)); - vm = vm_create(mode, pages); + vm = __vm_create(mode, pages); for (i = 0; i < nr_vcpus; ++i) { uint32_t vcpuid = vcpuids ? vcpuids[i] : i; diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index 9146d62439fc..cdb9ce907c18 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -365,7 +365,7 @@ static void test_pmu_config_disable(void (*guest_code)(void)) if (!(r & KVM_PMU_CAP_DISABLE)) return; - vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); + vm = vm_create(DEFAULT_GUEST_PHY_PAGES); vm_enable_cap(vm, KVM_CAP_PMU_CAPABILITY, KVM_PMU_CAP_DISABLE); diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c index 6bc13cf17220..9ba3cd4e7f20 100644 --- a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c +++ b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c @@ -86,7 +86,7 @@ static struct kvm_vm *create_vm(void) uint64_t extra_pg_pages = vcpu_pages / PTES_PER_MIN_PAGE * N_VCPU; uint64_t pages = DEFAULT_GUEST_PHY_PAGES + vcpu_pages + extra_pg_pages; - return vm_create(VM_MODE_DEFAULT, pages); + return vm_create(pages); } static void add_x86_vcpu(struct kvm_vm *vm, uint32_t vcpuid, bool bsp_code) -- cgit v1.2.3 From 70ca149be61d9b09cc31c62e9fbe6faea4a79811 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 18 Apr 2022 10:08:40 -0700 Subject: KVM: selftests: Rename xAPIC state test's vcpu struct Rename xapic_state_test's kvm_vcpu struct to xapic_vcpu to avoid a collision when the common 'struct vcpu' is renamed to 'struct kvm_vcpu' in a future patch. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/xapic_state_test.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c index 0792334ba243..9d8393b6ec75 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c @@ -11,7 +11,7 @@ #include "processor.h" #include "test_util.h" -struct kvm_vcpu { +struct xapic_vcpu { uint32_t id; bool is_x2apic; }; @@ -47,7 +47,7 @@ static void x2apic_guest_code(void) } while (1); } -static void ____test_icr(struct kvm_vm *vm, struct kvm_vcpu *vcpu, uint64_t val) +static void ____test_icr(struct kvm_vm *vm, struct xapic_vcpu *vcpu, uint64_t val) { struct kvm_lapic_state xapic; struct ucall uc; @@ -75,13 +75,13 @@ static void ____test_icr(struct kvm_vm *vm, struct kvm_vcpu *vcpu, uint64_t val) ASSERT_EQ(icr, val & ~APIC_ICR_BUSY); } -static void __test_icr(struct kvm_vm *vm, struct kvm_vcpu *vcpu, uint64_t val) +static void __test_icr(struct kvm_vm *vm, struct xapic_vcpu *vcpu, uint64_t val) { ____test_icr(vm, vcpu, val | APIC_ICR_BUSY); ____test_icr(vm, vcpu, val & ~(u64)APIC_ICR_BUSY); } -static void test_icr(struct kvm_vm *vm, struct kvm_vcpu *vcpu) +static void test_icr(struct kvm_vm *vm, struct xapic_vcpu *vcpu) { uint64_t icr, i, j; @@ -116,7 +116,7 @@ static void test_icr(struct kvm_vm *vm, struct kvm_vcpu *vcpu) int main(int argc, char *argv[]) { - struct kvm_vcpu vcpu = { + struct xapic_vcpu vcpu = { .id = 0, .is_x2apic = true, }; -- cgit v1.2.3 From 1079c3d4e452a12f71f9ed076a37e5689f365153 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 13:26:01 -0800 Subject: KVM: selftests: Rename vcpu.state => vcpu.run Rename the "state" field of 'struct vcpu' to "run". KVM calls it "run", the struct name is "kvm_run", etc... Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/kvm_util_base.h | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 24 ++++++++-------------- tools/testing/selftests/kvm/lib/s390x/processor.c | 2 +- 3 files changed, 11 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 6418b1c04bc0..b83c3327d0e4 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -48,7 +48,7 @@ struct vcpu { uint32_t id; int fd; struct kvm_vm *vm; - struct kvm_run *state; + struct kvm_run *run; struct kvm_dirty_gfn *dirty_gfns; uint32_t fetch_index; uint32_t dirty_gfns_count; diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index ea85e0e6688b..74ab8f723288 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -514,7 +514,7 @@ static void vm_vcpu_rm(struct kvm_vm *vm, struct vcpu *vcpu) vcpu->dirty_gfns = NULL; } - ret = munmap(vcpu->state, vcpu_mmap_sz()); + ret = munmap(vcpu->run, vcpu_mmap_sz()); TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); ret = close(vcpu->fd); @@ -1081,13 +1081,7 @@ void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid) struct vcpu *vcpu; /* Confirm a vcpu with the specified id doesn't already exist. */ - vcpu = vcpu_find(vm, vcpuid); - if (vcpu != NULL) - TEST_FAIL("vcpu with the specified id " - "already exists,\n" - " requested vcpuid: %u\n" - " existing vcpuid: %u state: %p", - vcpuid, vcpu->id, vcpu->state); + TEST_ASSERT(!vcpu_find(vm, vcpuid), "vCPU%d already exists\n", vcpuid); /* Allocate and initialize new vcpu structure. */ vcpu = calloc(1, sizeof(*vcpu)); @@ -1098,12 +1092,12 @@ void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid) vcpu->fd = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)(unsigned long)vcpuid); TEST_ASSERT(vcpu->fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VCPU, vcpu->fd)); - TEST_ASSERT(vcpu_mmap_sz() >= sizeof(*vcpu->state), "vcpu mmap size " + TEST_ASSERT(vcpu_mmap_sz() >= sizeof(*vcpu->run), "vcpu mmap size " "smaller than expected, vcpu_mmap_sz: %i expected_min: %zi", - vcpu_mmap_sz(), sizeof(*vcpu->state)); - vcpu->state = (struct kvm_run *) mmap(NULL, vcpu_mmap_sz(), + vcpu_mmap_sz(), sizeof(*vcpu->run)); + vcpu->run = (struct kvm_run *) mmap(NULL, vcpu_mmap_sz(), PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd, 0); - TEST_ASSERT(vcpu->state != MAP_FAILED, + TEST_ASSERT(vcpu->run != MAP_FAILED, __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED)); /* Add to linked-list of VCPUs. */ @@ -1460,7 +1454,7 @@ struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid) { struct vcpu *vcpu = vcpu_get(vm, vcpuid); - return vcpu->state; + return vcpu->run; } /* @@ -1502,9 +1496,9 @@ void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid) struct vcpu *vcpu = vcpu_get(vm, vcpuid); int ret; - vcpu->state->immediate_exit = 1; + vcpu->run->immediate_exit = 1; ret = __vcpu_run(vm, vcpuid); - vcpu->state->immediate_exit = 0; + vcpu->run->immediate_exit = 0; TEST_ASSERT(ret == -1 && errno == EINTR, "KVM_RUN IOCTL didn't exit immediately, rc: %i, errno: %i", diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c index 53c413932f64..df9d9650d916 100644 --- a/tools/testing/selftests/kvm/lib/s390x/processor.c +++ b/tools/testing/selftests/kvm/lib/s390x/processor.c @@ -210,7 +210,7 @@ void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) struct vcpu *vcpu = vcpu_get(vm, vcpuid); fprintf(stream, "%*spstate: psw: 0x%.16llx:0x%.16llx\n", - indent, "", vcpu->state->psw_mask, vcpu->state->psw_addr); + indent, "", vcpu->run->psw_mask, vcpu->run->psw_addr); } void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid) -- cgit v1.2.3 From 0cc64b08096c71ba139e25759597c5df80ae422a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 15:24:21 -0800 Subject: KVM: selftests: Rename 'struct vcpu' to 'struct kvm_vcpu' Rename 'struct vcpu' to 'struct kvm_vcpu' to align with 'struct kvm_vm' in the selftest, and to give readers a hint that the struct is specific to KVM. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/kvm_util_base.h | 11 ++++--- tools/testing/selftests/kvm/lib/kvm_util.c | 36 ++++++++++------------ tools/testing/selftests/kvm/lib/s390x/processor.c | 2 +- 3 files changed, 24 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index b83c3327d0e4..d2c7fb391fc7 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -16,6 +16,7 @@ #include #include "linux/rbtree.h" + #include #include "sparsebit.h" @@ -43,7 +44,7 @@ struct userspace_mem_region { struct hlist_node slot_node; }; -struct vcpu { +struct kvm_vcpu { struct list_head list; uint32_t id; int fd; @@ -92,7 +93,7 @@ struct kvm_vm { continue; \ else -struct vcpu *vcpu_get(struct kvm_vm *vm, uint32_t vcpuid); +struct kvm_vcpu *vcpu_get(struct kvm_vm *vm, uint32_t vcpuid); /* * Virtual Translation Tables Dump @@ -644,17 +645,17 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, * Create a VM with a single vCPU with reasonable defaults and @extra_mem_pages * additional pages of guest memory. Returns the VM and vCPU (via out param). */ -struct kvm_vm *__vm_create_with_one_vcpu(struct vcpu **vcpu, +struct kvm_vm *__vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, uint64_t extra_mem_pages, void *guest_code); -static inline struct kvm_vm *vm_create_with_one_vcpu(struct vcpu **vcpu, +static inline struct kvm_vm *vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, void *guest_code) { return __vm_create_with_one_vcpu(vcpu, 0, guest_code); } -struct vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm); +struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm); /* * Adds a vCPU with reasonable defaults (e.g. a stack) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 74ab8f723288..bd1066f789df 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -349,7 +349,7 @@ struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, (uint32_t []){ vcpuid }); } -struct kvm_vm *__vm_create_with_one_vcpu(struct vcpu **vcpu, +struct kvm_vm *__vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, uint64_t extra_mem_pages, void *guest_code) { @@ -393,7 +393,7 @@ void kvm_vm_restart(struct kvm_vm *vmp) } } -struct vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm) +struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm) { kvm_vm_restart(vm); @@ -472,23 +472,23 @@ kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, return ®ion->region; } -static struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid) +static struct kvm_vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpu_id) { - struct vcpu *vcpu; + struct kvm_vcpu *vcpu; list_for_each_entry(vcpu, &vm->vcpus, list) { - if (vcpu->id == vcpuid) + if (vcpu->id == vcpu_id) return vcpu; } return NULL; } -struct vcpu *vcpu_get(struct kvm_vm *vm, uint32_t vcpuid) +struct kvm_vcpu *vcpu_get(struct kvm_vm *vm, uint32_t vcpu_id) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); + struct kvm_vcpu *vcpu = vcpu_find(vm, vcpu_id); - TEST_ASSERT(vcpu, "vCPU %d does not exist", vcpuid); + TEST_ASSERT(vcpu, "vCPU %d does not exist", vcpu_id); return vcpu; } @@ -504,7 +504,7 @@ struct vcpu *vcpu_get(struct kvm_vm *vm, uint32_t vcpuid) * * Removes a vCPU from a VM and frees its resources. */ -static void vm_vcpu_rm(struct kvm_vm *vm, struct vcpu *vcpu) +static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) { int ret; @@ -526,7 +526,7 @@ static void vm_vcpu_rm(struct kvm_vm *vm, struct vcpu *vcpu) void kvm_vm_release(struct kvm_vm *vmp) { - struct vcpu *vcpu, *tmp; + struct kvm_vcpu *vcpu, *tmp; int ret; list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list) @@ -1078,7 +1078,7 @@ static int vcpu_mmap_sz(void) */ void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid) { - struct vcpu *vcpu; + struct kvm_vcpu *vcpu; /* Confirm a vcpu with the specified id doesn't already exist. */ TEST_ASSERT(!vcpu_find(vm, vcpuid), "vCPU%d already exists\n", vcpuid); @@ -1452,7 +1452,7 @@ void vm_create_irqchip(struct kvm_vm *vm) */ struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid) { - struct vcpu *vcpu = vcpu_get(vm, vcpuid); + struct kvm_vcpu *vcpu = vcpu_get(vm, vcpuid); return vcpu->run; } @@ -1493,7 +1493,7 @@ int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid) void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid) { - struct vcpu *vcpu = vcpu_get(vm, vcpuid); + struct kvm_vcpu *vcpu = vcpu_get(vm, vcpuid); int ret; vcpu->run->immediate_exit = 1; @@ -1537,7 +1537,7 @@ struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vm *vm, uint32_t vcpuid) int __vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long cmd, void *arg) { - struct vcpu *vcpu = vcpu_get(vm, vcpuid); + struct kvm_vcpu *vcpu = vcpu_get(vm, vcpuid); return ioctl(vcpu->fd, cmd, arg); } @@ -1552,7 +1552,7 @@ void _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long cmd, void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid) { - struct vcpu *vcpu = vcpu_get(vm, vcpuid); + struct kvm_vcpu *vcpu = vcpu_get(vm, vcpuid); uint32_t size = vm->dirty_ring_size; TEST_ASSERT(size > 0, "Should enable dirty ring first"); @@ -1684,9 +1684,7 @@ void vcpu_device_attr_set(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, int __vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, uint64_t attr) { - struct vcpu *vcpu = vcpu_get(vm, vcpuid); - - return __kvm_has_device_attr(vcpu->fd, group, attr); + return __kvm_has_device_attr(vcpu_get(vm, vcpuid)->fd, group, attr); } /* @@ -1779,7 +1777,7 @@ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { int ctr; struct userspace_mem_region *region; - struct vcpu *vcpu; + struct kvm_vcpu *vcpu; fprintf(stream, "%*smode: 0x%x\n", indent, "", vm->mode); fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd); diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c index df9d9650d916..aec15ca9d887 100644 --- a/tools/testing/selftests/kvm/lib/s390x/processor.c +++ b/tools/testing/selftests/kvm/lib/s390x/processor.c @@ -207,7 +207,7 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) { - struct vcpu *vcpu = vcpu_get(vm, vcpuid); + struct kvm_vcpu *vcpu = vcpu_get(vm, vcpuid); fprintf(stream, "%*spstate: psw: 0x%.16llx:0x%.16llx\n", indent, "", vcpu->run->psw_mask, vcpu->run->psw_addr); -- cgit v1.2.3 From e3763d3aebea261ac3eef24e94dc85be91209d0b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 16:16:45 -0800 Subject: KVM: selftests: Return the created vCPU from vm_vcpu_add() Return the created vCPU from vm_vcpu_add() so that callers don't need to manually retrieve the vCPU that was just added. Opportunistically drop the "heavy" function comment, it adds a lot of lines of "code" but not much value, e.g. it's pretty obvious that @vm is a virtual machine... Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/kvm_util_base.h | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 27 ++++++++-------------- 2 files changed, 10 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index d2c7fb391fc7..fbc54e920383 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -324,7 +324,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); -void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid); +struct kvm_vcpu *vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid); vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages); vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index bd1066f789df..f8274ca5fe5b 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -397,8 +397,7 @@ struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm) { kvm_vm_restart(vm); - vm_vcpu_add(vm, 0); - return vcpu_get(vm, 0); + return vm_vcpu_add(vm, 0); } /* @@ -1063,33 +1062,23 @@ static int vcpu_mmap_sz(void) } /* - * VM VCPU Add - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * - * Output Args: None - * - * Return: None - * - * Adds a virtual CPU to the VM specified by vm with the ID given by vcpuid. - * No additional VCPU setup is done. + * Adds a virtual CPU to the VM specified by vm with the ID given by vcpu_id. + * No additional vCPU setup is done. Returns the vCPU. */ -void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid) +struct kvm_vcpu *vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) { struct kvm_vcpu *vcpu; /* Confirm a vcpu with the specified id doesn't already exist. */ - TEST_ASSERT(!vcpu_find(vm, vcpuid), "vCPU%d already exists\n", vcpuid); + TEST_ASSERT(!vcpu_find(vm, vcpu_id), "vCPU%d already exists\n", vcpu_id); /* Allocate and initialize new vcpu structure. */ vcpu = calloc(1, sizeof(*vcpu)); TEST_ASSERT(vcpu != NULL, "Insufficient Memory"); vcpu->vm = vm; - vcpu->id = vcpuid; - vcpu->fd = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)(unsigned long)vcpuid); + vcpu->id = vcpu_id; + vcpu->fd = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)(unsigned long)vcpu_id); TEST_ASSERT(vcpu->fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VCPU, vcpu->fd)); TEST_ASSERT(vcpu_mmap_sz() >= sizeof(*vcpu->run), "vcpu mmap size " @@ -1102,6 +1091,8 @@ void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid) /* Add to linked-list of VCPUs. */ list_add(&vcpu->list, &vm->vcpus); + + return vcpu; } /* -- cgit v1.2.3 From e82e630ba965ad8a1278cf8bbe9759975a5a5109 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 13:34:31 -0800 Subject: KVM: selftests: Convert memslot_perf_test away from VCPU_ID Convert memslot_perf_test to use __vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. This is the first of many, many steps towards eliminating VCPU_ID from all KVM selftests, and towards eventually purging the VM+vcpu_id mess. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/memslot_perf_test.c | 28 ++++++++++++------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index 1727f75e0c2c..009eb19b28af 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -25,8 +25,6 @@ #include #include -#define VCPU_ID 0 - #define MEM_SIZE ((512U << 20) + 4096) #define MEM_SIZE_PAGES (MEM_SIZE / 4096) #define MEM_GPA 0x10000000UL @@ -90,6 +88,7 @@ static_assert(MEM_TEST_MOVE_SIZE <= MEM_TEST_SIZE, struct vm_data { struct kvm_vm *vm; + struct kvm_vcpu *vcpu; pthread_t vcpu_thread; uint32_t nslots; uint64_t npages; @@ -127,29 +126,29 @@ static bool verbose; pr_info(__VA_ARGS__); \ } while (0) -static void check_mmio_access(struct vm_data *vm, struct kvm_run *run) +static void check_mmio_access(struct vm_data *data, struct kvm_run *run) { - TEST_ASSERT(vm->mmio_ok, "Unexpected mmio exit"); + TEST_ASSERT(data->mmio_ok, "Unexpected mmio exit"); TEST_ASSERT(run->mmio.is_write, "Unexpected mmio read"); TEST_ASSERT(run->mmio.len == 8, "Unexpected exit mmio size = %u", run->mmio.len); - TEST_ASSERT(run->mmio.phys_addr >= vm->mmio_gpa_min && - run->mmio.phys_addr <= vm->mmio_gpa_max, + TEST_ASSERT(run->mmio.phys_addr >= data->mmio_gpa_min && + run->mmio.phys_addr <= data->mmio_gpa_max, "Unexpected exit mmio address = 0x%llx", run->mmio.phys_addr); } -static void *vcpu_worker(void *data) +static void *vcpu_worker(void *__data) { - struct vm_data *vm = data; - struct kvm_run *run; + struct vm_data *data = __data; + struct kvm_vcpu *vcpu = data->vcpu; + struct kvm_run *run = vcpu->run; struct ucall uc; - run = vcpu_state(vm->vm, VCPU_ID); while (1) { - vcpu_run(vm->vm, VCPU_ID); + vcpu_run(data->vm, vcpu->id); - switch (get_ucall(vm->vm, VCPU_ID, &uc)) { + switch (get_ucall(data->vm, vcpu->id, &uc)) { case UCALL_SYNC: TEST_ASSERT(uc.args[1] == 0, "Unexpected sync ucall, got %lx", @@ -158,7 +157,7 @@ static void *vcpu_worker(void *data) continue; case UCALL_NONE: if (run->exit_reason == KVM_EXIT_MMIO) - check_mmio_access(vm, run); + check_mmio_access(data, run); else goto done; break; @@ -238,6 +237,7 @@ static struct vm_data *alloc_vm(void) TEST_ASSERT(data, "malloc(vmdata) failed"); data->vm = NULL; + data->vcpu = NULL; data->hva_slots = NULL; return data; @@ -278,7 +278,7 @@ static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots, data->hva_slots = malloc(sizeof(*data->hva_slots) * data->nslots); TEST_ASSERT(data->hva_slots, "malloc() fail"); - data->vm = vm_create_default(VCPU_ID, mempages, guest_code); + data->vm = __vm_create_with_one_vcpu(&data->vcpu, mempages, guest_code); ucall_init(data->vm, NULL); pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n", -- cgit v1.2.3 From 2494a6d80fb58c303b542331d3218ecd70cccea3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 13:36:46 -0800 Subject: KVM: selftests: Convert rseq_test away from VCPU_ID Convert rseq_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/rseq_test.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c index 4158da0da2bb..fd754de0b74c 100644 --- a/tools/testing/selftests/kvm/rseq_test.c +++ b/tools/testing/selftests/kvm/rseq_test.c @@ -20,8 +20,6 @@ #include "processor.h" #include "test_util.h" -#define VCPU_ID 0 - static __thread volatile struct rseq __rseq = { .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, }; @@ -207,6 +205,7 @@ int main(int argc, char *argv[]) { int r, i, snapshot; struct kvm_vm *vm; + struct kvm_vcpu *vcpu; u32 cpu, rseq_cpu; /* Tell stdout not to buffer its content */ @@ -228,14 +227,14 @@ int main(int argc, char *argv[]) * GUEST_SYNC, while concurrently migrating the process by setting its * CPU affinity. */ - vm = vm_create_default(VCPU_ID, 0, guest_code); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); ucall_init(vm, NULL); pthread_create(&migration_thread, NULL, migration_worker, 0); for (i = 0; !done; i++) { - vcpu_run(vm, VCPU_ID); - TEST_ASSERT(get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC, + vcpu_run(vm, vcpu->id); + TEST_ASSERT(get_ucall(vm, vcpu->id, NULL) == UCALL_SYNC, "Guest failed?"); /* -- cgit v1.2.3 From 58606e6025536a9ff3216df9cb2c6b7b7a1594be Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 13:49:19 -0800 Subject: KVM: selftests: Convert xss_msr_test away from VCPU_ID Convert xss_msr_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Note, this is a "functional" change in the sense that the test now creates a vCPU with vcpu_id==0 instead of vcpu_id==1. The non-zero VCPU_ID was 100% arbitrary and added little to no validation coverage. If testing non-zero vCPU IDs is desirable for generic tests, that can be done in the future by tweaking the VM creation helpers. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/xss_msr_test.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c index a6abcb559e7c..a89d49ae79a6 100644 --- a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c +++ b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c @@ -12,7 +12,6 @@ #include "kvm_util.h" #include "vmx.h" -#define VCPU_ID 1 #define MSR_BITS 64 #define X86_FEATURE_XSAVES (1<<3) @@ -23,11 +22,12 @@ int main(int argc, char *argv[]) bool xss_supported = false; bool xss_in_msr_list; struct kvm_vm *vm; + struct kvm_vcpu *vcpu; uint64_t xss_val; int i, r; /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, 0); + vm = vm_create_with_one_vcpu(&vcpu, NULL); if (kvm_get_cpuid_max_basic() >= 0xd) { entry = kvm_get_supported_cpuid_index(0xd, 1); @@ -38,11 +38,12 @@ int main(int argc, char *argv[]) exit(KSFT_SKIP); } - xss_val = vcpu_get_msr(vm, VCPU_ID, MSR_IA32_XSS); + xss_val = vcpu_get_msr(vm, vcpu->id, MSR_IA32_XSS); TEST_ASSERT(xss_val == 0, "MSR_IA32_XSS should be initialized to zero\n"); - vcpu_set_msr(vm, VCPU_ID, MSR_IA32_XSS, xss_val); + vcpu_set_msr(vm, vcpu->id, MSR_IA32_XSS, xss_val); + /* * At present, KVM only supports a guest IA32_XSS value of 0. Verify * that trying to set the guest IA32_XSS to an unsupported value fails. @@ -51,7 +52,7 @@ int main(int argc, char *argv[]) */ xss_in_msr_list = kvm_msr_is_in_save_restore_list(MSR_IA32_XSS); for (i = 0; i < MSR_BITS; ++i) { - r = _vcpu_set_msr(vm, VCPU_ID, MSR_IA32_XSS, 1ull << i); + r = _vcpu_set_msr(vm, vcpu->id, MSR_IA32_XSS, 1ull << i); /* * Setting a list of MSRs returns the entry that "faulted", or -- cgit v1.2.3 From b1bc990406beab8690a20818e91d4ac522712bc7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 13:51:14 -0800 Subject: KVM: selftests: Convert vmx_preemption_timer_test away from VCPU_ID Convert vmx_preemption_timer_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Note, this is a "functional" change in the sense that the test now creates a vCPU with vcpu_id==0 instead of vcpu_id==5. The non-zero VCPU_ID was 100% arbitrary and added little to no validation coverage. If testing non-zero vCPU IDs is desirable for generic tests, that can be done in the future by tweaking the VM creation helpers. Opportunistically use vcpu_run() instead of _vcpu_run(), the test expects KVM_RUN to succeed. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../kvm/x86_64/vmx_preemption_timer_test.c | 30 +++++++++++----------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c index f5b4ae914131..168adc5b2272 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c @@ -22,7 +22,6 @@ #include "processor.h" #include "vmx.h" -#define VCPU_ID 5 #define PREEMPTION_TIMER_VALUE 100000000ull #define PREEMPTION_TIMER_VALUE_THRESHOLD1 80000000ull @@ -159,6 +158,7 @@ int main(int argc, char *argv[]) struct kvm_regs regs1, regs2; struct kvm_vm *vm; struct kvm_run *run; + struct kvm_vcpu *vcpu; struct kvm_x86_state *state; struct ucall uc; int stage; @@ -175,22 +175,22 @@ int main(int argc, char *argv[]) } /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code); - run = vcpu_state(vm, VCPU_ID); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + run = vcpu->run; - vcpu_regs_get(vm, VCPU_ID, ®s1); + vcpu_regs_get(vm, vcpu->id, ®s1); vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + vcpu_args_set(vm, vcpu->id, 1, vmx_pages_gva); for (stage = 1;; stage++) { - _vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Stage %d: unexpected exit reason: %u (%s),\n", stage, run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vm, vcpu->id, &uc)) { case UCALL_ABORT: TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]); @@ -232,22 +232,22 @@ int main(int argc, char *argv[]) stage, uc.args[4], uc.args[5]); } - state = vcpu_save_state(vm, VCPU_ID); + state = vcpu_save_state(vm, vcpu->id); memset(®s1, 0, sizeof(regs1)); - vcpu_regs_get(vm, VCPU_ID, ®s1); + vcpu_regs_get(vm, vcpu->id, ®s1); kvm_vm_release(vm); /* Restore state in a new VM. */ - kvm_vm_restart(vm); - vm_vcpu_add(vm, VCPU_ID); - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); - vcpu_load_state(vm, VCPU_ID, state); - run = vcpu_state(vm, VCPU_ID); + vcpu = vm_recreate_with_one_vcpu(vm); + + vcpu_set_cpuid(vm, vcpu->id, kvm_get_supported_cpuid()); + vcpu_load_state(vm, vcpu->id, state); + run = vcpu->run; kvm_x86_state_cleanup(state); memset(®s2, 0, sizeof(regs2)); - vcpu_regs_get(vm, VCPU_ID, ®s2); + vcpu_regs_get(vm, vcpu->id, ®s2); TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", (ulong) regs2.rdi, (ulong) regs2.rsi); -- cgit v1.2.3 From d8b5b5d1327175b0bd0d9c97434a307cded6b5dd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 13:53:06 -0800 Subject: KVM: selftests: Convert vmx_pmu_msrs_test away from VCPU_ID Convert vmx_pmu_msrs_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../selftests/kvm/x86_64/vmx_pmu_caps_test.c | 25 +++++++++++----------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c index 97b7fd4a9a3d..63129ff5d003 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c @@ -17,8 +17,6 @@ #include "kvm_util.h" #include "vmx.h" -#define VCPU_ID 0 - #define X86_FEATURE_PDCM (1<<15) #define PMU_CAP_FW_WRITES (1ULL << 13) #define PMU_CAP_LBR_FMT 0x3f @@ -61,6 +59,7 @@ int main(int argc, char *argv[]) struct kvm_cpuid_entry2 *entry_a_0; bool pdcm_supported = false; struct kvm_vm *vm; + struct kvm_vcpu *vcpu; int ret; union cpuid10_eax eax; union perf_capabilities host_cap; @@ -69,7 +68,7 @@ int main(int argc, char *argv[]) host_cap.capabilities &= (PMU_CAP_FW_WRITES | PMU_CAP_LBR_FMT); /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); cpuid = kvm_get_supported_cpuid(); if (kvm_get_cpuid_max_basic() >= 0xa) { @@ -88,27 +87,27 @@ int main(int argc, char *argv[]) } /* testcase 1, set capabilities when we have PDCM bit */ - vcpu_set_cpuid(vm, VCPU_ID, cpuid); - vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, PMU_CAP_FW_WRITES); + vcpu_set_cpuid(vm, vcpu->id, cpuid); + vcpu_set_msr(vm, vcpu->id, MSR_IA32_PERF_CAPABILITIES, PMU_CAP_FW_WRITES); /* check capabilities can be retrieved with KVM_GET_MSR */ - ASSERT_EQ(vcpu_get_msr(vm, VCPU_ID, MSR_IA32_PERF_CAPABILITIES), PMU_CAP_FW_WRITES); + ASSERT_EQ(vcpu_get_msr(vm, vcpu->id, MSR_IA32_PERF_CAPABILITIES), PMU_CAP_FW_WRITES); /* check whatever we write with KVM_SET_MSR is _not_ modified */ - vcpu_run(vm, VCPU_ID); - ASSERT_EQ(vcpu_get_msr(vm, VCPU_ID, MSR_IA32_PERF_CAPABILITIES), PMU_CAP_FW_WRITES); + vcpu_run(vm, vcpu->id); + ASSERT_EQ(vcpu_get_msr(vm, vcpu->id, MSR_IA32_PERF_CAPABILITIES), PMU_CAP_FW_WRITES); /* testcase 2, check valid LBR formats are accepted */ - vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, 0); - ASSERT_EQ(vcpu_get_msr(vm, VCPU_ID, MSR_IA32_PERF_CAPABILITIES), 0); + vcpu_set_msr(vm, vcpu->id, MSR_IA32_PERF_CAPABILITIES, 0); + ASSERT_EQ(vcpu_get_msr(vm, vcpu->id, MSR_IA32_PERF_CAPABILITIES), 0); - vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, host_cap.lbr_format); - ASSERT_EQ(vcpu_get_msr(vm, VCPU_ID, MSR_IA32_PERF_CAPABILITIES), (u64)host_cap.lbr_format); + vcpu_set_msr(vm, vcpu->id, MSR_IA32_PERF_CAPABILITIES, host_cap.lbr_format); + ASSERT_EQ(vcpu_get_msr(vm, vcpu->id, MSR_IA32_PERF_CAPABILITIES), (u64)host_cap.lbr_format); /* testcase 3, check invalid LBR format is rejected */ /* Note, on Arch LBR capable platforms, LBR_FMT in perf capability msr is 0x3f, * to avoid the failure, use a true invalid format 0x30 for the test. */ - ret = _vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, 0x30); + ret = _vcpu_set_msr(vm, vcpu->id, MSR_IA32_PERF_CAPABILITIES, 0x30); TEST_ASSERT(ret == 0, "Bad PERF_CAPABILITIES didn't fail."); printf("Completed perf capability tests.\n"); -- cgit v1.2.3 From 4bc87470858db1a329ad94e36f398fec97b54095 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 13:56:58 -0800 Subject: KVM: selftests: Convert vmx_set_nested_state_test away from VCPU_ID Convert vmx_set_nested_state_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Note, this is a "functional" change in the sense that the test now creates a vCPU with vcpu_id==0 instead of vcpu_id==5. The non-zero VCPU_ID was 100% arbitrary and added little to no validation coverage. If testing non-zero vCPU IDs is desirable for generic tests, that can be done in the future by tweaking the VM creation helpers. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../kvm/x86_64/vmx_set_nested_state_test.c | 86 +++++++++++----------- 1 file changed, 43 insertions(+), 43 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c index af3b60eb35ec..de38f0e68153 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c @@ -23,38 +23,37 @@ * changes this should be updated. */ #define VMCS12_REVISION 0x11e57ed0 -#define VCPU_ID 5 bool have_evmcs; -void test_nested_state(struct kvm_vm *vm, struct kvm_nested_state *state) +void test_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state *state) { - vcpu_nested_state_set(vm, VCPU_ID, state); + vcpu_nested_state_set(vcpu->vm, vcpu->id, state); } -void test_nested_state_expect_errno(struct kvm_vm *vm, +void test_nested_state_expect_errno(struct kvm_vcpu *vcpu, struct kvm_nested_state *state, int expected_errno) { int rv; - rv = __vcpu_nested_state_set(vm, VCPU_ID, state); + rv = __vcpu_nested_state_set(vcpu->vm, vcpu->id, state); TEST_ASSERT(rv == -1 && errno == expected_errno, "Expected %s (%d) from vcpu_nested_state_set but got rv: %i errno: %s (%d)", strerror(expected_errno), expected_errno, rv, strerror(errno), errno); } -void test_nested_state_expect_einval(struct kvm_vm *vm, +void test_nested_state_expect_einval(struct kvm_vcpu *vcpu, struct kvm_nested_state *state) { - test_nested_state_expect_errno(vm, state, EINVAL); + test_nested_state_expect_errno(vcpu, state, EINVAL); } -void test_nested_state_expect_efault(struct kvm_vm *vm, +void test_nested_state_expect_efault(struct kvm_vcpu *vcpu, struct kvm_nested_state *state) { - test_nested_state_expect_errno(vm, state, EFAULT); + test_nested_state_expect_errno(vcpu, state, EFAULT); } void set_revision_id_for_vmcs12(struct kvm_nested_state *state, @@ -86,7 +85,7 @@ void set_default_vmx_state(struct kvm_nested_state *state, int size) set_revision_id_for_vmcs12(state, VMCS12_REVISION); } -void test_vmx_nested_state(struct kvm_vm *vm) +void test_vmx_nested_state(struct kvm_vcpu *vcpu) { /* Add a page for VMCS12. */ const int state_sz = sizeof(struct kvm_nested_state) + getpagesize(); @@ -96,14 +95,14 @@ void test_vmx_nested_state(struct kvm_vm *vm) /* The format must be set to 0. 0 for VMX, 1 for SVM. */ set_default_vmx_state(state, state_sz); state->format = 1; - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); /* * We cannot virtualize anything if the guest does not have VMX * enabled. */ set_default_vmx_state(state, state_sz); - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); /* * We cannot virtualize anything if the guest does not have VMX @@ -112,17 +111,17 @@ void test_vmx_nested_state(struct kvm_vm *vm) */ set_default_vmx_state(state, state_sz); state->hdr.vmx.vmxon_pa = -1ull; - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); state->hdr.vmx.vmcs12_pa = -1ull; state->flags = KVM_STATE_NESTED_EVMCS; - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); state->flags = 0; - test_nested_state(vm, state); + test_nested_state(vcpu, state); /* Enable VMX in the guest CPUID. */ - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + vcpu_set_cpuid(vcpu->vm, vcpu->id, kvm_get_supported_cpuid()); /* * Setting vmxon_pa == -1ull and vmcs_pa == -1ull exits early without @@ -133,34 +132,34 @@ void test_vmx_nested_state(struct kvm_vm *vm) set_default_vmx_state(state, state_sz); state->hdr.vmx.vmxon_pa = -1ull; state->hdr.vmx.vmcs12_pa = -1ull; - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); state->flags &= KVM_STATE_NESTED_EVMCS; if (have_evmcs) { - test_nested_state_expect_einval(vm, state); - vcpu_enable_evmcs(vm, VCPU_ID); + test_nested_state_expect_einval(vcpu, state); + vcpu_enable_evmcs(vcpu->vm, vcpu->id); } - test_nested_state(vm, state); + test_nested_state(vcpu, state); /* It is invalid to have vmxon_pa == -1ull and SMM flags non-zero. */ state->hdr.vmx.smm.flags = 1; - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); /* Invalid flags are rejected. */ set_default_vmx_state(state, state_sz); state->hdr.vmx.flags = ~0; - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); /* It is invalid to have vmxon_pa == -1ull and vmcs_pa != -1ull. */ set_default_vmx_state(state, state_sz); state->hdr.vmx.vmxon_pa = -1ull; state->flags = 0; - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); /* It is invalid to have vmxon_pa set to a non-page aligned address. */ set_default_vmx_state(state, state_sz); state->hdr.vmx.vmxon_pa = 1; - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); /* * It is invalid to have KVM_STATE_NESTED_SMM_GUEST_MODE and @@ -170,7 +169,7 @@ void test_vmx_nested_state(struct kvm_vm *vm) state->flags = KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING; state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE; - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); /* * It is invalid to have any of the SMM flags set besides: @@ -180,13 +179,13 @@ void test_vmx_nested_state(struct kvm_vm *vm) set_default_vmx_state(state, state_sz); state->hdr.vmx.smm.flags = ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON); - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); /* Outside SMM, SMM flags must be zero. */ set_default_vmx_state(state, state_sz); state->flags = 0; state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE; - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); /* * Size must be large enough to fit kvm_nested_state and vmcs12 @@ -195,13 +194,13 @@ void test_vmx_nested_state(struct kvm_vm *vm) set_default_vmx_state(state, state_sz); state->size = sizeof(*state); state->flags = 0; - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); set_default_vmx_state(state, state_sz); state->size = sizeof(*state); state->flags = 0; state->hdr.vmx.vmcs12_pa = -1; - test_nested_state(vm, state); + test_nested_state(vcpu, state); /* * KVM_SET_NESTED_STATE succeeds with invalid VMCS @@ -209,7 +208,7 @@ void test_vmx_nested_state(struct kvm_vm *vm) */ set_default_vmx_state(state, state_sz); state->flags = 0; - test_nested_state(vm, state); + test_nested_state(vcpu, state); /* Invalid flags are rejected, even if no VMCS loaded. */ set_default_vmx_state(state, state_sz); @@ -217,13 +216,13 @@ void test_vmx_nested_state(struct kvm_vm *vm) state->flags = 0; state->hdr.vmx.vmcs12_pa = -1; state->hdr.vmx.flags = ~0; - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); /* vmxon_pa cannot be the same address as vmcs_pa. */ set_default_vmx_state(state, state_sz); state->hdr.vmx.vmxon_pa = 0; state->hdr.vmx.vmcs12_pa = 0; - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); /* * Test that if we leave nesting the state reflects that when we get @@ -233,8 +232,8 @@ void test_vmx_nested_state(struct kvm_vm *vm) state->hdr.vmx.vmxon_pa = -1ull; state->hdr.vmx.vmcs12_pa = -1ull; state->flags = 0; - test_nested_state(vm, state); - vcpu_nested_state_get(vm, VCPU_ID, state); + test_nested_state(vcpu, state); + vcpu_nested_state_get(vcpu->vm, vcpu->id, state); TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz, "Size must be between %ld and %d. The size returned was %d.", sizeof(*state), state_sz, state->size); @@ -244,7 +243,7 @@ void test_vmx_nested_state(struct kvm_vm *vm) free(state); } -void disable_vmx(struct kvm_vm *vm) +void disable_vmx(struct kvm_vcpu *vcpu) { struct kvm_cpuid2 *cpuid = kvm_get_supported_cpuid(); int i; @@ -256,7 +255,7 @@ void disable_vmx(struct kvm_vm *vm) TEST_ASSERT(i != cpuid->nent, "CPUID function 1 not found"); cpuid->entries[i].ecx &= ~CPUID_VMX; - vcpu_set_cpuid(vm, VCPU_ID, cpuid); + vcpu_set_cpuid(vcpu->vm, vcpu->id, cpuid); cpuid->entries[i].ecx |= CPUID_VMX; } @@ -264,6 +263,7 @@ int main(int argc, char *argv[]) { struct kvm_vm *vm; struct kvm_nested_state state; + struct kvm_vcpu *vcpu; have_evmcs = kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS); @@ -278,20 +278,20 @@ int main(int argc, char *argv[]) */ nested_vmx_check_supported(); - vm = vm_create_default(VCPU_ID, 0, 0); + vm = vm_create_with_one_vcpu(&vcpu, NULL); /* * First run tests with VMX disabled to check error handling. */ - disable_vmx(vm); + disable_vmx(vcpu); /* Passing a NULL kvm_nested_state causes a EFAULT. */ - test_nested_state_expect_efault(vm, NULL); + test_nested_state_expect_efault(vcpu, NULL); /* 'size' cannot be smaller than sizeof(kvm_nested_state). */ set_default_state(&state); state.size = 0; - test_nested_state_expect_einval(vm, &state); + test_nested_state_expect_einval(vcpu, &state); /* * Setting the flags 0xf fails the flags check. The only flags that @@ -302,7 +302,7 @@ int main(int argc, char *argv[]) */ set_default_state(&state); state.flags = 0xf; - test_nested_state_expect_einval(vm, &state); + test_nested_state_expect_einval(vcpu, &state); /* * If KVM_STATE_NESTED_RUN_PENDING is set then @@ -310,9 +310,9 @@ int main(int argc, char *argv[]) */ set_default_state(&state); state.flags = KVM_STATE_NESTED_RUN_PENDING; - test_nested_state_expect_einval(vm, &state); + test_nested_state_expect_einval(vcpu, &state); - test_vmx_nested_state(vm); + test_vmx_nested_state(vcpu); kvm_vm_free(vm); return 0; -- cgit v1.2.3 From 5581ed8762fc9400047521be23844a27cf884b08 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 15:22:03 -0800 Subject: KVM: selftests: Convert vmx_tsc_adjust_test away from VCPU_ID Convert vmx_tsc_adjust_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Note, this is a "functional" change in the sense that the test now creates a vCPU with vcpu_id==0 instead of vcpu_id==5. The non-zero VCPU_ID was 100% arbitrary and added little to no validation coverage. If testing non-zero vCPU IDs is desirable for generic tests, that can be done in the future by tweaking the VM creation helpers. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c index 19b35c607dc6..29699d7c16c3 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c @@ -32,8 +32,6 @@ #define MSR_IA32_TSC_ADJUST 0x3b #endif -#define VCPU_ID 5 - #define TSC_ADJUST_VALUE (1ll << 32) #define TSC_OFFSET_VALUE -(1ll << 48) @@ -127,26 +125,27 @@ static void report(int64_t val) int main(int argc, char *argv[]) { vm_vaddr_t vmx_pages_gva; + struct kvm_vcpu *vcpu; nested_vmx_check_supported(); - vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + vm = vm_create_with_one_vcpu(&vcpu, (void *) l1_guest_code); /* Allocate VMX pages and shared descriptors (vmx_pages). */ vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + vcpu_args_set(vm, vcpu->id, 1, vmx_pages_gva); for (;;) { - volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); + volatile struct kvm_run *run = vcpu->run; struct ucall uc; - vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vm, vcpu->id, &uc)) { case UCALL_ABORT: TEST_FAIL("%s", (const char *)uc.args[0]); /* NOT REACHED */ -- cgit v1.2.3 From 5478431f984ec593684f0dbb212e2c141ec20320 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 15:28:49 -0800 Subject: KVM: selftests: Convert mmu_role_test away from VCPU_ID Convert mmu_role_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Note, this is a "functional" change in the sense that the test now creates a vCPU with vcpu_id==0 instead of vcpu_id==1. The non-zero VCPU_ID was 100% arbitrary and added little to no validation coverage. If testing non-zero vCPU IDs is desirable for generic tests, that can be done in the future by tweaking the VM creation helpers. Opportunistically use vcpu_run() instead of _vcpu_run() plus an open coded assert that KVM_RUN succeeded. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/mmu_role_test.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/mmu_role_test.c b/tools/testing/selftests/kvm/x86_64/mmu_role_test.c index bdecd532f935..829a5cf94f5c 100644 --- a/tools/testing/selftests/kvm/x86_64/mmu_role_test.c +++ b/tools/testing/selftests/kvm/x86_64/mmu_role_test.c @@ -3,8 +3,6 @@ #include "kvm_util.h" #include "processor.h" -#define VCPU_ID 1 - #define MMIO_GPA 0x100000000ull static void guest_code(void) @@ -25,22 +23,21 @@ static void guest_pf_handler(struct ex_regs *regs) static void mmu_role_test(u32 *cpuid_reg, u32 evil_cpuid_val) { u32 good_cpuid_val = *cpuid_reg; + struct kvm_vcpu *vcpu; struct kvm_run *run; struct kvm_vm *vm; uint64_t cmd; - int r; /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code); - run = vcpu_state(vm, VCPU_ID); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + run = vcpu->run; /* Map 1gb page without a backing memlot. */ __virt_pg_map(vm, MMIO_GPA, MMIO_GPA, PG_LEVEL_1G); - r = _vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); /* Guest access to the 1gb page should trigger MMIO. */ - TEST_ASSERT(r == 0, "vcpu_run failed: %d\n", r); TEST_ASSERT(run->exit_reason == KVM_EXIT_MMIO, "Unexpected exit reason: %u (%s), expected MMIO exit (1gb page w/o memslot)\n", run->exit_reason, exit_reason_str(run->exit_reason)); @@ -57,7 +54,7 @@ static void mmu_role_test(u32 *cpuid_reg, u32 evil_cpuid_val) * returns the struct that contains the entry being modified. Eww. */ *cpuid_reg = evil_cpuid_val; - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + vcpu_set_cpuid(vm, vcpu->id, kvm_get_supported_cpuid()); /* * Add a dummy memslot to coerce KVM into bumping the MMIO generation. @@ -70,13 +67,12 @@ static void mmu_role_test(u32 *cpuid_reg, u32 evil_cpuid_val) /* Set up a #PF handler to eat the RSVD #PF and signal all done! */ vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, VCPU_ID); + vcpu_init_descriptor_tables(vm, vcpu->id); vm_install_exception_handler(vm, PF_VECTOR, guest_pf_handler); - r = _vcpu_run(vm, VCPU_ID); - TEST_ASSERT(r == 0, "vcpu_run failed: %d\n", r); + vcpu_run(vm, vcpu->id); - cmd = get_ucall(vm, VCPU_ID, NULL); + cmd = get_ucall(vm, vcpu->id, NULL); TEST_ASSERT(cmd == UCALL_DONE, "Unexpected guest exit, exit_reason=%s, ucall.cmd = %lu\n", exit_reason_str(run->exit_reason), cmd); -- cgit v1.2.3 From a2d5d774919ed48680d17b6b82458d184f6519dd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 15:39:45 -0800 Subject: KVM: selftests: Convert pmu_event_filter_test away from VCPU_ID Convert pmu_event_filter_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Rename run_vm_to_sync() to run_vcpu_to_sync() accordingly. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../selftests/kvm/x86_64/pmu_event_filter_test.c | 75 +++++++++++----------- 1 file changed, 39 insertions(+), 36 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index cdb9ce907c18..d2c571f20521 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -49,7 +49,6 @@ union cpuid10_ebx { /* Oddly, this isn't in perf_event.h. */ #define ARCH_PERFMON_BRANCHES_RETIRED 5 -#define VCPU_ID 0 #define NUM_BRANCHES 42 /* @@ -173,17 +172,17 @@ static void amd_guest_code(void) * Run the VM to the next GUEST_SYNC(value), and return the value passed * to the sync. Any other exit from the guest is fatal. */ -static uint64_t run_vm_to_sync(struct kvm_vm *vm) +static uint64_t run_vcpu_to_sync(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct kvm_run *run = vcpu->run; struct ucall uc; - vcpu_run(vm, VCPU_ID); + vcpu_run(vcpu->vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Exit_reason other than KVM_EXIT_IO: %u (%s)\n", run->exit_reason, exit_reason_str(run->exit_reason)); - get_ucall(vm, VCPU_ID, &uc); + get_ucall(vcpu->vm, vcpu->id, &uc); TEST_ASSERT(uc.cmd == UCALL_SYNC, "Received ucall other than UCALL_SYNC: %lu", uc.cmd); return uc.args[1]; @@ -197,13 +196,13 @@ static uint64_t run_vm_to_sync(struct kvm_vm *vm) * a sanity check and then GUEST_SYNC(success). In the case of failure, * the behavior of the guest on resumption is undefined. */ -static bool sanity_check_pmu(struct kvm_vm *vm) +static bool sanity_check_pmu(struct kvm_vcpu *vcpu) { bool success; - vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); - success = run_vm_to_sync(vm); - vm_install_exception_handler(vm, GP_VECTOR, NULL); + vm_install_exception_handler(vcpu->vm, GP_VECTOR, guest_gp_handler); + success = run_vcpu_to_sync(vcpu); + vm_install_exception_handler(vcpu->vm, GP_VECTOR, NULL); return success; } @@ -264,9 +263,9 @@ static struct kvm_pmu_event_filter *remove_event(struct kvm_pmu_event_filter *f, return f; } -static void test_without_filter(struct kvm_vm *vm) +static void test_without_filter(struct kvm_vcpu *vcpu) { - uint64_t count = run_vm_to_sync(vm); + uint64_t count = run_vcpu_to_sync(vcpu); if (count != NUM_BRANCHES) pr_info("%s: Branch instructions retired = %lu (expected %u)\n", @@ -274,21 +273,21 @@ static void test_without_filter(struct kvm_vm *vm) TEST_ASSERT(count, "Allowed PMU event is not counting"); } -static uint64_t test_with_filter(struct kvm_vm *vm, +static uint64_t test_with_filter(struct kvm_vcpu *vcpu, struct kvm_pmu_event_filter *f) { - vm_ioctl(vm, KVM_SET_PMU_EVENT_FILTER, (void *)f); - return run_vm_to_sync(vm); + vm_ioctl(vcpu->vm, KVM_SET_PMU_EVENT_FILTER, (void *)f); + return run_vcpu_to_sync(vcpu); } -static void test_amd_deny_list(struct kvm_vm *vm) +static void test_amd_deny_list(struct kvm_vcpu *vcpu) { uint64_t event = EVENT(0x1C2, 0); struct kvm_pmu_event_filter *f; uint64_t count; f = create_pmu_event_filter(&event, 1, KVM_PMU_EVENT_DENY); - count = test_with_filter(vm, f); + count = test_with_filter(vcpu, f); free(f); if (count != NUM_BRANCHES) @@ -297,10 +296,10 @@ static void test_amd_deny_list(struct kvm_vm *vm) TEST_ASSERT(count, "Allowed PMU event is not counting"); } -static void test_member_deny_list(struct kvm_vm *vm) +static void test_member_deny_list(struct kvm_vcpu *vcpu) { struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_DENY); - uint64_t count = test_with_filter(vm, f); + uint64_t count = test_with_filter(vcpu, f); free(f); if (count) @@ -309,10 +308,10 @@ static void test_member_deny_list(struct kvm_vm *vm) TEST_ASSERT(!count, "Disallowed PMU Event is counting"); } -static void test_member_allow_list(struct kvm_vm *vm) +static void test_member_allow_list(struct kvm_vcpu *vcpu) { struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_ALLOW); - uint64_t count = test_with_filter(vm, f); + uint64_t count = test_with_filter(vcpu, f); free(f); if (count != NUM_BRANCHES) @@ -321,14 +320,14 @@ static void test_member_allow_list(struct kvm_vm *vm) TEST_ASSERT(count, "Allowed PMU event is not counting"); } -static void test_not_member_deny_list(struct kvm_vm *vm) +static void test_not_member_deny_list(struct kvm_vcpu *vcpu) { struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_DENY); uint64_t count; remove_event(f, INTEL_BR_RETIRED); remove_event(f, AMD_ZEN_BR_RETIRED); - count = test_with_filter(vm, f); + count = test_with_filter(vcpu, f); free(f); if (count != NUM_BRANCHES) pr_info("%s: Branch instructions retired = %lu (expected %u)\n", @@ -336,14 +335,14 @@ static void test_not_member_deny_list(struct kvm_vm *vm) TEST_ASSERT(count, "Allowed PMU event is not counting"); } -static void test_not_member_allow_list(struct kvm_vm *vm) +static void test_not_member_allow_list(struct kvm_vcpu *vcpu) { struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_ALLOW); uint64_t count; remove_event(f, INTEL_BR_RETIRED); remove_event(f, AMD_ZEN_BR_RETIRED); - count = test_with_filter(vm, f); + count = test_with_filter(vcpu, f); free(f); if (count) pr_info("%s: Branch instructions retired = %lu (expected 0)\n", @@ -358,6 +357,7 @@ static void test_not_member_allow_list(struct kvm_vm *vm) */ static void test_pmu_config_disable(void (*guest_code)(void)) { + struct kvm_vcpu *vcpu; int r; struct kvm_vm *vm; @@ -369,11 +369,13 @@ static void test_pmu_config_disable(void (*guest_code)(void)) vm_enable_cap(vm, KVM_CAP_PMU_CAPABILITY, KVM_PMU_CAP_DISABLE); - vm_vcpu_add_default(vm, VCPU_ID, guest_code); + vm_vcpu_add_default(vm, 0, guest_code); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, VCPU_ID); - TEST_ASSERT(!sanity_check_pmu(vm), + vcpu = vcpu_get(vm, 0); + vcpu_init_descriptor_tables(vm, vcpu->id); + + TEST_ASSERT(!sanity_check_pmu(vcpu), "Guest should not be able to use disabled PMU."); kvm_vm_free(vm); @@ -444,6 +446,7 @@ static bool use_amd_pmu(void) int main(int argc, char *argv[]) { void (*guest_code)(void) = NULL; + struct kvm_vcpu *vcpu; struct kvm_vm *vm; int r; @@ -466,24 +469,24 @@ int main(int argc, char *argv[]) exit(KSFT_SKIP); } - vm = vm_create_default(VCPU_ID, 0, guest_code); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, VCPU_ID); + vcpu_init_descriptor_tables(vm, vcpu->id); - if (!sanity_check_pmu(vm)) { + if (!sanity_check_pmu(vcpu)) { print_skip("Guest PMU is not functional"); exit(KSFT_SKIP); } if (use_amd_pmu()) - test_amd_deny_list(vm); + test_amd_deny_list(vcpu); - test_without_filter(vm); - test_member_deny_list(vm); - test_member_allow_list(vm); - test_not_member_deny_list(vm); - test_not_member_allow_list(vm); + test_without_filter(vcpu); + test_member_deny_list(vcpu); + test_member_allow_list(vcpu); + test_not_member_deny_list(vcpu); + test_not_member_allow_list(vcpu); kvm_vm_free(vm); -- cgit v1.2.3 From 20092699759bd7f5861b9441eab5ea5c983c77a2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 15:51:19 -0800 Subject: KVM: selftests: Convert smm_test away from VCPU_ID Convert smm_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Note, this is a "functional" change in the sense that the test now creates a vCPU with vcpu_id==0 instead of vcpu_id==1. The non-zero VCPU_ID was 100% arbitrary and added little to no validation coverage. If testing non-zero vCPU IDs is desirable for generic tests, that can be done in the future by tweaking the VM creation helpers. Opportunistically use vcpu_run() instead of _vcpu_run(), the test expects KVM_RUN to succeed. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/smm_test.c | 37 +++++++++++++-------------- 1 file changed, 18 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c index dd2c1522ab90..36165b774a28 100644 --- a/tools/testing/selftests/kvm/x86_64/smm_test.c +++ b/tools/testing/selftests/kvm/x86_64/smm_test.c @@ -19,8 +19,6 @@ #include "vmx.h" #include "svm_util.h" -#define VCPU_ID 1 - #define SMRAM_SIZE 65536 #define SMRAM_MEMSLOT ((1 << 16) | 1) #define SMRAM_PAGES (SMRAM_SIZE / PAGE_SIZE) @@ -116,22 +114,23 @@ static void guest_code(void *arg) sync_with_host(DONE); } -void inject_smi(struct kvm_vm *vm) +void inject_smi(struct kvm_vcpu *vcpu) { struct kvm_vcpu_events events; - vcpu_events_get(vm, VCPU_ID, &events); + vcpu_events_get(vcpu->vm, vcpu->id, &events); events.smi.pending = 1; events.flags |= KVM_VCPUEVENT_VALID_SMM; - vcpu_events_set(vm, VCPU_ID, &events); + vcpu_events_set(vcpu->vm, vcpu->id, &events); } int main(int argc, char *argv[]) { vm_vaddr_t nested_gva = 0; + struct kvm_vcpu *vcpu; struct kvm_regs regs; struct kvm_vm *vm; struct kvm_run *run; @@ -139,9 +138,9 @@ int main(int argc, char *argv[]) int stage, stage_reported; /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); - run = vcpu_state(vm, VCPU_ID); + run = vcpu->run; vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, SMRAM_GPA, SMRAM_MEMSLOT, SMRAM_PAGES, 0); @@ -152,7 +151,7 @@ int main(int argc, char *argv[]) memcpy(addr_gpa2hva(vm, SMRAM_GPA) + 0x8000, smi_handler, sizeof(smi_handler)); - vcpu_set_msr(vm, VCPU_ID, MSR_IA32_SMBASE, SMRAM_GPA); + vcpu_set_msr(vm, vcpu->id, MSR_IA32_SMBASE, SMRAM_GPA); if (kvm_check_cap(KVM_CAP_NESTED_STATE)) { if (nested_svm_supported()) @@ -164,17 +163,17 @@ int main(int argc, char *argv[]) if (!nested_gva) pr_info("will skip SMM test with VMX enabled\n"); - vcpu_args_set(vm, VCPU_ID, 1, nested_gva); + vcpu_args_set(vm, vcpu->id, 1, nested_gva); for (stage = 1;; stage++) { - _vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Stage %d: unexpected exit reason: %u (%s),\n", stage, run->exit_reason, exit_reason_str(run->exit_reason)); memset(®s, 0, sizeof(regs)); - vcpu_regs_get(vm, VCPU_ID, ®s); + vcpu_regs_get(vm, vcpu->id, ®s); stage_reported = regs.rax & 0xff; @@ -191,7 +190,7 @@ int main(int argc, char *argv[]) * return from it. Do not perform save/restore while in SMM yet. */ if (stage == 8) { - inject_smi(vm); + inject_smi(vcpu); continue; } @@ -200,15 +199,15 @@ int main(int argc, char *argv[]) * during L2 execution. */ if (stage == 10) - inject_smi(vm); + inject_smi(vcpu); - state = vcpu_save_state(vm, VCPU_ID); + state = vcpu_save_state(vm, vcpu->id); kvm_vm_release(vm); - kvm_vm_restart(vm); - vm_vcpu_add(vm, VCPU_ID); - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); - vcpu_load_state(vm, VCPU_ID, state); - run = vcpu_state(vm, VCPU_ID); + + vcpu = vm_recreate_with_one_vcpu(vm); + vcpu_set_cpuid(vm, vcpu->id, kvm_get_supported_cpuid()); + vcpu_load_state(vm, vcpu->id, state); + run = vcpu->run; kvm_x86_state_cleanup(state); } -- cgit v1.2.3 From 90b13cdde1fa6b71b4d6f8a5f6298f9e54233e98 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 15:53:16 -0800 Subject: KVM: selftests: Convert state_test away from VCPU_ID Convert state_test to use vm_create_with_one_vcpu() and vm_recreate_with_one_vcpu(), and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Note, this is a "functional" change in the sense that the test now creates a vCPU with vcpu_id==0 instead of vcpu_id==5. The non-zero VCPU_ID was 100% arbitrary and added little to no validation coverage. If testing non-zero vCPU IDs is desirable for generic tests, that can be done in the future by tweaking the VM creation helpers. Opportunistically use vcpu_run() instead of _vcpu_run(), the test expects KVM_RUN to succeed. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/state_test.c | 29 ++++++++++++------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c index 41f7faaef2ac..b7869efad22a 100644 --- a/tools/testing/selftests/kvm/x86_64/state_test.c +++ b/tools/testing/selftests/kvm/x86_64/state_test.c @@ -20,7 +20,6 @@ #include "vmx.h" #include "svm_util.h" -#define VCPU_ID 5 #define L2_GUEST_STACK_SIZE 256 void svm_l2_guest_code(void) @@ -157,6 +156,7 @@ int main(int argc, char *argv[]) vm_vaddr_t nested_gva = 0; struct kvm_regs regs1, regs2; + struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct kvm_run *run; struct kvm_x86_state *state; @@ -164,10 +164,10 @@ int main(int argc, char *argv[]) int stage; /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code); - run = vcpu_state(vm, VCPU_ID); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + run = vcpu->run; - vcpu_regs_get(vm, VCPU_ID, ®s1); + vcpu_regs_get(vm, vcpu->id, ®s1); if (kvm_check_cap(KVM_CAP_NESTED_STATE)) { if (nested_svm_supported()) @@ -179,16 +179,16 @@ int main(int argc, char *argv[]) if (!nested_gva) pr_info("will skip nested state checks\n"); - vcpu_args_set(vm, VCPU_ID, 1, nested_gva); + vcpu_args_set(vm, vcpu->id, 1, nested_gva); for (stage = 1;; stage++) { - _vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Stage %d: unexpected exit reason: %u (%s),\n", stage, run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vm, vcpu->id, &uc)) { case UCALL_ABORT: TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]); @@ -206,22 +206,21 @@ int main(int argc, char *argv[]) uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx", stage, (ulong)uc.args[1]); - state = vcpu_save_state(vm, VCPU_ID); + state = vcpu_save_state(vm, vcpu->id); memset(®s1, 0, sizeof(regs1)); - vcpu_regs_get(vm, VCPU_ID, ®s1); + vcpu_regs_get(vm, vcpu->id, ®s1); kvm_vm_release(vm); /* Restore state in a new VM. */ - kvm_vm_restart(vm); - vm_vcpu_add(vm, VCPU_ID); - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); - vcpu_load_state(vm, VCPU_ID, state); - run = vcpu_state(vm, VCPU_ID); + vcpu = vm_recreate_with_one_vcpu(vm); + vcpu_set_cpuid(vm, vcpu->id, kvm_get_supported_cpuid()); + vcpu_load_state(vm, vcpu->id, state); + run = vcpu->run; kvm_x86_state_cleanup(state); memset(®s2, 0, sizeof(regs2)); - vcpu_regs_get(vm, VCPU_ID, ®s2); + vcpu_regs_get(vm, vcpu->id, ®s2); TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", (ulong) regs2.rdi, (ulong) regs2.rsi); -- cgit v1.2.3 From cb4d9608af03ef289b57ceb925c6d8c2066d45d7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 15:55:26 -0800 Subject: KVM: selftests: Convert svm_int_ctl_test away from VCPU_ID Convert svm_int_ctl_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Opportunistically make the "vm" variable a local function variable, there are no users outside of main(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/x86_64/svm_int_ctl_test.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c index 30a81038df46..8e90e463895a 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c @@ -13,10 +13,6 @@ #include "svm_util.h" #include "apic.h" -#define VCPU_ID 0 - -static struct kvm_vm *vm; - bool vintr_irq_called; bool intr_irq_called; @@ -88,31 +84,34 @@ static void l1_guest_code(struct svm_test_data *svm) int main(int argc, char *argv[]) { + struct kvm_vcpu *vcpu; + struct kvm_run *run; vm_vaddr_t svm_gva; + struct kvm_vm *vm; + struct ucall uc; nested_svm_check_supported(); - vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, VCPU_ID); + vcpu_init_descriptor_tables(vm, vcpu->id); vm_install_exception_handler(vm, VINTR_IRQ_NUMBER, vintr_irq_handler); vm_install_exception_handler(vm, INTR_IRQ_NUMBER, intr_irq_handler); vcpu_alloc_svm(vm, &svm_gva); - vcpu_args_set(vm, VCPU_ID, 1, svm_gva); + vcpu_args_set(vm, vcpu->id, 1, svm_gva); - struct kvm_run *run = vcpu_state(vm, VCPU_ID); - struct ucall uc; + run = vcpu->run; - vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vm, vcpu->id, &uc)) { case UCALL_ABORT: TEST_FAIL("%s", (const char *)uc.args[0]); break; -- cgit v1.2.3 From 91520c5121561fd33eba8e381764ec64d2748eca Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 15:56:54 -0800 Subject: KVM: selftests: Convert svm_vmcall_test away from VCPU_ID Convert svm_vmcall_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Note, this is a "functional" change in the sense that the test now creates a vCPU with vcpu_id==0 instead of vcpu_id==5. The non-zero VCPU_ID was 100% arbitrary and added little to no validation coverage. If testing non-zero vCPU IDs is desirable for generic tests, that can be done in the future by tweaking the VM creation helpers. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c index be2ca157485b..15e389a7cd31 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c @@ -12,10 +12,6 @@ #include "processor.h" #include "svm_util.h" -#define VCPU_ID 5 - -static struct kvm_vm *vm; - static void l2_guest_code(struct svm_test_data *svm) { __asm__ __volatile__("vmcall"); @@ -39,26 +35,28 @@ static void l1_guest_code(struct svm_test_data *svm) int main(int argc, char *argv[]) { + struct kvm_vcpu *vcpu; vm_vaddr_t svm_gva; + struct kvm_vm *vm; nested_svm_check_supported(); - vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); vcpu_alloc_svm(vm, &svm_gva); - vcpu_args_set(vm, VCPU_ID, 1, svm_gva); + vcpu_args_set(vm, vcpu->id, 1, svm_gva); for (;;) { - volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); + volatile struct kvm_run *run = vcpu->run; struct ucall uc; - vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vm, vcpu->id, &uc)) { case UCALL_ABORT: TEST_FAIL("%s", (const char *)uc.args[0]); /* NOT REACHED */ -- cgit v1.2.3 From 0184323acbc460893025871ccfd1db04223a7477 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 16:04:25 -0800 Subject: KVM: selftests: Convert sync_regs_test away from VCPU_ID Convert sync_regs_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Note, this is a "functional" change in the sense that the test now creates a vCPU with vcpu_id==0 instead of vcpu_id==5. The non-zero VCPU_ID was 100% arbitrary and added little to no validation coverage. If testing non-zero vCPU IDs is desirable for generic tests, that can be done in the future by tweaking the VM creation helpers. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/x86_64/sync_regs_test.c | 52 +++++++++++----------- 1 file changed, 25 insertions(+), 27 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c index fc03a150278d..c971706b49f5 100644 --- a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c +++ b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c @@ -20,8 +20,6 @@ #include "kvm_util.h" #include "processor.h" -#define VCPU_ID 5 - #define UCALL_PIO_PORT ((uint16_t)0x1000) struct ucall uc_none = { @@ -84,6 +82,7 @@ static void compare_vcpu_events(struct kvm_vcpu_events *left, int main(int argc, char *argv[]) { + struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct kvm_run *run; struct kvm_regs regs; @@ -104,57 +103,56 @@ int main(int argc, char *argv[]) exit(KSFT_SKIP); } - /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); - run = vcpu_state(vm, VCPU_ID); + run = vcpu->run; /* Request reading invalid register set from VCPU. */ run->kvm_valid_regs = INVALID_SYNC_FIELD; - rv = _vcpu_run(vm, VCPU_ID); + rv = _vcpu_run(vm, vcpu->id); TEST_ASSERT(rv < 0 && errno == EINVAL, "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n", rv); - vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0; + run->kvm_valid_regs = 0; run->kvm_valid_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS; - rv = _vcpu_run(vm, VCPU_ID); + rv = _vcpu_run(vm, vcpu->id); TEST_ASSERT(rv < 0 && errno == EINVAL, "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n", rv); - vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0; + run->kvm_valid_regs = 0; /* Request setting invalid register set into VCPU. */ run->kvm_dirty_regs = INVALID_SYNC_FIELD; - rv = _vcpu_run(vm, VCPU_ID); + rv = _vcpu_run(vm, vcpu->id); TEST_ASSERT(rv < 0 && errno == EINVAL, "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n", rv); - vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0; + run->kvm_dirty_regs = 0; run->kvm_dirty_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS; - rv = _vcpu_run(vm, VCPU_ID); + rv = _vcpu_run(vm, vcpu->id); TEST_ASSERT(rv < 0 && errno == EINVAL, "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n", rv); - vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0; + run->kvm_dirty_regs = 0; /* Request and verify all valid register sets. */ /* TODO: BUILD TIME CHECK: TEST_ASSERT(KVM_SYNC_X86_NUM_FIELDS != 3); */ run->kvm_valid_regs = TEST_SYNC_FIELDS; - rv = _vcpu_run(vm, VCPU_ID); + rv = _vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Unexpected exit reason: %u (%s),\n", run->exit_reason, exit_reason_str(run->exit_reason)); - vcpu_regs_get(vm, VCPU_ID, ®s); + vcpu_regs_get(vm, vcpu->id, ®s); compare_regs(®s, &run->s.regs.regs); - vcpu_sregs_get(vm, VCPU_ID, &sregs); + vcpu_sregs_get(vm, vcpu->id, &sregs); compare_sregs(&sregs, &run->s.regs.sregs); - vcpu_events_get(vm, VCPU_ID, &events); + vcpu_events_get(vm, vcpu->id, &events); compare_vcpu_events(&events, &run->s.regs.events); /* Set and verify various register values. */ @@ -164,7 +162,7 @@ int main(int argc, char *argv[]) run->kvm_valid_regs = TEST_SYNC_FIELDS; run->kvm_dirty_regs = KVM_SYNC_X86_REGS | KVM_SYNC_X86_SREGS; - rv = _vcpu_run(vm, VCPU_ID); + rv = _vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Unexpected exit reason: %u (%s),\n", run->exit_reason, @@ -176,13 +174,13 @@ int main(int argc, char *argv[]) "apic_base sync regs value incorrect 0x%llx.", run->s.regs.sregs.apic_base); - vcpu_regs_get(vm, VCPU_ID, ®s); + vcpu_regs_get(vm, vcpu->id, ®s); compare_regs(®s, &run->s.regs.regs); - vcpu_sregs_get(vm, VCPU_ID, &sregs); + vcpu_sregs_get(vm, vcpu->id, &sregs); compare_sregs(&sregs, &run->s.regs.sregs); - vcpu_events_get(vm, VCPU_ID, &events); + vcpu_events_get(vm, vcpu->id, &events); compare_vcpu_events(&events, &run->s.regs.events); /* Clear kvm_dirty_regs bits, verify new s.regs values are @@ -191,7 +189,7 @@ int main(int argc, char *argv[]) run->kvm_valid_regs = TEST_SYNC_FIELDS; run->kvm_dirty_regs = 0; run->s.regs.regs.rbx = 0xDEADBEEF; - rv = _vcpu_run(vm, VCPU_ID); + rv = _vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Unexpected exit reason: %u (%s),\n", run->exit_reason, @@ -208,8 +206,8 @@ int main(int argc, char *argv[]) run->kvm_dirty_regs = 0; run->s.regs.regs.rbx = 0xAAAA; regs.rbx = 0xBAC0; - vcpu_regs_set(vm, VCPU_ID, ®s); - rv = _vcpu_run(vm, VCPU_ID); + vcpu_regs_set(vm, vcpu->id, ®s); + rv = _vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Unexpected exit reason: %u (%s),\n", run->exit_reason, @@ -217,7 +215,7 @@ int main(int argc, char *argv[]) TEST_ASSERT(run->s.regs.regs.rbx == 0xAAAA, "rbx sync regs value incorrect 0x%llx.", run->s.regs.regs.rbx); - vcpu_regs_get(vm, VCPU_ID, ®s); + vcpu_regs_get(vm, vcpu->id, ®s); TEST_ASSERT(regs.rbx == 0xBAC0 + 1, "rbx guest value incorrect 0x%llx.", regs.rbx); @@ -229,7 +227,7 @@ int main(int argc, char *argv[]) run->kvm_valid_regs = 0; run->kvm_dirty_regs = TEST_SYNC_FIELDS; run->s.regs.regs.rbx = 0xBBBB; - rv = _vcpu_run(vm, VCPU_ID); + rv = _vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Unexpected exit reason: %u (%s),\n", run->exit_reason, @@ -237,7 +235,7 @@ int main(int argc, char *argv[]) TEST_ASSERT(run->s.regs.regs.rbx == 0xBBBB, "rbx sync regs value incorrect 0x%llx.", run->s.regs.regs.rbx); - vcpu_regs_get(vm, VCPU_ID, ®s); + vcpu_regs_get(vm, vcpu->id, ®s); TEST_ASSERT(regs.rbx == 0xBBBB + 1, "rbx guest value incorrect 0x%llx.", regs.rbx); -- cgit v1.2.3 From 5c6e31b3bc4b526f872cca35a86169b3aa968259 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 16:08:48 -0800 Subject: KVM: selftests: Convert hyperv_cpuid away from VCPU_ID Convert hyperv_cpuid to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c index 896e1e7c1df7..d1a22ee98cf3 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c @@ -20,8 +20,6 @@ #include "processor.h" #include "vmx.h" -#define VCPU_ID 0 - static void guest_code(void) { } @@ -115,25 +113,26 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries, } } -void test_hv_cpuid_e2big(struct kvm_vm *vm, bool system) +void test_hv_cpuid_e2big(struct kvm_vm *vm, struct kvm_vcpu *vcpu) { static struct kvm_cpuid2 cpuid = {.nent = 0}; int ret; - if (!system) - ret = __vcpu_ioctl(vm, VCPU_ID, KVM_GET_SUPPORTED_HV_CPUID, &cpuid); + if (vcpu) + ret = __vcpu_ioctl(vm, vcpu->id, KVM_GET_SUPPORTED_HV_CPUID, &cpuid); else ret = __kvm_ioctl(vm_get_kvm_fd(vm), KVM_GET_SUPPORTED_HV_CPUID, &cpuid); TEST_ASSERT(ret == -1 && errno == E2BIG, "%s KVM_GET_SUPPORTED_HV_CPUID didn't fail with -E2BIG when" - " it should have: %d %d", system ? "KVM" : "vCPU", ret, errno); + " it should have: %d %d", !vcpu ? "KVM" : "vCPU", ret, errno); } int main(int argc, char *argv[]) { struct kvm_vm *vm; struct kvm_cpuid2 *hv_cpuid_entries; + struct kvm_vcpu *vcpu; /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); @@ -143,12 +142,12 @@ int main(int argc, char *argv[]) exit(KSFT_SKIP); } - vm = vm_create_default(VCPU_ID, 0, guest_code); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); /* Test vCPU ioctl version */ - test_hv_cpuid_e2big(vm, false); + test_hv_cpuid_e2big(vm, vcpu); - hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vm, VCPU_ID); + hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vm, vcpu->id); test_hv_cpuid(hv_cpuid_entries, false); free(hv_cpuid_entries); @@ -157,8 +156,8 @@ int main(int argc, char *argv[]) print_skip("Enlightened VMCS is unsupported"); goto do_sys; } - vcpu_enable_evmcs(vm, VCPU_ID); - hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vm, VCPU_ID); + vcpu_enable_evmcs(vm, vcpu->id); + hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vm, vcpu->id); test_hv_cpuid(hv_cpuid_entries, true); free(hv_cpuid_entries); @@ -169,7 +168,7 @@ do_sys: goto out; } - test_hv_cpuid_e2big(vm, true); + test_hv_cpuid_e2big(vm, NULL); hv_cpuid_entries = kvm_get_supported_hv_cpuid(); test_hv_cpuid(hv_cpuid_entries, nested_vmx_supported()); -- cgit v1.2.3 From f323dbce3ba126f7b1dfeb7042c3b3037155aac2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 16:10:04 -0800 Subject: KVM: selftests: Convert kvm_pv_test away from VCPU_ID Convert kvm_pv_test to use vm_create_with_one_vcpu() and pass arounda 'struct kvm_vcpu' object instead of using a global VCPU_ID. Opportunistically use vcpu_run() instead of _vcpu_run() with an open coded assert that KVM_RUN succeeded. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/kvm_pv_test.c | 25 ++++++++++-------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c index 5eea3ac7958e..734e71739d33 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c @@ -171,24 +171,18 @@ static void handle_abort(struct ucall *uc) __FILE__, uc->args[1]); } -#define VCPU_ID 0 - -static void enter_guest(struct kvm_vm *vm) +static void enter_guest(struct kvm_vcpu *vcpu) { - struct kvm_run *run; + struct kvm_run *run = vcpu->run; struct ucall uc; - int r; - - run = vcpu_state(vm, VCPU_ID); while (true) { - r = _vcpu_run(vm, VCPU_ID); - TEST_ASSERT(!r, "vcpu_run failed: %d\n", r); + vcpu_run(vcpu->vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "unexpected exit reason: %u (%s)", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vcpu->vm, vcpu->id, &uc)) { case UCALL_PR_MSR: pr_msr(&uc); break; @@ -207,6 +201,7 @@ static void enter_guest(struct kvm_vm *vm) int main(void) { struct kvm_cpuid2 *best; + struct kvm_vcpu *vcpu; struct kvm_vm *vm; if (!kvm_check_cap(KVM_CAP_ENFORCE_PV_FEATURE_CPUID)) { @@ -214,18 +209,18 @@ int main(void) exit(KSFT_SKIP); } - vm = vm_create_default(VCPU_ID, 0, guest_main); + vm = vm_create_with_one_vcpu(&vcpu, guest_main); - vcpu_enable_cap(vm, VCPU_ID, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, 1); + vcpu_enable_cap(vm, vcpu->id, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, 1); best = kvm_get_supported_cpuid(); clear_kvm_cpuid_features(best); - vcpu_set_cpuid(vm, VCPU_ID, best); + vcpu_set_cpuid(vm, vcpu->id, best); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, VCPU_ID); + vcpu_init_descriptor_tables(vm, vcpu->id); vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); - enter_guest(vm); + enter_guest(vcpu); kvm_vm_free(vm); } -- cgit v1.2.3 From 1cc1a9f38da4d6ce650eed03721d470924559b0f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 16:11:59 -0800 Subject: KVM: selftests: Convert platform_info_test away from VCPU_ID Convert platform_info_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../selftests/kvm/x86_64/platform_info_test.c | 32 +++++++++++----------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/platform_info_test.c b/tools/testing/selftests/kvm/x86_64/platform_info_test.c index e79c04581ca8..eb5e1f972d76 100644 --- a/tools/testing/selftests/kvm/x86_64/platform_info_test.c +++ b/tools/testing/selftests/kvm/x86_64/platform_info_test.c @@ -21,7 +21,6 @@ #include "kvm_util.h" #include "processor.h" -#define VCPU_ID 0 #define MSR_PLATFORM_INFO_MAX_TURBO_RATIO 0xff00 static void guest_code(void) @@ -35,18 +34,18 @@ static void guest_code(void) } } -static void test_msr_platform_info_enabled(struct kvm_vm *vm) +static void test_msr_platform_info_enabled(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct kvm_run *run = vcpu->run; struct ucall uc; - vm_enable_cap(vm, KVM_CAP_MSR_PLATFORM_INFO, true); - vcpu_run(vm, VCPU_ID); + vm_enable_cap(vcpu->vm, KVM_CAP_MSR_PLATFORM_INFO, true); + vcpu_run(vcpu->vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Exit_reason other than KVM_EXIT_IO: %u (%s),\n", run->exit_reason, exit_reason_str(run->exit_reason)); - get_ucall(vm, VCPU_ID, &uc); + get_ucall(vcpu->vm, vcpu->id, &uc); TEST_ASSERT(uc.cmd == UCALL_SYNC, "Received ucall other than UCALL_SYNC: %lu\n", uc.cmd); TEST_ASSERT((uc.args[1] & MSR_PLATFORM_INFO_MAX_TURBO_RATIO) == @@ -55,12 +54,12 @@ static void test_msr_platform_info_enabled(struct kvm_vm *vm) MSR_PLATFORM_INFO_MAX_TURBO_RATIO); } -static void test_msr_platform_info_disabled(struct kvm_vm *vm) +static void test_msr_platform_info_disabled(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct kvm_run *run = vcpu->run; - vm_enable_cap(vm, KVM_CAP_MSR_PLATFORM_INFO, false); - vcpu_run(vm, VCPU_ID); + vm_enable_cap(vcpu->vm, KVM_CAP_MSR_PLATFORM_INFO, false); + vcpu_run(vcpu->vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN, "Exit_reason other than KVM_EXIT_SHUTDOWN: %u (%s)\n", run->exit_reason, @@ -69,6 +68,7 @@ static void test_msr_platform_info_disabled(struct kvm_vm *vm) int main(int argc, char *argv[]) { + struct kvm_vcpu *vcpu; struct kvm_vm *vm; int rv; uint64_t msr_platform_info; @@ -82,14 +82,14 @@ int main(int argc, char *argv[]) exit(KSFT_SKIP); } - vm = vm_create_default(VCPU_ID, 0, guest_code); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); - msr_platform_info = vcpu_get_msr(vm, VCPU_ID, MSR_PLATFORM_INFO); - vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO, + msr_platform_info = vcpu_get_msr(vm, vcpu->id, MSR_PLATFORM_INFO); + vcpu_set_msr(vm, vcpu->id, MSR_PLATFORM_INFO, msr_platform_info | MSR_PLATFORM_INFO_MAX_TURBO_RATIO); - test_msr_platform_info_enabled(vm); - test_msr_platform_info_disabled(vm); - vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO, msr_platform_info); + test_msr_platform_info_enabled(vcpu); + test_msr_platform_info_disabled(vcpu); + vcpu_set_msr(vm, vcpu->id, MSR_PLATFORM_INFO, msr_platform_info); kvm_vm_free(vm); -- cgit v1.2.3 From 6f96628f8290d2b634ba0dc5b83bd7201e099c52 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 16:23:43 -0800 Subject: KVM: selftests: Convert vmx_nested_tsc_scaling_test away from VCPU_ID Convert vmx_nested_tsc_scaling_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c index c35ada9f7f9c..c9cb29f06244 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c @@ -15,9 +15,6 @@ #include "vmx.h" #include "kselftest.h" - -#define VCPU_ID 0 - /* L2 is scaled up (from L1's perspective) by this factor */ #define L2_SCALE_FACTOR 4ULL @@ -150,6 +147,7 @@ skip_test: int main(int argc, char *argv[]) { + struct kvm_vcpu *vcpu; struct kvm_vm *vm; vm_vaddr_t vmx_pages_gva; @@ -182,28 +180,28 @@ int main(int argc, char *argv[]) l0_tsc_freq = tsc_end - tsc_start; printf("real TSC frequency is around: %"PRIu64"\n", l0_tsc_freq); - vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + vcpu_args_set(vm, vcpu->id, 1, vmx_pages_gva); - tsc_khz = __vcpu_ioctl(vm, VCPU_ID, KVM_GET_TSC_KHZ, NULL); + tsc_khz = __vcpu_ioctl(vm, vcpu->id, KVM_GET_TSC_KHZ, NULL); TEST_ASSERT(tsc_khz != -1, "vcpu ioctl KVM_GET_TSC_KHZ failed"); /* scale down L1's TSC frequency */ - vcpu_ioctl(vm, VCPU_ID, KVM_SET_TSC_KHZ, + vcpu_ioctl(vm, vcpu->id, KVM_SET_TSC_KHZ, (void *) (tsc_khz / l1_scale_factor)); for (;;) { - volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); + volatile struct kvm_run *run = vcpu->run; struct ucall uc; - vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vm, vcpu->id, &uc)) { case UCALL_ABORT: TEST_FAIL("%s", (const char *) uc.args[0]); case UCALL_SYNC: -- cgit v1.2.3 From d31e15005dde30f5e25308ee01b6f518b5cadecb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 16:24:50 -0800 Subject: KVM: selftests: Convert set_sregs_test away from VCPU_ID Convert set_sregs_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Note, this is a "functional" change in the sense that the test now creates a vCPU with vcpu_id==0 instead of vcpu_id==5. The non-zero VCPU_ID was 100% arbitrary and added little to no validation coverage. If testing non-zero vCPU IDs is desirable for generic tests, that can be done in the future by tweaking the VM creation helpers. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/x86_64/set_sregs_test.c | 45 +++++++++++----------- 1 file changed, 22 insertions(+), 23 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c index f5e65db9f451..8a5c1f76287c 100644 --- a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c +++ b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c @@ -22,9 +22,7 @@ #include "kvm_util.h" #include "processor.h" -#define VCPU_ID 5 - -static void test_cr4_feature_bit(struct kvm_vm *vm, struct kvm_sregs *orig, +static void test_cr4_feature_bit(struct kvm_vcpu *vcpu, struct kvm_sregs *orig, uint64_t feature_bit) { struct kvm_sregs sregs; @@ -37,11 +35,11 @@ static void test_cr4_feature_bit(struct kvm_vm *vm, struct kvm_sregs *orig, memcpy(&sregs, orig, sizeof(sregs)); sregs.cr4 |= feature_bit; - rc = _vcpu_sregs_set(vm, VCPU_ID, &sregs); + rc = _vcpu_sregs_set(vcpu->vm, vcpu->id, &sregs); TEST_ASSERT(rc, "KVM allowed unsupported CR4 bit (0x%lx)", feature_bit); /* Sanity check that KVM didn't change anything. */ - vcpu_sregs_get(vm, VCPU_ID, &sregs); + vcpu_sregs_get(vcpu->vm, vcpu->id, &sregs); TEST_ASSERT(!memcmp(&sregs, orig, sizeof(sregs)), "KVM modified sregs"); } @@ -83,6 +81,7 @@ static uint64_t calc_cr4_feature_bits(struct kvm_vm *vm) int main(int argc, char *argv[]) { struct kvm_sregs sregs; + struct kvm_vcpu *vcpu; struct kvm_vm *vm; uint64_t cr4; int rc; @@ -96,43 +95,43 @@ int main(int argc, char *argv[]) * the vCPU model, i.e. without doing KVM_SET_CPUID2. */ vm = vm_create_barebones(); - vm_vcpu_add(vm, VCPU_ID); + vcpu = vm_vcpu_add(vm, 0); - vcpu_sregs_get(vm, VCPU_ID, &sregs); + vcpu_sregs_get(vm, vcpu->id, &sregs); sregs.cr4 |= calc_cr4_feature_bits(vm); cr4 = sregs.cr4; - rc = _vcpu_sregs_set(vm, VCPU_ID, &sregs); + rc = _vcpu_sregs_set(vm, vcpu->id, &sregs); TEST_ASSERT(!rc, "Failed to set supported CR4 bits (0x%lx)", cr4); - vcpu_sregs_get(vm, VCPU_ID, &sregs); + vcpu_sregs_get(vm, vcpu->id, &sregs); TEST_ASSERT(sregs.cr4 == cr4, "sregs.CR4 (0x%llx) != CR4 (0x%lx)", sregs.cr4, cr4); /* Verify all unsupported features are rejected by KVM. */ - test_cr4_feature_bit(vm, &sregs, X86_CR4_UMIP); - test_cr4_feature_bit(vm, &sregs, X86_CR4_LA57); - test_cr4_feature_bit(vm, &sregs, X86_CR4_VMXE); - test_cr4_feature_bit(vm, &sregs, X86_CR4_SMXE); - test_cr4_feature_bit(vm, &sregs, X86_CR4_FSGSBASE); - test_cr4_feature_bit(vm, &sregs, X86_CR4_PCIDE); - test_cr4_feature_bit(vm, &sregs, X86_CR4_OSXSAVE); - test_cr4_feature_bit(vm, &sregs, X86_CR4_SMEP); - test_cr4_feature_bit(vm, &sregs, X86_CR4_SMAP); - test_cr4_feature_bit(vm, &sregs, X86_CR4_PKE); + test_cr4_feature_bit(vcpu, &sregs, X86_CR4_UMIP); + test_cr4_feature_bit(vcpu, &sregs, X86_CR4_LA57); + test_cr4_feature_bit(vcpu, &sregs, X86_CR4_VMXE); + test_cr4_feature_bit(vcpu, &sregs, X86_CR4_SMXE); + test_cr4_feature_bit(vcpu, &sregs, X86_CR4_FSGSBASE); + test_cr4_feature_bit(vcpu, &sregs, X86_CR4_PCIDE); + test_cr4_feature_bit(vcpu, &sregs, X86_CR4_OSXSAVE); + test_cr4_feature_bit(vcpu, &sregs, X86_CR4_SMEP); + test_cr4_feature_bit(vcpu, &sregs, X86_CR4_SMAP); + test_cr4_feature_bit(vcpu, &sregs, X86_CR4_PKE); kvm_vm_free(vm); /* Create a "real" VM and verify APIC_BASE can be set. */ - vm = vm_create_default(VCPU_ID, 0, NULL); + vm = vm_create_with_one_vcpu(&vcpu, NULL); - vcpu_sregs_get(vm, VCPU_ID, &sregs); + vcpu_sregs_get(vm, vcpu->id, &sregs); sregs.apic_base = 1 << 10; - rc = _vcpu_sregs_set(vm, VCPU_ID, &sregs); + rc = _vcpu_sregs_set(vm, vcpu->id, &sregs); TEST_ASSERT(rc, "Set IA32_APIC_BASE to %llx (invalid)", sregs.apic_base); sregs.apic_base = 1 << 11; - rc = _vcpu_sregs_set(vm, VCPU_ID, &sregs); + rc = _vcpu_sregs_set(vm, vcpu->id, &sregs); TEST_ASSERT(!rc, "Couldn't set IA32_APIC_BASE to %llx (valid)", sregs.apic_base); -- cgit v1.2.3 From ec7b769a732052e6c8f73db35fd8ee35d1368a4b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 16:26:03 -0800 Subject: KVM: selftests: Convert vmx_dirty_log_test away from VCPU_ID Convert vmx_dirty_log_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Note, this is a "functional" change in the sense that the test now creates a vCPU with vcpu_id==0 instead of vcpu_id==1. The non-zero VCPU_ID was 100% arbitrary and added little to no validation coverage. If testing non-zero vCPU IDs is desirable for generic tests, that can be done in the future by tweaking the VM creation helpers. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c index 68f26a8b4f42..fb8c7f7236f7 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c @@ -17,8 +17,6 @@ #include "processor.h" #include "vmx.h" -#define VCPU_ID 1 - /* The memory slot index to track dirty pages */ #define TEST_MEM_SLOT_INDEX 1 #define TEST_MEM_PAGES 3 @@ -73,6 +71,7 @@ int main(int argc, char *argv[]) unsigned long *bmap; uint64_t *host_test_mem; + struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct kvm_run *run; struct ucall uc; @@ -81,10 +80,10 @@ int main(int argc, char *argv[]) nested_vmx_check_supported(); /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, l1_guest_code); + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); - run = vcpu_state(vm, VCPU_ID); + vcpu_args_set(vm, vcpu->id, 1, vmx_pages_gva); + run = vcpu->run; /* Add an extra memory slot for testing dirty logging */ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, @@ -116,13 +115,13 @@ int main(int argc, char *argv[]) while (!done) { memset(host_test_mem, 0xaa, TEST_MEM_PAGES * 4096); - _vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Unexpected exit reason: %u (%s),\n", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vm, vcpu->id, &uc)) { case UCALL_ABORT: TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]); -- cgit v1.2.3 From 706aaa4fedd9b028a3238221a8aba499b74d8f93 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 16:26:58 -0800 Subject: KVM: selftests: Convert vmx_close_while_nested_test away from VCPU_ID Convert vmx_close_while_nested_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Note, this is a "functional" change in the sense that the test now creates a vCPU with vcpu_id==0 instead of vcpu_id==5. The non-zero VCPU_ID was 100% arbitrary and added little to no validation coverage. If testing non-zero vCPU IDs is desirable for generic tests, that can be done in the future by tweaking the VM creation helpers. Opportunistically make the "vm" variable local, it is unused outside of main(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../selftests/kvm/x86_64/vmx_close_while_nested_test.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c index edac8839e717..da0363076fba 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c @@ -18,15 +18,10 @@ #include "kselftest.h" -#define VCPU_ID 5 - enum { PORT_L0_EXIT = 0x2000, }; -/* The virtual machine object. */ -static struct kvm_vm *vm; - static void l2_guest_code(void) { /* Exit to L0 */ @@ -53,20 +48,22 @@ static void l1_guest_code(struct vmx_pages *vmx_pages) int main(int argc, char *argv[]) { vm_vaddr_t vmx_pages_gva; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; nested_vmx_check_supported(); - vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); /* Allocate VMX pages and shared descriptors (vmx_pages). */ vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + vcpu_args_set(vm, vcpu->id, 1, vmx_pages_gva); for (;;) { - volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); + volatile struct kvm_run *run = vcpu->run; struct ucall uc; - vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", run->exit_reason, @@ -75,7 +72,7 @@ int main(int argc, char *argv[]) if (run->io.port == PORT_L0_EXIT) break; - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vm, vcpu->id, &uc)) { case UCALL_ABORT: TEST_FAIL("%s", (const char *)uc.args[0]); /* NOT REACHED */ -- cgit v1.2.3 From 21c602e671755765cde92dd5f07125c6ba8b8d03 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 16:27:47 -0800 Subject: KVM: selftests: Convert vmx_apic_access_test away from VCPU_ID Convert vmx_apic_access_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Opportunistically make the "vm" variable local, it is unused outside of main(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../selftests/kvm/x86_64/vmx_apic_access_test.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c index d438c4d3228a..10f9c86029e6 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c @@ -28,11 +28,6 @@ #include "kselftest.h" -#define VCPU_ID 0 - -/* The virtual machine object. */ -static struct kvm_vm *vm; - static void l2_guest_code(void) { /* Exit to L1 */ @@ -84,9 +79,12 @@ int main(int argc, char *argv[]) struct vmx_pages *vmx; bool done = false; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + nested_vmx_check_supported(); - vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); kvm_get_cpu_address_width(&paddr_width, &vaddr_width); high_gpa = (1ul << paddr_width) - getpagesize(); @@ -97,13 +95,13 @@ int main(int argc, char *argv[]) vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva); prepare_virtualize_apic_accesses(vmx, vm); - vcpu_args_set(vm, VCPU_ID, 2, vmx_pages_gva, high_gpa); + vcpu_args_set(vm, vcpu->id, 2, vmx_pages_gva, high_gpa); while (!done) { - volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); + volatile struct kvm_run *run = vcpu->run; struct ucall uc; - vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); if (apic_access_addr == high_gpa) { TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR, @@ -121,7 +119,7 @@ int main(int argc, char *argv[]) run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vm, vcpu->id, &uc)) { case UCALL_ABORT: TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]); -- cgit v1.2.3 From b4694260299ac4312a5a2d94a47957c1b6d55d00 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 16:38:01 -0800 Subject: KVM: selftests: Convert userspace_msr_exit_test away from VCPU_ID Convert userspace_msr_exit_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Note, this is a "functional" change in the sense that the test now creates a vCPU with vcpu_id==0 instead of vcpu_id==1. The non-zero VCPU_ID was 100% arbitrary and added little to no validation coverage. If testing non-zero vCPU IDs is desirable for generic tests, that can be done in the future by tweaking the VM creation helpers. Opportunistically use vcpu_run() instead of _vcpu_run() with an open coded assert that KVM_RUN succeeded. Fix minor coding style violations too. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../selftests/kvm/x86_64/userspace_msr_exit_test.c | 156 ++++++++++----------- 1 file changed, 72 insertions(+), 84 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c index 23e9292580c9..a0d35e578b25 100644 --- a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c +++ b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c @@ -17,7 +17,6 @@ #define KVM_FEP_LENGTH 5 static int fep_available = 1; -#define VCPU_ID 1 #define MSR_NON_EXISTENT 0x474f4f00 static u64 deny_bits = 0; @@ -395,31 +394,22 @@ static void guest_ud_handler(struct ex_regs *regs) regs->rip += KVM_FEP_LENGTH; } -static void run_guest(struct kvm_vm *vm) +static void check_for_guest_assert(struct kvm_vcpu *vcpu) { - int rc; - - rc = _vcpu_run(vm, VCPU_ID); - TEST_ASSERT(rc == 0, "vcpu_run failed: %d\n", rc); -} - -static void check_for_guest_assert(struct kvm_vm *vm) -{ - struct kvm_run *run = vcpu_state(vm, VCPU_ID); struct ucall uc; - if (run->exit_reason == KVM_EXIT_IO && - get_ucall(vm, VCPU_ID, &uc) == UCALL_ABORT) { - TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], - __FILE__, uc.args[1]); + if (vcpu->run->exit_reason == KVM_EXIT_IO && + get_ucall(vcpu->vm, vcpu->id, &uc) == UCALL_ABORT) { + TEST_FAIL("%s at %s:%ld", + (const char *)uc.args[0], __FILE__, uc.args[1]); } } -static void process_rdmsr(struct kvm_vm *vm, uint32_t msr_index) +static void process_rdmsr(struct kvm_vcpu *vcpu, uint32_t msr_index) { - struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct kvm_run *run = vcpu->run; - check_for_guest_assert(vm); + check_for_guest_assert(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_X86_RDMSR, "Unexpected exit reason: %u (%s),\n", @@ -450,11 +440,11 @@ static void process_rdmsr(struct kvm_vm *vm, uint32_t msr_index) } } -static void process_wrmsr(struct kvm_vm *vm, uint32_t msr_index) +static void process_wrmsr(struct kvm_vcpu *vcpu, uint32_t msr_index) { - struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct kvm_run *run = vcpu->run; - check_for_guest_assert(vm); + check_for_guest_assert(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_X86_WRMSR, "Unexpected exit reason: %u (%s),\n", @@ -481,43 +471,43 @@ static void process_wrmsr(struct kvm_vm *vm, uint32_t msr_index) } } -static void process_ucall_done(struct kvm_vm *vm) +static void process_ucall_done(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct kvm_run *run = vcpu->run; struct ucall uc; - check_for_guest_assert(vm); + check_for_guest_assert(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Unexpected exit reason: %u (%s)", run->exit_reason, exit_reason_str(run->exit_reason)); - TEST_ASSERT(get_ucall(vm, VCPU_ID, &uc) == UCALL_DONE, + TEST_ASSERT(get_ucall(vcpu->vm, vcpu->id, &uc) == UCALL_DONE, "Unexpected ucall command: %lu, expected UCALL_DONE (%d)", uc.cmd, UCALL_DONE); } -static uint64_t process_ucall(struct kvm_vm *vm) +static uint64_t process_ucall(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct kvm_run *run = vcpu->run; struct ucall uc = {}; - check_for_guest_assert(vm); + check_for_guest_assert(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Unexpected exit reason: %u (%s)", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vcpu->vm, vcpu->id, &uc)) { case UCALL_SYNC: break; case UCALL_ABORT: - check_for_guest_assert(vm); + check_for_guest_assert(vcpu); break; case UCALL_DONE: - process_ucall_done(vm); + process_ucall_done(vcpu); break; default: TEST_ASSERT(false, "Unexpected ucall"); @@ -526,38 +516,39 @@ static uint64_t process_ucall(struct kvm_vm *vm) return uc.cmd; } -static void run_guest_then_process_rdmsr(struct kvm_vm *vm, uint32_t msr_index) +static void run_guest_then_process_rdmsr(struct kvm_vcpu *vcpu, + uint32_t msr_index) { - run_guest(vm); - process_rdmsr(vm, msr_index); + vcpu_run(vcpu->vm, vcpu->id); + process_rdmsr(vcpu, msr_index); } -static void run_guest_then_process_wrmsr(struct kvm_vm *vm, uint32_t msr_index) +static void run_guest_then_process_wrmsr(struct kvm_vcpu *vcpu, + uint32_t msr_index) { - run_guest(vm); - process_wrmsr(vm, msr_index); + vcpu_run(vcpu->vm, vcpu->id); + process_wrmsr(vcpu, msr_index); } -static uint64_t run_guest_then_process_ucall(struct kvm_vm *vm) +static uint64_t run_guest_then_process_ucall(struct kvm_vcpu *vcpu) { - run_guest(vm); - return process_ucall(vm); + vcpu_run(vcpu->vm, vcpu->id); + return process_ucall(vcpu); } -static void run_guest_then_process_ucall_done(struct kvm_vm *vm) +static void run_guest_then_process_ucall_done(struct kvm_vcpu *vcpu) { - run_guest(vm); - process_ucall_done(vm); + vcpu_run(vcpu->vm, vcpu->id); + process_ucall_done(vcpu); } static void test_msr_filter_allow(void) { + struct kvm_vcpu *vcpu; struct kvm_vm *vm; int rc; - /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code_filter_allow); - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + vm = vm_create_with_one_vcpu(&vcpu, guest_code_filter_allow); rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR); TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available"); @@ -569,43 +560,43 @@ static void test_msr_filter_allow(void) vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_allow); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, VCPU_ID); + vcpu_init_descriptor_tables(vm, vcpu->id); vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); /* Process guest code userspace exits. */ - run_guest_then_process_rdmsr(vm, MSR_IA32_XSS); - run_guest_then_process_wrmsr(vm, MSR_IA32_XSS); - run_guest_then_process_wrmsr(vm, MSR_IA32_XSS); + run_guest_then_process_rdmsr(vcpu, MSR_IA32_XSS); + run_guest_then_process_wrmsr(vcpu, MSR_IA32_XSS); + run_guest_then_process_wrmsr(vcpu, MSR_IA32_XSS); - run_guest_then_process_rdmsr(vm, MSR_IA32_FLUSH_CMD); - run_guest_then_process_wrmsr(vm, MSR_IA32_FLUSH_CMD); - run_guest_then_process_wrmsr(vm, MSR_IA32_FLUSH_CMD); + run_guest_then_process_rdmsr(vcpu, MSR_IA32_FLUSH_CMD); + run_guest_then_process_wrmsr(vcpu, MSR_IA32_FLUSH_CMD); + run_guest_then_process_wrmsr(vcpu, MSR_IA32_FLUSH_CMD); - run_guest_then_process_wrmsr(vm, MSR_NON_EXISTENT); - run_guest_then_process_rdmsr(vm, MSR_NON_EXISTENT); + run_guest_then_process_wrmsr(vcpu, MSR_NON_EXISTENT); + run_guest_then_process_rdmsr(vcpu, MSR_NON_EXISTENT); vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); - run_guest(vm); + vcpu_run(vm, vcpu->id); vm_install_exception_handler(vm, UD_VECTOR, NULL); - if (process_ucall(vm) != UCALL_DONE) { + if (process_ucall(vcpu) != UCALL_DONE) { vm_install_exception_handler(vm, GP_VECTOR, guest_fep_gp_handler); /* Process emulated rdmsr and wrmsr instructions. */ - run_guest_then_process_rdmsr(vm, MSR_IA32_XSS); - run_guest_then_process_wrmsr(vm, MSR_IA32_XSS); - run_guest_then_process_wrmsr(vm, MSR_IA32_XSS); + run_guest_then_process_rdmsr(vcpu, MSR_IA32_XSS); + run_guest_then_process_wrmsr(vcpu, MSR_IA32_XSS); + run_guest_then_process_wrmsr(vcpu, MSR_IA32_XSS); - run_guest_then_process_rdmsr(vm, MSR_IA32_FLUSH_CMD); - run_guest_then_process_wrmsr(vm, MSR_IA32_FLUSH_CMD); - run_guest_then_process_wrmsr(vm, MSR_IA32_FLUSH_CMD); + run_guest_then_process_rdmsr(vcpu, MSR_IA32_FLUSH_CMD); + run_guest_then_process_wrmsr(vcpu, MSR_IA32_FLUSH_CMD); + run_guest_then_process_wrmsr(vcpu, MSR_IA32_FLUSH_CMD); - run_guest_then_process_wrmsr(vm, MSR_NON_EXISTENT); - run_guest_then_process_rdmsr(vm, MSR_NON_EXISTENT); + run_guest_then_process_wrmsr(vcpu, MSR_NON_EXISTENT); + run_guest_then_process_rdmsr(vcpu, MSR_NON_EXISTENT); /* Confirm the guest completed without issues. */ - run_guest_then_process_ucall_done(vm); + run_guest_then_process_ucall_done(vcpu); } else { printf("To run the instruction emulated tests set the module parameter 'kvm.force_emulation_prefix=1'\n"); } @@ -613,16 +604,16 @@ static void test_msr_filter_allow(void) kvm_vm_free(vm); } -static int handle_ucall(struct kvm_vm *vm) +static int handle_ucall(struct kvm_vcpu *vcpu) { struct ucall uc; - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vcpu->vm, vcpu->id, &uc)) { case UCALL_ABORT: TEST_FAIL("Guest assertion not met"); break; case UCALL_SYNC: - vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &no_filter_deny); + vm_ioctl(vcpu->vm, KVM_X86_SET_MSR_FILTER, &no_filter_deny); break; case UCALL_DONE: return 1; @@ -672,14 +663,13 @@ static void handle_wrmsr(struct kvm_run *run) static void test_msr_filter_deny(void) { + struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct kvm_run *run; int rc; - /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code_filter_deny); - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); - run = vcpu_state(vm, VCPU_ID); + vm = vm_create_with_one_vcpu(&vcpu, guest_code_filter_deny); + run = vcpu->run; rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR); TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available"); @@ -694,9 +684,7 @@ static void test_msr_filter_deny(void) vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_deny); while (1) { - rc = _vcpu_run(vm, VCPU_ID); - - TEST_ASSERT(rc == 0, "vcpu_run failed: %d\n", rc); + vcpu_run(vm, vcpu->id); switch (run->exit_reason) { case KVM_EXIT_X86_RDMSR: @@ -706,7 +694,7 @@ static void test_msr_filter_deny(void) handle_wrmsr(run); break; case KVM_EXIT_IO: - if (handle_ucall(vm)) + if (handle_ucall(vcpu)) goto done; break; } @@ -722,12 +710,11 @@ done: static void test_msr_permission_bitmap(void) { + struct kvm_vcpu *vcpu; struct kvm_vm *vm; int rc; - /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code_permission_bitmap); - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + vm = vm_create_with_one_vcpu(&vcpu, guest_code_permission_bitmap); rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR); TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available"); @@ -737,11 +724,12 @@ static void test_msr_permission_bitmap(void) TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available"); vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_fs); - run_guest_then_process_rdmsr(vm, MSR_FS_BASE); - TEST_ASSERT(run_guest_then_process_ucall(vm) == UCALL_SYNC, "Expected ucall state to be UCALL_SYNC."); + run_guest_then_process_rdmsr(vcpu, MSR_FS_BASE); + TEST_ASSERT(run_guest_then_process_ucall(vcpu) == UCALL_SYNC, + "Expected ucall state to be UCALL_SYNC."); vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_gs); - run_guest_then_process_rdmsr(vm, MSR_GS_BASE); - run_guest_then_process_ucall_done(vm); + run_guest_then_process_rdmsr(vcpu, MSR_GS_BASE); + run_guest_then_process_ucall_done(vcpu); kvm_vm_free(vm); } -- cgit v1.2.3 From 709fd88491a819382c0f47e813628f2650497b4d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 16:44:36 -0800 Subject: KVM: selftests: Convert vmx_exception_with_invalid_guest_state away from VCPU_ID Convert vmx_exception_with_invalid_guest_state to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../vmx_exception_with_invalid_guest_state.c | 62 +++++++++++++--------- 1 file changed, 36 insertions(+), 26 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c index 27a850f3d7ce..70b30583e50d 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c @@ -10,10 +10,6 @@ #include "kselftest.h" -#define VCPU_ID 0 - -static struct kvm_vm *vm; - static void guest_ud_handler(struct ex_regs *regs) { /* Loop on the ud2 until guest state is made invalid. */ @@ -24,11 +20,11 @@ static void guest_code(void) asm volatile("ud2"); } -static void __run_vcpu_with_invalid_state(void) +static void __run_vcpu_with_invalid_state(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct kvm_run *run = vcpu->run; - vcpu_run(vm, VCPU_ID); + vcpu_run(vcpu->vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR, "Expected KVM_EXIT_INTERNAL_ERROR, got %d (%s)\n", @@ -38,15 +34,15 @@ static void __run_vcpu_with_invalid_state(void) run->emulation_failure.suberror); } -static void run_vcpu_with_invalid_state(void) +static void run_vcpu_with_invalid_state(struct kvm_vcpu *vcpu) { /* * Always run twice to verify KVM handles the case where _KVM_ queues * an exception with invalid state and then exits to userspace, i.e. * that KVM doesn't explode if userspace ignores the initial error. */ - __run_vcpu_with_invalid_state(); - __run_vcpu_with_invalid_state(); + __run_vcpu_with_invalid_state(vcpu); + __run_vcpu_with_invalid_state(vcpu); } static void set_timer(void) @@ -59,33 +55,43 @@ static void set_timer(void) ASSERT_EQ(setitimer(ITIMER_REAL, &timer, NULL), 0); } -static void set_or_clear_invalid_guest_state(bool set) +static void set_or_clear_invalid_guest_state(struct kvm_vcpu *vcpu, bool set) { static struct kvm_sregs sregs; if (!sregs.cr0) - vcpu_sregs_get(vm, VCPU_ID, &sregs); + vcpu_sregs_get(vcpu->vm, vcpu->id, &sregs); sregs.tr.unusable = !!set; - vcpu_sregs_set(vm, VCPU_ID, &sregs); + vcpu_sregs_set(vcpu->vm, vcpu->id, &sregs); } -static void set_invalid_guest_state(void) +static void set_invalid_guest_state(struct kvm_vcpu *vcpu) { - set_or_clear_invalid_guest_state(true); + set_or_clear_invalid_guest_state(vcpu, true); } -static void clear_invalid_guest_state(void) +static void clear_invalid_guest_state(struct kvm_vcpu *vcpu) { - set_or_clear_invalid_guest_state(false); + set_or_clear_invalid_guest_state(vcpu, false); +} + +static struct kvm_vcpu *get_set_sigalrm_vcpu(struct kvm_vcpu *__vcpu) +{ + static struct kvm_vcpu *vcpu = NULL; + + if (__vcpu) + vcpu = __vcpu; + return vcpu; } static void sigalrm_handler(int sig) { + struct kvm_vcpu *vcpu = get_set_sigalrm_vcpu(NULL); struct kvm_vcpu_events events; TEST_ASSERT(sig == SIGALRM, "Unexpected signal = %d", sig); - vcpu_events_get(vm, VCPU_ID, &events); + vcpu_events_get(vcpu->vm, vcpu->id, &events); /* * If an exception is pending, attempt KVM_RUN with invalid guest, @@ -93,8 +99,8 @@ static void sigalrm_handler(int sig) * between KVM queueing an exception and re-entering the guest. */ if (events.exception.pending) { - set_invalid_guest_state(); - run_vcpu_with_invalid_state(); + set_invalid_guest_state(vcpu); + run_vcpu_with_invalid_state(vcpu); } else { set_timer(); } @@ -102,15 +108,19 @@ static void sigalrm_handler(int sig) int main(int argc, char *argv[]) { + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + if (!is_intel_cpu() || vm_is_unrestricted_guest(NULL)) { print_skip("Must be run with kvm_intel.unrestricted_guest=0"); exit(KSFT_SKIP); } - vm = vm_create_default(VCPU_ID, 0, (void *)guest_code); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + get_set_sigalrm_vcpu(vcpu); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, VCPU_ID); + vcpu_init_descriptor_tables(vm, vcpu->id); vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); @@ -119,8 +129,8 @@ int main(int argc, char *argv[]) * KVM_RUN should induce a TRIPLE_FAULT in L2 as KVM doesn't support * emulating invalid guest state for L2. */ - set_invalid_guest_state(); - run_vcpu_with_invalid_state(); + set_invalid_guest_state(vcpu); + run_vcpu_with_invalid_state(vcpu); /* * Verify KVM also handles the case where userspace gains control while @@ -129,11 +139,11 @@ int main(int argc, char *argv[]) * guest with invalid state when the handler interrupts KVM with an * exception pending. */ - clear_invalid_guest_state(); + clear_invalid_guest_state(vcpu); TEST_ASSERT(signal(SIGALRM, sigalrm_handler) != SIG_ERR, "Failed to register SIGALRM handler, errno = %d (%s)", errno, strerror(errno)); set_timer(); - run_vcpu_with_invalid_state(); + run_vcpu_with_invalid_state(vcpu); } -- cgit v1.2.3 From f7024348d7ea2bea0be3dd82703b15f3dac9590b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 16:45:15 -0800 Subject: KVM: selftests: Convert tsc_msrs_test away from VCPU_ID Convert tsc_msrs_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c | 35 ++++++++++------------ 1 file changed, 16 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c index a426078b16a3..3b7bf660eced 100644 --- a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c +++ b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c @@ -9,14 +9,12 @@ #include "kvm_util.h" #include "processor.h" -#define VCPU_ID 0 - #define UNITY (1ull << 30) #define HOST_ADJUST (UNITY * 64) #define GUEST_STEP (UNITY * 4) #define ROUND(x) ((x + UNITY / 2) & -UNITY) #define rounded_rdmsr(x) ROUND(rdmsr(x)) -#define rounded_host_rdmsr(x) ROUND(vcpu_get_msr(vm, 0, x)) +#define rounded_host_rdmsr(x) ROUND(vcpu_get_msr(vm, vcpu->id, x)) static void guest_code(void) { @@ -66,15 +64,13 @@ static void guest_code(void) GUEST_DONE(); } -static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage) +static void run_vcpu(struct kvm_vcpu *vcpu, int stage) { struct ucall uc; - vcpu_args_set(vm, vcpuid, 1, vcpuid); - - vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL); + vcpu_run(vcpu->vm, vcpu->id); - switch (get_ucall(vm, vcpuid, &uc)) { + switch (get_ucall(vcpu->vm, vcpu->id, &uc)) { case UCALL_SYNC: TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx", @@ -88,29 +84,30 @@ static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage) __FILE__, uc.args[1], uc.args[2], uc.args[3]); default: TEST_ASSERT(false, "Unexpected exit: %s", - exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason)); + exit_reason_str(vcpu->run->exit_reason)); } } int main(void) { + struct kvm_vcpu *vcpu; struct kvm_vm *vm; uint64_t val; - vm = vm_create_default(VCPU_ID, 0, guest_code); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); val = 0; ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val); ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); /* Guest: writes to MSR_IA32_TSC affect both MSRs. */ - run_vcpu(vm, VCPU_ID, 1); + run_vcpu(vcpu, 1); val = 1ull * GUEST_STEP; ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val); ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); /* Guest: writes to MSR_IA32_TSC_ADJUST affect both MSRs. */ - run_vcpu(vm, VCPU_ID, 2); + run_vcpu(vcpu, 2); val = 2ull * GUEST_STEP; ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val); ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); @@ -119,18 +116,18 @@ int main(void) * Host: writes to MSR_IA32_TSC set the host-side offset * and therefore do not change MSR_IA32_TSC_ADJUST. */ - vcpu_set_msr(vm, 0, MSR_IA32_TSC, HOST_ADJUST + val); + vcpu_set_msr(vm, vcpu->id, MSR_IA32_TSC, HOST_ADJUST + val); ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val); ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); - run_vcpu(vm, VCPU_ID, 3); + run_vcpu(vcpu, 3); /* Host: writes to MSR_IA32_TSC_ADJUST do not modify the TSC. */ - vcpu_set_msr(vm, 0, MSR_IA32_TSC_ADJUST, UNITY * 123456); + vcpu_set_msr(vm, vcpu->id, MSR_IA32_TSC_ADJUST, UNITY * 123456); ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val); - ASSERT_EQ(vcpu_get_msr(vm, 0, MSR_IA32_TSC_ADJUST), UNITY * 123456); + ASSERT_EQ(vcpu_get_msr(vm, vcpu->id, MSR_IA32_TSC_ADJUST), UNITY * 123456); /* Restore previous value. */ - vcpu_set_msr(vm, 0, MSR_IA32_TSC_ADJUST, val); + vcpu_set_msr(vm, vcpu->id, MSR_IA32_TSC_ADJUST, val); ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val); ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); @@ -138,7 +135,7 @@ int main(void) * Guest: writes to MSR_IA32_TSC_ADJUST do not destroy the * host-side offset and affect both MSRs. */ - run_vcpu(vm, VCPU_ID, 4); + run_vcpu(vcpu, 4); val = 3ull * GUEST_STEP; ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val); ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); @@ -147,7 +144,7 @@ int main(void) * Guest: writes to MSR_IA32_TSC affect both MSRs, so the host-side * offset is now visible in MSR_IA32_TSC_ADJUST. */ - run_vcpu(vm, VCPU_ID, 5); + run_vcpu(vcpu, 5); val = 4ull * GUEST_STEP; ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val); ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val - HOST_ADJUST); -- cgit v1.2.3 From 5e7cb71570b99eec5e4bb76ea5184ded87497e3f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 16:46:56 -0800 Subject: KVM: selftests: Convert kvm_clock_test away from VCPU_ID Convert kvm_clock_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Opportunistically use vcpu_run() instead of _vcpu_run() with an open coded assert that KVM_RUN succeeded. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/x86_64/kvm_clock_test.c | 23 ++++++++++------------ 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c index 97731454f3f3..2c1f850c4053 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c @@ -16,8 +16,6 @@ #include "kvm_util.h" #include "processor.h" -#define VCPU_ID 0 - struct test_case { uint64_t kvmclock_base; int64_t realtime_offset; @@ -105,29 +103,27 @@ static void setup_clock(struct kvm_vm *vm, struct test_case *test_case) vm_ioctl(vm, KVM_SET_CLOCK, &data); } -static void enter_guest(struct kvm_vm *vm) +static void enter_guest(struct kvm_vcpu *vcpu) { struct kvm_clock_data start, end; - struct kvm_run *run; + struct kvm_run *run = vcpu->run; + struct kvm_vm *vm = vcpu->vm; struct ucall uc; - int i, r; - - run = vcpu_state(vm, VCPU_ID); + int i; for (i = 0; i < ARRAY_SIZE(test_cases); i++) { setup_clock(vm, &test_cases[i]); vm_ioctl(vm, KVM_GET_CLOCK, &start); - r = _vcpu_run(vm, VCPU_ID); + vcpu_run(vcpu->vm, vcpu->id); vm_ioctl(vm, KVM_GET_CLOCK, &end); - TEST_ASSERT(!r, "vcpu_run failed: %d\n", r); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "unexpected exit reason: %u (%s)", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vcpu->vm, vcpu->id, &uc)) { case UCALL_SYNC: handle_sync(&uc, &start, &end); break; @@ -178,6 +174,7 @@ out: int main(void) { + struct kvm_vcpu *vcpu; vm_vaddr_t pvti_gva; vm_paddr_t pvti_gpa; struct kvm_vm *vm; @@ -192,12 +189,12 @@ int main(void) check_clocksource(); - vm = vm_create_default(VCPU_ID, 0, guest_main); + vm = vm_create_with_one_vcpu(&vcpu, guest_main); pvti_gva = vm_vaddr_alloc(vm, getpagesize(), 0x10000); pvti_gpa = addr_gva2gpa(vm, pvti_gva); - vcpu_args_set(vm, VCPU_ID, 2, pvti_gpa, pvti_gva); + vcpu_args_set(vm, vcpu->id, 2, pvti_gpa, pvti_gva); - enter_guest(vm); + enter_guest(vcpu); kvm_vm_free(vm); } -- cgit v1.2.3 From a1918c0fbeea59ccc629d83e491e798fc657fe41 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 16:47:48 -0800 Subject: KVM: selftests: Convert hyperv_svm_test away from VCPU_ID Convert hyperv_svm_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Note, this is a "functional" change in the sense that the test now creates a vCPU with vcpu_id==0 instead of vcpu_id==1. The non-zero VCPU_ID was 100% arbitrary and added little to no validation coverage. If testing non-zero vCPU IDs is desirable for generic tests, that can be done in the future by tweaking the VM creation helpers. Opportunistically use vcpu_run() instead of _vcpu_run(), the test expects KVM_RUN to succeed. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c index 994b33fd8724..b6a749f5c766 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c @@ -21,7 +21,6 @@ #include "svm_util.h" #include "hyperv.h" -#define VCPU_ID 1 #define L2_GUEST_STACK_SIZE 256 struct hv_enlightenments { @@ -122,6 +121,7 @@ int main(int argc, char *argv[]) { vm_vaddr_t nested_gva = 0; + struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct kvm_run *run; struct ucall uc; @@ -132,20 +132,20 @@ int main(int argc, char *argv[]) exit(KSFT_SKIP); } /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code); - vcpu_set_hv_cpuid(vm, VCPU_ID); - run = vcpu_state(vm, VCPU_ID); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + vcpu_set_hv_cpuid(vm, vcpu->id); + run = vcpu->run; vcpu_alloc_svm(vm, &nested_gva); - vcpu_args_set(vm, VCPU_ID, 1, nested_gva); + vcpu_args_set(vm, vcpu->id, 1, nested_gva); for (stage = 1;; stage++) { - _vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Stage %d: unexpected exit reason: %u (%s),\n", stage, run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vm, vcpu->id, &uc)) { case UCALL_ABORT: TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]); -- cgit v1.2.3 From d96b959600e540337fc489564dd93fc9a5ee9fe7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 16:50:11 -0800 Subject: KVM: selftests: Convert hyperv_features away from VCPU_ID Convert hyperv_features to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Opportunistically use vcpu_run() instead of _vcpu_run() with an open coded assert that KVM_RUN succeeded. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/x86_64/hyperv_features.c | 51 +++++++++++----------- 1 file changed, 25 insertions(+), 26 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c index 7ff6e4d70333..d0bd9d5e8a99 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c @@ -13,7 +13,6 @@ #include "processor.h" #include "hyperv.h" -#define VCPU_ID 0 #define LINUX_OS_ID ((u64)0x8100 << 48) extern unsigned char rdmsr_start; @@ -151,7 +150,7 @@ static void guest_hcall(vm_vaddr_t pgs_gpa, struct hcall_data *hcall) GUEST_DONE(); } -static void hv_set_cpuid(struct kvm_vm *vm, struct kvm_cpuid2 *cpuid, +static void hv_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 *feat, struct kvm_cpuid_entry2 *recomm, struct kvm_cpuid_entry2 *dbg) @@ -162,15 +161,16 @@ static void hv_set_cpuid(struct kvm_vm *vm, struct kvm_cpuid2 *cpuid, "failed to set HYPERV_CPUID_ENLIGHTMENT_INFO leaf"); TEST_ASSERT(set_cpuid(cpuid, dbg), "failed to set HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES leaf"); - vcpu_set_cpuid(vm, VCPU_ID, cpuid); + vcpu_set_cpuid(vcpu->vm, vcpu->id, cpuid); } static void guest_test_msrs_access(void) { + struct kvm_vcpu *vcpu; struct kvm_run *run; struct kvm_vm *vm; struct ucall uc; - int stage = 0, r; + int stage = 0; struct kvm_cpuid_entry2 feat = { .function = HYPERV_CPUID_FEATURES }; @@ -185,24 +185,24 @@ static void guest_test_msrs_access(void) struct msr_data *msr; while (true) { - vm = vm_create_default(VCPU_ID, 0, guest_msr); + vm = vm_create_with_one_vcpu(&vcpu, guest_msr); msr_gva = vm_vaddr_alloc_page(vm); memset(addr_gva2hva(vm, msr_gva), 0x0, getpagesize()); msr = addr_gva2hva(vm, msr_gva); - vcpu_args_set(vm, VCPU_ID, 1, msr_gva); - vcpu_enable_cap(vm, VCPU_ID, KVM_CAP_HYPERV_ENFORCE_CPUID, 1); + vcpu_args_set(vm, vcpu->id, 1, msr_gva); + vcpu_enable_cap(vm, vcpu->id, KVM_CAP_HYPERV_ENFORCE_CPUID, 1); - vcpu_set_hv_cpuid(vm, VCPU_ID); + vcpu_set_hv_cpuid(vm, vcpu->id); best = kvm_get_supported_hv_cpuid(); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, VCPU_ID); + vcpu_init_descriptor_tables(vm, vcpu->id); vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); - run = vcpu_state(vm, VCPU_ID); + run = vcpu->run; switch (stage) { case 0: @@ -333,7 +333,7 @@ static void guest_test_msrs_access(void) * Remains unavailable even with KVM_CAP_HYPERV_SYNIC2 * capability enabled and guest visible CPUID bit unset. */ - vcpu_enable_cap(vm, VCPU_ID, KVM_CAP_HYPERV_SYNIC2, 0); + vcpu_enable_cap(vm, vcpu->id, KVM_CAP_HYPERV_SYNIC2, 0); break; case 22: feat.eax |= HV_MSR_SYNIC_AVAILABLE; @@ -463,7 +463,7 @@ static void guest_test_msrs_access(void) break; } - hv_set_cpuid(vm, best, &feat, &recomm, &dbg); + hv_set_cpuid(vcpu, best, &feat, &recomm, &dbg); if (msr->idx) pr_debug("Stage %d: testing msr: 0x%x for %s\n", stage, @@ -471,13 +471,12 @@ static void guest_test_msrs_access(void) else pr_debug("Stage %d: finish\n", stage); - r = _vcpu_run(vm, VCPU_ID); - TEST_ASSERT(!r, "vcpu_run failed: %d\n", r); + vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "unexpected exit reason: %u (%s)", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vm, vcpu->id, &uc)) { case UCALL_SYNC: TEST_ASSERT(uc.args[1] == 0, "Unexpected stage: %ld (0 expected)\n", @@ -498,10 +497,11 @@ static void guest_test_msrs_access(void) static void guest_test_hcalls_access(void) { + struct kvm_vcpu *vcpu; struct kvm_run *run; struct kvm_vm *vm; struct ucall uc; - int stage = 0, r; + int stage = 0; struct kvm_cpuid_entry2 feat = { .function = HYPERV_CPUID_FEATURES, .eax = HV_MSR_HYPERCALL_AVAILABLE @@ -517,10 +517,10 @@ static void guest_test_hcalls_access(void) struct kvm_cpuid2 *best; while (true) { - vm = vm_create_default(VCPU_ID, 0, guest_hcall); + vm = vm_create_with_one_vcpu(&vcpu, guest_hcall); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, VCPU_ID); + vcpu_init_descriptor_tables(vm, vcpu->id); vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); /* Hypercall input/output */ @@ -531,14 +531,14 @@ static void guest_test_hcalls_access(void) hcall_params = vm_vaddr_alloc_page(vm); memset(addr_gva2hva(vm, hcall_params), 0x0, getpagesize()); - vcpu_args_set(vm, VCPU_ID, 2, addr_gva2gpa(vm, hcall_page), hcall_params); - vcpu_enable_cap(vm, VCPU_ID, KVM_CAP_HYPERV_ENFORCE_CPUID, 1); + vcpu_args_set(vm, vcpu->id, 2, addr_gva2gpa(vm, hcall_page), hcall_params); + vcpu_enable_cap(vm, vcpu->id, KVM_CAP_HYPERV_ENFORCE_CPUID, 1); - vcpu_set_hv_cpuid(vm, VCPU_ID); + vcpu_set_hv_cpuid(vm, vcpu->id); best = kvm_get_supported_hv_cpuid(); - run = vcpu_state(vm, VCPU_ID); + run = vcpu->run; switch (stage) { case 0: @@ -633,7 +633,7 @@ static void guest_test_hcalls_access(void) break; } - hv_set_cpuid(vm, best, &feat, &recomm, &dbg); + hv_set_cpuid(vcpu, best, &feat, &recomm, &dbg); if (hcall->control) pr_debug("Stage %d: testing hcall: 0x%lx\n", stage, @@ -641,13 +641,12 @@ static void guest_test_hcalls_access(void) else pr_debug("Stage %d: finish\n", stage); - r = _vcpu_run(vm, VCPU_ID); - TEST_ASSERT(!r, "vcpu_run failed: %d\n", r); + vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "unexpected exit reason: %u (%s)", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vm, vcpu->id, &uc)) { case UCALL_SYNC: TEST_ASSERT(uc.args[1] == 0, "Unexpected stage: %ld (0 expected)\n", -- cgit v1.2.3 From a858163711751b1caba7415c473a0668de1ef2bf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 16:54:04 -0800 Subject: KVM: selftests: Convert hyperv_clock away from VCPU_ID Convert hyperv_clock to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Opportunistically use vcpu_run() instead of _vcpu_run() with an open coded assert that KVM_RUN succeeded. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/hyperv_clock.c | 25 +++++++++++------------ 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c index 3330fb183c68..4f8f3999de2d 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c @@ -173,23 +173,21 @@ static void guest_main(struct ms_hyperv_tsc_page *tsc_page, vm_paddr_t tsc_page_ GUEST_DONE(); } -#define VCPU_ID 0 - -static void host_check_tsc_msr_rdtsc(struct kvm_vm *vm) +static void host_check_tsc_msr_rdtsc(struct kvm_vcpu *vcpu) { u64 tsc_freq, r1, r2, t1, t2; s64 delta_ns; - tsc_freq = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TSC_FREQUENCY); + tsc_freq = vcpu_get_msr(vcpu->vm, vcpu->id, HV_X64_MSR_TSC_FREQUENCY); TEST_ASSERT(tsc_freq > 0, "TSC frequency must be nonzero"); /* For increased accuracy, take mean rdtsc() before and afrer ioctl */ r1 = rdtsc(); - t1 = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TIME_REF_COUNT); + t1 = vcpu_get_msr(vcpu->vm, vcpu->id, HV_X64_MSR_TIME_REF_COUNT); r1 = (r1 + rdtsc()) / 2; nop_loop(); r2 = rdtsc(); - t2 = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TIME_REF_COUNT); + t2 = vcpu_get_msr(vcpu->vm, vcpu->id, HV_X64_MSR_TIME_REF_COUNT); r2 = (r2 + rdtsc()) / 2; TEST_ASSERT(t2 > t1, "Time reference MSR is not monotonic (%ld <= %ld)", t1, t2); @@ -207,33 +205,34 @@ static void host_check_tsc_msr_rdtsc(struct kvm_vm *vm) int main(void) { + struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct kvm_run *run; struct ucall uc; vm_vaddr_t tsc_page_gva; int stage; - vm = vm_create_default(VCPU_ID, 0, guest_main); - run = vcpu_state(vm, VCPU_ID); + vm = vm_create_with_one_vcpu(&vcpu, guest_main); + run = vcpu->run; - vcpu_set_hv_cpuid(vm, VCPU_ID); + vcpu_set_hv_cpuid(vm, vcpu->id); tsc_page_gva = vm_vaddr_alloc_page(vm); memset(addr_gva2hva(vm, tsc_page_gva), 0x0, getpagesize()); TEST_ASSERT((addr_gva2gpa(vm, tsc_page_gva) & (getpagesize() - 1)) == 0, "TSC page has to be page aligned\n"); - vcpu_args_set(vm, VCPU_ID, 2, tsc_page_gva, addr_gva2gpa(vm, tsc_page_gva)); + vcpu_args_set(vm, vcpu->id, 2, tsc_page_gva, addr_gva2gpa(vm, tsc_page_gva)); - host_check_tsc_msr_rdtsc(vm); + host_check_tsc_msr_rdtsc(vcpu); for (stage = 1;; stage++) { - _vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Stage %d: unexpected exit reason: %u (%s),\n", stage, run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vm, vcpu->id, &uc)) { case UCALL_ABORT: TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]); -- cgit v1.2.3 From be0dff8610b15976151934d25f433391880f88a9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 16:57:49 -0800 Subject: KVM: selftests: Convert evmcs_test away from VCPU_ID Convert evmcs_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Note, this is a "functional" change in the sense that the test now creates a vCPU with vcpu_id==0 instead of vcpu_id==5. The non-zero VCPU_ID was 100% arbitrary and added little to no validation coverage. If testing non-zero vCPU IDs is desirable for generic tests, that can be done in the future by tweaking the VM creation helpers. Opportunistically use vcpu_run() instead of _vcpu_run(), the test expects KVM_RUN to succeed. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/evmcs_test.c | 52 ++++++++++++------------- 1 file changed, 26 insertions(+), 26 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index 78668605f673..ba39042a5d96 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -18,8 +18,6 @@ #include "vmx.h" -#define VCPU_ID 5 - static int ud_count; static void guest_ud_handler(struct ex_regs *regs) @@ -159,55 +157,56 @@ void guest_code(struct vmx_pages *vmx_pages) GUEST_DONE(); } -void inject_nmi(struct kvm_vm *vm) +void inject_nmi(struct kvm_vcpu *vcpu) { struct kvm_vcpu_events events; - vcpu_events_get(vm, VCPU_ID, &events); + vcpu_events_get(vcpu->vm, vcpu->id, &events); events.nmi.pending = 1; events.flags |= KVM_VCPUEVENT_VALID_NMI_PENDING; - vcpu_events_set(vm, VCPU_ID, &events); + vcpu_events_set(vcpu->vm, vcpu->id, &events); } -static void save_restore_vm(struct kvm_vm *vm) +static struct kvm_vcpu *save_restore_vm(struct kvm_vm *vm, + struct kvm_vcpu *vcpu) { struct kvm_regs regs1, regs2; struct kvm_x86_state *state; - state = vcpu_save_state(vm, VCPU_ID); + state = vcpu_save_state(vm, vcpu->id); memset(®s1, 0, sizeof(regs1)); - vcpu_regs_get(vm, VCPU_ID, ®s1); + vcpu_regs_get(vm, vcpu->id, ®s1); kvm_vm_release(vm); /* Restore state in a new VM. */ - kvm_vm_restart(vm); - vm_vcpu_add(vm, VCPU_ID); - vcpu_set_hv_cpuid(vm, VCPU_ID); - vcpu_enable_evmcs(vm, VCPU_ID); - vcpu_load_state(vm, VCPU_ID, state); + vcpu = vm_recreate_with_one_vcpu(vm); + vcpu_set_hv_cpuid(vm, vcpu->id); + vcpu_enable_evmcs(vm, vcpu->id); + vcpu_load_state(vm, vcpu->id, state); kvm_x86_state_cleanup(state); memset(®s2, 0, sizeof(regs2)); - vcpu_regs_get(vm, VCPU_ID, ®s2); + vcpu_regs_get(vm, vcpu->id, ®s2); TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", (ulong) regs2.rdi, (ulong) regs2.rsi); + return vcpu; } int main(int argc, char *argv[]) { vm_vaddr_t vmx_pages_gva = 0; + struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct kvm_run *run; struct ucall uc; int stage; - /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); if (!nested_vmx_supported() || !kvm_check_cap(KVM_CAP_NESTED_STATE) || @@ -216,28 +215,29 @@ int main(int argc, char *argv[]) exit(KSFT_SKIP); } - vcpu_set_hv_cpuid(vm, VCPU_ID); - vcpu_enable_evmcs(vm, VCPU_ID); + vcpu_set_hv_cpuid(vm, vcpu->id); + vcpu_enable_evmcs(vm, vcpu->id); vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + vcpu_args_set(vm, vcpu->id, 1, vmx_pages_gva); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, VCPU_ID); + vcpu_init_descriptor_tables(vm, vcpu->id); vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); vm_install_exception_handler(vm, NMI_VECTOR, guest_nmi_handler); pr_info("Running L1 which uses EVMCS to run L2\n"); for (stage = 1;; stage++) { - run = vcpu_state(vm, VCPU_ID); - _vcpu_run(vm, VCPU_ID); + run = vcpu->run; + + vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Stage %d: unexpected exit reason: %u (%s),\n", stage, run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vm, vcpu->id, &uc)) { case UCALL_ABORT: TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]); @@ -255,12 +255,12 @@ int main(int argc, char *argv[]) uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx", stage, (ulong)uc.args[1]); - save_restore_vm(vm); + vcpu = save_restore_vm(vm, vcpu); /* Force immediate L2->L1 exit before resuming */ if (stage == 8) { pr_info("Injecting NMI into L1 before L2 had a chance to run after restore\n"); - inject_nmi(vm); + inject_nmi(vcpu); } /* @@ -270,7 +270,7 @@ int main(int argc, char *argv[]) */ if (stage == 9) { pr_info("Trying extra KVM_GET_NESTED_STATE/KVM_SET_NESTED_STATE cycle\n"); - save_restore_vm(vm); + vcpu = save_restore_vm(vm, vcpu); } } -- cgit v1.2.3 From 42975c219975f5df0d686868e8b57698b2d09561 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 17:02:44 -0800 Subject: KVM: selftests: Convert emulator_error_test away from VCPU_ID Convert emulator_error_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Note, this is a "functional" change in the sense that the test now creates a vCPU with vcpu_id==0 instead of vcpu_id==5. The non-zero VCPU_ID was 100% arbitrary and added little to no validation coverage. If testing non-zero vCPU IDs is desirable for generic tests, that can be done in the future by tweaking the VM creation helpers. Opportunistically use vcpu_run() instead of _vcpu_run() with an open coded assert that KVM_RUN succeeded. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../selftests/kvm/x86_64/emulator_error_test.c | 65 ++++++++++------------ 1 file changed, 28 insertions(+), 37 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c index 9c156f9cfa15..08a95dab3a6b 100644 --- a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c +++ b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c @@ -11,7 +11,6 @@ #include "kvm_util.h" #include "vmx.h" -#define VCPU_ID 1 #define MAXPHYADDR 36 #define MEM_REGION_GVA 0x0000123456789000 @@ -27,14 +26,6 @@ static void guest_code(void) GUEST_DONE(); } -static void run_guest(struct kvm_vm *vm) -{ - int rc; - - rc = _vcpu_run(vm, VCPU_ID); - TEST_ASSERT(rc == 0, "vcpu_run failed: %d\n", rc); -} - /* * Accessors to get R/M, REG, and Mod bits described in the SDM vol 2, * figure 2-2 "Table Interpretation of ModR/M Byte (C8H)". @@ -56,9 +47,9 @@ static bool is_flds(uint8_t *insn_bytes, uint8_t insn_size) GET_RM(insn_bytes[1]) != 0x5; } -static void process_exit_on_emulation_error(struct kvm_vm *vm) +static void process_exit_on_emulation_error(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct kvm_run *run = vcpu->run; struct kvm_regs regs; uint8_t *insn_bytes; uint8_t insn_size; @@ -92,50 +83,49 @@ static void process_exit_on_emulation_error(struct kvm_vm *vm) * contained an flds instruction that is 2-bytes in * length (ie: no prefix, no SIB, no displacement). */ - vcpu_regs_get(vm, VCPU_ID, ®s); + vcpu_regs_get(vcpu->vm, vcpu->id, ®s); regs.rip += 2; - vcpu_regs_set(vm, VCPU_ID, ®s); + vcpu_regs_set(vcpu->vm, vcpu->id, ®s); } } } -static void do_guest_assert(struct kvm_vm *vm, struct ucall *uc) +static void do_guest_assert(struct ucall *uc) { TEST_FAIL("%s at %s:%ld", (const char *)uc->args[0], __FILE__, uc->args[1]); } -static void check_for_guest_assert(struct kvm_vm *vm) +static void check_for_guest_assert(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu_state(vm, VCPU_ID); struct ucall uc; - if (run->exit_reason == KVM_EXIT_IO && - get_ucall(vm, VCPU_ID, &uc) == UCALL_ABORT) { - do_guest_assert(vm, &uc); + if (vcpu->run->exit_reason == KVM_EXIT_IO && + get_ucall(vcpu->vm, vcpu->id, &uc) == UCALL_ABORT) { + do_guest_assert(&uc); } } -static void process_ucall_done(struct kvm_vm *vm) +static void process_ucall_done(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct kvm_run *run = vcpu->run; struct ucall uc; - check_for_guest_assert(vm); + check_for_guest_assert(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Unexpected exit reason: %u (%s)", run->exit_reason, exit_reason_str(run->exit_reason)); - TEST_ASSERT(get_ucall(vm, VCPU_ID, &uc) == UCALL_DONE, + TEST_ASSERT(get_ucall(vcpu->vm, vcpu->id, &uc) == UCALL_DONE, "Unexpected ucall command: %lu, expected UCALL_DONE (%d)", uc.cmd, UCALL_DONE); } -static uint64_t process_ucall(struct kvm_vm *vm) +static uint64_t process_ucall(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct kvm_run *run = vcpu->run; struct ucall uc; TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, @@ -143,14 +133,14 @@ static uint64_t process_ucall(struct kvm_vm *vm) run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vcpu->vm, vcpu->id, &uc)) { case UCALL_SYNC: break; case UCALL_ABORT: - do_guest_assert(vm, &uc); + do_guest_assert(&uc); break; case UCALL_DONE: - process_ucall_done(vm); + process_ucall_done(vcpu); break; default: TEST_ASSERT(false, "Unexpected ucall"); @@ -163,6 +153,7 @@ int main(int argc, char *argv[]) { struct kvm_cpuid_entry2 *entry; struct kvm_cpuid2 *cpuid; + struct kvm_vcpu *vcpu; struct kvm_vm *vm; uint64_t gpa, pte; uint64_t *hva; @@ -171,20 +162,20 @@ int main(int argc, char *argv[]) /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); - vm = vm_create_default(VCPU_ID, 0, guest_code); - if (!kvm_check_cap(KVM_CAP_SMALLER_MAXPHYADDR)) { printf("module parameter 'allow_smaller_maxphyaddr' is not set. Skipping test.\n"); return 0; } + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + cpuid = kvm_get_supported_cpuid(); entry = kvm_get_supported_cpuid_index(0x80000008, 0); entry->eax = (entry->eax & 0xffffff00) | MAXPHYADDR; set_cpuid(cpuid, entry); - vcpu_set_cpuid(vm, VCPU_ID, cpuid); + vcpu_set_cpuid(vm, vcpu->id, cpuid); rc = kvm_check_cap(KVM_CAP_EXIT_ON_EMULATION_FAILURE); TEST_ASSERT(rc, "KVM_CAP_EXIT_ON_EMULATION_FAILURE is unavailable"); @@ -199,14 +190,14 @@ int main(int argc, char *argv[]) virt_map(vm, MEM_REGION_GVA, MEM_REGION_GPA, 1); hva = addr_gpa2hva(vm, MEM_REGION_GPA); memset(hva, 0, PAGE_SIZE); - pte = vm_get_page_table_entry(vm, VCPU_ID, MEM_REGION_GVA); - vm_set_page_table_entry(vm, VCPU_ID, MEM_REGION_GVA, pte | (1ull << 36)); + pte = vm_get_page_table_entry(vm, vcpu->id, MEM_REGION_GVA); + vm_set_page_table_entry(vm, vcpu->id, MEM_REGION_GVA, pte | (1ull << 36)); - run_guest(vm); - process_exit_on_emulation_error(vm); - run_guest(vm); + vcpu_run(vm, vcpu->id); + process_exit_on_emulation_error(vcpu); + vcpu_run(vm, vcpu->id); - TEST_ASSERT(process_ucall(vm) == UCALL_DONE, "Expected UCALL_DONE"); + TEST_ASSERT(process_ucall(vcpu) == UCALL_DONE, "Expected UCALL_DONE"); kvm_vm_free(vm); -- cgit v1.2.3 From 28039449b83e21f741937e84d4fd6485ad4fb9f8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 17:06:02 -0800 Subject: KVM: selftests: Convert debug_regs away from VCPU_ID Convert debug_regs to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Opportunstically drop the CLEAR_DEBUG/APPLY_DEBUG macros as they only obfuscate the code, e.g. operating on local variables not "passed" to the macro is all kinds of confusing. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/debug_regs.c | 53 ++++++++++++------------- 1 file changed, 25 insertions(+), 28 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/debug_regs.c b/tools/testing/selftests/kvm/x86_64/debug_regs.c index f726645bb9c3..182d71c6d13a 100644 --- a/tools/testing/selftests/kvm/x86_64/debug_regs.c +++ b/tools/testing/selftests/kvm/x86_64/debug_regs.c @@ -10,8 +10,6 @@ #include "processor.h" #include "apic.h" -#define VCPU_ID 0 - #define DR6_BD (1 << 13) #define DR7_GD (1 << 13) @@ -66,13 +64,11 @@ static void guest_code(void) GUEST_DONE(); } -#define CLEAR_DEBUG() memset(&debug, 0, sizeof(debug)) -#define APPLY_DEBUG() vcpu_guest_debug_set(vm, VCPU_ID, &debug) #define CAST_TO_RIP(v) ((unsigned long long)&(v)) #define SET_RIP(v) do { \ - vcpu_regs_get(vm, VCPU_ID, ®s); \ + vcpu_regs_get(vm, vcpu->id, ®s); \ regs.rip = (v); \ - vcpu_regs_set(vm, VCPU_ID, ®s); \ + vcpu_regs_set(vm, vcpu->id, ®s); \ } while (0) #define MOVE_RIP(v) SET_RIP(regs.rip + (v)); @@ -80,6 +76,7 @@ int main(void) { struct kvm_guest_debug debug; unsigned long long target_dr6, target_rip; + struct kvm_vcpu *vcpu; struct kvm_regs regs; struct kvm_run *run; struct kvm_vm *vm; @@ -101,14 +98,14 @@ int main(void) return 0; } - vm = vm_create_default(VCPU_ID, 0, guest_code); - run = vcpu_state(vm, VCPU_ID); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + run = vcpu->run; /* Test software BPs - int3 */ - CLEAR_DEBUG(); + memset(&debug, 0, sizeof(debug)); debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP; - APPLY_DEBUG(); - vcpu_run(vm, VCPU_ID); + vcpu_guest_debug_set(vm, vcpu->id, &debug); + vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && run->debug.arch.exception == BP_VECTOR && run->debug.arch.pc == CAST_TO_RIP(sw_bp), @@ -119,12 +116,12 @@ int main(void) /* Test instruction HW BP over DR[0-3] */ for (i = 0; i < 4; i++) { - CLEAR_DEBUG(); + memset(&debug, 0, sizeof(debug)); debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP; debug.arch.debugreg[i] = CAST_TO_RIP(hw_bp); debug.arch.debugreg[7] = 0x400 | (1UL << (2*i+1)); - APPLY_DEBUG(); - vcpu_run(vm, VCPU_ID); + vcpu_guest_debug_set(vm, vcpu->id, &debug); + vcpu_run(vm, vcpu->id); target_dr6 = 0xffff0ff0 | (1UL << i); TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && run->debug.arch.exception == DB_VECTOR && @@ -141,13 +138,13 @@ int main(void) /* Test data access HW BP over DR[0-3] */ for (i = 0; i < 4; i++) { - CLEAR_DEBUG(); + memset(&debug, 0, sizeof(debug)); debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP; debug.arch.debugreg[i] = CAST_TO_RIP(guest_value); debug.arch.debugreg[7] = 0x00000400 | (1UL << (2*i+1)) | (0x000d0000UL << (4*i)); - APPLY_DEBUG(); - vcpu_run(vm, VCPU_ID); + vcpu_guest_debug_set(vm, vcpu->id, &debug); + vcpu_run(vm, vcpu->id); target_dr6 = 0xffff0ff0 | (1UL << i); TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && run->debug.arch.exception == DB_VECTOR && @@ -167,15 +164,15 @@ int main(void) /* Test single step */ target_rip = CAST_TO_RIP(ss_start); target_dr6 = 0xffff4ff0ULL; - vcpu_regs_get(vm, VCPU_ID, ®s); + vcpu_regs_get(vm, vcpu->id, ®s); for (i = 0; i < (sizeof(ss_size) / sizeof(ss_size[0])); i++) { target_rip += ss_size[i]; - CLEAR_DEBUG(); + memset(&debug, 0, sizeof(debug)); debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_BLOCKIRQ; debug.arch.debugreg[7] = 0x00000400; - APPLY_DEBUG(); - vcpu_run(vm, VCPU_ID); + vcpu_guest_debug_set(vm, vcpu->id, &debug); + vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && run->debug.arch.exception == DB_VECTOR && run->debug.arch.pc == target_rip && @@ -188,11 +185,11 @@ int main(void) } /* Finally test global disable */ - CLEAR_DEBUG(); + memset(&debug, 0, sizeof(debug)); debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP; debug.arch.debugreg[7] = 0x400 | DR7_GD; - APPLY_DEBUG(); - vcpu_run(vm, VCPU_ID); + vcpu_guest_debug_set(vm, vcpu->id, &debug); + vcpu_run(vm, vcpu->id); target_dr6 = 0xffff0ff0 | DR6_BD; TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && run->debug.arch.exception == DB_VECTOR && @@ -205,12 +202,12 @@ int main(void) target_dr6); /* Disable all debug controls, run to the end */ - CLEAR_DEBUG(); - APPLY_DEBUG(); + memset(&debug, 0, sizeof(debug)); + vcpu_guest_debug_set(vm, vcpu->id, &debug); - vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "KVM_EXIT_IO"); - cmd = get_ucall(vm, VCPU_ID, &uc); + cmd = get_ucall(vm, vcpu->id, &uc); TEST_ASSERT(cmd == UCALL_DONE, "UCALL_DONE"); kvm_vm_free(vm); -- cgit v1.2.3 From 2571bcdb136a3daf59df677585f32b89615eea47 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 17:09:25 -0800 Subject: KVM: selftests: Add proper helper for advancing RIP in debug_regs Replace MOVE_RIP+SET_RIP with a proper helper, vcpu_skip_insn(), that is more descriptive, doesn't subtly access local variables, and provides type safety. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/debug_regs.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/debug_regs.c b/tools/testing/selftests/kvm/x86_64/debug_regs.c index 182d71c6d13a..3cc25714d703 100644 --- a/tools/testing/selftests/kvm/x86_64/debug_regs.c +++ b/tools/testing/selftests/kvm/x86_64/debug_regs.c @@ -65,19 +65,21 @@ static void guest_code(void) } #define CAST_TO_RIP(v) ((unsigned long long)&(v)) -#define SET_RIP(v) do { \ - vcpu_regs_get(vm, vcpu->id, ®s); \ - regs.rip = (v); \ - vcpu_regs_set(vm, vcpu->id, ®s); \ - } while (0) -#define MOVE_RIP(v) SET_RIP(regs.rip + (v)); + +static void vcpu_skip_insn(struct kvm_vcpu *vcpu, int insn_len) +{ + struct kvm_regs regs; + + vcpu_regs_get(vcpu->vm, vcpu->id, ®s); + regs.rip += insn_len; + vcpu_regs_set(vcpu->vm, vcpu->id, ®s); +} int main(void) { struct kvm_guest_debug debug; unsigned long long target_dr6, target_rip; struct kvm_vcpu *vcpu; - struct kvm_regs regs; struct kvm_run *run; struct kvm_vm *vm; struct ucall uc; @@ -112,7 +114,7 @@ int main(void) "INT3: exit %d exception %d rip 0x%llx (should be 0x%llx)", run->exit_reason, run->debug.arch.exception, run->debug.arch.pc, CAST_TO_RIP(sw_bp)); - MOVE_RIP(1); + vcpu_skip_insn(vcpu, 1); /* Test instruction HW BP over DR[0-3] */ for (i = 0; i < 4; i++) { @@ -134,7 +136,7 @@ int main(void) run->debug.arch.dr6, target_dr6); } /* Skip "nop" */ - MOVE_RIP(1); + vcpu_skip_insn(vcpu, 1); /* Test data access HW BP over DR[0-3] */ for (i = 0; i < 4; i++) { @@ -156,15 +158,14 @@ int main(void) run->debug.arch.pc, CAST_TO_RIP(write_data), run->debug.arch.dr6, target_dr6); /* Rollback the 4-bytes "mov" */ - MOVE_RIP(-7); + vcpu_skip_insn(vcpu, -7); } /* Skip the 4-bytes "mov" */ - MOVE_RIP(7); + vcpu_skip_insn(vcpu, 7); /* Test single step */ target_rip = CAST_TO_RIP(ss_start); target_dr6 = 0xffff4ff0ULL; - vcpu_regs_get(vm, vcpu->id, ®s); for (i = 0; i < (sizeof(ss_size) / sizeof(ss_size[0])); i++) { target_rip += ss_size[i]; memset(&debug, 0, sizeof(debug)); -- cgit v1.2.3 From 39839c1a68ce28ea38ca3f789c9981b0d7310def Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 17:11:25 -0800 Subject: KVM: selftests: Convert amx_test away from VCPU_ID Convert amx_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID.o Opportunistically use vcpu_run() instead of _vcpu_run(), the test expects KVM_RUN to succeed. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/amx_test.c | 33 +++++++++++++-------------- 1 file changed, 16 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c index 2f01247da0b5..7755fe8fcffb 100644 --- a/tools/testing/selftests/kvm/x86_64/amx_test.c +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -25,7 +25,6 @@ # error This test is 64-bit only #endif -#define VCPU_ID 0 #define X86_FEATURE_XSAVE (1 << 26) #define X86_FEATURE_OSXSAVE (1 << 27) @@ -319,6 +318,7 @@ int main(int argc, char *argv[]) struct kvm_cpuid_entry2 *entry; struct kvm_regs regs1, regs2; bool amx_supported = false; + struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct kvm_run *run; struct kvm_x86_state *state; @@ -331,7 +331,7 @@ int main(int argc, char *argv[]) vm_xsave_req_perm(XSTATE_XTILE_DATA_BIT); /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); entry = kvm_get_supported_cpuid_entry(1); if (!(entry->ecx & X86_FEATURE_XSAVE)) { @@ -350,12 +350,12 @@ int main(int argc, char *argv[]) xsave_restore_size = entry->ecx; } - run = vcpu_state(vm, VCPU_ID); - vcpu_regs_get(vm, VCPU_ID, ®s1); + run = vcpu->run; + vcpu_regs_get(vm, vcpu->id, ®s1); /* Register #NM handler */ vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, VCPU_ID); + vcpu_init_descriptor_tables(vm, vcpu->id); vm_install_exception_handler(vm, NM_VECTOR, guest_nm_handler); /* amx cfg for guest_code */ @@ -369,16 +369,16 @@ int main(int argc, char *argv[]) /* xsave data for guest_code */ xsavedata = vm_vaddr_alloc_pages(vm, 3); memset(addr_gva2hva(vm, xsavedata), 0, 3 * getpagesize()); - vcpu_args_set(vm, VCPU_ID, 3, amx_cfg, tiledata, xsavedata); + vcpu_args_set(vm, vcpu->id, 3, amx_cfg, tiledata, xsavedata); for (stage = 1; ; stage++) { - _vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Stage %d: unexpected exit reason: %u (%s),\n", stage, run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vm, vcpu->id, &uc)) { case UCALL_ABORT: TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]); @@ -403,7 +403,7 @@ int main(int argc, char *argv[]) * size subtract 8K amx size. */ amx_offset = xsave_restore_size - NUM_TILES*TILE_SIZE; - state = vcpu_save_state(vm, VCPU_ID); + state = vcpu_save_state(vm, vcpu->id); void *amx_start = (void *)state->xsave + amx_offset; void *tiles_data = (void *)addr_gva2hva(vm, tiledata); /* Only check TMM0 register, 1 tile */ @@ -424,22 +424,21 @@ int main(int argc, char *argv[]) TEST_FAIL("Unknown ucall %lu", uc.cmd); } - state = vcpu_save_state(vm, VCPU_ID); + state = vcpu_save_state(vm, vcpu->id); memset(®s1, 0, sizeof(regs1)); - vcpu_regs_get(vm, VCPU_ID, ®s1); + vcpu_regs_get(vm, vcpu->id, ®s1); kvm_vm_release(vm); /* Restore state in a new VM. */ - kvm_vm_restart(vm); - vm_vcpu_add(vm, VCPU_ID); - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); - vcpu_load_state(vm, VCPU_ID, state); - run = vcpu_state(vm, VCPU_ID); + vcpu = vm_recreate_with_one_vcpu(vm); + vcpu_set_cpuid(vm, vcpu->id, kvm_get_supported_cpuid()); + vcpu_load_state(vm, vcpu->id, state); + run = vcpu->run; kvm_x86_state_cleanup(state); memset(®s2, 0, sizeof(regs2)); - vcpu_regs_get(vm, VCPU_ID, ®s2); + vcpu_regs_get(vm, vcpu->id, ®s2); TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", (ulong) regs2.rdi, (ulong) regs2.rsi); -- cgit v1.2.3 From 50630b80eb8f8779f301e12c1328e200b004df61 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 17:12:37 -0800 Subject: KVM: selftests: Convert cr4_cpuid_sync_test away from VCPU_ID Convert cr4_cpuid_sync_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Note, this is a "functional" change in the sense that the test now creates a vCPU with vcpu_id==0 instead of vcpu_id==1. The non-zero VCPU_ID was 100% arbitrary and added little to no validation coverage. If testing non-zero vCPU IDs is desirable for generic tests, that can be done in the future by tweaking the VM creation helpers. Opportunistically use vcpu_run() instead of _vcpu_run() with an open coded assert that KVM_RUN succeeded. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c index 6f6fd189dda3..d5615cd0b81b 100644 --- a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c +++ b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c @@ -21,7 +21,6 @@ #define X86_FEATURE_XSAVE (1<<26) #define X86_FEATURE_OSXSAVE (1<<27) -#define VCPU_ID 1 static inline bool cr4_cpuid_is_sync(void) { @@ -63,12 +62,12 @@ static void guest_code(void) int main(int argc, char *argv[]) { + struct kvm_vcpu *vcpu; struct kvm_run *run; struct kvm_vm *vm; struct kvm_sregs sregs; struct kvm_cpuid_entry2 *entry; struct ucall uc; - int rc; entry = kvm_get_supported_cpuid_entry(1); if (!(entry->ecx & X86_FEATURE_XSAVE)) { @@ -79,25 +78,23 @@ int main(int argc, char *argv[]) /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); - /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code); - run = vcpu_state(vm, VCPU_ID); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + run = vcpu->run; while (1) { - rc = _vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); - TEST_ASSERT(rc == 0, "vcpu_run failed: %d\n", rc); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Unexpected exit reason: %u (%s),\n", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vm, vcpu->id, &uc)) { case UCALL_SYNC: /* emulate hypervisor clearing CR4.OSXSAVE */ - vcpu_sregs_get(vm, VCPU_ID, &sregs); + vcpu_sregs_get(vm, vcpu->id, &sregs); sregs.cr4 &= ~X86_CR4_OSXSAVE; - vcpu_sregs_set(vm, VCPU_ID, &sregs); + vcpu_sregs_set(vm, vcpu->id, &sregs); break; case UCALL_ABORT: TEST_FAIL("Guest CR4 bit (OSXSAVE) unsynchronized with CPUID bit."); -- cgit v1.2.3 From 87f1b5b3c0cda3feed7de624285966dcf3a1c7ae Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 17:13:53 -0800 Subject: KVM: selftests: Convert cpuid_test away from VCPU_ID Convert cpuid_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Opportunistically use vcpu_run() instead of _vcpu_run(), the test expects KVM_RUN to succeed. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/cpuid_test.c | 29 ++++++++++++------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c index 16d2465c5634..76cdd0d10757 100644 --- a/tools/testing/selftests/kvm/x86_64/cpuid_test.c +++ b/tools/testing/selftests/kvm/x86_64/cpuid_test.c @@ -12,8 +12,6 @@ #include "kvm_util.h" #include "processor.h" -#define VCPU_ID 0 - /* CPUIDs known to differ */ struct { u32 function; @@ -118,13 +116,13 @@ static void compare_cpuids(struct kvm_cpuid2 *cpuid1, struct kvm_cpuid2 *cpuid2) check_cpuid(cpuid1, &cpuid2->entries[i]); } -static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage) +static void run_vcpu(struct kvm_vcpu *vcpu, int stage) { struct ucall uc; - _vcpu_run(vm, vcpuid); + vcpu_run(vcpu->vm, vcpu->id); - switch (get_ucall(vm, vcpuid, &uc)) { + switch (get_ucall(vcpu->vm, vcpu->id, &uc)) { case UCALL_SYNC: TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && uc.args[1] == stage + 1, @@ -138,7 +136,7 @@ static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage) __FILE__, uc.args[1], uc.args[2], uc.args[3]); default: TEST_ASSERT(false, "Unexpected exit: %s", - exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason)); + exit_reason_str(vcpu->run->exit_reason)); } } @@ -154,21 +152,21 @@ struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct return guest_cpuids; } -static void set_cpuid_after_run(struct kvm_vm *vm, struct kvm_cpuid2 *cpuid) +static void set_cpuid_after_run(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid) { struct kvm_cpuid_entry2 *ent; int rc; u32 eax, ebx, x; /* Setting unmodified CPUID is allowed */ - rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid); + rc = __vcpu_set_cpuid(vcpu->vm, vcpu->id, cpuid); TEST_ASSERT(!rc, "Setting unmodified CPUID after KVM_RUN failed: %d", rc); /* Changing CPU features is forbidden */ ent = get_cpuid(cpuid, 0x7, 0); ebx = ent->ebx; ent->ebx--; - rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid); + rc = __vcpu_set_cpuid(vcpu->vm, vcpu->id, cpuid); TEST_ASSERT(rc, "Changing CPU features should fail"); ent->ebx = ebx; @@ -177,7 +175,7 @@ static void set_cpuid_after_run(struct kvm_vm *vm, struct kvm_cpuid2 *cpuid) eax = ent->eax; x = eax & 0xff; ent->eax = (eax & ~0xffu) | (x - 1); - rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid); + rc = __vcpu_set_cpuid(vcpu->vm, vcpu->id, cpuid); TEST_ASSERT(rc, "Changing MAXPHYADDR should fail"); ent->eax = eax; } @@ -185,25 +183,26 @@ static void set_cpuid_after_run(struct kvm_vm *vm, struct kvm_cpuid2 *cpuid) int main(void) { struct kvm_cpuid2 *supp_cpuid, *cpuid2; + struct kvm_vcpu *vcpu; vm_vaddr_t cpuid_gva; struct kvm_vm *vm; int stage; - vm = vm_create_default(VCPU_ID, 0, guest_main); + vm = vm_create_with_one_vcpu(&vcpu, guest_main); supp_cpuid = kvm_get_supported_cpuid(); - cpuid2 = vcpu_get_cpuid(vm, VCPU_ID); + cpuid2 = vcpu_get_cpuid(vm, vcpu->id); compare_cpuids(supp_cpuid, cpuid2); vcpu_alloc_cpuid(vm, &cpuid_gva, cpuid2); - vcpu_args_set(vm, VCPU_ID, 1, cpuid_gva); + vcpu_args_set(vm, vcpu->id, 1, cpuid_gva); for (stage = 0; stage < 3; stage++) - run_vcpu(vm, VCPU_ID, stage); + run_vcpu(vcpu, stage); - set_cpuid_after_run(vm, cpuid2); + set_cpuid_after_run(vcpu, cpuid2); kvm_vm_free(vm); } -- cgit v1.2.3 From ada1bf4d653168ee209d7825ed3f19a8fe418d07 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 17:15:15 -0800 Subject: KVM: selftests: Convert userspace_io_test away from VCPU_ID Convert userspace_io_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Note, this is a "functional" change in the sense that the test now creates a vCPU with vcpu_id==0 instead of vcpu_id==1. The non-zero VCPU_ID was 100% arbitrary and added little to no validation coverage. If testing non-zero vCPU IDs is desirable for generic tests, that can be done in the future by tweaking the VM creation helpers. Opportunistically use vcpu_run() instead of _vcpu_run() with an open coded assert that KVM_RUN succeeded. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/userspace_io_test.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/userspace_io_test.c b/tools/testing/selftests/kvm/x86_64/userspace_io_test.c index e4bef2e05686..0ba774ed6476 100644 --- a/tools/testing/selftests/kvm/x86_64/userspace_io_test.c +++ b/tools/testing/selftests/kvm/x86_64/userspace_io_test.c @@ -10,8 +10,6 @@ #include "kvm_util.h" #include "processor.h" -#define VCPU_ID 1 - static void guest_ins_port80(uint8_t *buffer, unsigned int count) { unsigned long end; @@ -52,31 +50,29 @@ static void guest_code(void) int main(int argc, char *argv[]) { + struct kvm_vcpu *vcpu; struct kvm_regs regs; struct kvm_run *run; struct kvm_vm *vm; struct ucall uc; - int rc; /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); - /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code); - run = vcpu_state(vm, VCPU_ID); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + run = vcpu->run; memset(®s, 0, sizeof(regs)); while (1) { - rc = _vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); - TEST_ASSERT(rc == 0, "vcpu_run failed: %d\n", rc); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Unexpected exit reason: %u (%s),\n", run->exit_reason, exit_reason_str(run->exit_reason)); - if (get_ucall(vm, VCPU_ID, &uc)) + if (get_ucall(vm, vcpu->id, &uc)) break; TEST_ASSERT(run->io.port == 0x80, @@ -89,13 +85,13 @@ int main(int argc, char *argv[]) * scope from a testing perspective as it's not ABI in any way, * i.e. it really is abusing internal KVM knowledge. */ - vcpu_regs_get(vm, VCPU_ID, ®s); + vcpu_regs_get(vm, vcpu->id, ®s); if (regs.rcx == 2) regs.rcx = 1; if (regs.rcx == 3) regs.rcx = 8192; memset((void *)run + run->io.data_offset, 0xaa, 4096); - vcpu_regs_set(vm, VCPU_ID, ®s); + vcpu_regs_set(vm, vcpu->id, ®s); } switch (uc.cmd) { -- cgit v1.2.3 From 35b6cb825abdd15a6b2f3da22ffcfb076ecb75db Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 17:30:35 -0800 Subject: KVM: selftests: Convert vmx_invalid_nested_guest_state away from VCPU_ID Convert vmx_invalid_nested_guest_state to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../kvm/x86_64/vmx_invalid_nested_guest_state.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c b/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c index 489fbed4ca6f..ba534be498f9 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c @@ -9,7 +9,6 @@ #include "kselftest.h" -#define VCPU_ID 0 #define ARBITRARY_IO_PORT 0x2000 static struct kvm_vm *vm; @@ -55,20 +54,21 @@ int main(int argc, char *argv[]) { vm_vaddr_t vmx_pages_gva; struct kvm_sregs sregs; + struct kvm_vcpu *vcpu; struct kvm_run *run; struct ucall uc; nested_vmx_check_supported(); - vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); /* Allocate VMX pages and shared descriptors (vmx_pages). */ vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + vcpu_args_set(vm, vcpu->id, 1, vmx_pages_gva); - vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); - run = vcpu_state(vm, VCPU_ID); + run = vcpu->run; /* * The first exit to L0 userspace should be an I/O access from L2. @@ -88,13 +88,13 @@ int main(int argc, char *argv[]) * emulating invalid guest state for L2. */ memset(&sregs, 0, sizeof(sregs)); - vcpu_sregs_get(vm, VCPU_ID, &sregs); + vcpu_sregs_get(vm, vcpu->id, &sregs); sregs.tr.unusable = 1; - vcpu_sregs_set(vm, VCPU_ID, &sregs); + vcpu_sregs_set(vm, vcpu->id, &sregs); - vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vm, vcpu->id, &uc)) { case UCALL_DONE: break; case UCALL_ABORT: -- cgit v1.2.3 From 92897016697754a2709e42d56117824b0401c7dd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 17:33:28 -0800 Subject: KVM: selftests: Convert xen_vmcall_test away from VCPU_ID Convert xen_vmcall_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Note, this is a "functional" change in the sense that the test now creates a vCPU with vcpu_id==0 instead of vcpu_id==5. The non-zero VCPU_ID was 100% arbitrary and added little to no validation coverage. If testing non-zero vCPU IDs is desirable for generic tests, that can be done in the future by tweaking the VM creation helpers. Opportunistically make the "vm" variable local, it is unused outside of main(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c index b30fe9de1d4f..1411ead620fe 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c @@ -11,13 +11,9 @@ #include "kvm_util.h" #include "processor.h" -#define VCPU_ID 5 - #define HCALL_REGION_GPA 0xc0000000ULL #define HCALL_REGION_SLOT 10 -static struct kvm_vm *vm; - #define INPUTVALUE 17 #define ARGVALUE(x) (0xdeadbeef5a5a0000UL + x) #define RETVALUE 0xcafef00dfbfbffffUL @@ -84,14 +80,17 @@ static void guest_code(void) int main(int argc, char *argv[]) { + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + if (!(kvm_check_cap(KVM_CAP_XEN_HVM) & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) ) { print_skip("KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL not available"); exit(KSFT_SKIP); } - vm = vm_create_default(VCPU_ID, 0, (void *) guest_code); - vcpu_set_hv_cpuid(vm, VCPU_ID); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + vcpu_set_hv_cpuid(vm, vcpu->id); struct kvm_xen_hvm_config hvmc = { .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL, @@ -105,10 +104,10 @@ int main(int argc, char *argv[]) virt_map(vm, HCALL_REGION_GPA, HCALL_REGION_GPA, 2); for (;;) { - volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); + volatile struct kvm_run *run = vcpu->run; struct ucall uc; - vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); if (run->exit_reason == KVM_EXIT_XEN) { ASSERT_EQ(run->xen.type, KVM_EXIT_XEN_HCALL); @@ -130,7 +129,7 @@ int main(int argc, char *argv[]) run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vm, vcpu->id, &uc)) { case UCALL_ABORT: TEST_FAIL("%s", (const char *)uc.args[0]); /* NOT REACHED */ -- cgit v1.2.3 From 0037727b3989c3fe1929c89a9a1dfe289ad86f58 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 17:34:32 -0800 Subject: KVM: selftests: Convert xen_shinfo_test away from VCPU_ID Convert xen_shinfo_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Note, this is a "functional" change in the sense that the test now creates a vCPU with vcpu_id==0 instead of vcpu_id==5. The non-zero VCPU_ID was 100% arbitrary and added little to no validation coverage. If testing non-zero vCPU IDs is desirable for generic tests, that can be done in the future by tweaking the VM creation helpers. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/x86_64/xen_shinfo_test.c | 62 +++++++++++----------- 1 file changed, 30 insertions(+), 32 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index 7a51bb648fbb..5c0abaf0eb60 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -18,8 +18,6 @@ #include -#define VCPU_ID 5 - #define SHINFO_REGION_GVA 0xc0000000ULL #define SHINFO_REGION_GPA 0xc0000000ULL #define SHINFO_REGION_SLOT 10 @@ -42,8 +40,6 @@ #define EVTCHN_TEST2 66 #define EVTCHN_TIMER 13 -static struct kvm_vm *vm; - #define XEN_HYPERCALL_MSR 0x40000000 #define MIN_STEAL_TIME 50000 @@ -344,19 +340,22 @@ static int cmp_timespec(struct timespec *a, struct timespec *b) else return 0; } -struct vcpu_info *vinfo; + +static struct vcpu_info *vinfo; +static struct kvm_vcpu *vcpu; static void handle_alrm(int sig) { if (vinfo) printf("evtchn_upcall_pending 0x%x\n", vinfo->evtchn_upcall_pending); - vcpu_dump(stdout, vm, VCPU_ID, 0); + vcpu_dump(stdout, vcpu->vm, vcpu->id, 0); TEST_FAIL("IRQ delivery timed out"); } int main(int argc, char *argv[]) { struct timespec min_ts, max_ts, vm_ts; + struct kvm_vm *vm; bool verbose; verbose = argc > 1 && (!strncmp(argv[1], "-v", 3) || @@ -374,8 +373,7 @@ int main(int argc, char *argv[]) clock_gettime(CLOCK_REALTIME, &min_ts); - vm = vm_create_default(VCPU_ID, 0, (void *) guest_code); - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); /* Map a region for the shared_info page */ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, @@ -425,13 +423,13 @@ int main(int argc, char *argv[]) .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, .u.gpa = VCPU_INFO_ADDR, }; - vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &vi); + vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_SET_ATTR, &vi); struct kvm_xen_vcpu_attr pvclock = { .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO, .u.gpa = PVTIME_ADDR, }; - vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &pvclock); + vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_SET_ATTR, &pvclock); struct kvm_xen_hvm_attr vec = { .type = KVM_XEN_ATTR_TYPE_UPCALL_VECTOR, @@ -440,7 +438,7 @@ int main(int argc, char *argv[]) vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &vec); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, VCPU_ID); + vcpu_init_descriptor_tables(vm, vcpu->id); vm_install_exception_handler(vm, EVTCHN_VECTOR, evtchn_handler); if (do_runstate_tests) { @@ -448,7 +446,7 @@ int main(int argc, char *argv[]) .type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR, .u.gpa = RUNSTATE_ADDR, }; - vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &st); + vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_SET_ATTR, &st); } int irq_fd[2] = { -1, -1 }; @@ -468,13 +466,13 @@ int main(int argc, char *argv[]) irq_routes.entries[0].gsi = 32; irq_routes.entries[0].type = KVM_IRQ_ROUTING_XEN_EVTCHN; irq_routes.entries[0].u.xen_evtchn.port = EVTCHN_TEST1; - irq_routes.entries[0].u.xen_evtchn.vcpu = VCPU_ID; + irq_routes.entries[0].u.xen_evtchn.vcpu = vcpu->id; irq_routes.entries[0].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; irq_routes.entries[1].gsi = 33; irq_routes.entries[1].type = KVM_IRQ_ROUTING_XEN_EVTCHN; irq_routes.entries[1].u.xen_evtchn.port = EVTCHN_TEST2; - irq_routes.entries[1].u.xen_evtchn.vcpu = VCPU_ID; + irq_routes.entries[1].u.xen_evtchn.vcpu = vcpu->id; irq_routes.entries[1].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; vm_ioctl(vm, KVM_SET_GSI_ROUTING, &irq_routes); @@ -508,14 +506,14 @@ int main(int argc, char *argv[]) .u.evtchn.type = EVTCHNSTAT_interdomain, .u.evtchn.flags = 0, .u.evtchn.deliver.port.port = EVTCHN_TEST1, - .u.evtchn.deliver.port.vcpu = VCPU_ID + 1, + .u.evtchn.deliver.port.vcpu = vcpu->id + 1, .u.evtchn.deliver.port.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL, }; vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj); /* Test migration to a different vCPU */ inj.u.evtchn.flags = KVM_XEN_EVTCHN_UPDATE; - inj.u.evtchn.deliver.port.vcpu = VCPU_ID; + inj.u.evtchn.deliver.port.vcpu = vcpu->id; vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj); inj.u.evtchn.send_port = 197; @@ -524,7 +522,7 @@ int main(int argc, char *argv[]) inj.u.evtchn.flags = 0; vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj); - vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr); + vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_SET_ATTR, &tmr); } vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR); vinfo->evtchn_upcall_pending = 0; @@ -535,17 +533,17 @@ int main(int argc, char *argv[]) bool evtchn_irq_expected = false; for (;;) { - volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); + volatile struct kvm_run *run = vcpu->run; struct ucall uc; - vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vm, vcpu->id, &uc)) { case UCALL_ABORT: TEST_FAIL("%s", (const char *)uc.args[0]); /* NOT REACHED */ @@ -574,7 +572,7 @@ int main(int argc, char *argv[]) printf("Testing runstate %s\n", runstate_names[uc.args[1]]); rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT; rst.u.runstate.state = uc.args[1]; - vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &rst); + vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_SET_ATTR, &rst); break; case 4: @@ -589,7 +587,7 @@ int main(int argc, char *argv[]) 0x6b6b - rs->time[RUNSTATE_offline]; rst.u.runstate.time_runnable = -rst.u.runstate.time_blocked - rst.u.runstate.time_offline; - vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &rst); + vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_SET_ATTR, &rst); break; case 5: @@ -601,7 +599,7 @@ int main(int argc, char *argv[]) rst.u.runstate.state_entry_time = 0x6b6b + 0x5a; rst.u.runstate.time_blocked = 0x6b6b; rst.u.runstate.time_offline = 0x5a; - vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &rst); + vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_SET_ATTR, &rst); break; case 6: @@ -660,7 +658,7 @@ int main(int argc, char *argv[]) struct kvm_irq_routing_xen_evtchn e; e.port = EVTCHN_TEST2; - e.vcpu = VCPU_ID; + e.vcpu = vcpu->id; e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; vm_ioctl(vm, KVM_XEN_HVM_EVTCHN_SEND, &e); @@ -702,7 +700,7 @@ int main(int argc, char *argv[]) case 14: memset(&tmr, 0, sizeof(tmr)); tmr.type = KVM_XEN_VCPU_ATTR_TYPE_TIMER; - vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_GET_ATTR, &tmr); + vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_GET_ATTR, &tmr); TEST_ASSERT(tmr.u.timer.port == EVTCHN_TIMER, "Timer port not returned"); TEST_ASSERT(tmr.u.timer.priority == KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL, @@ -722,7 +720,7 @@ int main(int argc, char *argv[]) printf("Testing restored oneshot timer\n"); tmr.u.timer.expires_ns = rs->state_entry_time + 100000000, - vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr); + vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_SET_ATTR, &tmr); evtchn_irq_expected = true; alarm(1); break; @@ -749,7 +747,7 @@ int main(int argc, char *argv[]) printf("Testing SCHEDOP_poll wake on masked event\n"); tmr.u.timer.expires_ns = rs->state_entry_time + 100000000, - vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr); + vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_SET_ATTR, &tmr); alarm(1); break; @@ -760,11 +758,11 @@ int main(int argc, char *argv[]) evtchn_irq_expected = true; tmr.u.timer.expires_ns = rs->state_entry_time + 100000000; - vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr); + vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_SET_ATTR, &tmr); /* Read it back and check the pending time is reported correctly */ tmr.u.timer.expires_ns = 0; - vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_GET_ATTR, &tmr); + vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_GET_ATTR, &tmr); TEST_ASSERT(tmr.u.timer.expires_ns == rs->state_entry_time + 100000000, "Timer not reported pending"); alarm(1); @@ -774,7 +772,7 @@ int main(int argc, char *argv[]) TEST_ASSERT(!evtchn_irq_expected, "Expected event channel IRQ but it didn't happen"); /* Read timer and check it is no longer pending */ - vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_GET_ATTR, &tmr); + vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_GET_ATTR, &tmr); TEST_ASSERT(!tmr.u.timer.expires_ns, "Timer still reported pending"); shinfo->evtchn_pending[0] = 0; @@ -783,7 +781,7 @@ int main(int argc, char *argv[]) evtchn_irq_expected = true; tmr.u.timer.expires_ns = rs->state_entry_time - 100000000ULL; - vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr); + vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_SET_ATTR, &tmr); alarm(1); break; @@ -853,7 +851,7 @@ int main(int argc, char *argv[]) struct kvm_xen_vcpu_attr rst = { .type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA, }; - vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_GET_ATTR, &rst); + vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_GET_ATTR, &rst); if (verbose) { printf("Runstate: %s(%d), entry %" PRIu64 " ns\n", -- cgit v1.2.3 From c09aee348495af4cc15f823ff7256b77728d53c7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 17:40:19 -0800 Subject: KVM: selftests: Convert dirty_log_test away from VCPU_ID Convert dirty_log_test to pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Note, this is a "functional" change in the sense that the test now creates a vCPU with vcpu_id==0 instead of vcpu_id==5. The non-zero VCPU_ID was 100% arbitrary and added little to no validation coverage. If testing non-zero vCPU IDs is desirable for generic tests, that can be done in the future by tweaking the VM creation helpers. The test still hardcodes usage of vcpu_id==0, but only for a few lines. That wart will be removed in the not-too-distant future. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/dirty_log_test.c | 59 ++++++++++++++-------------- 1 file changed, 30 insertions(+), 29 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index cf426a8ae816..23e0c727e375 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -23,8 +23,6 @@ #include "guest_modes.h" #include "processor.h" -#define VCPU_ID 1 - /* The memory slot index to track dirty pages */ #define TEST_MEM_SLOT_INDEX 1 @@ -226,17 +224,17 @@ static void clear_log_create_vm_done(struct kvm_vm *vm) vm_enable_cap(vm, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, manual_caps); } -static void dirty_log_collect_dirty_pages(struct kvm_vm *vm, int slot, +static void dirty_log_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot, void *bitmap, uint32_t num_pages) { - kvm_vm_get_dirty_log(vm, slot, bitmap); + kvm_vm_get_dirty_log(vcpu->vm, slot, bitmap); } -static void clear_log_collect_dirty_pages(struct kvm_vm *vm, int slot, +static void clear_log_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot, void *bitmap, uint32_t num_pages) { - kvm_vm_get_dirty_log(vm, slot, bitmap); - kvm_vm_clear_dirty_log(vm, slot, bitmap, 0, num_pages); + kvm_vm_get_dirty_log(vcpu->vm, slot, bitmap); + kvm_vm_clear_dirty_log(vcpu->vm, slot, bitmap, 0, num_pages); } /* Should only be called after a GUEST_SYNC */ @@ -250,14 +248,14 @@ static void vcpu_handle_sync_stop(void) } } -static void default_after_vcpu_run(struct kvm_vm *vm, int ret, int err) +static void default_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err) { - struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct kvm_run *run = vcpu->run; TEST_ASSERT(ret == 0 || (ret == -1 && err == EINTR), "vcpu run failed: errno=%d", err); - TEST_ASSERT(get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC, + TEST_ASSERT(get_ucall(vcpu->vm, vcpu->id, NULL) == UCALL_SYNC, "Invalid guest sync status: exit_reason=%s\n", exit_reason_str(run->exit_reason)); @@ -328,7 +326,7 @@ static void dirty_ring_continue_vcpu(void) sem_post(&sem_vcpu_cont); } -static void dirty_ring_collect_dirty_pages(struct kvm_vm *vm, int slot, +static void dirty_ring_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot, void *bitmap, uint32_t num_pages) { /* We only have one vcpu */ @@ -348,10 +346,10 @@ static void dirty_ring_collect_dirty_pages(struct kvm_vm *vm, int slot, } /* Only have one vcpu */ - count = dirty_ring_collect_one(vcpu_map_dirty_ring(vm, VCPU_ID), + count = dirty_ring_collect_one(vcpu_map_dirty_ring(vcpu->vm, vcpu->id), slot, bitmap, num_pages, &fetch_index); - cleared = kvm_vm_reset_dirty_ring(vm); + cleared = kvm_vm_reset_dirty_ring(vcpu->vm); /* Cleared pages should be the same as collected */ TEST_ASSERT(cleared == count, "Reset dirty pages (%u) mismatch " @@ -366,12 +364,12 @@ static void dirty_ring_collect_dirty_pages(struct kvm_vm *vm, int slot, pr_info("Iteration %ld collected %u pages\n", iteration, count); } -static void dirty_ring_after_vcpu_run(struct kvm_vm *vm, int ret, int err) +static void dirty_ring_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err) { - struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct kvm_run *run = vcpu->run; /* A ucall-sync or ring-full event is allowed */ - if (get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC) { + if (get_ucall(vcpu->vm, vcpu->id, NULL) == UCALL_SYNC) { /* We should allow this to continue */ ; } else if (run->exit_reason == KVM_EXIT_DIRTY_RING_FULL || @@ -405,10 +403,10 @@ struct log_mode { /* Hook when the vm creation is done (before vcpu creation) */ void (*create_vm_done)(struct kvm_vm *vm); /* Hook to collect the dirty pages into the bitmap provided */ - void (*collect_dirty_pages) (struct kvm_vm *vm, int slot, + void (*collect_dirty_pages) (struct kvm_vcpu *vcpu, int slot, void *bitmap, uint32_t num_pages); /* Hook to call when after each vcpu run */ - void (*after_vcpu_run)(struct kvm_vm *vm, int ret, int err); + void (*after_vcpu_run)(struct kvm_vcpu *vcpu, int ret, int err); void (*before_vcpu_join) (void); } log_modes[LOG_MODE_NUM] = { { @@ -470,22 +468,22 @@ static void log_mode_create_vm_done(struct kvm_vm *vm) mode->create_vm_done(vm); } -static void log_mode_collect_dirty_pages(struct kvm_vm *vm, int slot, +static void log_mode_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot, void *bitmap, uint32_t num_pages) { struct log_mode *mode = &log_modes[host_log_mode]; TEST_ASSERT(mode->collect_dirty_pages != NULL, "collect_dirty_pages() is required for any log mode!"); - mode->collect_dirty_pages(vm, slot, bitmap, num_pages); + mode->collect_dirty_pages(vcpu, slot, bitmap, num_pages); } -static void log_mode_after_vcpu_run(struct kvm_vm *vm, int ret, int err) +static void log_mode_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err) { struct log_mode *mode = &log_modes[host_log_mode]; if (mode->after_vcpu_run) - mode->after_vcpu_run(vm, ret, err); + mode->after_vcpu_run(vcpu, ret, err); } static void log_mode_before_vcpu_join(void) @@ -507,7 +505,8 @@ static void generate_random_array(uint64_t *guest_array, uint64_t size) static void *vcpu_worker(void *data) { int ret; - struct kvm_vm *vm = data; + struct kvm_vcpu *vcpu = data; + struct kvm_vm *vm = vcpu->vm; uint64_t *guest_array; uint64_t pages_count = 0; struct kvm_signal_mask *sigmask = alloca(offsetof(struct kvm_signal_mask, sigset) @@ -522,7 +521,7 @@ static void *vcpu_worker(void *data) sigmask->len = 8; pthread_sigmask(0, NULL, sigset); sigdelset(sigset, SIG_IPI); - vcpu_ioctl(vm, VCPU_ID, KVM_SET_SIGNAL_MASK, sigmask); + vcpu_ioctl(vm, vcpu->id, KVM_SET_SIGNAL_MASK, sigmask); sigemptyset(sigset); sigaddset(sigset, SIG_IPI); @@ -534,13 +533,13 @@ static void *vcpu_worker(void *data) generate_random_array(guest_array, TEST_PAGES_PER_LOOP); pages_count += TEST_PAGES_PER_LOOP; /* Let the guest dirty the random pages */ - ret = __vcpu_run(vm, VCPU_ID); + ret = __vcpu_run(vm, vcpu->id); if (ret == -1 && errno == EINTR) { int sig = -1; sigwait(sigset, &sig); assert(sig == SIG_IPI); } - log_mode_after_vcpu_run(vm, ret, errno); + log_mode_after_vcpu_run(vcpu, ret, errno); } pr_info("Dirtied %"PRIu64" pages\n", pages_count); @@ -693,6 +692,7 @@ struct test_params { static void run_test(enum vm_guest_mode mode, void *arg) { struct test_params *p = arg; + struct kvm_vcpu *vcpu; struct kvm_vm *vm; unsigned long *bmap; @@ -710,9 +710,10 @@ static void run_test(enum vm_guest_mode mode, void *arg) * (e.g., 64K page size guest will need even less memory for * page tables). */ - vm = create_vm(mode, VCPU_ID, + vm = create_vm(mode, 0, 2ul << (DIRTY_MEM_BITS - PAGE_SHIFT_4K), guest_code); + vcpu = vcpu_get(vm, 0); guest_page_size = vm_get_page_size(vm); /* @@ -773,12 +774,12 @@ static void run_test(enum vm_guest_mode mode, void *arg) host_clear_count = 0; host_track_next_count = 0; - pthread_create(&vcpu_thread, NULL, vcpu_worker, vm); + pthread_create(&vcpu_thread, NULL, vcpu_worker, vcpu); while (iteration < p->iterations) { /* Give the vcpu thread some time to dirty some pages */ usleep(p->interval * 1000); - log_mode_collect_dirty_pages(vm, TEST_MEM_SLOT_INDEX, + log_mode_collect_dirty_pages(vcpu, TEST_MEM_SLOT_INDEX, bmap, host_num_pages); /* -- cgit v1.2.3 From d7828144d4651efc063d0889c9498176da2aed8c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Feb 2022 17:44:50 -0800 Subject: KVM: selftests: Convert set_memory_region_test away from VCPU_ID Convert set_memory_region_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/set_memory_region_test.c | 36 +++++++++++----------- 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index c33402ba7587..1274bbb0e30b 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -17,8 +17,6 @@ #include #include -#define VCPU_ID 0 - /* * s390x needs at least 1MB alignment, and the x86_64 MOVE/DELETE tests need a * 2MB sized and aligned region so that the initial region corresponds to @@ -54,8 +52,8 @@ static inline uint64_t guest_spin_on_val(uint64_t spin_val) static void *vcpu_worker(void *data) { - struct kvm_vm *vm = data; - struct kvm_run *run; + struct kvm_vcpu *vcpu = data; + struct kvm_run *run = vcpu->run; struct ucall uc; uint64_t cmd; @@ -64,13 +62,11 @@ static void *vcpu_worker(void *data) * which will occur if the guest attempts to access a memslot after it * has been deleted or while it is being moved . */ - run = vcpu_state(vm, VCPU_ID); - while (1) { - vcpu_run(vm, VCPU_ID); + vcpu_run(vcpu->vm, vcpu->id); if (run->exit_reason == KVM_EXIT_IO) { - cmd = get_ucall(vm, VCPU_ID, &uc); + cmd = get_ucall(vcpu->vm, vcpu->id, &uc); if (cmd != UCALL_SYNC) break; @@ -113,13 +109,14 @@ static void wait_for_vcpu(void) usleep(100000); } -static struct kvm_vm *spawn_vm(pthread_t *vcpu_thread, void *guest_code) +static struct kvm_vm *spawn_vm(struct kvm_vcpu **vcpu, pthread_t *vcpu_thread, + void *guest_code) { struct kvm_vm *vm; uint64_t *hva; uint64_t gpa; - vm = vm_create_default(VCPU_ID, 0, guest_code); + vm = vm_create_with_one_vcpu(vcpu, guest_code); vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_THP, MEM_REGION_GPA, MEM_REGION_SLOT, @@ -138,7 +135,7 @@ static struct kvm_vm *spawn_vm(pthread_t *vcpu_thread, void *guest_code) hva = addr_gpa2hva(vm, MEM_REGION_GPA); memset(hva, 0, 2 * 4096); - pthread_create(vcpu_thread, NULL, vcpu_worker, vm); + pthread_create(vcpu_thread, NULL, vcpu_worker, *vcpu); /* Ensure the guest thread is spun up. */ wait_for_vcpu(); @@ -180,10 +177,11 @@ static void guest_code_move_memory_region(void) static void test_move_memory_region(void) { pthread_t vcpu_thread; + struct kvm_vcpu *vcpu; struct kvm_vm *vm; uint64_t *hva; - vm = spawn_vm(&vcpu_thread, guest_code_move_memory_region); + vm = spawn_vm(&vcpu, &vcpu_thread, guest_code_move_memory_region); hva = addr_gpa2hva(vm, MEM_REGION_GPA); @@ -258,11 +256,12 @@ static void guest_code_delete_memory_region(void) static void test_delete_memory_region(void) { pthread_t vcpu_thread; + struct kvm_vcpu *vcpu; struct kvm_regs regs; struct kvm_run *run; struct kvm_vm *vm; - vm = spawn_vm(&vcpu_thread, guest_code_delete_memory_region); + vm = spawn_vm(&vcpu, &vcpu_thread, guest_code_delete_memory_region); /* Delete the memory region, the guest should not die. */ vm_mem_region_delete(vm, MEM_REGION_SLOT); @@ -286,13 +285,13 @@ static void test_delete_memory_region(void) pthread_join(vcpu_thread, NULL); - run = vcpu_state(vm, VCPU_ID); + run = vcpu->run; TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN || run->exit_reason == KVM_EXIT_INTERNAL_ERROR, "Unexpected exit reason = %d", run->exit_reason); - vcpu_regs_get(vm, VCPU_ID, ®s); + vcpu_regs_get(vm, vcpu->id, ®s); /* * On AMD, after KVM_EXIT_SHUTDOWN the VMCB has been reinitialized already, @@ -309,18 +308,19 @@ static void test_delete_memory_region(void) static void test_zero_memory_regions(void) { + struct kvm_vcpu *vcpu; struct kvm_run *run; struct kvm_vm *vm; pr_info("Testing KVM_RUN with zero added memory regions\n"); vm = vm_create_barebones(); - vm_vcpu_add(vm, VCPU_ID); + vcpu = vm_vcpu_add(vm, 0); vm_ioctl(vm, KVM_SET_NR_MMU_PAGES, (void *)64ul); - vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); - run = vcpu_state(vm, VCPU_ID); + run = vcpu->run; TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR, "Unexpected exit_reason = %u\n", run->exit_reason); -- cgit v1.2.3 From 10f0b222ea7e0ab90e7c72a255ce8e4f284b12e3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 18 Apr 2022 11:28:15 -0700 Subject: KVM: selftests: Convert system_counter_offset_test away from VCPU_ID Convert system_counter_offset_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../selftests/kvm/system_counter_offset_test.c | 28 +++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/system_counter_offset_test.c b/tools/testing/selftests/kvm/system_counter_offset_test.c index 5dd9d28efb97..0690ce0ae4fa 100644 --- a/tools/testing/selftests/kvm/system_counter_offset_test.c +++ b/tools/testing/selftests/kvm/system_counter_offset_test.c @@ -14,8 +14,6 @@ #include "kvm_util.h" #include "processor.h" -#define VCPU_ID 0 - #ifdef __x86_64__ struct test_case { @@ -28,18 +26,19 @@ static struct test_case test_cases[] = { { -180 * NSEC_PER_SEC }, }; -static void check_preconditions(struct kvm_vm *vm) +static void check_preconditions(struct kvm_vcpu *vcpu) { - if (!__vcpu_has_device_attr(vm, VCPU_ID, KVM_VCPU_TSC_CTRL, KVM_VCPU_TSC_OFFSET)) + if (!__vcpu_has_device_attr(vcpu->vm, vcpu->id, KVM_VCPU_TSC_CTRL, + KVM_VCPU_TSC_OFFSET)) return; print_skip("KVM_VCPU_TSC_OFFSET not supported; skipping test"); exit(KSFT_SKIP); } -static void setup_system_counter(struct kvm_vm *vm, struct test_case *test) +static void setup_system_counter(struct kvm_vcpu *vcpu, struct test_case *test) { - vcpu_device_attr_set(vm, VCPU_ID, KVM_VCPU_TSC_CTRL, + vcpu_device_attr_set(vcpu->vm, vcpu->id, KVM_VCPU_TSC_CTRL, KVM_VCPU_TSC_OFFSET, &test->tsc_offset); } @@ -91,7 +90,7 @@ static void handle_abort(struct ucall *uc) __FILE__, uc->args[1]); } -static void enter_guest(struct kvm_vm *vm) +static void enter_guest(struct kvm_vcpu *vcpu) { uint64_t start, end; struct ucall uc; @@ -100,12 +99,12 @@ static void enter_guest(struct kvm_vm *vm) for (i = 0; i < ARRAY_SIZE(test_cases); i++) { struct test_case *test = &test_cases[i]; - setup_system_counter(vm, test); + setup_system_counter(vcpu, test); start = host_read_guest_system_counter(test); - vcpu_run(vm, VCPU_ID); + vcpu_run(vcpu->vm, vcpu->id); end = host_read_guest_system_counter(test); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vcpu->vm, vcpu->id, &uc)) { case UCALL_SYNC: handle_sync(&uc, start, end); break; @@ -114,19 +113,20 @@ static void enter_guest(struct kvm_vm *vm) return; default: TEST_ASSERT(0, "unhandled ucall %ld\n", - get_ucall(vm, VCPU_ID, &uc)); + get_ucall(vcpu->vm, vcpu->id, &uc)); } } } int main(void) { + struct kvm_vcpu *vcpu; struct kvm_vm *vm; - vm = vm_create_default(VCPU_ID, 0, guest_main); - check_preconditions(vm); + vm = vm_create_with_one_vcpu(&vcpu, guest_main); + check_preconditions(vcpu); ucall_init(vm, NULL); - enter_guest(vm); + enter_guest(vcpu); kvm_vm_free(vm); } -- cgit v1.2.3 From ee7f7d9e988e137f20a34b8c02dd28dd5312e3f1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 18 Apr 2022 11:28:21 -0700 Subject: KVM: selftests: Track kvm_vcpu object in tsc_scaling_sync Track the added 'struct kvm_vcpu' object in tsc_scaling_sync instead of relying purely on the VM + vcpu_id combination. Ideally, the test wouldn't need to manually manage vCPUs, but the need to invoke a per-VM ioctl before creating vCPUs is not handled by the selftests framework, at least not yet... Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c index f0083d8cfe98..b7cd5c47fc53 100644 --- a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c +++ b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c @@ -46,38 +46,41 @@ static void guest_code(void) static void *run_vcpu(void *_cpu_nr) { - unsigned long cpu = (unsigned long)_cpu_nr; + unsigned long vcpu_id = (unsigned long)_cpu_nr; unsigned long failures = 0; static bool first_cpu_done; + struct kvm_vcpu *vcpu; /* The kernel is fine, but vm_vcpu_add_default() needs locking */ pthread_spin_lock(&create_lock); - vm_vcpu_add_default(vm, cpu, guest_code); + vm_vcpu_add_default(vm, vcpu_id, guest_code); + vcpu = vcpu_get(vm, vcpu_id); if (!first_cpu_done) { first_cpu_done = true; - vcpu_set_msr(vm, cpu, MSR_IA32_TSC, TEST_TSC_OFFSET); + vcpu_set_msr(vm, vcpu->id, MSR_IA32_TSC, TEST_TSC_OFFSET); } pthread_spin_unlock(&create_lock); for (;;) { - volatile struct kvm_run *run = vcpu_state(vm, cpu); + volatile struct kvm_run *run = vcpu->run; struct ucall uc; - vcpu_run(vm, cpu); + vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, cpu, &uc)) { + switch (get_ucall(vm, vcpu->id, &uc)) { case UCALL_DONE: goto out; case UCALL_SYNC: - printf("Guest %ld sync %lx %lx %ld\n", cpu, uc.args[2], uc.args[3], uc.args[2] - uc.args[3]); + printf("Guest %d sync %lx %lx %ld\n", vcpu->id, + uc.args[2], uc.args[3], uc.args[2] - uc.args[3]); failures++; break; -- cgit v1.2.3 From 20a7eb990ae8eeb33e072e97dfb05042603b0d81 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 18 Apr 2022 12:11:54 -0700 Subject: KVM: selftests: Convert xapic_state_test away from hardcoded vCPU ID Convert xapic_state_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of the raw vCPU ID. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../selftests/kvm/x86_64/xapic_state_test.c | 48 +++++++++++----------- 1 file changed, 25 insertions(+), 23 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c index 9d8393b6ec75..56301ee1adee 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c @@ -12,7 +12,7 @@ #include "test_util.h" struct xapic_vcpu { - uint32_t id; + struct kvm_vcpu *vcpu; bool is_x2apic; }; @@ -47,8 +47,9 @@ static void x2apic_guest_code(void) } while (1); } -static void ____test_icr(struct kvm_vm *vm, struct xapic_vcpu *vcpu, uint64_t val) +static void ____test_icr(struct kvm_vm *vm, struct xapic_vcpu *x, uint64_t val) { + struct kvm_vcpu *vcpu = x->vcpu; struct kvm_lapic_state xapic; struct ucall uc; uint64_t icr; @@ -70,28 +71,29 @@ static void ____test_icr(struct kvm_vm *vm, struct xapic_vcpu *vcpu, uint64_t va vcpu_ioctl(vm, vcpu->id, KVM_GET_LAPIC, &xapic); icr = (u64)(*((u32 *)&xapic.regs[APIC_ICR])) | (u64)(*((u32 *)&xapic.regs[APIC_ICR2])) << 32; - if (!vcpu->is_x2apic) + if (!x->is_x2apic) val &= (-1u | (0xffull << (32 + 24))); ASSERT_EQ(icr, val & ~APIC_ICR_BUSY); } -static void __test_icr(struct kvm_vm *vm, struct xapic_vcpu *vcpu, uint64_t val) +static void __test_icr(struct kvm_vm *vm, struct xapic_vcpu *x, uint64_t val) { - ____test_icr(vm, vcpu, val | APIC_ICR_BUSY); - ____test_icr(vm, vcpu, val & ~(u64)APIC_ICR_BUSY); + ____test_icr(vm, x, val | APIC_ICR_BUSY); + ____test_icr(vm, x, val & ~(u64)APIC_ICR_BUSY); } -static void test_icr(struct kvm_vm *vm, struct xapic_vcpu *vcpu) +static void test_icr(struct kvm_vm *vm, struct xapic_vcpu *x) { + struct kvm_vcpu *vcpu = x->vcpu; uint64_t icr, i, j; icr = APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_FIXED; for (i = 0; i <= 0xff; i++) - __test_icr(vm, vcpu, icr | i); + __test_icr(vm, x, icr | i); icr = APIC_INT_ASSERT | APIC_DM_FIXED; for (i = 0; i <= 0xff; i++) - __test_icr(vm, vcpu, icr | i); + __test_icr(vm, x, icr | i); /* * Send all flavors of IPIs to non-existent vCPUs. TODO: use number of @@ -100,32 +102,32 @@ static void test_icr(struct kvm_vm *vm, struct xapic_vcpu *vcpu) icr = APIC_INT_ASSERT | 0xff; for (i = vcpu->id + 1; i < 0xff; i++) { for (j = 0; j < 8; j++) - __test_icr(vm, vcpu, i << (32 + 24) | APIC_INT_ASSERT | (j << 8)); + __test_icr(vm, x, i << (32 + 24) | APIC_INT_ASSERT | (j << 8)); } /* And again with a shorthand destination for all types of IPIs. */ icr = APIC_DEST_ALLBUT | APIC_INT_ASSERT; for (i = 0; i < 8; i++) - __test_icr(vm, vcpu, icr | (i << 8)); + __test_icr(vm, x, icr | (i << 8)); /* And a few garbage value, just make sure it's an IRQ (blocked). */ - __test_icr(vm, vcpu, 0xa5a5a5a5a5a5a5a5 & ~APIC_DM_FIXED_MASK); - __test_icr(vm, vcpu, 0x5a5a5a5a5a5a5a5a & ~APIC_DM_FIXED_MASK); - __test_icr(vm, vcpu, -1ull & ~APIC_DM_FIXED_MASK); + __test_icr(vm, x, 0xa5a5a5a5a5a5a5a5 & ~APIC_DM_FIXED_MASK); + __test_icr(vm, x, 0x5a5a5a5a5a5a5a5a & ~APIC_DM_FIXED_MASK); + __test_icr(vm, x, -1ull & ~APIC_DM_FIXED_MASK); } int main(int argc, char *argv[]) { - struct xapic_vcpu vcpu = { - .id = 0, + struct xapic_vcpu x = { + .vcpu = NULL, .is_x2apic = true, }; struct kvm_cpuid2 *cpuid; struct kvm_vm *vm; int i; - vm = vm_create_default(vcpu.id, 0, x2apic_guest_code); - test_icr(vm, &vcpu); + vm = vm_create_with_one_vcpu(&x.vcpu, x2apic_guest_code); + test_icr(vm, &x); kvm_vm_free(vm); /* @@ -133,18 +135,18 @@ int main(int argc, char *argv[]) * the guest in order to test AVIC. KVM disallows changing CPUID after * KVM_RUN and AVIC is disabled if _any_ vCPU is allowed to use x2APIC. */ - vm = vm_create_default(vcpu.id, 0, xapic_guest_code); - vcpu.is_x2apic = false; + vm = vm_create_with_one_vcpu(&x.vcpu, xapic_guest_code); + x.is_x2apic = false; - cpuid = vcpu_get_cpuid(vm, vcpu.id); + cpuid = vcpu_get_cpuid(vm, x.vcpu->id); for (i = 0; i < cpuid->nent; i++) { if (cpuid->entries[i].function == 1) break; } cpuid->entries[i].ecx &= ~BIT(21); - vcpu_set_cpuid(vm, vcpu.id, cpuid); + vcpu_set_cpuid(vm, x.vcpu->id, cpuid); virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); - test_icr(vm, &vcpu); + test_icr(vm, &x); kvm_vm_free(vm); } -- cgit v1.2.3 From e5d86c7a032362b0051aaa75d8a1ee2291d16b42 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 08:35:01 -0800 Subject: KVM: selftests: Convert debug-exceptions away from VCPU_ID Convert debug-exceptions to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/debug-exceptions.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c index 63b2178210c4..b69db0942169 100644 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -3,8 +3,6 @@ #include #include -#define VCPU_ID 0 - #define MDSCR_KDE (1 << 13) #define MDSCR_MDE (1 << 15) #define MDSCR_SS (1 << 0) @@ -240,27 +238,28 @@ static void guest_svc_handler(struct ex_regs *regs) svc_addr = regs->pc; } -static int debug_version(struct kvm_vm *vm) +static int debug_version(struct kvm_vcpu *vcpu) { uint64_t id_aa64dfr0; - get_reg(vm, VCPU_ID, KVM_ARM64_SYS_REG(SYS_ID_AA64DFR0_EL1), &id_aa64dfr0); + get_reg(vcpu->vm, vcpu->id, KVM_ARM64_SYS_REG(SYS_ID_AA64DFR0_EL1), &id_aa64dfr0); return id_aa64dfr0 & 0xf; } int main(int argc, char *argv[]) { + struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct ucall uc; int stage; - vm = vm_create_default(VCPU_ID, 0, guest_code); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); ucall_init(vm, NULL); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, VCPU_ID); + vcpu_init_descriptor_tables(vm, vcpu->id); - if (debug_version(vm) < 6) { + if (debug_version(vcpu) < 6) { print_skip("Armv8 debug architecture not supported."); kvm_vm_free(vm); exit(KSFT_SKIP); @@ -278,9 +277,9 @@ int main(int argc, char *argv[]) ESR_EC_SVC64, guest_svc_handler); for (stage = 0; stage < 11; stage++) { - vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vm, vcpu->id, &uc)) { case UCALL_SYNC: TEST_ASSERT(uc.args[1] == stage, "Stage %d: Unexpected sync ucall, got %lx", -- cgit v1.2.3 From afcda3dcb3787d100d6e9e3ee97ebf0ff3e67dbb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 18 Apr 2022 11:50:15 -0700 Subject: KVM: selftests: Convert fix_hypercall_test away from VCPU_ID Convert fix_hypercall_test to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of using a global VCPU_ID. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../selftests/kvm/x86_64/fix_hypercall_test.c | 34 ++++++++++------------ 1 file changed, 16 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c index 81f9f5b1f655..108c3f75361d 100644 --- a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c +++ b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c @@ -14,8 +14,6 @@ #include "kvm_util.h" #include "processor.h" -#define VCPU_ID 0 - static bool ud_expected; static void guest_ud_handler(struct ex_regs *regs) @@ -94,22 +92,20 @@ static void guest_main(void) GUEST_DONE(); } -static void setup_ud_vector(struct kvm_vm *vm) +static void setup_ud_vector(struct kvm_vcpu *vcpu) { - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, VCPU_ID); - vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); + vm_init_descriptor_tables(vcpu->vm); + vcpu_init_descriptor_tables(vcpu->vm, vcpu->id); + vm_install_exception_handler(vcpu->vm, UD_VECTOR, guest_ud_handler); } -static void enter_guest(struct kvm_vm *vm) +static void enter_guest(struct kvm_vcpu *vcpu) { - struct kvm_run *run; + struct kvm_run *run = vcpu->run; struct ucall uc; - run = vcpu_state(vm, VCPU_ID); - - vcpu_run(vm, VCPU_ID); - switch (get_ucall(vm, VCPU_ID, &uc)) { + vcpu_run(vcpu->vm, vcpu->id); + switch (get_ucall(vcpu->vm, vcpu->id, &uc)) { case UCALL_SYNC: pr_info("%s: %016lx\n", (const char *)uc.args[2], uc.args[3]); break; @@ -125,25 +121,27 @@ static void enter_guest(struct kvm_vm *vm) static void test_fix_hypercall(void) { + struct kvm_vcpu *vcpu; struct kvm_vm *vm; - vm = vm_create_default(VCPU_ID, 0, guest_main); - setup_ud_vector(vm); + vm = vm_create_with_one_vcpu(&vcpu, guest_main); + setup_ud_vector(vcpu); ud_expected = false; sync_global_to_guest(vm, ud_expected); virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); - enter_guest(vm); + enter_guest(vcpu); } static void test_fix_hypercall_disabled(void) { + struct kvm_vcpu *vcpu; struct kvm_vm *vm; - vm = vm_create_default(VCPU_ID, 0, guest_main); - setup_ud_vector(vm); + vm = vm_create_with_one_vcpu(&vcpu, guest_main); + setup_ud_vector(vcpu); vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_FIX_HYPERCALL_INSN); @@ -153,7 +151,7 @@ static void test_fix_hypercall_disabled(void) virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); - enter_guest(vm); + enter_guest(vcpu); } int main(void) -- cgit v1.2.3 From fd04edc3560c1be3321c50da1bb504ebc002e676 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 08:41:38 -0800 Subject: KVM: selftests: Convert vgic_irq away from VCPU_ID Convert vgic_irq to use vm_create_with_one_vcpu() and pass around a 'struct kvm_vcpu' object instead of passing around a vCPU ID (which is always the global VCPU_ID...). Opportunstically align the indentation for multiple functions' parameters. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/vgic_irq.c | 30 ++++++++++++---------- tools/testing/selftests/kvm/include/aarch64/vgic.h | 6 +++-- tools/testing/selftests/kvm/lib/aarch64/vgic.c | 10 ++++---- 3 files changed, 25 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c index 87e41895b385..111170201e9b 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c @@ -22,7 +22,6 @@ #define GICD_BASE_GPA 0x08000000ULL #define GICR_BASE_GPA 0x080A0000ULL -#define VCPU_ID 0 /* * Stores the user specified args; it's passed to the guest and to every test @@ -589,7 +588,8 @@ static void kvm_set_gsi_routing_irqchip_check(struct kvm_vm *vm, } static void kvm_irq_write_ispendr_check(int gic_fd, uint32_t intid, - uint32_t vcpu, bool expect_failure) + struct kvm_vcpu *vcpu, + bool expect_failure) { /* * Ignore this when expecting failure as invalid intids will lead to @@ -659,15 +659,16 @@ static void kvm_routing_and_irqfd_check(struct kvm_vm *vm, (tmp) < (uint64_t)(first) + (uint64_t)(num); \ (tmp)++, (i)++) -static void run_guest_cmd(struct kvm_vm *vm, int gic_fd, - struct kvm_inject_args *inject_args, - struct test_args *test_args) +static void run_guest_cmd(struct kvm_vcpu *vcpu, int gic_fd, + struct kvm_inject_args *inject_args, + struct test_args *test_args) { kvm_inject_cmd cmd = inject_args->cmd; uint32_t intid = inject_args->first_intid; uint32_t num = inject_args->num; int level = inject_args->level; bool expect_failure = inject_args->expect_failure; + struct kvm_vm *vm = vcpu->vm; uint64_t tmp; uint32_t i; @@ -705,12 +706,12 @@ static void run_guest_cmd(struct kvm_vm *vm, int gic_fd, break; case KVM_WRITE_ISPENDR: for (i = intid; i < intid + num; i++) - kvm_irq_write_ispendr_check(gic_fd, i, - VCPU_ID, expect_failure); + kvm_irq_write_ispendr_check(gic_fd, i, vcpu, + expect_failure); break; case KVM_WRITE_ISACTIVER: for (i = intid; i < intid + num; i++) - kvm_irq_write_isactiver(gic_fd, i, VCPU_ID); + kvm_irq_write_isactiver(gic_fd, i, vcpu); break; default: break; @@ -739,6 +740,7 @@ static void test_vgic(uint32_t nr_irqs, bool level_sensitive, bool eoi_split) { struct ucall uc; int gic_fd; + struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct kvm_inject_args inject_args; vm_vaddr_t args_gva; @@ -753,16 +755,16 @@ static void test_vgic(uint32_t nr_irqs, bool level_sensitive, bool eoi_split) print_args(&args); - vm = vm_create_default(VCPU_ID, 0, guest_code); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); ucall_init(vm, NULL); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, VCPU_ID); + vcpu_init_descriptor_tables(vm, vcpu->id); /* Setup the guest args page (so it gets the args). */ args_gva = vm_vaddr_alloc_page(vm); memcpy(addr_gva2hva(vm, args_gva), &args, sizeof(args)); - vcpu_args_set(vm, 0, 1, args_gva); + vcpu_args_set(vm, vcpu->id, 1, args_gva); gic_fd = vgic_v3_setup(vm, 1, nr_irqs, GICD_BASE_GPA, GICR_BASE_GPA); @@ -775,12 +777,12 @@ static void test_vgic(uint32_t nr_irqs, bool level_sensitive, bool eoi_split) guest_irq_handlers[args.eoi_split][args.level_sensitive]); while (1) { - vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vm, vcpu->id, &uc)) { case UCALL_SYNC: kvm_inject_get_call(vm, &uc, &inject_args); - run_guest_cmd(vm, gic_fd, &inject_args, &args); + run_guest_cmd(vcpu, gic_fd, &inject_args, &args); break; case UCALL_ABORT: TEST_FAIL("%s at %s:%ld\n\tvalues: %#lx, %#lx", diff --git a/tools/testing/selftests/kvm/include/aarch64/vgic.h b/tools/testing/selftests/kvm/include/aarch64/vgic.h index 4442081221a0..0ac6f05c63f9 100644 --- a/tools/testing/selftests/kvm/include/aarch64/vgic.h +++ b/tools/testing/selftests/kvm/include/aarch64/vgic.h @@ -8,6 +8,8 @@ #include +#include "kvm_util.h" + #define REDIST_REGION_ATTR_ADDR(count, base, flags, index) \ (((uint64_t)(count) << 52) | \ ((uint64_t)((base) >> 16) << 16) | \ @@ -26,8 +28,8 @@ void kvm_arm_irq_line(struct kvm_vm *vm, uint32_t intid, int level); int _kvm_arm_irq_line(struct kvm_vm *vm, uint32_t intid, int level); /* The vcpu arg only applies to private interrupts. */ -void kvm_irq_write_ispendr(int gic_fd, uint32_t intid, uint32_t vcpu); -void kvm_irq_write_isactiver(int gic_fd, uint32_t intid, uint32_t vcpu); +void kvm_irq_write_ispendr(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu); +void kvm_irq_write_isactiver(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu); #define KVM_IRQCHIP_NUM_PINS (1020 - 32) diff --git a/tools/testing/selftests/kvm/lib/aarch64/vgic.c b/tools/testing/selftests/kvm/lib/aarch64/vgic.c index cfe3067efbf0..b5f28d21a947 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/vgic.c +++ b/tools/testing/selftests/kvm/lib/aarch64/vgic.c @@ -127,8 +127,8 @@ void kvm_arm_irq_line(struct kvm_vm *vm, uint32_t intid, int level) TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_IRQ_LINE, ret)); } -static void vgic_poke_irq(int gic_fd, uint32_t intid, - uint32_t vcpu, uint64_t reg_off) +static void vgic_poke_irq(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu, + uint64_t reg_off) { uint64_t reg = intid / 32; uint64_t index = intid % 32; @@ -141,7 +141,7 @@ static void vgic_poke_irq(int gic_fd, uint32_t intid, if (intid_is_private) { /* TODO: only vcpu 0 implemented for now. */ - assert(vcpu == 0); + assert(vcpu->id == 0); attr += SZ_64K; } @@ -159,12 +159,12 @@ static void vgic_poke_irq(int gic_fd, uint32_t intid, kvm_device_attr_set(gic_fd, group, attr, &val); } -void kvm_irq_write_ispendr(int gic_fd, uint32_t intid, uint32_t vcpu) +void kvm_irq_write_ispendr(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu) { vgic_poke_irq(gic_fd, intid, vcpu, GICD_ISPENDR); } -void kvm_irq_write_isactiver(int gic_fd, uint32_t intid, uint32_t vcpu) +void kvm_irq_write_isactiver(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu) { vgic_poke_irq(gic_fd, intid, vcpu, GICD_ISACTIVER); } -- cgit v1.2.3 From 033899489062e69d06e1ef7c0795ad9ac9bd47c1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 09:02:27 -0800 Subject: KVM: selftests: Make arm64's guest_get_vcpuid() declaration arm64-only Move the declaration of guest_get_vcpuid() to include/aarch64/processor.h, it is implemented and used only by arm64. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/aarch64/processor.h | 2 ++ tools/testing/selftests/kvm/include/kvm_util_base.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h index 59ece9d4e0d1..4d2d474b6874 100644 --- a/tools/testing/selftests/kvm/include/aarch64/processor.h +++ b/tools/testing/selftests/kvm/include/aarch64/processor.h @@ -207,4 +207,6 @@ void smccc_hvc(uint32_t function_id, uint64_t arg0, uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5, uint64_t arg6, struct arm_smccc_res *res); +uint32_t guest_get_vcpuid(void); + #endif /* SELFTEST_KVM_PROCESSOR_H */ diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index fbc54e920383..d94b6083d678 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -707,6 +707,4 @@ kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid); -uint32_t guest_get_vcpuid(void); - #endif /* SELFTEST_KVM_UTIL_BASE_H */ -- cgit v1.2.3 From b8592448370b1235119a0a135e99277430c76d53 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 09:08:00 -0800 Subject: KVM: selftests: Move vm_is_unrestricted_guest() to x86-64 An "unrestricted guest" is an VMX-only concept, move the relevant helper to x86-64 code. Assume most readers can correctly convert underscores to spaces and oppurtunistically trim the function comment. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/kvm_util_base.h | 2 -- .../selftests/kvm/include/x86_64/processor.h | 1 + tools/testing/selftests/kvm/lib/kvm_util.c | 33 ---------------------- tools/testing/selftests/kvm/lib/x86_64/processor.c | 21 ++++++++++++++ 4 files changed, 22 insertions(+), 35 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index d94b6083d678..5426de96e169 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -667,8 +667,6 @@ struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm); */ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code); -bool vm_is_unrestricted_guest(struct kvm_vm *vm); - unsigned int vm_get_page_size(struct kvm_vm *vm); unsigned int vm_get_page_shift(struct kvm_vm *vm); unsigned long vm_compute_max_gfn(struct kvm_vm *vm); diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 0bb9ba955d18..a19a52c50608 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -526,6 +526,7 @@ static inline void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t kvm_get_cpuid_max_basic(void); uint32_t kvm_get_cpuid_max_extended(void); void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits); +bool vm_is_unrestricted_guest(struct kvm_vm *vm); struct ex_regs { uint64_t rax, rcx, rdx, rbx; diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index f8274ca5fe5b..752731cf4292 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1948,39 +1948,6 @@ void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva) return addr_gpa2hva(vm, addr_gva2gpa(vm, gva)); } -/* - * Is Unrestricted Guest - * - * Input Args: - * vm - Virtual Machine - * - * Output Args: None - * - * Return: True if the unrestricted guest is set to 'Y', otherwise return false. - * - * Check if the unrestricted guest flag is enabled. - */ -bool vm_is_unrestricted_guest(struct kvm_vm *vm) -{ - char val = 'N'; - size_t count; - FILE *f; - - if (vm == NULL) { - /* Ensure that the KVM vendor-specific module is loaded. */ - close(open_kvm_dev_path_or_exit()); - } - - f = fopen("/sys/module/kvm_intel/parameters/unrestricted_guest", "r"); - if (f) { - count = fread(&val, sizeof(char), 1, f); - TEST_ASSERT(count == 1, "Unable to read from param file."); - fclose(f); - } - - return val == 'Y'; -} - unsigned int vm_get_page_size(struct kvm_vm *vm) { return vm->page_size; diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 47bc84d1aeb0..d25a6a9cdfee 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1357,3 +1357,24 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm) done: return min(max_gfn, ht_gfn - 1); } + +/* Returns true if kvm_intel was loaded with unrestricted_guest=1. */ +bool vm_is_unrestricted_guest(struct kvm_vm *vm) +{ + char val = 'N'; + size_t count; + FILE *f; + + /* Ensure that a KVM vendor-specific module is loaded. */ + if (vm == NULL) + close(open_kvm_dev_path_or_exit()); + + f = fopen("/sys/module/kvm_intel/parameters/unrestricted_guest", "r"); + if (f) { + count = fread(&val, sizeof(char), 1, f); + TEST_ASSERT(count == 1, "Unable to read from param file."); + fclose(f); + } + + return val == 'Y'; +} -- cgit v1.2.3 From 9931be3fc62edad381de074ad0db576eefed7fee Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 09:30:39 -0800 Subject: KVM: selftests: Add "arch" to common utils that have arch implementations Add "arch" into the name of utility functions that are declared in common code, but (surprise!) have arch-specific implementations. Shuffle code around so that all such helpers' declarations are bundled together. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/kvm_util_base.h | 200 ++++++++++++--------- .../testing/selftests/kvm/lib/aarch64/processor.c | 12 +- tools/testing/selftests/kvm/lib/riscv/processor.c | 12 +- tools/testing/selftests/kvm/lib/s390x/processor.c | 12 +- tools/testing/selftests/kvm/lib/x86_64/processor.c | 12 +- 5 files changed, 141 insertions(+), 107 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 5426de96e169..c7abe48d07cb 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -95,23 +95,6 @@ struct kvm_vm { struct kvm_vcpu *vcpu_get(struct kvm_vm *vm, uint32_t vcpuid); -/* - * Virtual Translation Tables Dump - * - * Input Args: - * stream - Output FILE stream - * vm - Virtual Machine - * indent - Left margin indent amount - * - * Output Args: None - * - * Return: None - * - * Dumps to the FILE stream given by @stream, the contents of all the - * virtual translation tables for the VM given by @vm. - */ -void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); - struct userspace_mem_region * memslot2region(struct kvm_vm *vm, uint32_t memslot); @@ -291,25 +274,6 @@ static inline int vm_get_stats_fd(struct kvm_vm *vm) return fd; } -/* - * VM VCPU Dump - * - * Input Args: - * stream - Output FILE stream - * vm - Virtual Machine - * vcpuid - VCPU ID - * indent - Left margin indent amount - * - * Output Args: None - * - * Return: None - * - * Dumps the current state of the VCPU specified by @vcpuid, within the VM - * given by @vm, to the FILE stream given by @stream. - */ -void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, - uint8_t indent); - void vm_create_irqchip(struct kvm_vm *vm); void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, @@ -336,23 +300,6 @@ void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva); vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva); void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa); -/* - * Address Guest Virtual to Guest Physical - * - * Input Args: - * vm - Virtual Machine - * gva - VM virtual address - * - * Output Args: None - * - * Return: - * Equivalent VM physical address - * - * Returns the VM physical address of the translated VM virtual - * address given by @gva. - */ -vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva); - struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid); void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid); int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid); @@ -569,26 +516,6 @@ void kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing); const char *exit_reason_str(unsigned int exit_reason); -void virt_pgd_alloc(struct kvm_vm *vm); - -/* - * VM Virtual Page Map - * - * Input Args: - * vm - Virtual Machine - * vaddr - VM Virtual Address - * paddr - VM Physical Address - * memslot - Memory region slot for new virtual translation tables - * - * Output Args: None - * - * Return: None - * - * Within @vm, creates a virtual translation for the page starting - * at @vaddr to the page starting at @paddr. - */ -void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr); - vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min, uint32_t memslot); vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, @@ -657,16 +584,6 @@ static inline struct kvm_vm *vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm); -/* - * Adds a vCPU with reasonable defaults (e.g. a stack) - * - * Input Args: - * vm - Virtual Machine - * vcpuid - The id of the VCPU to add to the VM. - * guest_code - The vCPU's entry point - */ -void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code); - unsigned int vm_get_page_size(struct kvm_vm *vm); unsigned int vm_get_page_shift(struct kvm_vm *vm); unsigned long vm_compute_max_gfn(struct kvm_vm *vm); @@ -705,4 +622,121 @@ kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid); +/* + * VM VCPU Dump + * + * Input Args: + * stream - Output FILE stream + * vm - Virtual Machine + * vcpuid - VCPU ID + * indent - Left margin indent amount + * + * Output Args: None + * + * Return: None + * + * Dumps the current state of the VCPU specified by @vcpuid, within the VM + * given by @vm, to the FILE stream given by @stream. + */ + +void vcpu_arch_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, + uint8_t indent); + +static inline void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, + uint8_t indent) +{ + vcpu_arch_dump(stream, vm, vcpuid, indent); +} + +/* + * Adds a vCPU with reasonable defaults (e.g. a stack) + * + * Input Args: + * vm - Virtual Machine + * vcpuid - The id of the VCPU to add to the VM. + * guest_code - The vCPU's entry point + */ +void vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code); + +static inline void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, + void *guest_code) +{ + vm_arch_vcpu_add(vm, vcpuid, guest_code); +} + +void virt_arch_pgd_alloc(struct kvm_vm *vm); + +static inline void virt_pgd_alloc(struct kvm_vm *vm) +{ + virt_arch_pgd_alloc(vm); +} + +/* + * VM Virtual Page Map + * + * Input Args: + * vm - Virtual Machine + * vaddr - VM Virtual Address + * paddr - VM Physical Address + * memslot - Memory region slot for new virtual translation tables + * + * Output Args: None + * + * Return: None + * + * Within @vm, creates a virtual translation for the page starting + * at @vaddr to the page starting at @paddr. + */ +void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr); + +static inline void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) +{ + virt_arch_pg_map(vm, vaddr, paddr); +} + + +/* + * Address Guest Virtual to Guest Physical + * + * Input Args: + * vm - Virtual Machine + * gva - VM virtual address + * + * Output Args: None + * + * Return: + * Equivalent VM physical address + * + * Returns the VM physical address of the translated VM virtual + * address given by @gva. + */ +vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva); + +static inline vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +{ + return addr_arch_gva2gpa(vm, gva); +} + +/* + * Virtual Translation Tables Dump + * + * Input Args: + * stream - Output FILE stream + * vm - Virtual Machine + * indent - Left margin indent amount + * + * Output Args: None + * + * Return: None + * + * Dumps to the FILE stream given by @stream, the contents of all the + * virtual translation tables for the VM given by @vm. + */ +void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); + +static inline void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +{ + virt_arch_dump(stream, vm, indent); +} + #endif /* SELFTEST_KVM_UTIL_BASE_H */ diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index 2e73853f485e..d14579176e52 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -74,7 +74,7 @@ static uint64_t __maybe_unused ptrs_per_pte(struct kvm_vm *vm) return 1 << (vm->page_shift - 3); } -void virt_pgd_alloc(struct kvm_vm *vm) +void virt_arch_pgd_alloc(struct kvm_vm *vm) { if (!vm->pgd_created) { vm_paddr_t paddr = vm_phy_pages_alloc(vm, @@ -131,14 +131,14 @@ static void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, *ptep |= (attr_idx << 2) | (1 << 10) /* Access Flag */; } -void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) +void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) { uint64_t attr_idx = 4; /* NORMAL (See DEFAULT_MAIR_EL1) */ _virt_pg_map(vm, vaddr, paddr, attr_idx); } -vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) { uint64_t *ptep; @@ -195,7 +195,7 @@ static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, uint64_t p #endif } -void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { int level = 4 - (vm->pgtable_levels - 1); uint64_t pgd, *ptep; @@ -303,7 +303,7 @@ void aarch64_vcpu_setup(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_init set_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_TPIDR_EL1), vcpuid); } -void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) +void vcpu_arch_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) { uint64_t pstate, pc; @@ -330,7 +330,7 @@ void aarch64_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, set_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code); } -void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) +void vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) { aarch64_vcpu_add_default(vm, vcpuid, NULL, guest_code); } diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index 5ee8250dd74c..d70d5a4c5ad6 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -53,7 +53,7 @@ static uint64_t pte_index(struct kvm_vm *vm, vm_vaddr_t gva, int level) return (gva & pte_index_mask[level]) >> pte_index_shift[level]; } -void virt_pgd_alloc(struct kvm_vm *vm) +void virt_arch_pgd_alloc(struct kvm_vm *vm) { if (!vm->pgd_created) { vm_paddr_t paddr = vm_phy_pages_alloc(vm, @@ -64,7 +64,7 @@ void virt_pgd_alloc(struct kvm_vm *vm) } } -void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) +void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) { uint64_t *ptep, next_ppn; int level = vm->pgtable_levels - 1; @@ -108,7 +108,7 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) PGTBL_PTE_PERM_MASK | PGTBL_PTE_VALID_MASK; } -vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) { uint64_t *ptep; int level = vm->pgtable_levels - 1; @@ -159,7 +159,7 @@ static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, #endif } -void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { int level = vm->pgtable_levels - 1; uint64_t pgd, *ptep; @@ -201,7 +201,7 @@ void riscv_vcpu_mmu_setup(struct kvm_vm *vm, int vcpuid) set_reg(vm, vcpuid, RISCV_CSR_REG(satp), satp); } -void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) +void vcpu_arch_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) { struct kvm_riscv_core core; @@ -274,7 +274,7 @@ static void __aligned(16) guest_unexp_trap(void) 0, 0, 0, 0, 0, 0); } -void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) +void vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) { int r; size_t stack_size = vm->page_size == 4096 ? diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c index aec15ca9d887..c2fe56a3fb74 100644 --- a/tools/testing/selftests/kvm/lib/s390x/processor.c +++ b/tools/testing/selftests/kvm/lib/s390x/processor.c @@ -10,7 +10,7 @@ #define PAGES_PER_REGION 4 -void virt_pgd_alloc(struct kvm_vm *vm) +void virt_arch_pgd_alloc(struct kvm_vm *vm) { vm_paddr_t paddr; @@ -46,7 +46,7 @@ static uint64_t virt_alloc_region(struct kvm_vm *vm, int ri) | ((ri < 4 ? (PAGES_PER_REGION - 1) : 0) & REGION_ENTRY_LENGTH); } -void virt_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa) +void virt_arch_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa) { int ri, idx; uint64_t *entry; @@ -85,7 +85,7 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa) entry[idx] = gpa; } -vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) { int ri, idx; uint64_t *entry; @@ -146,7 +146,7 @@ static void virt_dump_region(FILE *stream, struct kvm_vm *vm, uint8_t indent, } } -void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { if (!vm->pgd_created) return; @@ -154,7 +154,7 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) virt_dump_region(stream, vm, indent, vm->pgd); } -void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) +void vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) { size_t stack_size = DEFAULT_STACK_PGS * getpagesize(); uint64_t stack_vaddr; @@ -205,7 +205,7 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) va_end(ap); } -void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) +void vcpu_arch_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) { struct kvm_vcpu *vcpu = vcpu_get(vm, vcpuid); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index d25a6a9cdfee..1bc406b6317d 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -109,7 +109,7 @@ static void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent) } } -void virt_pgd_alloc(struct kvm_vm *vm) +void virt_arch_pgd_alloc(struct kvm_vm *vm) { TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " "unknown or unsupported guest mode, mode: 0x%x", vm->mode); @@ -207,7 +207,7 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK); } -void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) +void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) { __virt_pg_map(vm, vaddr, paddr, PG_LEVEL_4K); } @@ -302,7 +302,7 @@ void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr, *(uint64_t *)new_pte = pte; } -void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { uint64_t *pml4e, *pml4e_start; uint64_t *pdpe, *pdpe_start; @@ -483,7 +483,7 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector, kvm_seg_fill_gdt_64bit(vm, segp); } -vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) { uint16_t index[4]; uint64_t *pml4e, *pdpe, *pde; @@ -632,7 +632,7 @@ void vm_xsave_req_perm(int bit) bitmask); } -void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) +void vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) { struct kvm_mp_state mp_state; struct kvm_regs regs; @@ -873,7 +873,7 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) va_end(ap); } -void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) +void vcpu_arch_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) { struct kvm_regs regs; struct kvm_sregs sregs; -- cgit v1.2.3 From 1422efd6bb75e4ae038432449bf9229d6be8e0b4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 09:37:11 -0800 Subject: KVM: selftests: Return created vcpu from vm_vcpu_add_default() Return the created 'struct kvm_vcpu' object from vm_vcpu_add_default(), which cleans up a few tests and will eventually allow removing vcpu_get() entirely. Opportunistically rename @vcpuid to @vcpu_id to follow preferred kernel style. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../selftests/kvm/include/aarch64/processor.h | 5 +++-- tools/testing/selftests/kvm/include/kvm_util_base.h | 10 ++++++---- tools/testing/selftests/kvm/lib/aarch64/processor.c | 20 ++++++++++++-------- tools/testing/selftests/kvm/lib/riscv/processor.c | 20 ++++++++++++-------- tools/testing/selftests/kvm/lib/s390x/processor.c | 18 +++++++++++------- tools/testing/selftests/kvm/lib/x86_64/processor.c | 20 ++++++++++++-------- .../selftests/kvm/x86_64/pmu_event_filter_test.c | 4 +--- .../testing/selftests/kvm/x86_64/tsc_scaling_sync.c | 3 +-- 8 files changed, 58 insertions(+), 42 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h index 4d2d474b6874..9dad391b4fec 100644 --- a/tools/testing/selftests/kvm/include/aarch64/processor.h +++ b/tools/testing/selftests/kvm/include/aarch64/processor.h @@ -64,8 +64,9 @@ static inline void set_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id, uint } void aarch64_vcpu_setup(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_init *init); -void aarch64_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_vcpu_init *init, void *guest_code); +struct kvm_vcpu *aarch64_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpu_id, + struct kvm_vcpu_init *init, + void *guest_code); struct ex_regs { u64 regs[31]; diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index c7abe48d07cb..622b09ec23dd 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -656,12 +656,14 @@ static inline void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, * vcpuid - The id of the VCPU to add to the VM. * guest_code - The vCPU's entry point */ -void vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code); +struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, + void *guest_code); -static inline void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, - void *guest_code) +static inline struct kvm_vcpu *vm_vcpu_add_default(struct kvm_vm *vm, + uint32_t vcpu_id, + void *guest_code) { - vm_arch_vcpu_add(vm, vcpuid, guest_code); + return vm_arch_vcpu_add(vm, vcpu_id, guest_code); } void virt_arch_pgd_alloc(struct kvm_vm *vm); diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index d14579176e52..2b169b4ec29e 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -314,25 +314,29 @@ void vcpu_arch_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t in indent, "", pstate, pc); } -void aarch64_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_vcpu_init *init, void *guest_code) +struct kvm_vcpu *aarch64_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpu_id, + struct kvm_vcpu_init *init, + void *guest_code) { size_t stack_size = vm->page_size == 4096 ? DEFAULT_STACK_PGS * vm->page_size : vm->page_size; uint64_t stack_vaddr = vm_vaddr_alloc(vm, stack_size, DEFAULT_ARM64_GUEST_STACK_VADDR_MIN); + struct kvm_vcpu *vcpu = vm_vcpu_add(vm, vcpu_id); - vm_vcpu_add(vm, vcpuid); - aarch64_vcpu_setup(vm, vcpuid, init); + aarch64_vcpu_setup(vm, vcpu_id, init); - set_reg(vm, vcpuid, ARM64_CORE_REG(sp_el1), stack_vaddr + stack_size); - set_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code); + set_reg(vm, vcpu_id, ARM64_CORE_REG(sp_el1), stack_vaddr + stack_size); + set_reg(vm, vcpu_id, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code); + + return vcpu; } -void vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) +struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, + void *guest_code) { - aarch64_vcpu_add_default(vm, vcpuid, NULL, guest_code); + return aarch64_vcpu_add_default(vm, vcpu_id, NULL, guest_code); } void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index d70d5a4c5ad6..5946101144eb 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -274,7 +274,8 @@ static void __aligned(16) guest_unexp_trap(void) 0, 0, 0, 0, 0, 0); } -void vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) +struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, + void *guest_code) { int r; size_t stack_size = vm->page_size == 4096 ? @@ -284,9 +285,10 @@ void vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) DEFAULT_RISCV_GUEST_STACK_VADDR_MIN); unsigned long current_gp = 0; struct kvm_mp_state mps; + struct kvm_vcpu *vcpu; - vm_vcpu_add(vm, vcpuid); - riscv_vcpu_mmu_setup(vm, vcpuid); + vcpu = vm_vcpu_add(vm, vcpu_id); + riscv_vcpu_mmu_setup(vm, vcpu_id); /* * With SBI HSM support in KVM RISC-V, all secondary VCPUs are @@ -294,23 +296,25 @@ void vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) * are powered-on using KVM_SET_MP_STATE ioctl(). */ mps.mp_state = KVM_MP_STATE_RUNNABLE; - r = __vcpu_ioctl(vm, vcpuid, KVM_SET_MP_STATE, &mps); + r = __vcpu_ioctl(vm, vcpu_id, KVM_SET_MP_STATE, &mps); TEST_ASSERT(!r, "IOCTL KVM_SET_MP_STATE failed (error %d)", r); /* Setup global pointer of guest to be same as the host */ asm volatile ( "add %0, gp, zero" : "=r" (current_gp) : : "memory"); - set_reg(vm, vcpuid, RISCV_CORE_REG(regs.gp), current_gp); + set_reg(vm, vcpu_id, RISCV_CORE_REG(regs.gp), current_gp); /* Setup stack pointer and program counter of guest */ - set_reg(vm, vcpuid, RISCV_CORE_REG(regs.sp), + set_reg(vm, vcpu_id, RISCV_CORE_REG(regs.sp), stack_vaddr + stack_size); - set_reg(vm, vcpuid, RISCV_CORE_REG(regs.pc), + set_reg(vm, vcpu_id, RISCV_CORE_REG(regs.pc), (unsigned long)guest_code); /* Setup default exception vector of guest */ - set_reg(vm, vcpuid, RISCV_CSR_REG(stvec), + set_reg(vm, vcpu_id, RISCV_CSR_REG(stvec), (unsigned long)guest_unexp_trap); + + return vcpu; } void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c index c2fe56a3fb74..cf759844b226 100644 --- a/tools/testing/selftests/kvm/lib/s390x/processor.c +++ b/tools/testing/selftests/kvm/lib/s390x/processor.c @@ -154,12 +154,14 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) virt_dump_region(stream, vm, indent, vm->pgd); } -void vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) +struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, + void *guest_code) { size_t stack_size = DEFAULT_STACK_PGS * getpagesize(); uint64_t stack_vaddr; struct kvm_regs regs; struct kvm_sregs sregs; + struct kvm_vcpu *vcpu; struct kvm_run *run; TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x", @@ -168,21 +170,23 @@ void vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) stack_vaddr = vm_vaddr_alloc(vm, stack_size, DEFAULT_GUEST_STACK_VADDR_MIN); - vm_vcpu_add(vm, vcpuid); + vcpu = vm_vcpu_add(vm, vcpu_id); /* Setup guest registers */ - vcpu_regs_get(vm, vcpuid, ®s); + vcpu_regs_get(vm, vcpu_id, ®s); regs.gprs[15] = stack_vaddr + (DEFAULT_STACK_PGS * getpagesize()) - 160; - vcpu_regs_set(vm, vcpuid, ®s); + vcpu_regs_set(vm, vcpu_id, ®s); - vcpu_sregs_get(vm, vcpuid, &sregs); + vcpu_sregs_get(vm, vcpu_id, &sregs); sregs.crs[0] |= 0x00040000; /* Enable floating point regs */ sregs.crs[1] = vm->pgd | 0xf; /* Primary region table */ - vcpu_sregs_set(vm, vcpuid, &sregs); + vcpu_sregs_set(vm, vcpu_id, &sregs); - run = vcpu_state(vm, vcpuid); + run = vcpu_state(vm, vcpu_id); run->psw_mask = 0x0400000180000000ULL; /* DAT enabled + 64 bit mode */ run->psw_addr = (uintptr_t)guest_code; + + return vcpu; } void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 1bc406b6317d..bafa8ec54569 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -632,29 +632,33 @@ void vm_xsave_req_perm(int bit) bitmask); } -void vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) +struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, + void *guest_code) { struct kvm_mp_state mp_state; struct kvm_regs regs; vm_vaddr_t stack_vaddr; + struct kvm_vcpu *vcpu; + stack_vaddr = vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(), DEFAULT_GUEST_STACK_VADDR_MIN); - /* Create VCPU */ - vm_vcpu_add(vm, vcpuid); - vcpu_set_cpuid(vm, vcpuid, kvm_get_supported_cpuid()); - vcpu_setup(vm, vcpuid); + vcpu = vm_vcpu_add(vm, vcpu_id); + vcpu_set_cpuid(vm, vcpu_id, kvm_get_supported_cpuid()); + vcpu_setup(vm, vcpu_id); /* Setup guest general purpose registers */ - vcpu_regs_get(vm, vcpuid, ®s); + vcpu_regs_get(vm, vcpu_id, ®s); regs.rflags = regs.rflags | 0x2; regs.rsp = stack_vaddr + (DEFAULT_STACK_PGS * getpagesize()); regs.rip = (unsigned long) guest_code; - vcpu_regs_set(vm, vcpuid, ®s); + vcpu_regs_set(vm, vcpu_id, ®s); /* Setup the MP state */ mp_state.mp_state = 0; - vcpu_mp_state_set(vm, vcpuid, &mp_state); + vcpu_mp_state_set(vm, vcpu_id, &mp_state); + + return vcpu; } /* diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index d2c571f20521..2faa43336131 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -369,10 +369,8 @@ static void test_pmu_config_disable(void (*guest_code)(void)) vm_enable_cap(vm, KVM_CAP_PMU_CAPABILITY, KVM_PMU_CAP_DISABLE); - vm_vcpu_add_default(vm, 0, guest_code); + vcpu = vm_vcpu_add_default(vm, 0, guest_code); vm_init_descriptor_tables(vm); - - vcpu = vcpu_get(vm, 0); vcpu_init_descriptor_tables(vm, vcpu->id); TEST_ASSERT(!sanity_check_pmu(vcpu), diff --git a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c index b7cd5c47fc53..ea70ca2e63c3 100644 --- a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c +++ b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c @@ -54,8 +54,7 @@ static void *run_vcpu(void *_cpu_nr) /* The kernel is fine, but vm_vcpu_add_default() needs locking */ pthread_spin_lock(&create_lock); - vm_vcpu_add_default(vm, vcpu_id, guest_code); - vcpu = vcpu_get(vm, vcpu_id); + vcpu = vm_vcpu_add_default(vm, vcpu_id, guest_code); if (!first_cpu_done) { first_cpu_done = true; -- cgit v1.2.3 From f742d94ff4e5147e08b3bb7826f009eda7545124 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 09:56:24 -0800 Subject: KVM: selftests: Rename vm_vcpu_add* helpers to better show relationships Rename vm_vcpu_add() to __vm_vcpu_add(), and vm_vcpu_add_default() to vm_vcpu_add() to show the relationship between the newly minted vm_vcpu_add() and __vm_vcpu_add(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/get-reg-list.c | 2 +- tools/testing/selftests/kvm/aarch64/psci_test.c | 4 ++-- tools/testing/selftests/kvm/aarch64/vcpu_width_config.c | 8 ++++---- tools/testing/selftests/kvm/aarch64/vgic_init.c | 10 +++++----- tools/testing/selftests/kvm/dirty_log_test.c | 2 +- tools/testing/selftests/kvm/hardware_disable_test.c | 2 +- tools/testing/selftests/kvm/include/aarch64/processor.h | 5 ++--- tools/testing/selftests/kvm/include/kvm_util_base.h | 7 +++---- tools/testing/selftests/kvm/kvm_binary_stats_test.c | 2 +- tools/testing/selftests/kvm/kvm_create_max_vcpus.c | 2 +- tools/testing/selftests/kvm/lib/aarch64/processor.c | 9 ++++----- tools/testing/selftests/kvm/lib/kvm_util.c | 6 +++--- tools/testing/selftests/kvm/lib/riscv/processor.c | 2 +- tools/testing/selftests/kvm/lib/s390x/processor.c | 2 +- tools/testing/selftests/kvm/lib/x86_64/processor.c | 2 +- tools/testing/selftests/kvm/set_memory_region_test.c | 2 +- tools/testing/selftests/kvm/steal_time.c | 2 +- tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c | 2 +- tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c | 4 ++-- tools/testing/selftests/kvm/x86_64/set_sregs_test.c | 2 +- tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c | 8 ++++---- tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c | 4 ++-- tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c | 2 +- 23 files changed, 44 insertions(+), 47 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c index 5476bb465b78..fbb0c714211d 100644 --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -418,7 +418,7 @@ static void run_test(struct vcpu_config *c) vm = vm_create_barebones(); prepare_vcpu_init(c, &init); - vm_vcpu_add(vm, 0); + __vm_vcpu_add(vm, 0); aarch64_vcpu_setup(vm, 0, &init); finalize_vcpu(vm, 0, c); diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c index fa4e6c3343d7..347cb5c130e2 100644 --- a/tools/testing/selftests/kvm/aarch64/psci_test.c +++ b/tools/testing/selftests/kvm/aarch64/psci_test.c @@ -84,8 +84,8 @@ static struct kvm_vm *setup_vm(void *guest_code) vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init); init.features[0] |= (1 << KVM_ARM_VCPU_PSCI_0_2); - aarch64_vcpu_add_default(vm, VCPU_ID_SOURCE, &init, guest_code); - aarch64_vcpu_add_default(vm, VCPU_ID_TARGET, &init, guest_code); + aarch64_vcpu_add(vm, VCPU_ID_SOURCE, &init, guest_code); + aarch64_vcpu_add(vm, VCPU_ID_TARGET, &init, guest_code); return vm; } diff --git a/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c b/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c index 1757f44dd3e2..1dd856a58f5d 100644 --- a/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c +++ b/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c @@ -26,12 +26,12 @@ static int add_init_2vcpus(struct kvm_vcpu_init *init1, vm = vm_create_barebones(); - vm_vcpu_add(vm, 0); + __vm_vcpu_add(vm, 0); ret = __vcpu_ioctl(vm, 0, KVM_ARM_VCPU_INIT, init1); if (ret) goto free_exit; - vm_vcpu_add(vm, 1); + __vm_vcpu_add(vm, 1); ret = __vcpu_ioctl(vm, 1, KVM_ARM_VCPU_INIT, init2); free_exit: @@ -51,8 +51,8 @@ static int add_2vcpus_init_2vcpus(struct kvm_vcpu_init *init1, vm = vm_create_barebones(); - vm_vcpu_add(vm, 0); - vm_vcpu_add(vm, 1); + __vm_vcpu_add(vm, 0); + __vm_vcpu_add(vm, 1); ret = __vcpu_ioctl(vm, 0, KVM_ARM_VCPU_INIT, init1); if (ret) diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index c5866c3f4516..451f65b199ad 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -331,7 +331,7 @@ static void test_vgic_then_vcpus(uint32_t gic_dev_type) /* Add the rest of the VCPUs */ for (i = 1; i < NR_VCPUS; ++i) - vm_vcpu_add_default(v.vm, i, guest_code); + vm_vcpu_add(v.vm, i, guest_code); ret = run_vcpu(v.vm, 3); TEST_ASSERT(ret == -EINVAL, "dist/rdist overlap detected on 1st vcpu run"); @@ -418,17 +418,17 @@ static void test_v3_typer_accesses(void) v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3); - vm_vcpu_add_default(v.vm, 3, guest_code); + vm_vcpu_add(v.vm, 3, guest_code); v3_redist_reg_get_errno(v.gic_fd, 1, GICR_TYPER, EINVAL, "attempting to read GICR_TYPER of non created vcpu"); - vm_vcpu_add_default(v.vm, 1, guest_code); + vm_vcpu_add(v.vm, 1, guest_code); v3_redist_reg_get_errno(v.gic_fd, 1, GICR_TYPER, EBUSY, "read GICR_TYPER before GIC initialized"); - vm_vcpu_add_default(v.vm, 2, guest_code); + vm_vcpu_add(v.vm, 2, guest_code); kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); @@ -559,7 +559,7 @@ static void test_v3_redist_ipa_range_check_at_vcpu_run(void) /* Add the rest of the VCPUs */ for (i = 1; i < NR_VCPUS; ++i) - vm_vcpu_add_default(v.vm, i, guest_code); + vm_vcpu_add(v.vm, i, guest_code); kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 23e0c727e375..1a5c01c65044 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -676,7 +676,7 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, vm = __vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages); log_mode_create_vm_done(vm); - vm_vcpu_add_default(vm, vcpuid, guest_code); + vm_vcpu_add(vm, vcpuid, guest_code); return vm; } diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c index 29f6ca51408f..be2763ecb6e7 100644 --- a/tools/testing/selftests/kvm/hardware_disable_test.c +++ b/tools/testing/selftests/kvm/hardware_disable_test.c @@ -108,7 +108,7 @@ static void run_test(uint32_t run) pr_debug("%s: [%d] start vcpus\n", __func__, run); for (i = 0; i < VCPU_NUM; ++i) { - vm_vcpu_add_default(vm, i, guest_code); + vm_vcpu_add(vm, i, guest_code); payloads[i].vm = vm; payloads[i].index = i; diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h index 9dad391b4fec..f774609f7848 100644 --- a/tools/testing/selftests/kvm/include/aarch64/processor.h +++ b/tools/testing/selftests/kvm/include/aarch64/processor.h @@ -64,9 +64,8 @@ static inline void set_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id, uint } void aarch64_vcpu_setup(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_init *init); -struct kvm_vcpu *aarch64_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpu_id, - struct kvm_vcpu_init *init, - void *guest_code); +struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, + struct kvm_vcpu_init *init, void *guest_code); struct ex_regs { u64 regs[31]; diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 622b09ec23dd..2c7a8a91ebe2 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -288,7 +288,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); -struct kvm_vcpu *vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid); +struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid); vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages); vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm); @@ -659,9 +659,8 @@ static inline void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, void *guest_code); -static inline struct kvm_vcpu *vm_vcpu_add_default(struct kvm_vm *vm, - uint32_t vcpu_id, - void *guest_code) +static inline struct kvm_vcpu *vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, + void *guest_code) { return vm_arch_vcpu_add(vm, vcpu_id, guest_code); } diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index edeb08239036..407e9ea8e6f3 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -223,7 +223,7 @@ int main(int argc, char *argv[]) for (i = 0; i < max_vm; ++i) { vms[i] = vm_create_barebones(); for (j = 0; j < max_vcpu; ++j) - vm_vcpu_add(vms[i], j); + __vm_vcpu_add(vms[i], j); } /* Check stats read for every VM and VCPU */ diff --git a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c index acc92703f563..3ae0237e96b2 100644 --- a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c +++ b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c @@ -32,7 +32,7 @@ void test_vcpu_creation(int first_vcpu_id, int num_vcpus) for (i = first_vcpu_id; i < first_vcpu_id + num_vcpus; i++) /* This asserts that the vCPU was created. */ - vm_vcpu_add(vm, i); + __vm_vcpu_add(vm, i); kvm_vm_free(vm); } diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index 2b169b4ec29e..5b95fa2cce18 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -314,16 +314,15 @@ void vcpu_arch_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t in indent, "", pstate, pc); } -struct kvm_vcpu *aarch64_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpu_id, - struct kvm_vcpu_init *init, - void *guest_code) +struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, + struct kvm_vcpu_init *init, void *guest_code) { size_t stack_size = vm->page_size == 4096 ? DEFAULT_STACK_PGS * vm->page_size : vm->page_size; uint64_t stack_vaddr = vm_vaddr_alloc(vm, stack_size, DEFAULT_ARM64_GUEST_STACK_VADDR_MIN); - struct kvm_vcpu *vcpu = vm_vcpu_add(vm, vcpu_id); + struct kvm_vcpu *vcpu = __vm_vcpu_add(vm, vcpu_id); aarch64_vcpu_setup(vm, vcpu_id, init); @@ -336,7 +335,7 @@ struct kvm_vcpu *aarch64_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpu_id, struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, void *guest_code) { - return aarch64_vcpu_add_default(vm, vcpu_id, NULL, guest_code); + return aarch64_vcpu_add(vm, vcpu_id, NULL, guest_code); } void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 752731cf4292..c2a99f26e9ba 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -328,7 +328,7 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, for (i = 0; i < nr_vcpus; ++i) { uint32_t vcpuid = vcpuids ? vcpuids[i] : i; - vm_vcpu_add_default(vm, vcpuid, guest_code); + vm_vcpu_add(vm, vcpuid, guest_code); } return vm; @@ -397,7 +397,7 @@ struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm) { kvm_vm_restart(vm); - return vm_vcpu_add(vm, 0); + return __vm_vcpu_add(vm, 0); } /* @@ -1065,7 +1065,7 @@ static int vcpu_mmap_sz(void) * Adds a virtual CPU to the VM specified by vm with the ID given by vcpu_id. * No additional vCPU setup is done. Returns the vCPU. */ -struct kvm_vcpu *vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) +struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) { struct kvm_vcpu *vcpu; diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index 5946101144eb..ba5761843c76 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -287,7 +287,7 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, struct kvm_mp_state mps; struct kvm_vcpu *vcpu; - vcpu = vm_vcpu_add(vm, vcpu_id); + vcpu = __vm_vcpu_add(vm, vcpu_id); riscv_vcpu_mmu_setup(vm, vcpu_id); /* diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c index cf759844b226..f8170e97eeb7 100644 --- a/tools/testing/selftests/kvm/lib/s390x/processor.c +++ b/tools/testing/selftests/kvm/lib/s390x/processor.c @@ -170,7 +170,7 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, stack_vaddr = vm_vaddr_alloc(vm, stack_size, DEFAULT_GUEST_STACK_VADDR_MIN); - vcpu = vm_vcpu_add(vm, vcpu_id); + vcpu = __vm_vcpu_add(vm, vcpu_id); /* Setup guest registers */ vcpu_regs_get(vm, vcpu_id, ®s); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index bafa8ec54569..f89d67101bf1 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -643,7 +643,7 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, stack_vaddr = vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(), DEFAULT_GUEST_STACK_VADDR_MIN); - vcpu = vm_vcpu_add(vm, vcpu_id); + vcpu = __vm_vcpu_add(vm, vcpu_id); vcpu_set_cpuid(vm, vcpu_id, kvm_get_supported_cpuid()); vcpu_setup(vm, vcpu_id); diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index 1274bbb0e30b..d832fc12984e 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -315,7 +315,7 @@ static void test_zero_memory_regions(void) pr_info("Testing KVM_RUN with zero added memory regions\n"); vm = vm_create_barebones(); - vcpu = vm_vcpu_add(vm, 0); + vcpu = __vm_vcpu_add(vm, 0); vm_ioctl(vm, KVM_SET_NR_MMU_PAGES, (void *)64ul); vcpu_run(vm, vcpu->id); diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index 75303fe8359d..fd3533582509 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -275,7 +275,7 @@ int main(int ac, char **av) /* Add the rest of the VCPUs */ for (i = 1; i < NR_VCPUS; ++i) - vm_vcpu_add_default(vm, i, guest_code); + vm_vcpu_add(vm, i, guest_code); steal_time_init(vm); diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index 2faa43336131..ffa9e267188c 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -369,7 +369,7 @@ static void test_pmu_config_disable(void (*guest_code)(void)) vm_enable_cap(vm, KVM_CAP_PMU_CAPABILITY, KVM_PMU_CAP_DISABLE); - vcpu = vm_vcpu_add_default(vm, 0, guest_code); + vcpu = vm_vcpu_add(vm, 0, guest_code); vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vm, vcpu->id); diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c index 9ba3cd4e7f20..e63709894030 100644 --- a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c +++ b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c @@ -92,9 +92,9 @@ static struct kvm_vm *create_vm(void) static void add_x86_vcpu(struct kvm_vm *vm, uint32_t vcpuid, bool bsp_code) { if (bsp_code) - vm_vcpu_add_default(vm, vcpuid, guest_bsp_vcpu); + vm_vcpu_add(vm, vcpuid, guest_bsp_vcpu); else - vm_vcpu_add_default(vm, vcpuid, guest_not_bsp_vcpu); + vm_vcpu_add(vm, vcpuid, guest_not_bsp_vcpu); } static void run_vm_bsp(uint32_t bsp_vcpu) diff --git a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c index 8a5c1f76287c..2e67df3a95ba 100644 --- a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c +++ b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c @@ -95,7 +95,7 @@ int main(int argc, char *argv[]) * the vCPU model, i.e. without doing KVM_SET_CPUID2. */ vm = vm_create_barebones(); - vcpu = vm_vcpu_add(vm, 0); + vcpu = __vm_vcpu_add(vm, 0); vcpu_sregs_get(vm, vcpu->id, &sregs); diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c index 245fd0755390..ec418b823273 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c +++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c @@ -56,7 +56,7 @@ static struct kvm_vm *sev_vm_create(bool es) vm = vm_create_barebones(); sev_ioctl(vm->fd, es ? KVM_SEV_ES_INIT : KVM_SEV_INIT, NULL); for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i) - vm_vcpu_add(vm, i); + __vm_vcpu_add(vm, i); if (es) start.policy |= SEV_POLICY_ES; sev_ioctl(vm->fd, KVM_SEV_LAUNCH_START, &start); @@ -75,7 +75,7 @@ static struct kvm_vm *aux_vm_create(bool with_vcpus) return vm; for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i) - vm_vcpu_add(vm, i); + __vm_vcpu_add(vm, i); return vm; } @@ -182,7 +182,7 @@ static void test_sev_migrate_parameters(void) sev_es_vm = sev_vm_create(/* es= */ true); sev_es_vm_no_vmsa = vm_create_barebones(); sev_ioctl(sev_es_vm_no_vmsa->fd, KVM_SEV_ES_INIT, NULL); - vm_vcpu_add(sev_es_vm_no_vmsa, 1); + __vm_vcpu_add(sev_es_vm_no_vmsa, 1); ret = __sev_migrate_from(sev_vm, sev_es_vm); TEST_ASSERT( @@ -278,7 +278,7 @@ static void test_sev_mirror(bool es) /* Check that we can complete creation of the mirror VM. */ for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i) - vm_vcpu_add(dst_vm, i); + __vm_vcpu_add(dst_vm, i); if (es) sev_ioctl(dst_vm->fd, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL); diff --git a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c index ea70ca2e63c3..2411215e7ae8 100644 --- a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c +++ b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c @@ -51,10 +51,10 @@ static void *run_vcpu(void *_cpu_nr) static bool first_cpu_done; struct kvm_vcpu *vcpu; - /* The kernel is fine, but vm_vcpu_add_default() needs locking */ + /* The kernel is fine, but vm_vcpu_add() needs locking */ pthread_spin_lock(&create_lock); - vcpu = vm_vcpu_add_default(vm, vcpu_id, guest_code); + vcpu = vm_vcpu_add(vm, vcpu_id, guest_code); if (!first_cpu_done) { first_cpu_done = true; diff --git a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c index afbbc40df884..8b366652be31 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c @@ -425,7 +425,7 @@ int main(int argc, char *argv[]) virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); - vm_vcpu_add_default(vm, SENDER_VCPU_ID, sender_guest_code); + vm_vcpu_add(vm, SENDER_VCPU_ID, sender_guest_code); test_data_page_vaddr = vm_vaddr_alloc_page(vm); data = -- cgit v1.2.3 From 682b11a012b85c18310a6798fe2509c55bf6563e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 11:07:09 -0800 Subject: KVM: selftests: Convert set_boot_cpu_id away from global VCPU_IDs Rework set_boot_cpu_id to pass around 'struct kvm_vcpu' objects instead of relying on global VCPU_IDs. The test is still ugly, but that's unavoidable since the point of the test is to verify that KVM correctly assigns VCPU_ID==0 to be the BSP by default. This is literally one of two KVM selftests that legitimately needs to care about the exact vCPU IDs of the vCPUs it creates. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/x86_64/set_boot_cpu_id.c | 86 +++++++++------------- 1 file changed, 36 insertions(+), 50 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c index e63709894030..b11f12888fad 100644 --- a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c +++ b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c @@ -16,10 +16,6 @@ #include "processor.h" #include "apic.h" -#define N_VCPU 2 -#define VCPU_ID0 0 -#define VCPU_ID1 1 - static void guest_bsp_vcpu(void *arg) { GUEST_SYNC(1); @@ -38,31 +34,30 @@ static void guest_not_bsp_vcpu(void *arg) GUEST_DONE(); } -static void test_set_boot_busy(struct kvm_vm *vm) +static void test_set_bsp_busy(struct kvm_vcpu *vcpu, const char *msg) { - int res; + int r = __vm_ioctl(vcpu->vm, KVM_SET_BOOT_CPU_ID, + (void *)(unsigned long)vcpu->id); - res = __vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID0); - TEST_ASSERT(res == -1 && errno == EBUSY, - "KVM_SET_BOOT_CPU_ID set while running vm"); + TEST_ASSERT(r == -1 && errno == EBUSY, "KVM_SET_BOOT_CPU_ID set %s", msg); } -static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid) +static void run_vcpu(struct kvm_vcpu *vcpu) { struct ucall uc; int stage; for (stage = 0; stage < 2; stage++) { - vcpu_run(vm, vcpuid); + vcpu_run(vcpu->vm, vcpu->id); - switch (get_ucall(vm, vcpuid, &uc)) { + switch (get_ucall(vcpu->vm, vcpu->id, &uc)) { case UCALL_SYNC: TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx", stage + 1, (ulong)uc.args[1]); - test_set_boot_busy(vm); + test_set_bsp_busy(vcpu, "while running vm"); break; case UCALL_DONE: TEST_ASSERT(stage == 1, @@ -75,65 +70,56 @@ static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid) uc.args[1], uc.args[2], uc.args[3]); default: TEST_ASSERT(false, "Unexpected exit: %s", - exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason)); + exit_reason_str(vcpu->run->exit_reason)); } } } -static struct kvm_vm *create_vm(void) +static struct kvm_vm *create_vm(uint32_t nr_vcpus, uint32_t bsp_vcpu_id, + struct kvm_vcpu *vcpus[]) { - uint64_t vcpu_pages = (DEFAULT_STACK_PGS) * 2; - uint64_t extra_pg_pages = vcpu_pages / PTES_PER_MIN_PAGE * N_VCPU; + uint64_t vcpu_pages = (DEFAULT_STACK_PGS) * nr_vcpus; + uint64_t extra_pg_pages = vcpu_pages / PTES_PER_MIN_PAGE * nr_vcpus; uint64_t pages = DEFAULT_GUEST_PHY_PAGES + vcpu_pages + extra_pg_pages; + struct kvm_vm *vm; + uint32_t i; - return vm_create(pages); -} + vm = vm_create(pages); -static void add_x86_vcpu(struct kvm_vm *vm, uint32_t vcpuid, bool bsp_code) -{ - if (bsp_code) - vm_vcpu_add(vm, vcpuid, guest_bsp_vcpu); - else - vm_vcpu_add(vm, vcpuid, guest_not_bsp_vcpu); + vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *)(unsigned long)bsp_vcpu_id); + + for (i = 0; i < nr_vcpus; i++) + vcpus[i] = vm_vcpu_add(vm, i, i == bsp_vcpu_id ? guest_bsp_vcpu : + guest_not_bsp_vcpu); + return vm; } -static void run_vm_bsp(uint32_t bsp_vcpu) +static void run_vm_bsp(uint32_t bsp_vcpu_id) { + struct kvm_vcpu *vcpus[2]; struct kvm_vm *vm; - bool is_bsp_vcpu1 = bsp_vcpu == VCPU_ID1; - - vm = create_vm(); - if (is_bsp_vcpu1) - vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID1); + vm = create_vm(ARRAY_SIZE(vcpus), bsp_vcpu_id, vcpus); - add_x86_vcpu(vm, VCPU_ID0, !is_bsp_vcpu1); - add_x86_vcpu(vm, VCPU_ID1, is_bsp_vcpu1); - - run_vcpu(vm, VCPU_ID0); - run_vcpu(vm, VCPU_ID1); + run_vcpu(vcpus[0]); + run_vcpu(vcpus[1]); kvm_vm_free(vm); } static void check_set_bsp_busy(void) { + struct kvm_vcpu *vcpus[2]; struct kvm_vm *vm; - int res; - - vm = create_vm(); - add_x86_vcpu(vm, VCPU_ID0, true); - add_x86_vcpu(vm, VCPU_ID1, false); + vm = create_vm(ARRAY_SIZE(vcpus), 0, vcpus); - res = __vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID1); - TEST_ASSERT(res == -1 && errno == EBUSY, "KVM_SET_BOOT_CPU_ID set after adding vcpu"); + test_set_bsp_busy(vcpus[1], "after adding vcpu"); - run_vcpu(vm, VCPU_ID0); - run_vcpu(vm, VCPU_ID1); + run_vcpu(vcpus[0]); + run_vcpu(vcpus[1]); - res = __vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID1); - TEST_ASSERT(res == -1 && errno == EBUSY, "KVM_SET_BOOT_CPU_ID set to a terminated vcpu"); + test_set_bsp_busy(vcpus[1], "to a terminated vcpu"); kvm_vm_free(vm); } @@ -145,9 +131,9 @@ int main(int argc, char *argv[]) return 0; } - run_vm_bsp(VCPU_ID0); - run_vm_bsp(VCPU_ID1); - run_vm_bsp(VCPU_ID0); + run_vm_bsp(0); + run_vm_bsp(1); + run_vm_bsp(0); check_set_bsp_busy(); } -- cgit v1.2.3 From b093da659f3d0ff291fcd6373090ba17cf183253 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 11:11:50 -0800 Subject: KVM: selftests: Convert psci_test away from VCPU_ID Pass around 'struct kvm_vcpu' objects in psci_test instead of relying on global VCPU_IDs. Ideally, the test wouldn't have to manually create vCPUs and thus care about vCPU IDs, but it's not the end of the world and avoiding that behavior isn't guaranteed to be a net positive (an attempt at macro shenanigans did not go very well). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/psci_test.c | 50 ++++++++++++------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c index 347cb5c130e2..d9695a939cc9 100644 --- a/tools/testing/selftests/kvm/aarch64/psci_test.c +++ b/tools/testing/selftests/kvm/aarch64/psci_test.c @@ -17,9 +17,6 @@ #include "processor.h" #include "test_util.h" -#define VCPU_ID_SOURCE 0 -#define VCPU_ID_TARGET 1 - #define CPU_ON_ENTRY_ADDR 0xfeedf00dul #define CPU_ON_CONTEXT_ID 0xdeadc0deul @@ -64,16 +61,17 @@ static uint64_t psci_features(uint32_t func_id) return res.a0; } -static void vcpu_power_off(struct kvm_vm *vm, uint32_t vcpuid) +static void vcpu_power_off(struct kvm_vcpu *vcpu) { struct kvm_mp_state mp_state = { .mp_state = KVM_MP_STATE_STOPPED, }; - vcpu_mp_state_set(vm, vcpuid, &mp_state); + vcpu_mp_state_set(vcpu->vm, vcpu->id, &mp_state); } -static struct kvm_vm *setup_vm(void *guest_code) +static struct kvm_vm *setup_vm(void *guest_code, struct kvm_vcpu **source, + struct kvm_vcpu **target) { struct kvm_vcpu_init init; struct kvm_vm *vm; @@ -84,28 +82,28 @@ static struct kvm_vm *setup_vm(void *guest_code) vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init); init.features[0] |= (1 << KVM_ARM_VCPU_PSCI_0_2); - aarch64_vcpu_add(vm, VCPU_ID_SOURCE, &init, guest_code); - aarch64_vcpu_add(vm, VCPU_ID_TARGET, &init, guest_code); + *source = aarch64_vcpu_add(vm, 0, &init, guest_code); + *target = aarch64_vcpu_add(vm, 1, &init, guest_code); return vm; } -static void enter_guest(struct kvm_vm *vm, uint32_t vcpuid) +static void enter_guest(struct kvm_vcpu *vcpu) { struct ucall uc; - vcpu_run(vm, vcpuid); - if (get_ucall(vm, vcpuid, &uc) == UCALL_ABORT) + vcpu_run(vcpu->vm, vcpu->id); + if (get_ucall(vcpu->vm, vcpu->id, &uc) == UCALL_ABORT) TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]); } -static void assert_vcpu_reset(struct kvm_vm *vm, uint32_t vcpuid) +static void assert_vcpu_reset(struct kvm_vcpu *vcpu) { uint64_t obs_pc, obs_x0; - get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), &obs_pc); - get_reg(vm, vcpuid, ARM64_CORE_REG(regs.regs[0]), &obs_x0); + get_reg(vcpu->vm, vcpu->id, ARM64_CORE_REG(regs.pc), &obs_pc); + get_reg(vcpu->vm, vcpu->id, ARM64_CORE_REG(regs.regs[0]), &obs_x0); TEST_ASSERT(obs_pc == CPU_ON_ENTRY_ADDR, "unexpected target cpu pc: %lx (expected: %lx)", @@ -133,25 +131,26 @@ static void guest_test_cpu_on(uint64_t target_cpu) static void host_test_cpu_on(void) { + struct kvm_vcpu *source, *target; uint64_t target_mpidr; struct kvm_vm *vm; struct ucall uc; - vm = setup_vm(guest_test_cpu_on); + vm = setup_vm(guest_test_cpu_on, &source, &target); /* * make sure the target is already off when executing the test. */ - vcpu_power_off(vm, VCPU_ID_TARGET); + vcpu_power_off(target); - get_reg(vm, VCPU_ID_TARGET, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), &target_mpidr); - vcpu_args_set(vm, VCPU_ID_SOURCE, 1, target_mpidr & MPIDR_HWID_BITMASK); - enter_guest(vm, VCPU_ID_SOURCE); + get_reg(vm, target->id, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), &target_mpidr); + vcpu_args_set(vm, source->id, 1, target_mpidr & MPIDR_HWID_BITMASK); + enter_guest(source); - if (get_ucall(vm, VCPU_ID_SOURCE, &uc) != UCALL_DONE) + if (get_ucall(vm, source->id, &uc) != UCALL_DONE) TEST_FAIL("Unhandled ucall: %lu", uc.cmd); - assert_vcpu_reset(vm, VCPU_ID_TARGET); + assert_vcpu_reset(target); kvm_vm_free(vm); } @@ -169,16 +168,17 @@ static void guest_test_system_suspend(void) static void host_test_system_suspend(void) { + struct kvm_vcpu *source, *target; struct kvm_run *run; struct kvm_vm *vm; - vm = setup_vm(guest_test_system_suspend); + vm = setup_vm(guest_test_system_suspend, &source, &target); vm_enable_cap(vm, KVM_CAP_ARM_SYSTEM_SUSPEND, 0); - vcpu_power_off(vm, VCPU_ID_TARGET); - run = vcpu_state(vm, VCPU_ID_SOURCE); + vcpu_power_off(target); + run = source->run; - enter_guest(vm, VCPU_ID_SOURCE); + enter_guest(source); TEST_ASSERT(run->exit_reason == KVM_EXIT_SYSTEM_EVENT, "Unhandled exit reason: %u (%s)", -- cgit v1.2.3 From 0750388ca7110d332547f3669715442eac6a1254 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 11:15:08 -0800 Subject: KVM: selftests: Convert hardware_disable_test to pass around vCPU objects Pass around 'struct kvm_vcpu' objects in hardware_disable_test instead of the VM+vcpu_id (called "index" by the test). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/hardware_disable_test.c | 25 +++++++--------------- 1 file changed, 8 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c index be2763ecb6e7..59bb43345a3e 100644 --- a/tools/testing/selftests/kvm/hardware_disable_test.c +++ b/tools/testing/selftests/kvm/hardware_disable_test.c @@ -27,12 +27,6 @@ sem_t *sem; -/* Arguments for the pthreads */ -struct payload { - struct kvm_vm *vm; - uint32_t index; -}; - static void guest_code(void) { for (;;) @@ -42,14 +36,14 @@ static void guest_code(void) static void *run_vcpu(void *arg) { - struct payload *payload = (struct payload *)arg; - struct kvm_run *state = vcpu_state(payload->vm, payload->index); + struct kvm_vcpu *vcpu = arg; + struct kvm_run *run = vcpu->run; - vcpu_run(payload->vm, payload->index); + vcpu_run(vcpu->vm, vcpu->id); TEST_ASSERT(false, "%s: exited with reason %d: %s\n", - __func__, state->exit_reason, - exit_reason_str(state->exit_reason)); + __func__, run->exit_reason, + exit_reason_str(run->exit_reason)); pthread_exit(NULL); } @@ -92,11 +86,11 @@ static inline void check_join(pthread_t thread, void **retval) static void run_test(uint32_t run) { + struct kvm_vcpu *vcpu; struct kvm_vm *vm; cpu_set_t cpu_set; pthread_t threads[VCPU_NUM]; pthread_t throw_away; - struct payload payloads[VCPU_NUM]; void *b; uint32_t i, j; @@ -108,12 +102,9 @@ static void run_test(uint32_t run) pr_debug("%s: [%d] start vcpus\n", __func__, run); for (i = 0; i < VCPU_NUM; ++i) { - vm_vcpu_add(vm, i, guest_code); - payloads[i].vm = vm; - payloads[i].index = i; + vcpu = vm_vcpu_add(vm, i, guest_code); - check_create_thread(&threads[i], NULL, run_vcpu, - (void *)&payloads[i]); + check_create_thread(&threads[i], NULL, run_vcpu, vcpu); check_set_affinity(threads[i], &cpu_set); for (j = 0; j < SLEEPING_THREAD_NUM; ++j) { -- cgit v1.2.3 From 0ffc70eab775b52432a76f26aada24c5b78ef0eb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 11:19:18 -0800 Subject: KVM: selftests: Add VM creation helper that "returns" vCPUs Add a VM creator that "returns" the created vCPUs by filling the provided array. This will allow converting multi-vCPU tests away from hardcoded vCPU IDs. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/kvm_util_base.h | 17 +++++++++++++---- tools/testing/selftests/kvm/kvm_page_table_test.c | 4 ++-- tools/testing/selftests/kvm/lib/kvm_util.c | 17 ++++++++++------- tools/testing/selftests/kvm/lib/perf_test_util.c | 5 +++-- 4 files changed, 28 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 2c7a8a91ebe2..c0b2158a53d5 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -563,10 +563,19 @@ struct kvm_vm *vm_create_default_with_vcpus(uint32_t nr_vcpus, uint64_t extra_me uint32_t vcpuids[]); /* Like vm_create_default_with_vcpus, but accepts mode and slot0 memory as a parameter */ -struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, - uint64_t slot0_mem_pages, uint64_t extra_mem_pages, - uint32_t num_percpu_pages, void *guest_code, - uint32_t vcpuids[]); +struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, + uint64_t slot0_mem_pages, uint64_t extra_mem_pages, + uint32_t num_percpu_pages, void *guest_code, + uint32_t vcpuids[], struct kvm_vcpu *vcpus[]); + +static inline struct kvm_vm *vm_create_with_vcpus(uint32_t nr_vcpus, + void *guest_code, + struct kvm_vcpu *vcpus[]) +{ + return __vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, + DEFAULT_GUEST_PHY_PAGES, 0, 0, + guest_code, NULL, vcpus); +} /* * Create a VM with a single vCPU with reasonable defaults and @extra_mem_pages diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index 2c4a7563a4f8..e91bc7f1400d 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -268,8 +268,8 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) /* Create a VM with enough guest pages */ guest_num_pages = test_mem_size / guest_page_size; - vm = vm_create_with_vcpus(mode, nr_vcpus, DEFAULT_GUEST_PHY_PAGES, - guest_num_pages, 0, guest_code, NULL); + vm = __vm_create_with_vcpus(mode, nr_vcpus, DEFAULT_GUEST_PHY_PAGES, + guest_num_pages, 0, guest_code, NULL, NULL); /* Align down GPA of the testing memslot */ if (!p->phys_offset) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index c2a99f26e9ba..3015d490a8f6 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -296,12 +296,13 @@ struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint64_t nr_pages) * extra_mem_pages is only used to calculate the maximum page table size, * no real memory allocation for non-slot0 memory in this function. */ -struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, - uint64_t slot0_mem_pages, uint64_t extra_mem_pages, - uint32_t num_percpu_pages, void *guest_code, - uint32_t vcpuids[]) +struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, + uint64_t slot0_mem_pages, uint64_t extra_mem_pages, + uint32_t num_percpu_pages, void *guest_code, + uint32_t vcpuids[], struct kvm_vcpu *vcpus[]) { uint64_t vcpu_pages, extra_pg_pages, pages; + struct kvm_vcpu *vcpu; struct kvm_vm *vm; int i; @@ -328,7 +329,9 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, for (i = 0; i < nr_vcpus; ++i) { uint32_t vcpuid = vcpuids ? vcpuids[i] : i; - vm_vcpu_add(vm, vcpuid, guest_code); + vcpu = vm_vcpu_add(vm, vcpuid, guest_code); + if (vcpus) + vcpus[i] = vcpu; } return vm; @@ -338,8 +341,8 @@ struct kvm_vm *vm_create_default_with_vcpus(uint32_t nr_vcpus, uint64_t extra_me uint32_t num_percpu_pages, void *guest_code, uint32_t vcpuids[]) { - return vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, DEFAULT_GUEST_PHY_PAGES, - extra_mem_pages, num_percpu_pages, guest_code, vcpuids); + return __vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, DEFAULT_GUEST_PHY_PAGES, + extra_mem_pages, num_percpu_pages, guest_code, vcpuids, NULL); } struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index f989ff91f022..d3ff05f00653 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -147,8 +147,9 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, * The memory is also added to memslot 0, but that's a benign side * effect as KVM allows aliasing HVAs in meslots. */ - vm = vm_create_with_vcpus(mode, vcpus, slot0_pages, guest_num_pages, 0, - perf_test_guest_code, NULL); + vm = __vm_create_with_vcpus(mode, vcpus, DEFAULT_GUEST_PHY_PAGES, + slot0_pages + guest_num_pages, 0, + perf_test_guest_code, NULL, NULL); pta->vm = vm; -- cgit v1.2.3 From 9980160482213a9334e864774f789336960568a3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 11:24:01 -0800 Subject: KVM: selftests: Convert steal_time away from VCPU_ID Convert steal_time to use vm_create_with_vcpus() and pass around a 'struct kvm_vcpu' object instead of requiring that the index into the array of vCPUs for a given vCPU is also the ID of the vCPU. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/steal_time.c | 123 ++++++++++++++++--------------- 1 file changed, 62 insertions(+), 61 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index fd3533582509..7a6645464925 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -58,36 +58,34 @@ static void guest_code(int cpu) GUEST_DONE(); } -static void steal_time_init(struct kvm_vm *vm) +static bool is_steal_time_supported(struct kvm_vcpu *vcpu) { - int i; + struct kvm_cpuid_entry2 *cpuid = kvm_get_supported_cpuid_entry(KVM_CPUID_FEATURES); - if (!(kvm_get_supported_cpuid_entry(KVM_CPUID_FEATURES)->eax & - KVM_FEATURE_STEAL_TIME)) { - print_skip("steal-time not supported"); - exit(KSFT_SKIP); - } + return cpuid && (cpuid->eax & KVM_FEATURE_STEAL_TIME); +} - for (i = 0; i < NR_VCPUS; ++i) { - int ret; +static void steal_time_init(struct kvm_vcpu *vcpu, uint32_t i) +{ + int ret; - /* ST_GPA_BASE is identity mapped */ - st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE); - sync_global_to_guest(vm, st_gva[i]); + /* ST_GPA_BASE is identity mapped */ + st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE); + sync_global_to_guest(vcpu->vm, st_gva[i]); - ret = _vcpu_set_msr(vm, i, MSR_KVM_STEAL_TIME, (ulong)st_gva[i] | KVM_STEAL_RESERVED_MASK); - TEST_ASSERT(ret == 0, "Bad GPA didn't fail"); + ret = _vcpu_set_msr(vcpu->vm, vcpu->id, MSR_KVM_STEAL_TIME, + (ulong)st_gva[i] | KVM_STEAL_RESERVED_MASK); + TEST_ASSERT(ret == 0, "Bad GPA didn't fail"); - vcpu_set_msr(vm, i, MSR_KVM_STEAL_TIME, (ulong)st_gva[i] | KVM_MSR_ENABLED); - } + vcpu_set_msr(vcpu->vm, vcpu->id, MSR_KVM_STEAL_TIME, (ulong)st_gva[i] | KVM_MSR_ENABLED); } -static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpuid) +static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpu_idx) { - struct kvm_steal_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpuid]); + struct kvm_steal_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpu_idx]); int i; - pr_info("VCPU%d:\n", vcpuid); + pr_info("VCPU%d:\n", vcpu_idx); pr_info(" steal: %lld\n", st->steal); pr_info(" version: %d\n", st->version); pr_info(" flags: %d\n", st->flags); @@ -158,49 +156,50 @@ static void guest_code(int cpu) GUEST_DONE(); } -static void steal_time_init(struct kvm_vm *vm) +static bool is_steal_time_supported(struct kvm_vcpu *vcpu) { struct kvm_device_attr dev = { .group = KVM_ARM_VCPU_PVTIME_CTRL, .attr = KVM_ARM_VCPU_PVTIME_IPA, }; - int i, ret; - ret = __vcpu_ioctl(vm, 0, KVM_HAS_DEVICE_ATTR, &dev); - if (ret != 0 && errno == ENXIO) { - print_skip("steal-time not supported"); - exit(KSFT_SKIP); - } - - for (i = 0; i < NR_VCPUS; ++i) { - uint64_t st_ipa; + return !__vcpu_ioctl(vcpu->vm, vcpu->id, KVM_HAS_DEVICE_ATTR, &dev); +} - vcpu_ioctl(vm, i, KVM_HAS_DEVICE_ATTR, &dev); +static void steal_time_init(struct kvm_vcpu *vcpu, uint32_t i) +{ + struct kvm_vm *vm = vcpu->vm; + uint64_t st_ipa; + int ret; - dev.addr = (uint64_t)&st_ipa; + struct kvm_device_attr dev = { + .group = KVM_ARM_VCPU_PVTIME_CTRL, + .attr = KVM_ARM_VCPU_PVTIME_IPA, + .addr = (uint64_t)&st_ipa, + }; - /* ST_GPA_BASE is identity mapped */ - st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE); - sync_global_to_guest(vm, st_gva[i]); + vcpu_ioctl(vm, vcpu->id, KVM_HAS_DEVICE_ATTR, &dev); - st_ipa = (ulong)st_gva[i] | 1; - ret = __vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev); - TEST_ASSERT(ret == -1 && errno == EINVAL, "Bad IPA didn't report EINVAL"); + /* ST_GPA_BASE is identity mapped */ + st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE); + sync_global_to_guest(vm, st_gva[i]); - st_ipa = (ulong)st_gva[i]; - vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev); + st_ipa = (ulong)st_gva[i] | 1; + ret = __vcpu_ioctl(vm, vcpu->id, KVM_SET_DEVICE_ATTR, &dev); + TEST_ASSERT(ret == -1 && errno == EINVAL, "Bad IPA didn't report EINVAL"); - ret = __vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev); - TEST_ASSERT(ret == -1 && errno == EEXIST, "Set IPA twice without EEXIST"); + st_ipa = (ulong)st_gva[i]; + vcpu_ioctl(vm, vcpu->id, KVM_SET_DEVICE_ATTR, &dev); - } + ret = __vcpu_ioctl(vm, vcpu->id, KVM_SET_DEVICE_ATTR, &dev); + TEST_ASSERT(ret == -1 && errno == EEXIST, "Set IPA twice without EEXIST"); } -static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpuid) +static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpu_idx) { - struct st_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpuid]); + struct st_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpu_idx]); - pr_info("VCPU%d:\n", vcpuid); + pr_info("VCPU%d:\n", vcpu_idx); pr_info(" rev: %d\n", st->rev); pr_info(" attr: %d\n", st->attr); pr_info(" st_time: %ld\n", st->st_time); @@ -224,15 +223,13 @@ static void *do_steal_time(void *arg) return NULL; } -static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid) +static void run_vcpu(struct kvm_vcpu *vcpu) { struct ucall uc; - vcpu_args_set(vm, vcpuid, 1, vcpuid); - - vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL); + vcpu_run(vcpu->vm, vcpu->id); - switch (get_ucall(vm, vcpuid, &uc)) { + switch (get_ucall(vcpu->vm, vcpu->id, &uc)) { case UCALL_SYNC: case UCALL_DONE: break; @@ -241,12 +238,13 @@ static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid) __FILE__, uc.args[1]); default: TEST_ASSERT(false, "Unexpected exit: %s", - exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason)); + exit_reason_str(vcpu->run->exit_reason)); } } int main(int ac, char **av) { + struct kvm_vcpu *vcpus[NR_VCPUS]; struct kvm_vm *vm; pthread_attr_t attr; pthread_t thread; @@ -266,26 +264,29 @@ int main(int ac, char **av) pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset); pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); - /* Create a one VCPU guest and an identity mapped memslot for the steal time structure */ - vm = vm_create_default(0, 0, guest_code); + /* Create a VM and an identity mapped memslot for the steal time structure */ + vm = vm_create_with_vcpus(NR_VCPUS, guest_code, vcpus); gpages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, STEAL_TIME_SIZE * NR_VCPUS); vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, gpages, 0); virt_map(vm, ST_GPA_BASE, ST_GPA_BASE, gpages); ucall_init(vm, NULL); - /* Add the rest of the VCPUs */ - for (i = 1; i < NR_VCPUS; ++i) - vm_vcpu_add(vm, i, guest_code); - - steal_time_init(vm); + if (!is_steal_time_supported(vcpus[0])) { + print_skip("steal-time not supported"); + exit(KSFT_SKIP); + } /* Run test on each VCPU */ for (i = 0; i < NR_VCPUS; ++i) { + steal_time_init(vcpus[i], i); + + vcpu_args_set(vm, vcpus[i]->id, 1, i); + /* First VCPU run initializes steal-time */ - run_vcpu(vm, i); + run_vcpu(vcpus[i]); /* Second VCPU run, expect guest stolen time to be <= run_delay */ - run_vcpu(vm, i); + run_vcpu(vcpus[i]); sync_global_from_guest(vm, guest_stolen_time[i]); stolen_time = guest_stolen_time[i]; run_delay = get_run_delay(); @@ -306,7 +307,7 @@ int main(int ac, char **av) MIN_RUN_DELAY_NS, run_delay); /* Run VCPU again to confirm stolen time is consistent with run_delay */ - run_vcpu(vm, i); + run_vcpu(vcpus[i]); sync_global_from_guest(vm, guest_stolen_time[i]); stolen_time = guest_stolen_time[i] - stolen_time; TEST_ASSERT(stolen_time >= run_delay, -- cgit v1.2.3 From 7a5e4ae3db643e4b14453c5f67e4670a83284644 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 11:40:33 -0800 Subject: KVM: selftests: Convert arch_timer away from VCPU_ID Convert arch_timer to use vm_create_with_vcpus() and pass around a 'struct kvm_vcpu' object instead of requiring that the index into the array of vCPUs for a given vCPU is also the ID of the vCPU Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/arch_timer.c | 62 +++++++++++------------- 1 file changed, 27 insertions(+), 35 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c index f04ca07c7f14..a873d9adc558 100644 --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -76,13 +76,8 @@ struct test_vcpu_shared_data { uint64_t xcnt; }; -struct test_vcpu { - uint32_t vcpuid; - pthread_t pt_vcpu_run; - struct kvm_vm *vm; -}; - -static struct test_vcpu test_vcpu[KVM_MAX_VCPUS]; +static struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; +static pthread_t pt_vcpu_run[KVM_MAX_VCPUS]; static struct test_vcpu_shared_data vcpu_shared_data[KVM_MAX_VCPUS]; static int vtimer_irq, ptimer_irq; @@ -217,20 +212,20 @@ static void guest_code(void) static void *test_vcpu_run(void *arg) { + unsigned int vcpu_idx = (unsigned long)arg; struct ucall uc; - struct test_vcpu *vcpu = arg; + struct kvm_vcpu *vcpu = vcpus[vcpu_idx]; struct kvm_vm *vm = vcpu->vm; - uint32_t vcpuid = vcpu->vcpuid; - struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[vcpuid]; + struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[vcpu_idx]; - vcpu_run(vm, vcpuid); + vcpu_run(vm, vcpu->id); /* Currently, any exit from guest is an indication of completion */ pthread_mutex_lock(&vcpu_done_map_lock); - set_bit(vcpuid, vcpu_done_map); + set_bit(vcpu_idx, vcpu_done_map); pthread_mutex_unlock(&vcpu_done_map_lock); - switch (get_ucall(vm, vcpuid, &uc)) { + switch (get_ucall(vm, vcpu->id, &uc)) { case UCALL_SYNC: case UCALL_DONE: break; @@ -238,7 +233,7 @@ static void *test_vcpu_run(void *arg) sync_global_from_guest(vm, *shared_data); TEST_FAIL("%s at %s:%ld\n\tvalues: %lu, %lu; %lu, vcpu: %u; stage: %u; iter: %u", (const char *)uc.args[0], __FILE__, uc.args[1], - uc.args[2], uc.args[3], uc.args[4], vcpuid, + uc.args[2], uc.args[3], uc.args[4], vcpu_idx, shared_data->guest_stage, shared_data->nr_iter); break; default: @@ -265,7 +260,7 @@ static uint32_t test_get_pcpu(void) return pcpu; } -static int test_migrate_vcpu(struct test_vcpu *vcpu) +static int test_migrate_vcpu(unsigned int vcpu_idx) { int ret; cpu_set_t cpuset; @@ -274,15 +269,15 @@ static int test_migrate_vcpu(struct test_vcpu *vcpu) CPU_ZERO(&cpuset); CPU_SET(new_pcpu, &cpuset); - pr_debug("Migrating vCPU: %u to pCPU: %u\n", vcpu->vcpuid, new_pcpu); + pr_debug("Migrating vCPU: %u to pCPU: %u\n", vcpu_idx, new_pcpu); - ret = pthread_setaffinity_np(vcpu->pt_vcpu_run, - sizeof(cpuset), &cpuset); + ret = pthread_setaffinity_np(pt_vcpu_run[vcpu_idx], + sizeof(cpuset), &cpuset); /* Allow the error where the vCPU thread is already finished */ TEST_ASSERT(ret == 0 || ret == ESRCH, - "Failed to migrate the vCPU:%u to pCPU: %u; ret: %d\n", - vcpu->vcpuid, new_pcpu, ret); + "Failed to migrate the vCPU:%u to pCPU: %u; ret: %d\n", + vcpu_idx, new_pcpu, ret); return ret; } @@ -305,7 +300,7 @@ static void *test_vcpu_migration(void *arg) continue; } - test_migrate_vcpu(&test_vcpu[i]); + test_migrate_vcpu(i); } } while (test_args.nr_vcpus != n_done); @@ -314,16 +309,17 @@ static void *test_vcpu_migration(void *arg) static void test_run(struct kvm_vm *vm) { - int i, ret; pthread_t pt_vcpu_migration; + unsigned int i; + int ret; pthread_mutex_init(&vcpu_done_map_lock, NULL); vcpu_done_map = bitmap_zalloc(test_args.nr_vcpus); TEST_ASSERT(vcpu_done_map, "Failed to allocate vcpu done bitmap\n"); - for (i = 0; i < test_args.nr_vcpus; i++) { - ret = pthread_create(&test_vcpu[i].pt_vcpu_run, NULL, - test_vcpu_run, &test_vcpu[i]); + for (i = 0; i < (unsigned long)test_args.nr_vcpus; i++) { + ret = pthread_create(&pt_vcpu_run[i], NULL, test_vcpu_run, + (void *)(unsigned long)i); TEST_ASSERT(!ret, "Failed to create vCPU-%d pthread\n", i); } @@ -338,7 +334,7 @@ static void test_run(struct kvm_vm *vm) for (i = 0; i < test_args.nr_vcpus; i++) - pthread_join(test_vcpu[i].pt_vcpu_run, NULL); + pthread_join(pt_vcpu_run[i], NULL); if (test_args.migration_freq_ms) pthread_join(pt_vcpu_migration, NULL); @@ -349,9 +345,9 @@ static void test_run(struct kvm_vm *vm) static void test_init_timer_irq(struct kvm_vm *vm) { /* Timer initid should be same for all the vCPUs, so query only vCPU-0 */ - vcpu_device_attr_get(vm, 0, KVM_ARM_VCPU_TIMER_CTRL, + vcpu_device_attr_get(vm, vcpus[0]->id, KVM_ARM_VCPU_TIMER_CTRL, KVM_ARM_VCPU_TIMER_IRQ_PTIMER, &ptimer_irq); - vcpu_device_attr_get(vm, 0, KVM_ARM_VCPU_TIMER_CTRL, + vcpu_device_attr_get(vm, vcpus[0]->id, KVM_ARM_VCPU_TIMER_CTRL, KVM_ARM_VCPU_TIMER_IRQ_VTIMER, &vtimer_irq); sync_global_to_guest(vm, ptimer_irq); @@ -368,17 +364,13 @@ static struct kvm_vm *test_vm_create(void) unsigned int i; int nr_vcpus = test_args.nr_vcpus; - vm = vm_create_default_with_vcpus(nr_vcpus, 0, 0, guest_code, NULL); + vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus); vm_init_descriptor_tables(vm); vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT, guest_irq_handler); - for (i = 0; i < nr_vcpus; i++) { - vcpu_init_descriptor_tables(vm, i); - - test_vcpu[i].vcpuid = i; - test_vcpu[i].vm = vm; - } + for (i = 0; i < nr_vcpus; i++) + vcpu_init_descriptor_tables(vm, vcpus[i]->id); ucall_init(vm, NULL); test_init_timer_irq(vm); -- cgit v1.2.3 From 08ce0888c1f440f6a19133de124feb7280171754 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 1 Jun 2022 13:41:09 -0700 Subject: KVM: selftests: Convert svm_nested_soft_inject_test away from VCPU_ID Convert svm_nested_soft_inject_test to use vm_create_with_one_vcpu() and pull the vCPU's ID from 'struct kvm_vcpu'. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../selftests/kvm/x86_64/svm_nested_soft_inject_test.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c index f834b9a1a7fa..a337ab2ec101 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c @@ -18,7 +18,6 @@ #include "svm_util.h" #include "test_util.h" -#define VCPU_ID 0 #define INT_NR 0x20 #define X86_FEATURE_NRIPS BIT(3) @@ -135,6 +134,7 @@ static void l1_guest_code(struct svm_test_data *svm, uint64_t is_nmi, uint64_t i static void run_test(bool is_nmi) { + struct kvm_vcpu *vcpu; struct kvm_vm *vm; vm_vaddr_t svm_gva; vm_vaddr_t idt_alt_vm; @@ -142,10 +142,10 @@ static void run_test(bool is_nmi) pr_info("Running %s test\n", is_nmi ? "NMI" : "soft int"); - vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, VCPU_ID); + vcpu_init_descriptor_tables(vm, vcpu->id); vm_install_exception_handler(vm, NMI_VECTOR, guest_nmi_handler); vm_install_exception_handler(vm, BP_VECTOR, guest_bp_handler); @@ -163,23 +163,23 @@ static void run_test(bool is_nmi) } else { idt_alt_vm = 0; } - vcpu_args_set(vm, VCPU_ID, 3, svm_gva, (uint64_t)is_nmi, (uint64_t)idt_alt_vm); + vcpu_args_set(vm, vcpu->id, 3, svm_gva, (uint64_t)is_nmi, (uint64_t)idt_alt_vm); memset(&debug, 0, sizeof(debug)); - vcpu_guest_debug_set(vm, VCPU_ID, &debug); + vcpu_guest_debug_set(vm, vcpu->id, &debug); - struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct kvm_run *run = vcpu->run; struct ucall uc; alarm(2); - vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); alarm(0); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vm, vcpu->id, &uc)) { case UCALL_ABORT: TEST_FAIL("%s at %s:%ld, vals = 0x%lx 0x%lx 0x%lx", (const char *)uc.args[0], __FILE__, uc.args[1], uc.args[2], uc.args[3], uc.args[4]); -- cgit v1.2.3 From f3443bed2989a27a4d5c69c01e80d873139b4178 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 1 Jun 2022 13:43:40 -0700 Subject: KVM: selftests: Convert triple_fault_event_test away from VCPU_ID Convert triple_fault_event_test to use vm_create_with_one_vcpu() and pull the vCPU's ID from 'struct kvm_vcpu'. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../selftests/kvm/x86_64/triple_fault_event_test.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c index 68e0f1c5ec5a..2b0f19ddbc8b 100644 --- a/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c +++ b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c @@ -9,7 +9,6 @@ #include "kselftest.h" -#define VCPU_ID 0 #define ARBITRARY_IO_PORT 0x2000 /* The virtual machine object. */ @@ -41,6 +40,7 @@ void l1_guest_code(struct vmx_pages *vmx) int main(void) { + struct kvm_vcpu *vcpu; struct kvm_run *run; struct kvm_vcpu_events events; vm_vaddr_t vmx_pages_gva; @@ -56,13 +56,13 @@ int main(void) exit(KSFT_SKIP); } - vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); vm_enable_cap(vm, KVM_CAP_X86_TRIPLE_FAULT_EVENT, 1); - run = vcpu_state(vm, VCPU_ID); + run = vcpu->run; vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); - vcpu_run(vm, VCPU_ID); + vcpu_args_set(vm, vcpu->id, 1, vmx_pages_gva); + vcpu_run(vm, vcpu->id); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Expected KVM_EXIT_IO, got: %u (%s)\n", @@ -70,21 +70,21 @@ int main(void) TEST_ASSERT(run->io.port == ARBITRARY_IO_PORT, "Expected IN from port %d from L2, got port %d", ARBITRARY_IO_PORT, run->io.port); - vcpu_events_get(vm, VCPU_ID, &events); + vcpu_events_get(vm, vcpu->id, &events); events.flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT; events.triple_fault.pending = true; - vcpu_events_set(vm, VCPU_ID, &events); + vcpu_events_set(vm, vcpu->id, &events); run->immediate_exit = true; - vcpu_run_complete_io(vm, VCPU_ID); + vcpu_run_complete_io(vm, vcpu->id); - vcpu_events_get(vm, VCPU_ID, &events); + vcpu_events_get(vm, vcpu->id, &events); TEST_ASSERT(events.flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT, "Triple fault event invalid"); TEST_ASSERT(events.triple_fault.pending, "No triple fault pending"); - vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vm, vcpu->id, &uc)) { case UCALL_DONE: break; case UCALL_ABORT: -- cgit v1.2.3 From 45f568084a7a86960d55cf70fd9bf06fbb29e792 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 11:57:41 -0800 Subject: KVM: selftests: Convert vgic_init away from vm_create_default_with_vcpus() Use a combination of vm_create(), vm_create_with_vcpus(), and vm_vcpu_add() to convert vgic_init from vm_create_default_with_vcpus(), and away from referncing vCPUs by ID. Thus continues the march toward total annihilation of "default" helpers. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/vgic_init.c | 82 +++++++++++++++---------- 1 file changed, 51 insertions(+), 31 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index 451f65b199ad..40504f05641d 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -66,19 +66,21 @@ static void guest_code(void) } /* we don't want to assert on run execution, hence that helper */ -static int run_vcpu(struct kvm_vm *vm, uint32_t vcpuid) +static int run_vcpu(struct kvm_vcpu *vcpu) { - ucall_init(vm, NULL); + ucall_init(vcpu->vm, NULL); - return __vcpu_run(vm, vcpuid) ? -errno : 0; + return __vcpu_run(vcpu->vm, vcpu->id) ? -errno : 0; } -static struct vm_gic vm_gic_create_with_vcpus(uint32_t gic_dev_type, uint32_t nr_vcpus) +static struct vm_gic vm_gic_create_with_vcpus(uint32_t gic_dev_type, + uint32_t nr_vcpus, + struct kvm_vcpu *vcpus[]) { struct vm_gic v; v.gic_dev_type = gic_dev_type; - v.vm = vm_create_default_with_vcpus(nr_vcpus, 0, 0, guest_code, NULL); + v.vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus); v.gic_fd = kvm_create_device(v.vm, gic_dev_type); return v; @@ -322,18 +324,19 @@ static void subtest_v3_redist_regions(struct vm_gic *v) */ static void test_vgic_then_vcpus(uint32_t gic_dev_type) { + struct kvm_vcpu *vcpus[NR_VCPUS]; struct vm_gic v; int ret, i; - v = vm_gic_create_with_vcpus(gic_dev_type, 1); + v = vm_gic_create_with_vcpus(gic_dev_type, 1, vcpus); subtest_dist_rdist(&v); /* Add the rest of the VCPUs */ for (i = 1; i < NR_VCPUS; ++i) - vm_vcpu_add(v.vm, i, guest_code); + vcpus[i] = vm_vcpu_add(v.vm, i, guest_code); - ret = run_vcpu(v.vm, 3); + ret = run_vcpu(vcpus[3]); TEST_ASSERT(ret == -EINVAL, "dist/rdist overlap detected on 1st vcpu run"); vm_gic_destroy(&v); @@ -342,14 +345,15 @@ static void test_vgic_then_vcpus(uint32_t gic_dev_type) /* All the VCPUs are created before the VGIC KVM device gets initialized */ static void test_vcpus_then_vgic(uint32_t gic_dev_type) { + struct kvm_vcpu *vcpus[NR_VCPUS]; struct vm_gic v; int ret; - v = vm_gic_create_with_vcpus(gic_dev_type, NR_VCPUS); + v = vm_gic_create_with_vcpus(gic_dev_type, NR_VCPUS, vcpus); subtest_dist_rdist(&v); - ret = run_vcpu(v.vm, 3); + ret = run_vcpu(vcpus[3]); TEST_ASSERT(ret == -EINVAL, "dist/rdist overlap detected on 1st vcpu run"); vm_gic_destroy(&v); @@ -357,37 +361,38 @@ static void test_vcpus_then_vgic(uint32_t gic_dev_type) static void test_v3_new_redist_regions(void) { + struct kvm_vcpu *vcpus[NR_VCPUS]; void *dummy = NULL; struct vm_gic v; uint64_t addr; int ret; - v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, NR_VCPUS); + v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, NR_VCPUS, vcpus); subtest_v3_redist_regions(&v); kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); - ret = run_vcpu(v.vm, 3); + ret = run_vcpu(vcpus[3]); TEST_ASSERT(ret == -ENXIO, "running without sufficient number of rdists"); vm_gic_destroy(&v); /* step2 */ - v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, NR_VCPUS); + v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, NR_VCPUS, vcpus); subtest_v3_redist_regions(&v); addr = REDIST_REGION_ATTR_ADDR(1, 0x280000, 0, 2); kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); - ret = run_vcpu(v.vm, 3); + ret = run_vcpu(vcpus[3]); TEST_ASSERT(ret == -EBUSY, "running without vgic explicit init"); vm_gic_destroy(&v); /* step 3 */ - v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, NR_VCPUS); + v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, NR_VCPUS, vcpus); subtest_v3_redist_regions(&v); ret = __kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, @@ -402,7 +407,7 @@ static void test_v3_new_redist_regions(void) kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); - ret = run_vcpu(v.vm, 3); + ret = run_vcpu(vcpus[3]); TEST_ASSERT(!ret, "vcpu run"); vm_gic_destroy(&v); @@ -414,21 +419,22 @@ static void test_v3_typer_accesses(void) uint64_t addr; int ret, i; - v.vm = vm_create_default(0, 0, guest_code); + v.vm = vm_create(DEFAULT_GUEST_PHY_PAGES); + (void)vm_vcpu_add(v.vm, 0, guest_code); v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3); - vm_vcpu_add(v.vm, 3, guest_code); + (void)vm_vcpu_add(v.vm, 3, guest_code); v3_redist_reg_get_errno(v.gic_fd, 1, GICR_TYPER, EINVAL, "attempting to read GICR_TYPER of non created vcpu"); - vm_vcpu_add(v.vm, 1, guest_code); + (void)vm_vcpu_add(v.vm, 1, guest_code); v3_redist_reg_get_errno(v.gic_fd, 1, GICR_TYPER, EBUSY, "read GICR_TYPER before GIC initialized"); - vm_vcpu_add(v.vm, 2, guest_code); + (void)vm_vcpu_add(v.vm, 2, guest_code); kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); @@ -467,6 +473,21 @@ static void test_v3_typer_accesses(void) vm_gic_destroy(&v); } +static struct vm_gic vm_gic_v3_create_with_vcpuids(int nr_vcpus, + uint32_t vcpuids[]) +{ + struct vm_gic v; + int i; + + v.vm = vm_create(DEFAULT_GUEST_PHY_PAGES); + for (i = 0; i < nr_vcpus; i++) + vm_vcpu_add(v.vm, vcpuids[i], guest_code); + + v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3); + + return v; +} + /** * Test GICR_TYPER last bit with new redist regions * rdist regions #1 and #2 are contiguous @@ -483,9 +504,7 @@ static void test_v3_last_bit_redist_regions(void) struct vm_gic v; uint64_t addr; - v.vm = vm_create_default_with_vcpus(6, 0, 0, guest_code, vcpuids); - - v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3); + v = vm_gic_v3_create_with_vcpuids(ARRAY_SIZE(vcpuids), vcpuids); kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); @@ -519,9 +538,7 @@ static void test_v3_last_bit_single_rdist(void) struct vm_gic v; uint64_t addr; - v.vm = vm_create_default_with_vcpus(6, 0, 0, guest_code, vcpuids); - - v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3); + v = vm_gic_v3_create_with_vcpuids(ARRAY_SIZE(vcpuids), vcpuids); kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); @@ -542,11 +559,12 @@ static void test_v3_last_bit_single_rdist(void) /* Uses the legacy REDIST region API. */ static void test_v3_redist_ipa_range_check_at_vcpu_run(void) { + struct kvm_vcpu *vcpus[NR_VCPUS]; struct vm_gic v; int ret, i; uint64_t addr; - v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, 1); + v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, 1, vcpus); /* Set space for 3 redists, we have 1 vcpu, so this succeeds. */ addr = max_phys_size - (3 * 2 * 0x10000); @@ -559,13 +577,13 @@ static void test_v3_redist_ipa_range_check_at_vcpu_run(void) /* Add the rest of the VCPUs */ for (i = 1; i < NR_VCPUS; ++i) - vm_vcpu_add(v.vm, i, guest_code); + vcpus[i] = vm_vcpu_add(v.vm, i, guest_code); kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); /* Attempt to run a vcpu without enough redist space. */ - ret = run_vcpu(v.vm, 2); + ret = run_vcpu(vcpus[2]); TEST_ASSERT(ret && errno == EINVAL, "redist base+size above PA range detected on 1st vcpu run"); @@ -574,11 +592,12 @@ static void test_v3_redist_ipa_range_check_at_vcpu_run(void) static void test_v3_its_region(void) { + struct kvm_vcpu *vcpus[NR_VCPUS]; struct vm_gic v; uint64_t addr; int its_fd, ret; - v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, NR_VCPUS); + v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, NR_VCPUS, vcpus); its_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_ITS); addr = 0x401000; @@ -618,11 +637,12 @@ static void test_v3_its_region(void) */ int test_kvm_device(uint32_t gic_dev_type) { + struct kvm_vcpu *vcpus[NR_VCPUS]; struct vm_gic v; uint32_t other; int ret; - v.vm = vm_create_default_with_vcpus(NR_VCPUS, 0, 0, guest_code, NULL); + v.vm = vm_create_with_vcpus(NR_VCPUS, guest_code, vcpus); /* try to create a non existing KVM device */ ret = __kvm_test_create_device(v.vm, 0); -- cgit v1.2.3 From bfff0f60db89f425920580900ba242df3bd3c652 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 1 Jun 2022 17:16:11 -0700 Subject: KVM: selftests: Consolidate KVM_{G,S}ET_ONE_REG helpers Rework vcpu_{g,s}et_reg() to provide the APIs that tests actually want to use, and drop the three "one-off" implementations that cropped up due to the poor API. Ignore the handful of direct KVM_{G,S}ET_ONE_REG calls that don't fit the APIs for one reason or another. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../selftests/kvm/aarch64/debug-exceptions.c | 2 +- tools/testing/selftests/kvm/aarch64/get-reg-list.c | 2 +- tools/testing/selftests/kvm/aarch64/hypercalls.c | 32 ++------- tools/testing/selftests/kvm/aarch64/psci_test.c | 6 +- .../selftests/kvm/include/aarch64/processor.h | 18 +---- .../testing/selftests/kvm/include/kvm_util_base.h | 28 ++++++-- .../selftests/kvm/include/riscv/processor.h | 20 ------ .../testing/selftests/kvm/lib/aarch64/processor.c | 28 ++++---- tools/testing/selftests/kvm/lib/riscv/processor.c | 84 +++++++++++----------- tools/testing/selftests/kvm/s390x/resets.c | 5 +- 10 files changed, 93 insertions(+), 132 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c index b69db0942169..2fe13e117dba 100644 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -242,7 +242,7 @@ static int debug_version(struct kvm_vcpu *vcpu) { uint64_t id_aa64dfr0; - get_reg(vcpu->vm, vcpu->id, KVM_ARM64_SYS_REG(SYS_ID_AA64DFR0_EL1), &id_aa64dfr0); + vcpu_get_reg(vcpu->vm, vcpu->id, KVM_ARM64_SYS_REG(SYS_ID_AA64DFR0_EL1), &id_aa64dfr0); return id_aa64dfr0 & 0xf; } diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c index fbb0c714211d..22f5e4124304 100644 --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -458,7 +458,7 @@ static void run_test(struct vcpu_config *c) bool reject_reg = false; int ret; - ret = __vcpu_ioctl(vm, 0, KVM_GET_ONE_REG, ®); + ret = __vcpu_get_reg(vm, 0, reg_list->reg[i], &addr); if (ret) { printf("%s: Failed to get ", config_name(c)); print_reg(c, reg.id); diff --git a/tools/testing/selftests/kvm/aarch64/hypercalls.c b/tools/testing/selftests/kvm/aarch64/hypercalls.c index 1eb9738453b4..b1f99e786d05 100644 --- a/tools/testing/selftests/kvm/aarch64/hypercalls.c +++ b/tools/testing/selftests/kvm/aarch64/hypercalls.c @@ -141,26 +141,6 @@ static void guest_code(void) GUEST_DONE(); } -static int set_fw_reg(struct kvm_vm *vm, uint64_t id, uint64_t val) -{ - struct kvm_one_reg reg = { - .id = id, - .addr = (uint64_t)&val, - }; - - return __vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, ®); -} - -static void get_fw_reg(struct kvm_vm *vm, uint64_t id, uint64_t *addr) -{ - struct kvm_one_reg reg = { - .id = id, - .addr = (uint64_t)addr, - }; - - vcpu_ioctl(vm, 0, KVM_GET_ONE_REG, ®); -} - struct st_time { uint32_t rev; uint32_t attr; @@ -196,18 +176,18 @@ static void test_fw_regs_before_vm_start(struct kvm_vm *vm) const struct kvm_fw_reg_info *reg_info = &fw_reg_info[i]; /* First 'read' should be an upper limit of the features supported */ - get_fw_reg(vm, reg_info->reg, &val); + vcpu_get_reg(vm, 0, reg_info->reg, &val); TEST_ASSERT(val == FW_REG_ULIMIT_VAL(reg_info->max_feat_bit), "Expected all the features to be set for reg: 0x%lx; expected: 0x%lx; read: 0x%lx\n", reg_info->reg, FW_REG_ULIMIT_VAL(reg_info->max_feat_bit), val); /* Test a 'write' by disabling all the features of the register map */ - ret = set_fw_reg(vm, reg_info->reg, 0); + ret = __vcpu_set_reg(vm, 0, reg_info->reg, 0); TEST_ASSERT(ret == 0, "Failed to clear all the features of reg: 0x%lx; ret: %d\n", reg_info->reg, errno); - get_fw_reg(vm, reg_info->reg, &val); + vcpu_get_reg(vm, 0, reg_info->reg, &val); TEST_ASSERT(val == 0, "Expected all the features to be cleared for reg: 0x%lx\n", reg_info->reg); @@ -216,7 +196,7 @@ static void test_fw_regs_before_vm_start(struct kvm_vm *vm) * Avoid this check if all the bits are occupied. */ if (reg_info->max_feat_bit < 63) { - ret = set_fw_reg(vm, reg_info->reg, BIT(reg_info->max_feat_bit + 1)); + ret = __vcpu_set_reg(vm, 0, reg_info->reg, BIT(reg_info->max_feat_bit + 1)); TEST_ASSERT(ret != 0 && errno == EINVAL, "Unexpected behavior or return value (%d) while setting an unsupported feature for reg: 0x%lx\n", errno, reg_info->reg); @@ -237,7 +217,7 @@ static void test_fw_regs_after_vm_start(struct kvm_vm *vm) * Before starting the VM, the test clears all the bits. * Check if that's still the case. */ - get_fw_reg(vm, reg_info->reg, &val); + vcpu_get_reg(vm, 0, reg_info->reg, &val); TEST_ASSERT(val == 0, "Expected all the features to be cleared for reg: 0x%lx\n", reg_info->reg); @@ -247,7 +227,7 @@ static void test_fw_regs_after_vm_start(struct kvm_vm *vm) * the registers and should return EBUSY. Set the registers and check for * the expected errno. */ - ret = set_fw_reg(vm, reg_info->reg, FW_REG_ULIMIT_VAL(reg_info->max_feat_bit)); + ret = __vcpu_set_reg(vm, 0, reg_info->reg, FW_REG_ULIMIT_VAL(reg_info->max_feat_bit)); TEST_ASSERT(ret != 0 && errno == EBUSY, "Unexpected behavior or return value (%d) while setting a feature while VM is running for reg: 0x%lx\n", errno, reg_info->reg); diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c index d9695a939cc9..f4f73934351f 100644 --- a/tools/testing/selftests/kvm/aarch64/psci_test.c +++ b/tools/testing/selftests/kvm/aarch64/psci_test.c @@ -102,8 +102,8 @@ static void assert_vcpu_reset(struct kvm_vcpu *vcpu) { uint64_t obs_pc, obs_x0; - get_reg(vcpu->vm, vcpu->id, ARM64_CORE_REG(regs.pc), &obs_pc); - get_reg(vcpu->vm, vcpu->id, ARM64_CORE_REG(regs.regs[0]), &obs_x0); + vcpu_get_reg(vcpu->vm, vcpu->id, ARM64_CORE_REG(regs.pc), &obs_pc); + vcpu_get_reg(vcpu->vm, vcpu->id, ARM64_CORE_REG(regs.regs[0]), &obs_x0); TEST_ASSERT(obs_pc == CPU_ON_ENTRY_ADDR, "unexpected target cpu pc: %lx (expected: %lx)", @@ -143,7 +143,7 @@ static void host_test_cpu_on(void) */ vcpu_power_off(target); - get_reg(vm, target->id, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), &target_mpidr); + vcpu_get_reg(vm, target->id, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), &target_mpidr); vcpu_args_set(vm, source->id, 1, target_mpidr & MPIDR_HWID_BITMASK); enter_guest(source); diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h index f774609f7848..ba3e9066d990 100644 --- a/tools/testing/selftests/kvm/include/aarch64/processor.h +++ b/tools/testing/selftests/kvm/include/aarch64/processor.h @@ -19,7 +19,7 @@ /* * KVM_ARM64_SYS_REG(sys_reg_id): Helper macro to convert * SYS_* register definitions in asm/sysreg.h to use in KVM - * calls such as get_reg() and set_reg(). + * calls such as vcpu_get_reg() and vcpu_set_reg(). */ #define KVM_ARM64_SYS_REG(sys_reg_id) \ ARM64_SYS_REG(sys_reg_Op0(sys_reg_id), \ @@ -47,22 +47,6 @@ #define MPIDR_HWID_BITMASK (0xff00fffffful) -static inline void get_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id, uint64_t *addr) -{ - struct kvm_one_reg reg; - reg.id = id; - reg.addr = (uint64_t)addr; - vcpu_ioctl(vm, vcpuid, KVM_GET_ONE_REG, ®); -} - -static inline void set_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id, uint64_t val) -{ - struct kvm_one_reg reg; - reg.id = id; - reg.addr = (uint64_t)&val; - vcpu_ioctl(vm, vcpuid, KVM_SET_ONE_REG, ®); -} - void aarch64_vcpu_setup(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_init *init); struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, struct kvm_vcpu_init *init, void *guest_code); diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index c0b2158a53d5..9c29b6797ce8 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -374,16 +374,36 @@ static inline void vcpu_fpu_set(struct kvm_vm *vm, uint32_t vcpuid, { vcpu_ioctl(vm, vcpuid, KVM_SET_FPU, fpu); } + +static inline int __vcpu_get_reg(struct kvm_vm *vm, uint32_t vcpuid, + uint64_t reg_id, void *addr) +{ + struct kvm_one_reg reg = { .id = reg_id, .addr = (uint64_t)addr }; + + return __vcpu_ioctl(vm, vcpuid, KVM_GET_ONE_REG, ®); +} +static inline int __vcpu_set_reg(struct kvm_vm *vm, uint32_t vcpuid, + uint64_t reg_id, uint64_t val) +{ + struct kvm_one_reg reg = { .id = reg_id, .addr = (uint64_t)&val }; + + return __vcpu_ioctl(vm, vcpuid, KVM_SET_ONE_REG, ®); +} static inline void vcpu_get_reg(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_one_reg *reg) + uint64_t reg_id, void *addr) { - vcpu_ioctl(vm, vcpuid, KVM_GET_ONE_REG, reg); + struct kvm_one_reg reg = { .id = reg_id, .addr = (uint64_t)addr }; + + vcpu_ioctl(vm, vcpuid, KVM_GET_ONE_REG, ®); } static inline void vcpu_set_reg(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_one_reg *reg) + uint64_t reg_id, uint64_t val) { - vcpu_ioctl(vm, vcpuid, KVM_SET_ONE_REG, reg); + struct kvm_one_reg reg = { .id = reg_id, .addr = (uint64_t)&val }; + + vcpu_ioctl(vm, vcpuid, KVM_SET_ONE_REG, ®); } + #ifdef __KVM_HAVE_VCPU_EVENTS static inline void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_events *events) diff --git a/tools/testing/selftests/kvm/include/riscv/processor.h b/tools/testing/selftests/kvm/include/riscv/processor.h index 4fcfd1c0389d..d00d213c3805 100644 --- a/tools/testing/selftests/kvm/include/riscv/processor.h +++ b/tools/testing/selftests/kvm/include/riscv/processor.h @@ -38,26 +38,6 @@ static inline uint64_t __kvm_reg_id(uint64_t type, uint64_t idx, KVM_REG_RISCV_TIMER_REG(name), \ KVM_REG_SIZE_U64) -static inline void get_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id, - unsigned long *addr) -{ - struct kvm_one_reg reg; - - reg.id = id; - reg.addr = (unsigned long)addr; - vcpu_get_reg(vm, vcpuid, ®); -} - -static inline void set_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id, - unsigned long val) -{ - struct kvm_one_reg reg; - - reg.id = id; - reg.addr = (unsigned long)&val; - vcpu_set_reg(vm, vcpuid, ®); -} - /* L3 index Bit[47:39] */ #define PGTBL_L3_INDEX_MASK 0x0000FF8000000000ULL #define PGTBL_L3_INDEX_SHIFT 39 diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index 5b95fa2cce18..d158d5aa26e6 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -232,10 +232,10 @@ void aarch64_vcpu_setup(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_init * Enable FP/ASIMD to avoid trapping when accessing Q0-Q15 * registers, which the variable argument list macros do. */ - set_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_CPACR_EL1), 3 << 20); + vcpu_set_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_CPACR_EL1), 3 << 20); - get_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_SCTLR_EL1), &sctlr_el1); - get_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_TCR_EL1), &tcr_el1); + vcpu_get_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_SCTLR_EL1), &sctlr_el1); + vcpu_get_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_TCR_EL1), &tcr_el1); /* Configure base granule size */ switch (vm->mode) { @@ -296,19 +296,19 @@ void aarch64_vcpu_setup(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_init tcr_el1 |= (1 << 8) | (1 << 10) | (3 << 12); tcr_el1 |= (64 - vm->va_bits) /* T0SZ */; - set_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_SCTLR_EL1), sctlr_el1); - set_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_TCR_EL1), tcr_el1); - set_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_MAIR_EL1), DEFAULT_MAIR_EL1); - set_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_TTBR0_EL1), vm->pgd); - set_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_TPIDR_EL1), vcpuid); + vcpu_set_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_SCTLR_EL1), sctlr_el1); + vcpu_set_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_TCR_EL1), tcr_el1); + vcpu_set_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_MAIR_EL1), DEFAULT_MAIR_EL1); + vcpu_set_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_TTBR0_EL1), vm->pgd); + vcpu_set_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_TPIDR_EL1), vcpuid); } void vcpu_arch_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) { uint64_t pstate, pc; - get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pstate), &pstate); - get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), &pc); + vcpu_get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pstate), &pstate); + vcpu_get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), &pc); fprintf(stream, "%*spstate: 0x%.16lx pc: 0x%.16lx\n", indent, "", pstate, pc); @@ -326,8 +326,8 @@ struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, aarch64_vcpu_setup(vm, vcpu_id, init); - set_reg(vm, vcpu_id, ARM64_CORE_REG(sp_el1), stack_vaddr + stack_size); - set_reg(vm, vcpu_id, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code); + vcpu_set_reg(vm, vcpu_id, ARM64_CORE_REG(sp_el1), stack_vaddr + stack_size); + vcpu_set_reg(vm, vcpu_id, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code); return vcpu; } @@ -349,7 +349,7 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) va_start(ap, num); for (i = 0; i < num; i++) { - set_reg(vm, vcpuid, ARM64_CORE_REG(regs.regs[i]), + vcpu_set_reg(vm, vcpuid, ARM64_CORE_REG(regs.regs[i]), va_arg(ap, uint64_t)); } @@ -389,7 +389,7 @@ void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid) { extern char vectors; - set_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_VBAR_EL1), (uint64_t)&vectors); + vcpu_set_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_VBAR_EL1), (uint64_t)&vectors); } void route_exception(struct ex_regs *regs, int vector) diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index ba5761843c76..edbdc7bef05b 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -198,46 +198,46 @@ void riscv_vcpu_mmu_setup(struct kvm_vm *vm, int vcpuid) satp = (vm->pgd >> PGTBL_PAGE_SIZE_SHIFT) & SATP_PPN; satp |= SATP_MODE_48; - set_reg(vm, vcpuid, RISCV_CSR_REG(satp), satp); + vcpu_set_reg(vm, vcpuid, RISCV_CSR_REG(satp), satp); } void vcpu_arch_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) { struct kvm_riscv_core core; - get_reg(vm, vcpuid, RISCV_CORE_REG(mode), &core.mode); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.pc), &core.regs.pc); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.ra), &core.regs.ra); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.sp), &core.regs.sp); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.gp), &core.regs.gp); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.tp), &core.regs.tp); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t0), &core.regs.t0); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t1), &core.regs.t1); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t2), &core.regs.t2); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s0), &core.regs.s0); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s1), &core.regs.s1); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a0), &core.regs.a0); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a1), &core.regs.a1); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a2), &core.regs.a2); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a3), &core.regs.a3); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a4), &core.regs.a4); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a5), &core.regs.a5); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a6), &core.regs.a6); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a7), &core.regs.a7); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s2), &core.regs.s2); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s3), &core.regs.s3); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s4), &core.regs.s4); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s5), &core.regs.s5); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s6), &core.regs.s6); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s7), &core.regs.s7); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s8), &core.regs.s8); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s9), &core.regs.s9); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s10), &core.regs.s10); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s11), &core.regs.s11); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t3), &core.regs.t3); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t4), &core.regs.t4); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t5), &core.regs.t5); - get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t6), &core.regs.t6); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(mode), &core.mode); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.pc), &core.regs.pc); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.ra), &core.regs.ra); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.sp), &core.regs.sp); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.gp), &core.regs.gp); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.tp), &core.regs.tp); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t0), &core.regs.t0); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t1), &core.regs.t1); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t2), &core.regs.t2); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s0), &core.regs.s0); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s1), &core.regs.s1); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a0), &core.regs.a0); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a1), &core.regs.a1); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a2), &core.regs.a2); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a3), &core.regs.a3); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a4), &core.regs.a4); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a5), &core.regs.a5); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a6), &core.regs.a6); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a7), &core.regs.a7); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s2), &core.regs.s2); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s3), &core.regs.s3); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s4), &core.regs.s4); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s5), &core.regs.s5); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s6), &core.regs.s6); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s7), &core.regs.s7); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s8), &core.regs.s8); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s9), &core.regs.s9); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s10), &core.regs.s10); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s11), &core.regs.s11); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t3), &core.regs.t3); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t4), &core.regs.t4); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t5), &core.regs.t5); + vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t6), &core.regs.t6); fprintf(stream, " MODE: 0x%lx\n", core.mode); @@ -302,17 +302,17 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, /* Setup global pointer of guest to be same as the host */ asm volatile ( "add %0, gp, zero" : "=r" (current_gp) : : "memory"); - set_reg(vm, vcpu_id, RISCV_CORE_REG(regs.gp), current_gp); + vcpu_set_reg(vm, vcpu_id, RISCV_CORE_REG(regs.gp), current_gp); /* Setup stack pointer and program counter of guest */ - set_reg(vm, vcpu_id, RISCV_CORE_REG(regs.sp), - stack_vaddr + stack_size); - set_reg(vm, vcpu_id, RISCV_CORE_REG(regs.pc), - (unsigned long)guest_code); + vcpu_set_reg(vm, vcpu_id, RISCV_CORE_REG(regs.sp), + stack_vaddr + stack_size); + vcpu_set_reg(vm, vcpu_id, RISCV_CORE_REG(regs.pc), + (unsigned long)guest_code); /* Setup default exception vector of guest */ - set_reg(vm, vcpu_id, RISCV_CSR_REG(stvec), - (unsigned long)guest_unexp_trap); + vcpu_set_reg(vm, vcpu_id, RISCV_CSR_REG(stvec), + (unsigned long)guest_unexp_trap); return vcpu; } @@ -355,7 +355,7 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) id = RISCV_CORE_REG(regs.a7); break; } - set_reg(vm, vcpuid, id, va_arg(ap, uint64_t)); + vcpu_set_reg(vm, vcpuid, id, va_arg(ap, uint64_t)); } va_end(ap); diff --git a/tools/testing/selftests/kvm/s390x/resets.c b/tools/testing/selftests/kvm/s390x/resets.c index 298bff3fd52e..85d290454776 100644 --- a/tools/testing/selftests/kvm/s390x/resets.c +++ b/tools/testing/selftests/kvm/s390x/resets.c @@ -61,12 +61,9 @@ static void guest_code_initial(void) static void test_one_reg(uint64_t id, uint64_t value) { - struct kvm_one_reg reg; uint64_t eval_reg; - reg.addr = (uintptr_t)&eval_reg; - reg.id = id; - vcpu_get_reg(vm, VCPU_ID, ®); + vcpu_get_reg(vm, VCPU_ID, id, &eval_reg); TEST_ASSERT(eval_reg == value, "value == 0x%lx", value); } -- cgit v1.2.3 From f05427faedff746e08b2098195c15d8691bc5fbe Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 1 Jun 2022 17:27:51 -0700 Subject: KVM: selftests: Sync stage before VM is freed in hypercalls test Sync the next stage using the VM before said VM is potentially freed by the TEST_STAGE_HVC_IFACE_FEAT_DISABLED stage. Opportunistically take a double pointer in anticipation of also having to set the new vCPU pointer once the test stops hardcoding '0' everywhere. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/hypercalls.c | 27 ++++++++++++------------ 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/hypercalls.c b/tools/testing/selftests/kvm/aarch64/hypercalls.c index b1f99e786d05..44ca840e8219 100644 --- a/tools/testing/selftests/kvm/aarch64/hypercalls.c +++ b/tools/testing/selftests/kvm/aarch64/hypercalls.c @@ -246,32 +246,31 @@ static struct kvm_vm *test_vm_create(void) return vm; } -static struct kvm_vm *test_guest_stage(struct kvm_vm *vm) +static void test_guest_stage(struct kvm_vm **vm) { - struct kvm_vm *ret_vm = vm; + int prev_stage = stage; - pr_debug("Stage: %d\n", stage); + pr_debug("Stage: %d\n", prev_stage); - switch (stage) { + /* Sync the stage early, the VM might be freed below. */ + stage++; + sync_global_to_guest(*vm, stage); + + switch (prev_stage) { case TEST_STAGE_REG_IFACE: - test_fw_regs_after_vm_start(vm); + test_fw_regs_after_vm_start(*vm); break; case TEST_STAGE_HVC_IFACE_FEAT_DISABLED: /* Start a new VM so that all the features are now enabled by default */ - kvm_vm_free(vm); - ret_vm = test_vm_create(); + kvm_vm_free(*vm); + *vm = test_vm_create(); break; case TEST_STAGE_HVC_IFACE_FEAT_ENABLED: case TEST_STAGE_HVC_IFACE_FALSE_INFO: break; default: - TEST_FAIL("Unknown test stage: %d\n", stage); + TEST_FAIL("Unknown test stage: %d\n", prev_stage); } - - stage++; - sync_global_to_guest(vm, stage); - - return ret_vm; } static void test_run(void) @@ -289,7 +288,7 @@ static void test_run(void) switch (get_ucall(vm, 0, &uc)) { case UCALL_SYNC: - vm = test_guest_stage(vm); + test_guest_stage(&vm); break; case UCALL_DONE: guest_done = true; -- cgit v1.2.3 From 8a093ea0d104998c4755743cdc5df7349356ae81 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 1 Jun 2022 17:32:52 -0700 Subject: KVM: selftests: Convert hypercalls test away from vm_create_default() Use a combination of vm_create(), vm_create_with_vcpus(), and vm_vcpu_add() to convert vgic_init from vm_create_default_with_vcpus(), and away from referncing vCPUs by ID. Thus continues the march toward total annihilation of "default" helpers. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/hypercalls.c | 51 +++++++++++------------- 1 file changed, 24 insertions(+), 27 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/hypercalls.c b/tools/testing/selftests/kvm/aarch64/hypercalls.c index 44ca840e8219..fefa39dc9bc8 100644 --- a/tools/testing/selftests/kvm/aarch64/hypercalls.c +++ b/tools/testing/selftests/kvm/aarch64/hypercalls.c @@ -150,23 +150,19 @@ struct st_time { #define STEAL_TIME_SIZE ((sizeof(struct st_time) + 63) & ~63) #define ST_GPA_BASE (1 << 30) -static void steal_time_init(struct kvm_vm *vm) +static void steal_time_init(struct kvm_vcpu *vcpu) { uint64_t st_ipa = (ulong)ST_GPA_BASE; unsigned int gpages; - struct kvm_device_attr dev = { - .group = KVM_ARM_VCPU_PVTIME_CTRL, - .attr = KVM_ARM_VCPU_PVTIME_IPA, - .addr = (uint64_t)&st_ipa, - }; gpages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, STEAL_TIME_SIZE); - vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, gpages, 0); + vm_userspace_mem_region_add(vcpu->vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, gpages, 0); - vcpu_ioctl(vm, 0, KVM_SET_DEVICE_ATTR, &dev); + vcpu_device_attr_set(vcpu->vm, vcpu->id, KVM_ARM_VCPU_PVTIME_CTRL, + KVM_ARM_VCPU_PVTIME_IPA, &st_ipa); } -static void test_fw_regs_before_vm_start(struct kvm_vm *vm) +static void test_fw_regs_before_vm_start(struct kvm_vcpu *vcpu) { uint64_t val; unsigned int i; @@ -176,18 +172,18 @@ static void test_fw_regs_before_vm_start(struct kvm_vm *vm) const struct kvm_fw_reg_info *reg_info = &fw_reg_info[i]; /* First 'read' should be an upper limit of the features supported */ - vcpu_get_reg(vm, 0, reg_info->reg, &val); + vcpu_get_reg(vcpu->vm, vcpu->id, reg_info->reg, &val); TEST_ASSERT(val == FW_REG_ULIMIT_VAL(reg_info->max_feat_bit), "Expected all the features to be set for reg: 0x%lx; expected: 0x%lx; read: 0x%lx\n", reg_info->reg, FW_REG_ULIMIT_VAL(reg_info->max_feat_bit), val); /* Test a 'write' by disabling all the features of the register map */ - ret = __vcpu_set_reg(vm, 0, reg_info->reg, 0); + ret = __vcpu_set_reg(vcpu->vm, vcpu->id, reg_info->reg, 0); TEST_ASSERT(ret == 0, "Failed to clear all the features of reg: 0x%lx; ret: %d\n", reg_info->reg, errno); - vcpu_get_reg(vm, 0, reg_info->reg, &val); + vcpu_get_reg(vcpu->vm, vcpu->id, reg_info->reg, &val); TEST_ASSERT(val == 0, "Expected all the features to be cleared for reg: 0x%lx\n", reg_info->reg); @@ -196,7 +192,7 @@ static void test_fw_regs_before_vm_start(struct kvm_vm *vm) * Avoid this check if all the bits are occupied. */ if (reg_info->max_feat_bit < 63) { - ret = __vcpu_set_reg(vm, 0, reg_info->reg, BIT(reg_info->max_feat_bit + 1)); + ret = __vcpu_set_reg(vcpu->vm, vcpu->id, reg_info->reg, BIT(reg_info->max_feat_bit + 1)); TEST_ASSERT(ret != 0 && errno == EINVAL, "Unexpected behavior or return value (%d) while setting an unsupported feature for reg: 0x%lx\n", errno, reg_info->reg); @@ -204,7 +200,7 @@ static void test_fw_regs_before_vm_start(struct kvm_vm *vm) } } -static void test_fw_regs_after_vm_start(struct kvm_vm *vm) +static void test_fw_regs_after_vm_start(struct kvm_vcpu *vcpu) { uint64_t val; unsigned int i; @@ -217,7 +213,7 @@ static void test_fw_regs_after_vm_start(struct kvm_vm *vm) * Before starting the VM, the test clears all the bits. * Check if that's still the case. */ - vcpu_get_reg(vm, 0, reg_info->reg, &val); + vcpu_get_reg(vcpu->vm, vcpu->id, reg_info->reg, &val); TEST_ASSERT(val == 0, "Expected all the features to be cleared for reg: 0x%lx\n", reg_info->reg); @@ -227,26 +223,26 @@ static void test_fw_regs_after_vm_start(struct kvm_vm *vm) * the registers and should return EBUSY. Set the registers and check for * the expected errno. */ - ret = __vcpu_set_reg(vm, 0, reg_info->reg, FW_REG_ULIMIT_VAL(reg_info->max_feat_bit)); + ret = __vcpu_set_reg(vcpu->vm, vcpu->id, reg_info->reg, FW_REG_ULIMIT_VAL(reg_info->max_feat_bit)); TEST_ASSERT(ret != 0 && errno == EBUSY, "Unexpected behavior or return value (%d) while setting a feature while VM is running for reg: 0x%lx\n", errno, reg_info->reg); } } -static struct kvm_vm *test_vm_create(void) +static struct kvm_vm *test_vm_create(struct kvm_vcpu **vcpu) { struct kvm_vm *vm; - vm = vm_create_default(0, 0, guest_code); + vm = vm_create_with_one_vcpu(vcpu, guest_code); ucall_init(vm, NULL); - steal_time_init(vm); + steal_time_init(*vcpu); return vm; } -static void test_guest_stage(struct kvm_vm **vm) +static void test_guest_stage(struct kvm_vm **vm, struct kvm_vcpu **vcpu) { int prev_stage = stage; @@ -258,12 +254,12 @@ static void test_guest_stage(struct kvm_vm **vm) switch (prev_stage) { case TEST_STAGE_REG_IFACE: - test_fw_regs_after_vm_start(*vm); + test_fw_regs_after_vm_start(*vcpu); break; case TEST_STAGE_HVC_IFACE_FEAT_DISABLED: /* Start a new VM so that all the features are now enabled by default */ kvm_vm_free(*vm); - *vm = test_vm_create(); + *vm = test_vm_create(vcpu); break; case TEST_STAGE_HVC_IFACE_FEAT_ENABLED: case TEST_STAGE_HVC_IFACE_FALSE_INFO: @@ -275,20 +271,21 @@ static void test_guest_stage(struct kvm_vm **vm) static void test_run(void) { + struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct ucall uc; bool guest_done = false; - vm = test_vm_create(); + vm = test_vm_create(&vcpu); - test_fw_regs_before_vm_start(vm); + test_fw_regs_before_vm_start(vcpu); while (!guest_done) { - vcpu_run(vm, 0); + vcpu_run(vcpu->vm, vcpu->id); - switch (get_ucall(vm, 0, &uc)) { + switch (get_ucall(vcpu->vm, vcpu->id, &uc)) { case UCALL_SYNC: - test_guest_stage(&vm); + test_guest_stage(&vm, &vcpu); break; case UCALL_DONE: guest_done = true; -- cgit v1.2.3 From ebca1b8056dae8593bb350178d2ea454b3e123ad Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 12:10:40 -0800 Subject: KVM: selftests: Convert xapic_ipi_test away from *_VCPU_ID Convert vm_create_with_one_vcpu to use vm_create_with_vcpus() and pass around 'struct kvm_vcpu' objects instead of passing around vCPU IDs. Don't bother with macros for the HALTER versus SENDER indices, the vast majority of references don't differentiate between the vCPU roles, and the code that does either has a comment or an explicit reference to the role, e.g. to halter_guest_code() or sender_guest_code(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/x86_64/xapic_ipi_test.c | 48 +++++++++------------- 1 file changed, 20 insertions(+), 28 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c index 8b366652be31..4484ee563b18 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c @@ -39,9 +39,6 @@ /* Default delay between migrate_pages calls (microseconds) */ #define DEFAULT_DELAY_USECS 500000 -#define HALTER_VCPU_ID 0 -#define SENDER_VCPU_ID 1 - /* * Vector for IPI from sender vCPU to halting vCPU. * Value is arbitrary and was chosen for the alternating bit pattern. Any @@ -79,8 +76,7 @@ struct test_data_page { struct thread_params { struct test_data_page *data; - struct kvm_vm *vm; - uint32_t vcpu_id; + struct kvm_vcpu *vcpu; uint64_t *pipis_rcvd; /* host address of ipis_rcvd global */ }; @@ -198,6 +194,7 @@ static void sender_guest_code(struct test_data_page *data) static void *vcpu_thread(void *arg) { struct thread_params *params = (struct thread_params *)arg; + struct kvm_vcpu *vcpu = params->vcpu; struct ucall uc; int old; int r; @@ -206,17 +203,17 @@ static void *vcpu_thread(void *arg) r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old); TEST_ASSERT(r == 0, "pthread_setcanceltype failed on vcpu_id=%u with errno=%d", - params->vcpu_id, r); + vcpu->id, r); - fprintf(stderr, "vCPU thread running vCPU %u\n", params->vcpu_id); - vcpu_run(params->vm, params->vcpu_id); - exit_reason = vcpu_state(params->vm, params->vcpu_id)->exit_reason; + fprintf(stderr, "vCPU thread running vCPU %u\n", vcpu->id); + vcpu_run(vcpu->vm, vcpu->id); + exit_reason = vcpu->run->exit_reason; TEST_ASSERT(exit_reason == KVM_EXIT_IO, "vCPU %u exited with unexpected exit reason %u-%s, expected KVM_EXIT_IO", - params->vcpu_id, exit_reason, exit_reason_str(exit_reason)); + vcpu->id, exit_reason, exit_reason_str(exit_reason)); - if (get_ucall(params->vm, params->vcpu_id, &uc) == UCALL_ABORT) { + if (get_ucall(vcpu->vm, vcpu->id, &uc) == UCALL_ABORT) { TEST_ASSERT(false, "vCPU %u exited with error: %s.\n" "Sending vCPU sent %lu IPIs to halting vCPU\n" @@ -224,7 +221,7 @@ static void *vcpu_thread(void *arg) "Halter TPR=%#x PPR=%#x LVR=%#x\n" "Migrations attempted: %lu\n" "Migrations completed: %lu\n", - params->vcpu_id, (const char *)uc.args[0], + vcpu->id, (const char *)uc.args[0], params->data->ipis_sent, params->data->hlt_count, params->data->wake_count, *params->pipis_rcvd, params->data->halter_tpr, @@ -236,7 +233,7 @@ static void *vcpu_thread(void *arg) return NULL; } -static void cancel_join_vcpu_thread(pthread_t thread, uint32_t vcpu_id) +static void cancel_join_vcpu_thread(pthread_t thread, struct kvm_vcpu *vcpu) { void *retval; int r; @@ -244,12 +241,12 @@ static void cancel_join_vcpu_thread(pthread_t thread, uint32_t vcpu_id) r = pthread_cancel(thread); TEST_ASSERT(r == 0, "pthread_cancel on vcpu_id=%d failed with errno=%d", - vcpu_id, r); + vcpu->id, r); r = pthread_join(thread, &retval); TEST_ASSERT(r == 0, "pthread_join on vcpu_id=%d failed with errno=%d", - vcpu_id, r); + vcpu->id, r); TEST_ASSERT(retval == PTHREAD_CANCELED, "expected retval=%p, got %p", PTHREAD_CANCELED, retval); @@ -415,34 +412,30 @@ int main(int argc, char *argv[]) if (delay_usecs <= 0) delay_usecs = DEFAULT_DELAY_USECS; - vm = vm_create_default(HALTER_VCPU_ID, 0, halter_guest_code); - params[0].vm = vm; - params[1].vm = vm; + vm = vm_create_with_one_vcpu(¶ms[0].vcpu, halter_guest_code); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, HALTER_VCPU_ID); + vcpu_init_descriptor_tables(vm, params[0].vcpu->id); vm_install_exception_handler(vm, IPI_VECTOR, guest_ipi_handler); virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); - vm_vcpu_add(vm, SENDER_VCPU_ID, sender_guest_code); + params[1].vcpu = vm_vcpu_add(vm, 1, sender_guest_code); test_data_page_vaddr = vm_vaddr_alloc_page(vm); - data = - (struct test_data_page *)addr_gva2hva(vm, test_data_page_vaddr); + data = addr_gva2hva(vm, test_data_page_vaddr); memset(data, 0, sizeof(*data)); params[0].data = data; params[1].data = data; - vcpu_args_set(vm, HALTER_VCPU_ID, 1, test_data_page_vaddr); - vcpu_args_set(vm, SENDER_VCPU_ID, 1, test_data_page_vaddr); + vcpu_args_set(vm, params[0].vcpu->id, 1, test_data_page_vaddr); + vcpu_args_set(vm, params[1].vcpu->id, 1, test_data_page_vaddr); pipis_rcvd = (uint64_t *)addr_gva2hva(vm, (uint64_t)&ipis_rcvd); params[0].pipis_rcvd = pipis_rcvd; params[1].pipis_rcvd = pipis_rcvd; /* Start halter vCPU thread and wait for it to execute first HLT. */ - params[0].vcpu_id = HALTER_VCPU_ID; r = pthread_create(&threads[0], NULL, vcpu_thread, ¶ms[0]); TEST_ASSERT(r == 0, "pthread_create halter failed errno=%d", errno); @@ -462,7 +455,6 @@ int main(int argc, char *argv[]) "Halter vCPU thread reported its APIC ID: %u after %d seconds.\n", data->halter_apic_id, wait_secs); - params[1].vcpu_id = SENDER_VCPU_ID; r = pthread_create(&threads[1], NULL, vcpu_thread, ¶ms[1]); TEST_ASSERT(r == 0, "pthread_create sender failed errno=%d", errno); @@ -478,8 +470,8 @@ int main(int argc, char *argv[]) /* * Cancel threads and wait for them to stop. */ - cancel_join_vcpu_thread(threads[0], HALTER_VCPU_ID); - cancel_join_vcpu_thread(threads[1], SENDER_VCPU_ID); + cancel_join_vcpu_thread(threads[0], params[0].vcpu); + cancel_join_vcpu_thread(threads[1], params[1].vcpu); fprintf(stderr, "Test successful after running for %d seconds.\n" -- cgit v1.2.3 From e5b77cdef9e3023b8156d81e005c3b7f27a1eaa5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 12:20:41 -0800 Subject: KVM: selftests: Convert sync_regs_test away from VCPU_ID Convert sync_regs_test to use vm_create_with_vcpus() and pass around a 'struct kvm_vcpu' object instead of passing around vCPU IDs. Note, this is a "functional" change in the sense that the test now creates a vCPU with vcpu_id==0 instead of vcpu_id==5. The non-zero VCPU_ID was 100% arbitrary and added little to no validation coverage. If testing non-zero vCPU IDs is desirable for generic tests, that can be done in the future by tweaking the VM creation helpers. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/s390x/sync_regs_test.c | 59 +++++++++++----------- 1 file changed, 30 insertions(+), 29 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/s390x/sync_regs_test.c b/tools/testing/selftests/kvm/s390x/sync_regs_test.c index 9510739e226d..7c4143141ca7 100644 --- a/tools/testing/selftests/kvm/s390x/sync_regs_test.c +++ b/tools/testing/selftests/kvm/s390x/sync_regs_test.c @@ -23,8 +23,6 @@ #include "diag318_test_handler.h" #include "kselftest.h" -#define VCPU_ID 5 - static void guest_code(void) { /* @@ -75,55 +73,58 @@ static void compare_sregs(struct kvm_sregs *left, struct kvm_sync_regs *right) #define TEST_SYNC_FIELDS (KVM_SYNC_GPRS|KVM_SYNC_ACRS|KVM_SYNC_CRS|KVM_SYNC_DIAG318) #define INVALID_SYNC_FIELD 0x80000000 -void test_read_invalid(struct kvm_vm *vm, struct kvm_run *run) +void test_read_invalid(struct kvm_vcpu *vcpu) { + struct kvm_run *run = vcpu->run; int rv; /* Request reading invalid register set from VCPU. */ run->kvm_valid_regs = INVALID_SYNC_FIELD; - rv = _vcpu_run(vm, VCPU_ID); + rv = _vcpu_run(vcpu->vm, vcpu->id); TEST_ASSERT(rv < 0 && errno == EINVAL, "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n", rv); - vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0; + run->kvm_valid_regs = 0; run->kvm_valid_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS; - rv = _vcpu_run(vm, VCPU_ID); + rv = _vcpu_run(vcpu->vm, vcpu->id); TEST_ASSERT(rv < 0 && errno == EINVAL, "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n", rv); - vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0; + run->kvm_valid_regs = 0; } -void test_set_invalid(struct kvm_vm *vm, struct kvm_run *run) +void test_set_invalid(struct kvm_vcpu *vcpu) { + struct kvm_run *run = vcpu->run; int rv; /* Request setting invalid register set into VCPU. */ run->kvm_dirty_regs = INVALID_SYNC_FIELD; - rv = _vcpu_run(vm, VCPU_ID); + rv = _vcpu_run(vcpu->vm, vcpu->id); TEST_ASSERT(rv < 0 && errno == EINVAL, "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n", rv); - vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0; + run->kvm_dirty_regs = 0; run->kvm_dirty_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS; - rv = _vcpu_run(vm, VCPU_ID); + rv = _vcpu_run(vcpu->vm, vcpu->id); TEST_ASSERT(rv < 0 && errno == EINVAL, "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n", rv); - vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0; + run->kvm_dirty_regs = 0; } -void test_req_and_verify_all_valid_regs(struct kvm_vm *vm, struct kvm_run *run) +void test_req_and_verify_all_valid_regs(struct kvm_vcpu *vcpu) { + struct kvm_run *run = vcpu->run; struct kvm_sregs sregs; struct kvm_regs regs; int rv; /* Request and verify all valid register sets. */ run->kvm_valid_regs = TEST_SYNC_FIELDS; - rv = _vcpu_run(vm, VCPU_ID); + rv = _vcpu_run(vcpu->vm, vcpu->id); TEST_ASSERT(rv == 0, "vcpu_run failed: %d\n", rv); TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC, "Unexpected exit reason: %u (%s)\n", @@ -136,15 +137,16 @@ void test_req_and_verify_all_valid_regs(struct kvm_vm *vm, struct kvm_run *run) run->s390_sieic.icptcode, run->s390_sieic.ipa, run->s390_sieic.ipb); - vcpu_regs_get(vm, VCPU_ID, ®s); + vcpu_regs_get(vcpu->vm, vcpu->id, ®s); compare_regs(®s, &run->s.regs); - vcpu_sregs_get(vm, VCPU_ID, &sregs); + vcpu_sregs_get(vcpu->vm, vcpu->id, &sregs); compare_sregs(&sregs, &run->s.regs); } -void test_set_and_verify_various_reg_values(struct kvm_vm *vm, struct kvm_run *run) +void test_set_and_verify_various_reg_values(struct kvm_vcpu *vcpu) { + struct kvm_run *run = vcpu->run; struct kvm_sregs sregs; struct kvm_regs regs; int rv; @@ -161,7 +163,7 @@ void test_set_and_verify_various_reg_values(struct kvm_vm *vm, struct kvm_run *r run->kvm_dirty_regs |= KVM_SYNC_DIAG318; } - rv = _vcpu_run(vm, VCPU_ID); + rv = _vcpu_run(vcpu->vm, vcpu->id); TEST_ASSERT(rv == 0, "vcpu_run failed: %d\n", rv); TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC, "Unexpected exit reason: %u (%s)\n", @@ -177,15 +179,16 @@ void test_set_and_verify_various_reg_values(struct kvm_vm *vm, struct kvm_run *r "diag318 sync regs value incorrect 0x%llx.", run->s.regs.diag318); - vcpu_regs_get(vm, VCPU_ID, ®s); + vcpu_regs_get(vcpu->vm, vcpu->id, ®s); compare_regs(®s, &run->s.regs); - vcpu_sregs_get(vm, VCPU_ID, &sregs); + vcpu_sregs_get(vcpu->vm, vcpu->id, &sregs); compare_sregs(&sregs, &run->s.regs); } -void test_clear_kvm_dirty_regs_bits(struct kvm_vm *vm, struct kvm_run *run) +void test_clear_kvm_dirty_regs_bits(struct kvm_vcpu *vcpu) { + struct kvm_run *run = vcpu->run; int rv; /* Clear kvm_dirty_regs bits, verify new s.regs values are @@ -195,7 +198,7 @@ void test_clear_kvm_dirty_regs_bits(struct kvm_vm *vm, struct kvm_run *run) run->kvm_dirty_regs = 0; run->s.regs.gprs[11] = 0xDEADBEEF; run->s.regs.diag318 = 0x4B1D; - rv = _vcpu_run(vm, VCPU_ID); + rv = _vcpu_run(vcpu->vm, vcpu->id); TEST_ASSERT(rv == 0, "vcpu_run failed: %d\n", rv); TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC, "Unexpected exit reason: %u (%s)\n", @@ -211,7 +214,7 @@ void test_clear_kvm_dirty_regs_bits(struct kvm_vm *vm, struct kvm_run *run) struct testdef { const char *name; - void (*test)(struct kvm_vm *vm, struct kvm_run *run); + void (*test)(struct kvm_vcpu *vcpu); } testlist[] = { { "read invalid", test_read_invalid }, { "set invalid", test_set_invalid }, @@ -222,8 +225,8 @@ struct testdef { int main(int argc, char *argv[]) { - static struct kvm_run *run; - static struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; int idx; /* Tell stdout not to buffer its content */ @@ -237,12 +240,10 @@ int main(int argc, char *argv[]) ksft_set_plan(ARRAY_SIZE(testlist)); /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code); - - run = vcpu_state(vm, VCPU_ID); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) { - testlist[idx].test(vm, run); + testlist[idx].test(vcpu); ksft_test_result_pass("%s\n", testlist[idx].name); } -- cgit v1.2.3 From 371dfb2e90d942fb651c324505bc4929447b081b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 12:38:26 -0800 Subject: KVM: selftests: Convert s390's "resets" test away from VCPU_ID Pass around a 'struct kvm_vcpu' object in the "resets" test instead of referencing the vCPU by the global VCPU_ID. Rename the #define for the vCPU's ID to ARBITRARY_NON_ZERO_VCPU_ID to make it more obvious that (a) the value matters but (b) is otherwise arbitrary. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/s390x/resets.c | 137 ++++++++++++++++------------- 1 file changed, 77 insertions(+), 60 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/s390x/resets.c b/tools/testing/selftests/kvm/s390x/resets.c index 85d290454776..b2375d81571c 100644 --- a/tools/testing/selftests/kvm/s390x/resets.c +++ b/tools/testing/selftests/kvm/s390x/resets.c @@ -14,14 +14,12 @@ #include "kvm_util.h" #include "kselftest.h" -#define VCPU_ID 3 #define LOCAL_IRQS 32 -struct kvm_s390_irq buf[VCPU_ID + LOCAL_IRQS]; +#define ARBITRARY_NON_ZERO_VCPU_ID 3 + +struct kvm_s390_irq buf[ARBITRARY_NON_ZERO_VCPU_ID + LOCAL_IRQS]; -struct kvm_vm *vm; -struct kvm_run *run; -struct kvm_sync_regs *sync_regs; static uint8_t regs_null[512]; static void guest_code_initial(void) @@ -59,22 +57,22 @@ static void guest_code_initial(void) ); } -static void test_one_reg(uint64_t id, uint64_t value) +static void test_one_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t value) { uint64_t eval_reg; - vcpu_get_reg(vm, VCPU_ID, id, &eval_reg); + vcpu_get_reg(vcpu->vm, vcpu->id, id, &eval_reg); TEST_ASSERT(eval_reg == value, "value == 0x%lx", value); } -static void assert_noirq(void) +static void assert_noirq(struct kvm_vcpu *vcpu) { struct kvm_s390_irq_state irq_state; int irqs; irq_state.len = sizeof(buf); irq_state.buf = (unsigned long)buf; - irqs = __vcpu_ioctl(vm, VCPU_ID, KVM_S390_GET_IRQ_STATE, &irq_state); + irqs = __vcpu_ioctl(vcpu->vm, vcpu->id, KVM_S390_GET_IRQ_STATE, &irq_state); /* * irqs contains the number of retrieved interrupts. Any interrupt * (notably, the emergency call interrupt we have injected) should @@ -84,19 +82,20 @@ static void assert_noirq(void) TEST_ASSERT(!irqs, "IRQ pending"); } -static void assert_clear(void) +static void assert_clear(struct kvm_vcpu *vcpu) { + struct kvm_sync_regs *sync_regs = &vcpu->run->s.regs; struct kvm_sregs sregs; struct kvm_regs regs; struct kvm_fpu fpu; - vcpu_regs_get(vm, VCPU_ID, ®s); + vcpu_regs_get(vcpu->vm, vcpu->id, ®s); TEST_ASSERT(!memcmp(®s.gprs, regs_null, sizeof(regs.gprs)), "grs == 0"); - vcpu_sregs_get(vm, VCPU_ID, &sregs); + vcpu_sregs_get(vcpu->vm, vcpu->id, &sregs); TEST_ASSERT(!memcmp(&sregs.acrs, regs_null, sizeof(sregs.acrs)), "acrs == 0"); - vcpu_fpu_get(vm, VCPU_ID, &fpu); + vcpu_fpu_get(vcpu->vm, vcpu->id, &fpu); TEST_ASSERT(!memcmp(&fpu.fprs, regs_null, sizeof(fpu.fprs)), "fprs == 0"); /* sync regs */ @@ -110,8 +109,10 @@ static void assert_clear(void) "vrs0-15 == 0 (sync_regs)"); } -static void assert_initial_noclear(void) +static void assert_initial_noclear(struct kvm_vcpu *vcpu) { + struct kvm_sync_regs *sync_regs = &vcpu->run->s.regs; + TEST_ASSERT(sync_regs->gprs[0] == 0xffff000000000000UL, "gpr0 == 0xffff000000000000 (sync_regs)"); TEST_ASSERT(sync_regs->gprs[1] == 0x0000555500000000UL, @@ -125,13 +126,14 @@ static void assert_initial_noclear(void) TEST_ASSERT(sync_regs->acrs[9] == 1, "ar9 == 1 (sync_regs)"); } -static void assert_initial(void) +static void assert_initial(struct kvm_vcpu *vcpu) { + struct kvm_sync_regs *sync_regs = &vcpu->run->s.regs; struct kvm_sregs sregs; struct kvm_fpu fpu; /* KVM_GET_SREGS */ - vcpu_sregs_get(vm, VCPU_ID, &sregs); + vcpu_sregs_get(vcpu->vm, vcpu->id, &sregs); TEST_ASSERT(sregs.crs[0] == 0xE0UL, "cr0 == 0xE0 (KVM_GET_SREGS)"); TEST_ASSERT(sregs.crs[14] == 0xC2000000UL, "cr14 == 0xC2000000 (KVM_GET_SREGS)"); @@ -154,36 +156,38 @@ static void assert_initial(void) TEST_ASSERT(sync_regs->gbea == 1, "gbea == 1 (sync_regs)"); /* kvm_run */ - TEST_ASSERT(run->psw_addr == 0, "psw_addr == 0 (kvm_run)"); - TEST_ASSERT(run->psw_mask == 0, "psw_mask == 0 (kvm_run)"); + TEST_ASSERT(vcpu->run->psw_addr == 0, "psw_addr == 0 (kvm_run)"); + TEST_ASSERT(vcpu->run->psw_mask == 0, "psw_mask == 0 (kvm_run)"); - vcpu_fpu_get(vm, VCPU_ID, &fpu); + vcpu_fpu_get(vcpu->vm, vcpu->id, &fpu); TEST_ASSERT(!fpu.fpc, "fpc == 0"); - test_one_reg(KVM_REG_S390_GBEA, 1); - test_one_reg(KVM_REG_S390_PP, 0); - test_one_reg(KVM_REG_S390_TODPR, 0); - test_one_reg(KVM_REG_S390_CPU_TIMER, 0); - test_one_reg(KVM_REG_S390_CLOCK_COMP, 0); + test_one_reg(vcpu, KVM_REG_S390_GBEA, 1); + test_one_reg(vcpu, KVM_REG_S390_PP, 0); + test_one_reg(vcpu, KVM_REG_S390_TODPR, 0); + test_one_reg(vcpu, KVM_REG_S390_CPU_TIMER, 0); + test_one_reg(vcpu, KVM_REG_S390_CLOCK_COMP, 0); } -static void assert_normal_noclear(void) +static void assert_normal_noclear(struct kvm_vcpu *vcpu) { + struct kvm_sync_regs *sync_regs = &vcpu->run->s.regs; + TEST_ASSERT(sync_regs->crs[2] == 0x10, "cr2 == 10 (sync_regs)"); TEST_ASSERT(sync_regs->crs[8] == 1, "cr10 == 1 (sync_regs)"); TEST_ASSERT(sync_regs->crs[10] == 1, "cr10 == 1 (sync_regs)"); TEST_ASSERT(sync_regs->crs[11] == -1, "cr11 == -1 (sync_regs)"); } -static void assert_normal(void) +static void assert_normal(struct kvm_vcpu *vcpu) { - test_one_reg(KVM_REG_S390_PFTOKEN, KVM_S390_PFAULT_TOKEN_INVALID); - TEST_ASSERT(sync_regs->pft == KVM_S390_PFAULT_TOKEN_INVALID, + test_one_reg(vcpu, KVM_REG_S390_PFTOKEN, KVM_S390_PFAULT_TOKEN_INVALID); + TEST_ASSERT(vcpu->run->s.regs.pft == KVM_S390_PFAULT_TOKEN_INVALID, "pft == 0xff..... (sync_regs)"); - assert_noirq(); + assert_noirq(vcpu); } -static void inject_irq(int cpu_id) +static void inject_irq(struct kvm_vcpu *vcpu) { struct kvm_s390_irq_state irq_state; struct kvm_s390_irq *irq = &buf[0]; @@ -193,73 +197,86 @@ static void inject_irq(int cpu_id) irq_state.len = sizeof(struct kvm_s390_irq); irq_state.buf = (unsigned long)buf; irq->type = KVM_S390_INT_EMERGENCY; - irq->u.emerg.code = cpu_id; - irqs = __vcpu_ioctl(vm, cpu_id, KVM_S390_SET_IRQ_STATE, &irq_state); + irq->u.emerg.code = vcpu->id; + irqs = __vcpu_ioctl(vcpu->vm, vcpu->id, KVM_S390_SET_IRQ_STATE, &irq_state); TEST_ASSERT(irqs >= 0, "Error injecting EMERGENCY IRQ errno %d\n", errno); } +static struct kvm_vm *create_vm(struct kvm_vcpu **vcpu) +{ + struct kvm_vm *vm; + + vm = vm_create(DEFAULT_GUEST_PHY_PAGES); + + *vcpu = vm_vcpu_add(vm, ARBITRARY_NON_ZERO_VCPU_ID, guest_code_initial); + + return vm; +} + static void test_normal(void) { + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + ksft_print_msg("Testing normal reset\n"); - /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code_initial); - run = vcpu_state(vm, VCPU_ID); - sync_regs = &run->s.regs; + vm = create_vm(&vcpu); - vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); - inject_irq(VCPU_ID); + inject_irq(vcpu); - vcpu_ioctl(vm, VCPU_ID, KVM_S390_NORMAL_RESET, 0); + vcpu_ioctl(vm, vcpu->id, KVM_S390_NORMAL_RESET, 0); /* must clears */ - assert_normal(); + assert_normal(vcpu); /* must not clears */ - assert_normal_noclear(); - assert_initial_noclear(); + assert_normal_noclear(vcpu); + assert_initial_noclear(vcpu); kvm_vm_free(vm); } static void test_initial(void) { + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + ksft_print_msg("Testing initial reset\n"); - vm = vm_create_default(VCPU_ID, 0, guest_code_initial); - run = vcpu_state(vm, VCPU_ID); - sync_regs = &run->s.regs; + vm = create_vm(&vcpu); - vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); - inject_irq(VCPU_ID); + inject_irq(vcpu); - vcpu_ioctl(vm, VCPU_ID, KVM_S390_INITIAL_RESET, 0); + vcpu_ioctl(vm, vcpu->id, KVM_S390_INITIAL_RESET, 0); /* must clears */ - assert_normal(); - assert_initial(); + assert_normal(vcpu); + assert_initial(vcpu); /* must not clears */ - assert_initial_noclear(); + assert_initial_noclear(vcpu); kvm_vm_free(vm); } static void test_clear(void) { + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + ksft_print_msg("Testing clear reset\n"); - vm = vm_create_default(VCPU_ID, 0, guest_code_initial); - run = vcpu_state(vm, VCPU_ID); - sync_regs = &run->s.regs; + vm = create_vm(&vcpu); - vcpu_run(vm, VCPU_ID); + vcpu_run(vm, vcpu->id); - inject_irq(VCPU_ID); + inject_irq(vcpu); - vcpu_ioctl(vm, VCPU_ID, KVM_S390_CLEAR_RESET, 0); + vcpu_ioctl(vm, vcpu->id, KVM_S390_CLEAR_RESET, 0); /* must clears */ - assert_normal(); - assert_initial(); - assert_clear(); + assert_normal(vcpu); + assert_initial(vcpu); + assert_clear(vcpu); kvm_vm_free(vm); } -- cgit v1.2.3 From 5241904f2eb6289dd1e7f7c80f612016afcb00cd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 12:39:34 -0800 Subject: KVM: selftests: Convert memop away from VCPU_ID Pass around a 'struct kvm_vcpu' object instead of a vCPU ID in s390's memop test. Pass NULL for the vCPU instead of a magic '-1' ID to indicate that an ioctl/test should be done at VM scope. Rename "struct test_vcpu vcpu" to "struct test_info info" in order to avoid naming collisions (this is the bulk of the diff :-( ). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/s390x/memop.c | 82 ++++++++++++++++--------------- 1 file changed, 42 insertions(+), 40 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c index 230d4b6301e6..703bf137d6ac 100644 --- a/tools/testing/selftests/kvm/s390x/memop.c +++ b/tools/testing/selftests/kvm/s390x/memop.c @@ -99,21 +99,18 @@ static struct kvm_s390_mem_op ksmo_from_desc(struct mop_desc desc) return ksmo; } -/* vcpu dummy id signifying that vm instead of vcpu ioctl is to occur */ -const uint32_t VM_VCPU_ID = (uint32_t)-1; - -struct test_vcpu { +struct test_info { struct kvm_vm *vm; - uint32_t id; + struct kvm_vcpu *vcpu; }; #define PRINT_MEMOP false -static void print_memop(uint32_t vcpu_id, const struct kvm_s390_mem_op *ksmo) +static void print_memop(struct kvm_vcpu *vcpu, const struct kvm_s390_mem_op *ksmo) { if (!PRINT_MEMOP) return; - if (vcpu_id == VM_VCPU_ID) + if (!vcpu) printf("vm memop("); else printf("vcpu memop("); @@ -148,25 +145,29 @@ static void print_memop(uint32_t vcpu_id, const struct kvm_s390_mem_op *ksmo) puts(")"); } -static void memop_ioctl(struct test_vcpu vcpu, struct kvm_s390_mem_op *ksmo) +static void memop_ioctl(struct test_info info, struct kvm_s390_mem_op *ksmo) { - if (vcpu.id == VM_VCPU_ID) - vm_ioctl(vcpu.vm, KVM_S390_MEM_OP, ksmo); + struct kvm_vcpu *vcpu = info.vcpu; + + if (!vcpu) + vm_ioctl(info.vm, KVM_S390_MEM_OP, ksmo); else - vcpu_ioctl(vcpu.vm, vcpu.id, KVM_S390_MEM_OP, ksmo); + vcpu_ioctl(vcpu->vm, vcpu->id, KVM_S390_MEM_OP, ksmo); } -static int err_memop_ioctl(struct test_vcpu vcpu, struct kvm_s390_mem_op *ksmo) +static int err_memop_ioctl(struct test_info info, struct kvm_s390_mem_op *ksmo) { - if (vcpu.id == VM_VCPU_ID) - return __vm_ioctl(vcpu.vm, KVM_S390_MEM_OP, ksmo); + struct kvm_vcpu *vcpu = info.vcpu; + + if (!vcpu) + return __vm_ioctl(info.vm, KVM_S390_MEM_OP, ksmo); else - return __vcpu_ioctl(vcpu.vm, vcpu.id, KVM_S390_MEM_OP, ksmo); + return __vcpu_ioctl(vcpu->vm, vcpu->id, KVM_S390_MEM_OP, ksmo); } -#define MEMOP(err, vcpu_p, mop_target_p, access_mode_p, buf_p, size_p, ...) \ +#define MEMOP(err, info_p, mop_target_p, access_mode_p, buf_p, size_p, ...) \ ({ \ - struct test_vcpu __vcpu = (vcpu_p); \ + struct test_info __info = (info_p); \ struct mop_desc __desc = { \ .target = (mop_target_p), \ .mode = (access_mode_p), \ @@ -178,13 +179,13 @@ static int err_memop_ioctl(struct test_vcpu vcpu, struct kvm_s390_mem_op *ksmo) \ if (__desc._gaddr_v) { \ if (__desc.target == ABSOLUTE) \ - __desc.gaddr = addr_gva2gpa(__vcpu.vm, __desc.gaddr_v); \ + __desc.gaddr = addr_gva2gpa(__info.vm, __desc.gaddr_v); \ else \ __desc.gaddr = __desc.gaddr_v; \ } \ __ksmo = ksmo_from_desc(__desc); \ - print_memop(__vcpu.id, &__ksmo); \ - err##memop_ioctl(__vcpu, &__ksmo); \ + print_memop(__info.vcpu, &__ksmo); \ + err##memop_ioctl(__info, &__ksmo); \ }) #define MOP(...) MEMOP(, __VA_ARGS__) @@ -201,7 +202,6 @@ static int err_memop_ioctl(struct test_vcpu vcpu, struct kvm_s390_mem_op *ksmo) #define CHECK_N_DO(f, ...) ({ f(__VA_ARGS__, CHECK_ONLY); f(__VA_ARGS__); }) -#define VCPU_ID 1 #define PAGE_SHIFT 12 #define PAGE_SIZE (1ULL << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE - 1)) @@ -213,21 +213,22 @@ static uint8_t mem2[65536]; struct test_default { struct kvm_vm *kvm_vm; - struct test_vcpu vm; - struct test_vcpu vcpu; + struct test_info vm; + struct test_info vcpu; struct kvm_run *run; int size; }; static struct test_default test_default_init(void *guest_code) { + struct kvm_vcpu *vcpu; struct test_default t; t.size = min((size_t)kvm_check_cap(KVM_CAP_S390_MEM_OP), sizeof(mem1)); - t.kvm_vm = vm_create_default(VCPU_ID, 0, guest_code); - t.vm = (struct test_vcpu) { t.kvm_vm, VM_VCPU_ID }; - t.vcpu = (struct test_vcpu) { t.kvm_vm, VCPU_ID }; - t.run = vcpu_state(t.kvm_vm, VCPU_ID); + t.kvm_vm = vm_create_with_one_vcpu(&vcpu, guest_code); + t.vm = (struct test_info) { t.kvm_vm, NULL }; + t.vcpu = (struct test_info) { t.kvm_vm, vcpu }; + t.run = vcpu->run; return t; } @@ -242,14 +243,15 @@ enum stage { STAGE_COPIED, }; -#define HOST_SYNC(vcpu_p, stage) \ +#define HOST_SYNC(info_p, stage) \ ({ \ - struct test_vcpu __vcpu = (vcpu_p); \ + struct test_info __info = (info_p); \ + struct kvm_vcpu *__vcpu = __info.vcpu; \ struct ucall uc; \ int __stage = (stage); \ \ - vcpu_run(__vcpu.vm, __vcpu.id); \ - get_ucall(__vcpu.vm, __vcpu.id, &uc); \ + vcpu_run(__vcpu->vm, __vcpu->id); \ + get_ucall(__vcpu->vm, __vcpu->id, &uc); \ ASSERT_EQ(uc.cmd, UCALL_SYNC); \ ASSERT_EQ(uc.args[1], __stage); \ }) \ @@ -268,7 +270,7 @@ static void prepare_mem12(void) #define DEFAULT_WRITE_READ(copy_cpu, mop_cpu, mop_target_p, size, ...) \ ({ \ - struct test_vcpu __copy_cpu = (copy_cpu), __mop_cpu = (mop_cpu); \ + struct test_info __copy_cpu = (copy_cpu), __mop_cpu = (mop_cpu); \ enum mop_target __target = (mop_target_p); \ uint32_t __size = (size); \ \ @@ -283,7 +285,7 @@ static void prepare_mem12(void) #define DEFAULT_READ(copy_cpu, mop_cpu, mop_target_p, size, ...) \ ({ \ - struct test_vcpu __copy_cpu = (copy_cpu), __mop_cpu = (mop_cpu); \ + struct test_info __copy_cpu = (copy_cpu), __mop_cpu = (mop_cpu); \ enum mop_target __target = (mop_target_p); \ uint32_t __size = (size); \ \ @@ -624,34 +626,34 @@ static void guest_idle(void) GUEST_SYNC(STAGE_IDLED); } -static void _test_errors_common(struct test_vcpu vcpu, enum mop_target target, int size) +static void _test_errors_common(struct test_info info, enum mop_target target, int size) { int rv; /* Bad size: */ - rv = ERR_MOP(vcpu, target, WRITE, mem1, -1, GADDR_V(mem1)); + rv = ERR_MOP(info, target, WRITE, mem1, -1, GADDR_V(mem1)); TEST_ASSERT(rv == -1 && errno == E2BIG, "ioctl allows insane sizes"); /* Zero size: */ - rv = ERR_MOP(vcpu, target, WRITE, mem1, 0, GADDR_V(mem1)); + rv = ERR_MOP(info, target, WRITE, mem1, 0, GADDR_V(mem1)); TEST_ASSERT(rv == -1 && (errno == EINVAL || errno == ENOMEM), "ioctl allows 0 as size"); /* Bad flags: */ - rv = ERR_MOP(vcpu, target, WRITE, mem1, size, GADDR_V(mem1), SET_FLAGS(-1)); + rv = ERR_MOP(info, target, WRITE, mem1, size, GADDR_V(mem1), SET_FLAGS(-1)); TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows all flags"); /* Bad guest address: */ - rv = ERR_MOP(vcpu, target, WRITE, mem1, size, GADDR((void *)~0xfffUL), CHECK_ONLY); + rv = ERR_MOP(info, target, WRITE, mem1, size, GADDR((void *)~0xfffUL), CHECK_ONLY); TEST_ASSERT(rv > 0, "ioctl does not report bad guest memory access"); /* Bad host address: */ - rv = ERR_MOP(vcpu, target, WRITE, 0, size, GADDR_V(mem1)); + rv = ERR_MOP(info, target, WRITE, 0, size, GADDR_V(mem1)); TEST_ASSERT(rv == -1 && errno == EFAULT, "ioctl does not report bad host memory address"); /* Bad key: */ - rv = ERR_MOP(vcpu, target, WRITE, mem1, size, GADDR_V(mem1), KEY(17)); + rv = ERR_MOP(info, target, WRITE, mem1, size, GADDR_V(mem1), KEY(17)); TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows invalid key"); } -- cgit v1.2.3 From 7cdcdfe50d8d68b7b9ba2e1b0345ff47fdda390f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 12:40:18 -0800 Subject: KVM: selftests: Convert s390x/diag318_test_handler away from VCPU_ID Convert diag318_test_handler to use vm_create_with_vcpus() and pass around a 'struct kvm_vcpu' object instead of passing around vCPU IDs. Note, this is a "functional" change in the sense that the test now creates a vCPU with vcpu_id==0 instead of vcpu_id==6. The non-zero VCPU_ID was 100% arbitrary and added little to no validation coverage. If testing non-zero vCPU IDs is desirable for generic tests, that can be done in the future by tweaking the VM creation helpers. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/s390x/diag318_test_handler.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/s390x/diag318_test_handler.c b/tools/testing/selftests/kvm/lib/s390x/diag318_test_handler.c index 86b9e611ad87..21c31fe10c1a 100644 --- a/tools/testing/selftests/kvm/lib/s390x/diag318_test_handler.c +++ b/tools/testing/selftests/kvm/lib/s390x/diag318_test_handler.c @@ -8,8 +8,6 @@ #include "test_util.h" #include "kvm_util.h" -#define VCPU_ID 6 - #define ICPT_INSTRUCTION 0x04 #define IPA0_DIAG 0x8300 @@ -27,14 +25,15 @@ static void guest_code(void) */ static uint64_t diag318_handler(void) { + struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct kvm_run *run; uint64_t reg; uint64_t diag318_info; - vm = vm_create_default(VCPU_ID, 0, guest_code); - vcpu_run(vm, VCPU_ID); - run = vcpu_state(vm, VCPU_ID); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + vcpu_run(vm, vcpu->id); + run = vcpu->run; TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC, "DIAGNOSE 0x0318 instruction was not intercepted"); -- cgit v1.2.3 From 6a9d37efa2cf3e2ba551c660fe52496742cbfbc4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 21 Apr 2022 08:34:06 -0700 Subject: KVM: selftests: Convert tprot away from VCPU_ID Convert tprot to use vm_create_with_vcpus() and pass around a 'struct kvm_vcpu' object instead of passing around vCPU IDs. Note, this is a "functional" change in the sense that the test now creates a vCPU with vcpu_id==0 instead of vcpu_id==1. The non-zero VCPU_ID was 100% arbitrary and added little to no validation coverage. If testing non-zero vCPU IDs is desirable for generic tests, that can be done in the future by tweaking the VM creation helpers. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/s390x/tprot.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/s390x/tprot.c b/tools/testing/selftests/kvm/s390x/tprot.c index 14d74a9e7b3d..a82a8d1dc6c8 100644 --- a/tools/testing/selftests/kvm/s390x/tprot.c +++ b/tools/testing/selftests/kvm/s390x/tprot.c @@ -15,8 +15,6 @@ #define CR0_FETCH_PROTECTION_OVERRIDE (1UL << (63 - 38)) #define CR0_STORAGE_PROTECTION_OVERRIDE (1UL << (63 - 39)) -#define VCPU_ID 1 - static __aligned(PAGE_SIZE) uint8_t pages[2][PAGE_SIZE]; static uint8_t *const page_store_prot = pages[0]; static uint8_t *const page_fetch_prot = pages[1]; @@ -183,14 +181,14 @@ static void guest_code(void) GUEST_SYNC(perform_next_stage(&i, mapped_0)); } -#define HOST_SYNC_NO_TAP(vmp, stage) \ +#define HOST_SYNC_NO_TAP(vcpup, stage) \ ({ \ - struct kvm_vm *__vm = (vmp); \ + struct kvm_vcpu *__vcpu = (vcpup); \ struct ucall uc; \ int __stage = (stage); \ \ - vcpu_run(__vm, VCPU_ID); \ - get_ucall(__vm, VCPU_ID, &uc); \ + vcpu_run(__vcpu->vm, __vcpu->id); \ + get_ucall(__vcpu->vm, __vcpu->id, &uc); \ if (uc.cmd == UCALL_ABORT) { \ TEST_FAIL("line %lu: %s, hints: %lu, %lu", uc.args[1], \ (const char *)uc.args[0], uc.args[2], uc.args[3]); \ @@ -199,14 +197,15 @@ static void guest_code(void) ASSERT_EQ(uc.args[1], __stage); \ }) -#define HOST_SYNC(vmp, stage) \ +#define HOST_SYNC(vcpu, stage) \ ({ \ - HOST_SYNC_NO_TAP(vmp, stage); \ + HOST_SYNC_NO_TAP(vcpu, stage); \ ksft_test_result_pass("" #stage "\n"); \ }) int main(int argc, char *argv[]) { + struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct kvm_run *run; vm_vaddr_t guest_0_page; @@ -214,31 +213,31 @@ int main(int argc, char *argv[]) ksft_print_header(); ksft_set_plan(STAGE_END); - vm = vm_create_default(VCPU_ID, 0, guest_code); - run = vcpu_state(vm, VCPU_ID); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + run = vcpu->run; - HOST_SYNC(vm, STAGE_INIT_SIMPLE); + HOST_SYNC(vcpu, STAGE_INIT_SIMPLE); mprotect(addr_gva2hva(vm, (vm_vaddr_t)pages), PAGE_SIZE * 2, PROT_READ); - HOST_SYNC(vm, TEST_SIMPLE); + HOST_SYNC(vcpu, TEST_SIMPLE); guest_0_page = vm_vaddr_alloc(vm, PAGE_SIZE, 0); if (guest_0_page != 0) { /* Use NO_TAP so we don't get a PASS print */ - HOST_SYNC_NO_TAP(vm, STAGE_INIT_FETCH_PROT_OVERRIDE); + HOST_SYNC_NO_TAP(vcpu, STAGE_INIT_FETCH_PROT_OVERRIDE); ksft_test_result_skip("STAGE_INIT_FETCH_PROT_OVERRIDE - " "Did not allocate page at 0\n"); } else { - HOST_SYNC(vm, STAGE_INIT_FETCH_PROT_OVERRIDE); + HOST_SYNC(vcpu, STAGE_INIT_FETCH_PROT_OVERRIDE); } if (guest_0_page == 0) mprotect(addr_gva2hva(vm, (vm_vaddr_t)0), PAGE_SIZE, PROT_READ); run->s.regs.crs[0] |= CR0_FETCH_PROTECTION_OVERRIDE; run->kvm_dirty_regs = KVM_SYNC_CRS; - HOST_SYNC(vm, TEST_FETCH_PROT_OVERRIDE); + HOST_SYNC(vcpu, TEST_FETCH_PROT_OVERRIDE); run->s.regs.crs[0] |= CR0_STORAGE_PROTECTION_OVERRIDE; run->kvm_dirty_regs = KVM_SYNC_CRS; - HOST_SYNC(vm, TEST_STORAGE_PROT_OVERRIDE); + HOST_SYNC(vcpu, TEST_STORAGE_PROT_OVERRIDE); kvm_vm_free(vm); -- cgit v1.2.3 From 46647c65e1e618cc75961ef4d887f90ba7e18431 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 18 Apr 2022 17:35:33 -0700 Subject: KVM: selftests: Use vm_create() in tsc_scaling_sync Use vm_create() instead of vm_create_default_with_vcpus() in tsc_scaling_sync. The existing call doesn't create any vCPUs, and the guest_code() entry point is set when vm_vcpu_add_default() is invoked. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c index 2411215e7ae8..728b252597cc 100644 --- a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c +++ b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c @@ -98,7 +98,7 @@ int main(int argc, char *argv[]) exit(KSFT_SKIP); } - vm = vm_create_default_with_vcpus(0, DEFAULT_STACK_PGS * NR_TEST_VCPUS, 0, guest_code, NULL); + vm = vm_create(DEFAULT_GUEST_PHY_PAGES + DEFAULT_STACK_PGS * NR_TEST_VCPUS); vm_ioctl(vm, KVM_SET_TSC_KHZ, (void *) TEST_TSC_KHZ); pthread_spin_init(&create_lock, PTHREAD_PROCESS_PRIVATE); -- cgit v1.2.3 From 3468fd7d883110e481dfb8c8c7b802dc252ab186 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 19 Apr 2022 11:35:28 -0700 Subject: KVM: selftests: Use vm_create_with_vcpus() in max_guest_memory_test Use vm_create_with_vcpus() in max_guest_memory_test and reference vCPUs by their 'struct kvm_vcpu' object instead of their ID. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/max_guest_memory_test.c | 26 +++++++++++++--------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/max_guest_memory_test.c b/tools/testing/selftests/kvm/max_guest_memory_test.c index 15f046e19cb2..d59918d5cbe2 100644 --- a/tools/testing/selftests/kvm/max_guest_memory_test.c +++ b/tools/testing/selftests/kvm/max_guest_memory_test.c @@ -28,8 +28,7 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride) } struct vcpu_info { - struct kvm_vm *vm; - uint32_t id; + struct kvm_vcpu *vcpu; uint64_t start_gpa; uint64_t end_gpa; }; @@ -60,12 +59,13 @@ static void run_vcpu(struct kvm_vm *vm, uint32_t vcpu_id) static void *vcpu_worker(void *data) { - struct vcpu_info *vcpu = data; + struct vcpu_info *info = data; + struct kvm_vcpu *vcpu = info->vcpu; struct kvm_vm *vm = vcpu->vm; struct kvm_sregs sregs; struct kvm_regs regs; - vcpu_args_set(vm, vcpu->id, 3, vcpu->start_gpa, vcpu->end_gpa, + vcpu_args_set(vm, vcpu->id, 3, info->start_gpa, info->end_gpa, vm_get_page_size(vm)); /* Snapshot regs before the first run. */ @@ -89,8 +89,8 @@ static void *vcpu_worker(void *data) return NULL; } -static pthread_t *spawn_workers(struct kvm_vm *vm, uint64_t start_gpa, - uint64_t end_gpa) +static pthread_t *spawn_workers(struct kvm_vm *vm, struct kvm_vcpu **vcpus, + uint64_t start_gpa, uint64_t end_gpa) { struct vcpu_info *info; uint64_t gpa, nr_bytes; @@ -108,8 +108,7 @@ static pthread_t *spawn_workers(struct kvm_vm *vm, uint64_t start_gpa, TEST_ASSERT(nr_bytes, "C'mon, no way you have %d CPUs", nr_vcpus); for (i = 0, gpa = start_gpa; i < nr_vcpus; i++, gpa += nr_bytes) { - info[i].vm = vm; - info[i].id = i; + info[i].vcpu = vcpus[i]; info[i].start_gpa = gpa; info[i].end_gpa = gpa + nr_bytes; pthread_create(&threads[i], NULL, vcpu_worker, &info[i]); @@ -172,6 +171,7 @@ int main(int argc, char *argv[]) uint64_t max_gpa, gpa, slot_size, max_mem, i; int max_slots, slot, opt, fd; bool hugepages = false; + struct kvm_vcpu **vcpus; pthread_t *threads; struct kvm_vm *vm; void *mem; @@ -215,7 +215,10 @@ int main(int argc, char *argv[]) } } - vm = vm_create_default_with_vcpus(nr_vcpus, 0, 0, guest_code, NULL); + vcpus = malloc(nr_vcpus * sizeof(*vcpus)); + TEST_ASSERT(vcpus, "Failed to allocate vCPU array"); + + vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus); max_gpa = vm_get_max_gfn(vm) << vm_get_page_shift(vm); TEST_ASSERT(max_gpa > (4 * slot_size), "MAXPHYADDR <4gb "); @@ -252,7 +255,10 @@ int main(int argc, char *argv[]) } atomic_set(&rendezvous, nr_vcpus + 1); - threads = spawn_workers(vm, start_gpa, gpa); + threads = spawn_workers(vm, vcpus, start_gpa, gpa); + + free(vcpus); + vcpus = NULL; pr_info("Running with %lugb of guest memory and %u vCPUs\n", (gpa - start_gpa) / size_1gb, nr_vcpus); -- cgit v1.2.3 From 82ba83cbb76aa6480729f98d3fdeeb162a5cea30 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 12:45:22 -0800 Subject: KVM: selftests: Drop vm_create_default* helpers Drop all vm_create_default*() helpers, the "default" naming turned out to terrible as wasn't extensible (hard to have multiple defaults), was a lie (half the settings were default, half weren't), and failed to capture relationships between helpers, e.g. compared with the kernel's standard underscores pattern. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/kvm_util_base.h | 23 ---------------------- tools/testing/selftests/kvm/lib/kvm_util.c | 23 ++++++---------------- 2 files changed, 6 insertions(+), 40 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 9c29b6797ce8..90521c5716b1 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -559,29 +559,6 @@ static inline struct kvm_vm *vm_create(uint64_t nr_pages) return __vm_create(VM_MODE_DEFAULT, nr_pages); } -/* - * Create a VM with reasonable defaults - * - * Input Args: - * vcpuid - The id of the single VCPU to add to the VM. - * extra_mem_pages - The number of extra pages to add (this will - * decide how much extra space we will need to - * setup the page tables using memslot 0) - * guest_code - The vCPU's entry point - * - * Output Args: None - * - * Return: - * Pointer to opaque structure that describes the created VM. - */ -struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, - void *guest_code); - -/* Same as vm_create_default, but can be used for more than one vcpu */ -struct kvm_vm *vm_create_default_with_vcpus(uint32_t nr_vcpus, uint64_t extra_mem_pages, - uint32_t num_percpu_pages, void *guest_code, - uint32_t vcpuids[]); - /* Like vm_create_default_with_vcpus, but accepts mode and slot0 memory as a parameter */ struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, uint64_t slot0_mem_pages, uint64_t extra_mem_pages, diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 3015d490a8f6..137ffa1bc1f2 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -337,28 +337,17 @@ struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus return vm; } -struct kvm_vm *vm_create_default_with_vcpus(uint32_t nr_vcpus, uint64_t extra_mem_pages, - uint32_t num_percpu_pages, void *guest_code, - uint32_t vcpuids[]) -{ - return __vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, DEFAULT_GUEST_PHY_PAGES, - extra_mem_pages, num_percpu_pages, guest_code, vcpuids, NULL); -} - -struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, - void *guest_code) -{ - return vm_create_default_with_vcpus(1, extra_mem_pages, 0, guest_code, - (uint32_t []){ vcpuid }); -} - struct kvm_vm *__vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, uint64_t extra_mem_pages, void *guest_code) { - struct kvm_vm *vm = vm_create_default(0, extra_mem_pages, guest_code); + struct kvm_vcpu *vcpus[1]; + struct kvm_vm *vm; + + vm = __vm_create_with_vcpus(VM_MODE_DEFAULT, 1, DEFAULT_GUEST_PHY_PAGES, + extra_mem_pages, 0, guest_code, NULL, vcpus); - *vcpu = vcpu_get(vm, 0); + *vcpu = vcpus[0]; return vm; } -- cgit v1.2.3 From 5114c3e2f1b90ca5417c42d3eb17fcf5ac4dc724 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 12:49:13 -0800 Subject: KVM: selftests: Drop @vcpuids param from VM creators Drop the @vcpuids parameter from VM creators now that there are no users. Allowing tests to specify IDs was a gigantic mistake as it resulted in tests with arbitrary and ultimately meaningless IDs that differed only because the author used test X intead of test Y as the source for copy+paste (the de facto standard way to create a KVM selftest). Except for literally two tests, x86's set_boot_cpu_id and s390's resets, tests do not and should not care about the vCPU ID. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/kvm_util_base.h | 4 ++-- tools/testing/selftests/kvm/kvm_page_table_test.c | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 8 +++----- tools/testing/selftests/kvm/lib/perf_test_util.c | 2 +- 4 files changed, 7 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 90521c5716b1..f409bae336d5 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -563,7 +563,7 @@ static inline struct kvm_vm *vm_create(uint64_t nr_pages) struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, uint64_t slot0_mem_pages, uint64_t extra_mem_pages, uint32_t num_percpu_pages, void *guest_code, - uint32_t vcpuids[], struct kvm_vcpu *vcpus[]); + struct kvm_vcpu *vcpus[]); static inline struct kvm_vm *vm_create_with_vcpus(uint32_t nr_vcpus, void *guest_code, @@ -571,7 +571,7 @@ static inline struct kvm_vm *vm_create_with_vcpus(uint32_t nr_vcpus, { return __vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, DEFAULT_GUEST_PHY_PAGES, 0, 0, - guest_code, NULL, vcpus); + guest_code, vcpus); } /* diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index e91bc7f1400d..76031be195fa 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -269,7 +269,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) /* Create a VM with enough guest pages */ guest_num_pages = test_mem_size / guest_page_size; vm = __vm_create_with_vcpus(mode, nr_vcpus, DEFAULT_GUEST_PHY_PAGES, - guest_num_pages, 0, guest_code, NULL, NULL); + guest_num_pages, 0, guest_code, NULL); /* Align down GPA of the testing memslot */ if (!p->phys_offset) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 137ffa1bc1f2..c338713d8245 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -299,7 +299,7 @@ struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint64_t nr_pages) struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, uint64_t slot0_mem_pages, uint64_t extra_mem_pages, uint32_t num_percpu_pages, void *guest_code, - uint32_t vcpuids[], struct kvm_vcpu *vcpus[]) + struct kvm_vcpu *vcpus[]) { uint64_t vcpu_pages, extra_pg_pages, pages; struct kvm_vcpu *vcpu; @@ -327,9 +327,7 @@ struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus vm = __vm_create(mode, pages); for (i = 0; i < nr_vcpus; ++i) { - uint32_t vcpuid = vcpuids ? vcpuids[i] : i; - - vcpu = vm_vcpu_add(vm, vcpuid, guest_code); + vcpu = vm_vcpu_add(vm, i, guest_code); if (vcpus) vcpus[i] = vcpu; } @@ -345,7 +343,7 @@ struct kvm_vm *__vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, struct kvm_vm *vm; vm = __vm_create_with_vcpus(VM_MODE_DEFAULT, 1, DEFAULT_GUEST_PHY_PAGES, - extra_mem_pages, 0, guest_code, NULL, vcpus); + extra_mem_pages, 0, guest_code, vcpus); *vcpu = vcpus[0]; return vm; diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index d3ff05f00653..05f65c9292eb 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -149,7 +149,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, */ vm = __vm_create_with_vcpus(mode, vcpus, DEFAULT_GUEST_PHY_PAGES, slot0_pages + guest_num_pages, 0, - perf_test_guest_code, NULL, NULL); + perf_test_guest_code, NULL); pta->vm = vm; -- cgit v1.2.3 From 0f678e732099f1cd9b42461708374e14cc63847f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 13:06:18 -0800 Subject: KVM: selftests: Convert kvm_page_table_test away from reliance on vcpu_id Reference vCPUs by their 'struct kvm_vcpu' object in kvm_page_table_test instead of by their ID. This moves selftests one step closer towards taking a 'struct kvm_vcpu *' instead of VM+vcpu_id for vCPU helpers. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/kvm_page_table_test.c | 62 ++++++++--------------- 1 file changed, 21 insertions(+), 41 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index 76031be195fa..b577b5999c95 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -46,11 +46,6 @@ static const char * const test_stage_string[] = { "KVM_ADJUST_MAPPINGS", }; -struct vcpu_args { - int vcpu_id; - bool vcpu_write; -}; - struct test_args { struct kvm_vm *vm; uint64_t guest_test_virt_mem; @@ -60,7 +55,7 @@ struct test_args { uint64_t large_num_pages; uint64_t host_pages_per_lpage; enum vm_mem_backing_src_type src_type; - struct vcpu_args vcpu_args[KVM_MAX_VCPUS]; + struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; }; /* @@ -92,17 +87,13 @@ static uint64_t guest_test_phys_mem; */ static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM; -static void guest_code(int vcpu_id) +static void guest_code(bool do_write) { struct test_args *p = &test_args; - struct vcpu_args *vcpu_args = &p->vcpu_args[vcpu_id]; enum test_stage *current_stage = &guest_test_stage; uint64_t addr; int i, j; - /* Make sure vCPU args data structure is not corrupt */ - GUEST_ASSERT(vcpu_args->vcpu_id == vcpu_id); - while (true) { addr = p->guest_test_virt_mem; @@ -123,7 +114,7 @@ static void guest_code(int vcpu_id) */ case KVM_CREATE_MAPPINGS: for (i = 0; i < p->large_num_pages; i++) { - if (vcpu_args->vcpu_write) + if (do_write) *(uint64_t *)addr = 0x0123456789ABCDEF; else READ_ONCE(*(uint64_t *)addr); @@ -193,17 +184,15 @@ static void guest_code(int vcpu_id) static void *vcpu_worker(void *data) { - int ret; - struct vcpu_args *vcpu_args = data; struct kvm_vm *vm = test_args.vm; - int vcpu_id = vcpu_args->vcpu_id; - struct kvm_run *run; + struct kvm_vcpu *vcpu = data; + bool do_write = !(vcpu->id % 2); struct timespec start; struct timespec ts_diff; enum test_stage stage; + int ret; - vcpu_args_set(vm, vcpu_id, 1, vcpu_id); - run = vcpu_state(vm, vcpu_id); + vcpu_args_set(vm, vcpu->id, 1, do_write); while (!READ_ONCE(host_quit)) { ret = sem_wait(&test_stage_updated); @@ -213,15 +202,15 @@ static void *vcpu_worker(void *data) return NULL; clock_gettime(CLOCK_MONOTONIC_RAW, &start); - ret = _vcpu_run(vm, vcpu_id); + ret = _vcpu_run(vm, vcpu->id); ts_diff = timespec_elapsed(start); TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); - TEST_ASSERT(get_ucall(vm, vcpu_id, NULL) == UCALL_SYNC, + TEST_ASSERT(get_ucall(vm, vcpu->id, NULL) == UCALL_SYNC, "Invalid guest sync status: exit_reason=%s\n", - exit_reason_str(run->exit_reason)); + exit_reason_str(vcpu->run->exit_reason)); - pr_debug("Got sync event from vCPU %d\n", vcpu_id); + pr_debug("Got sync event from vCPU %d\n", vcpu->id); stage = READ_ONCE(*current_stage); /* @@ -230,7 +219,7 @@ static void *vcpu_worker(void *data) */ pr_debug("vCPU %d has completed stage %s\n" "execution time is: %ld.%.9lds\n\n", - vcpu_id, test_stage_string[stage], + vcpu->id, test_stage_string[stage], ts_diff.tv_sec, ts_diff.tv_nsec); ret = sem_post(&test_stage_completed); @@ -250,7 +239,6 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) { int ret; struct test_params *p = arg; - struct vcpu_args *vcpu_args; enum vm_mem_backing_src_type src_type = p->src_type; uint64_t large_page_size = get_backing_src_pagesz(src_type); uint64_t guest_page_size = vm_guest_mode_params[mode].page_size; @@ -260,7 +248,6 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) uint64_t alignment; void *host_test_mem; struct kvm_vm *vm; - int vcpu_id; /* Align up the test memory size */ alignment = max(large_page_size, guest_page_size); @@ -269,7 +256,8 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) /* Create a VM with enough guest pages */ guest_num_pages = test_mem_size / guest_page_size; vm = __vm_create_with_vcpus(mode, nr_vcpus, DEFAULT_GUEST_PHY_PAGES, - guest_num_pages, 0, guest_code, NULL); + guest_num_pages, 0, guest_code, + test_args.vcpus); /* Align down GPA of the testing memslot */ if (!p->phys_offset) @@ -292,12 +280,6 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) test_args.host_pages_per_lpage = large_page_size / host_page_size; test_args.src_type = src_type; - for (vcpu_id = 0; vcpu_id < KVM_MAX_VCPUS; vcpu_id++) { - vcpu_args = &test_args.vcpu_args[vcpu_id]; - vcpu_args->vcpu_id = vcpu_id; - vcpu_args->vcpu_write = !(vcpu_id % 2); - } - /* Add an extra memory slot with specified backing src type */ vm_userspace_mem_region_add(vm, src_type, guest_test_phys_mem, TEST_MEM_SLOT_INDEX, guest_num_pages, 0); @@ -363,12 +345,11 @@ static void vcpus_complete_new_stage(enum test_stage stage) static void run_test(enum vm_guest_mode mode, void *arg) { - int ret; pthread_t *vcpu_threads; struct kvm_vm *vm; - int vcpu_id; struct timespec start; struct timespec ts_diff; + int ret, i; /* Create VM with vCPUs and make some pre-initialization */ vm = pre_init_before_test(mode, arg); @@ -379,10 +360,9 @@ static void run_test(enum vm_guest_mode mode, void *arg) host_quit = false; *current_stage = KVM_BEFORE_MAPPINGS; - for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { - pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker, - &test_args.vcpu_args[vcpu_id]); - } + for (i = 0; i < nr_vcpus; i++) + pthread_create(&vcpu_threads[i], NULL, vcpu_worker, + test_args.vcpus[i]); vcpus_complete_new_stage(*current_stage); pr_info("Started all vCPUs successfully\n"); @@ -424,13 +404,13 @@ static void run_test(enum vm_guest_mode mode, void *arg) /* Tell the vcpu thread to quit */ host_quit = true; - for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { + for (i = 0; i < nr_vcpus; i++) { ret = sem_post(&test_stage_updated); TEST_ASSERT(ret == 0, "Error in sem_post"); } - for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) - pthread_join(vcpu_threads[vcpu_id], NULL); + for (i = 0; i < nr_vcpus; i++) + pthread_join(vcpu_threads[i], NULL); ret = sem_destroy(&test_stage_updated); TEST_ASSERT(ret == 0, "Error in sem_destroy"); -- cgit v1.2.3 From e813129a3dea6588e57ebfebae75cef6946a89d1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 16:16:32 -0800 Subject: KVM: selftests: Convert kvm_binary_stats_test away from vCPU IDs Track vCPUs by their 'struct kvm_vcpu' object in kvm_binary_stats_test, not by their ID. The per-vCPU helpers will soon take a vCPU instead of a VM+vcpu_id pair. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/kvm_binary_stats_test.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index 407e9ea8e6f3..dfc3cf531ced 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -172,9 +172,9 @@ static void vm_stats_test(struct kvm_vm *vm) TEST_ASSERT(fcntl(stats_fd, F_GETFD) == -1, "Stats fd not freed"); } -static void vcpu_stats_test(struct kvm_vm *vm, int vcpu_id) +static void vcpu_stats_test(struct kvm_vcpu *vcpu) { - int stats_fd = vcpu_get_stats_fd(vm, vcpu_id); + int stats_fd = vcpu_get_stats_fd(vcpu->vm, vcpu->id); stats_test(stats_fd); close(stats_fd); @@ -195,6 +195,7 @@ static void vcpu_stats_test(struct kvm_vm *vm, int vcpu_id) int main(int argc, char *argv[]) { int i, j; + struct kvm_vcpu **vcpus; struct kvm_vm **vms; int max_vm = DEFAULT_NUM_VM; int max_vcpu = DEFAULT_NUM_VCPU; @@ -220,17 +221,21 @@ int main(int argc, char *argv[]) /* Create VMs and VCPUs */ vms = malloc(sizeof(vms[0]) * max_vm); TEST_ASSERT(vms, "Allocate memory for storing VM pointers"); + + vcpus = malloc(sizeof(struct kvm_vcpu *) * max_vm * max_vcpu); + TEST_ASSERT(vcpus, "Allocate memory for storing vCPU pointers"); + for (i = 0; i < max_vm; ++i) { vms[i] = vm_create_barebones(); for (j = 0; j < max_vcpu; ++j) - __vm_vcpu_add(vms[i], j); + vcpus[j * max_vcpu + i] = __vm_vcpu_add(vms[i], j); } /* Check stats read for every VM and VCPU */ for (i = 0; i < max_vm; ++i) { vm_stats_test(vms[i]); for (j = 0; j < max_vcpu; ++j) - vcpu_stats_test(vms[i], j); + vcpu_stats_test(vcpus[j * max_vcpu + i]); } for (i = 0; i < max_vm; ++i) -- cgit v1.2.3 From 3cc3eeb165a0afa91365fcf7fa284cd766d2f0bf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 17 Feb 2022 17:01:58 -0800 Subject: KVM: selftests: Convert get-reg-list away from its "VCPU_ID" Track the vCPU's 'struct kvm_vcpu' object in get-reg-list instead of hardcoding '0' everywhere. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/get-reg-list.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c index 22f5e4124304..f0aec117faae 100644 --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -411,6 +411,7 @@ static void run_test(struct vcpu_config *c) struct kvm_vcpu_init init = { .target = -1, }; int new_regs = 0, missing_regs = 0, i, n; int failed_get = 0, failed_set = 0, failed_reject = 0; + struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct reg_sublist *s; @@ -418,11 +419,11 @@ static void run_test(struct vcpu_config *c) vm = vm_create_barebones(); prepare_vcpu_init(c, &init); - __vm_vcpu_add(vm, 0); - aarch64_vcpu_setup(vm, 0, &init); - finalize_vcpu(vm, 0, c); + vcpu = __vm_vcpu_add(vm, 0); + aarch64_vcpu_setup(vm, vcpu->id, &init); + finalize_vcpu(vm, vcpu->id, c); - reg_list = vcpu_get_reg_list(vm, 0); + reg_list = vcpu_get_reg_list(vm, vcpu->id); if (fixup_core_regs) core_reg_fixup(); @@ -458,7 +459,7 @@ static void run_test(struct vcpu_config *c) bool reject_reg = false; int ret; - ret = __vcpu_get_reg(vm, 0, reg_list->reg[i], &addr); + ret = __vcpu_get_reg(vm, vcpu->id, reg_list->reg[i], &addr); if (ret) { printf("%s: Failed to get ", config_name(c)); print_reg(c, reg.id); @@ -470,7 +471,7 @@ static void run_test(struct vcpu_config *c) for_each_sublist(c, s) { if (s->rejects_set && find_reg(s->rejects_set, s->rejects_set_n, reg.id)) { reject_reg = true; - ret = __vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, ®); + ret = __vcpu_ioctl(vm, vcpu->id, KVM_SET_ONE_REG, ®); if (ret != -1 || errno != EPERM) { printf("%s: Failed to reject (ret=%d, errno=%d) ", config_name(c), ret, errno); print_reg(c, reg.id); @@ -482,7 +483,7 @@ static void run_test(struct vcpu_config *c) } if (!reject_reg) { - ret = __vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, ®); + ret = __vcpu_ioctl(vm, vcpu->id, KVM_SET_ONE_REG, ®); if (ret) { printf("%s: Failed to set ", config_name(c)); print_reg(c, reg.id); -- cgit v1.2.3 From 376851f8953a28be237b0a99adef91f422fa8f19 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 20 Apr 2022 12:15:50 -0700 Subject: KVM: selftests: Stop hardcoding vCPU IDs in vcpu_width_config In preparation for taking a vCPU pointer in vCPU-scoped functions, grab the vCPU(s) created by __vm_vcpu_add() and use the ID from the vCPU object instead of hardcoding the ID in ioctl() invocations. Rename init1/init2 => init0/init1 to avoid having odd/confusing code where vcpu0 consumes init1 and vcpu1 consumes init2. Note, this change could easily be done when the functions are converted in the future, and/or the vcpu{0,1} vs. init{1,2} discrepancy could be ignored, but then there would be no opportunity to poke fun at the 1-based counting scheme. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../selftests/kvm/aarch64/vcpu_width_config.c | 60 +++++++++++----------- 1 file changed, 31 insertions(+), 29 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c b/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c index 1dd856a58f5d..e4e66632f05c 100644 --- a/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c +++ b/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c @@ -15,24 +15,25 @@ /* - * Add a vCPU, run KVM_ARM_VCPU_INIT with @init1, and then - * add another vCPU, and run KVM_ARM_VCPU_INIT with @init2. + * Add a vCPU, run KVM_ARM_VCPU_INIT with @init0, and then + * add another vCPU, and run KVM_ARM_VCPU_INIT with @init1. */ -static int add_init_2vcpus(struct kvm_vcpu_init *init1, - struct kvm_vcpu_init *init2) +static int add_init_2vcpus(struct kvm_vcpu_init *init0, + struct kvm_vcpu_init *init1) { + struct kvm_vcpu *vcpu0, *vcpu1; struct kvm_vm *vm; int ret; vm = vm_create_barebones(); - __vm_vcpu_add(vm, 0); - ret = __vcpu_ioctl(vm, 0, KVM_ARM_VCPU_INIT, init1); + vcpu0 = __vm_vcpu_add(vm, 0); + ret = __vcpu_ioctl(vm, vcpu0->id, KVM_ARM_VCPU_INIT, init0); if (ret) goto free_exit; - __vm_vcpu_add(vm, 1); - ret = __vcpu_ioctl(vm, 1, KVM_ARM_VCPU_INIT, init2); + vcpu1 = __vm_vcpu_add(vm, 1); + ret = __vcpu_ioctl(vm, vcpu1->id, KVM_ARM_VCPU_INIT, init1); free_exit: kvm_vm_free(vm); @@ -40,25 +41,26 @@ free_exit: } /* - * Add two vCPUs, then run KVM_ARM_VCPU_INIT for one vCPU with @init1, - * and run KVM_ARM_VCPU_INIT for another vCPU with @init2. + * Add two vCPUs, then run KVM_ARM_VCPU_INIT for one vCPU with @init0, + * and run KVM_ARM_VCPU_INIT for another vCPU with @init1. */ -static int add_2vcpus_init_2vcpus(struct kvm_vcpu_init *init1, - struct kvm_vcpu_init *init2) +static int add_2vcpus_init_2vcpus(struct kvm_vcpu_init *init0, + struct kvm_vcpu_init *init1) { + struct kvm_vcpu *vcpu0, *vcpu1; struct kvm_vm *vm; int ret; vm = vm_create_barebones(); - __vm_vcpu_add(vm, 0); - __vm_vcpu_add(vm, 1); + vcpu0 = __vm_vcpu_add(vm, 0); + vcpu1 = __vm_vcpu_add(vm, 1); - ret = __vcpu_ioctl(vm, 0, KVM_ARM_VCPU_INIT, init1); + ret = __vcpu_ioctl(vm, vcpu0->id, KVM_ARM_VCPU_INIT, init0); if (ret) goto free_exit; - ret = __vcpu_ioctl(vm, 1, KVM_ARM_VCPU_INIT, init2); + ret = __vcpu_ioctl(vm, vcpu1->id, KVM_ARM_VCPU_INIT, init1); free_exit: kvm_vm_free(vm); @@ -76,7 +78,7 @@ free_exit: */ int main(void) { - struct kvm_vcpu_init init1, init2; + struct kvm_vcpu_init init0, init1; struct kvm_vm *vm; int ret; @@ -85,36 +87,36 @@ int main(void) exit(KSFT_SKIP); } - /* Get the preferred target type and copy that to init2 for later use */ + /* Get the preferred target type and copy that to init1 for later use */ vm = vm_create_barebones(); - vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init1); + vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init0); kvm_vm_free(vm); - init2 = init1; + init1 = init0; /* Test with 64bit vCPUs */ - ret = add_init_2vcpus(&init1, &init1); + ret = add_init_2vcpus(&init0, &init0); TEST_ASSERT(ret == 0, "Configuring 64bit EL1 vCPUs failed unexpectedly"); - ret = add_2vcpus_init_2vcpus(&init1, &init1); + ret = add_2vcpus_init_2vcpus(&init0, &init0); TEST_ASSERT(ret == 0, "Configuring 64bit EL1 vCPUs failed unexpectedly"); /* Test with 32bit vCPUs */ - init1.features[0] = (1 << KVM_ARM_VCPU_EL1_32BIT); - ret = add_init_2vcpus(&init1, &init1); + init0.features[0] = (1 << KVM_ARM_VCPU_EL1_32BIT); + ret = add_init_2vcpus(&init0, &init0); TEST_ASSERT(ret == 0, "Configuring 32bit EL1 vCPUs failed unexpectedly"); - ret = add_2vcpus_init_2vcpus(&init1, &init1); + ret = add_2vcpus_init_2vcpus(&init0, &init0); TEST_ASSERT(ret == 0, "Configuring 32bit EL1 vCPUs failed unexpectedly"); /* Test with mixed-width vCPUs */ - init1.features[0] = 0; - init2.features[0] = (1 << KVM_ARM_VCPU_EL1_32BIT); - ret = add_init_2vcpus(&init1, &init2); + init0.features[0] = 0; + init1.features[0] = (1 << KVM_ARM_VCPU_EL1_32BIT); + ret = add_init_2vcpus(&init0, &init1); TEST_ASSERT(ret != 0, "Configuring mixed-width vCPUs worked unexpectedly"); - ret = add_2vcpus_init_2vcpus(&init1, &init2); + ret = add_2vcpus_init_2vcpus(&init0, &init1); TEST_ASSERT(ret != 0, "Configuring mixed-width vCPUs worked unexpectedly"); -- cgit v1.2.3 From df84cef531ca481cbc1dbce84eb13efc6623e4e1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 13:38:12 -0800 Subject: KVM: selftests: Stop conflating vCPU index and ID in perf tests Track vCPUs by their 'struct kvm_vcpu' object, and stop assuming that a vCPU's ID is the same as its index when referencing a vCPU's metadata. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../selftests/kvm/access_tracking_perf_test.c | 81 ++++++++++---------- tools/testing/selftests/kvm/demand_paging_test.c | 36 +++++---- tools/testing/selftests/kvm/dirty_log_perf_test.c | 39 +++++----- .../testing/selftests/kvm/include/perf_test_util.h | 7 +- tools/testing/selftests/kvm/lib/perf_test_util.c | 88 ++++++++++++---------- .../selftests/kvm/lib/x86_64/perf_test_util.c | 8 +- .../kvm/memslot_modification_stress_test.c | 10 +-- 7 files changed, 138 insertions(+), 131 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c index d8909032317a..86a90222f913 100644 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -74,7 +74,7 @@ struct test_params { uint64_t vcpu_memory_bytes; /* The number of vCPUs to create in the VM. */ - int vcpus; + int nr_vcpus; }; static uint64_t pread_uint64(int fd, const char *filename, uint64_t index) @@ -127,10 +127,12 @@ static void mark_page_idle(int page_idle_fd, uint64_t pfn) "Set page_idle bits for PFN 0x%" PRIx64, pfn); } -static void mark_vcpu_memory_idle(struct kvm_vm *vm, int vcpu_id) +static void mark_vcpu_memory_idle(struct kvm_vm *vm, + struct perf_test_vcpu_args *vcpu_args) { - uint64_t base_gva = perf_test_args.vcpu_args[vcpu_id].gva; - uint64_t pages = perf_test_args.vcpu_args[vcpu_id].pages; + int vcpu_idx = vcpu_args->vcpu_idx; + uint64_t base_gva = vcpu_args->gva; + uint64_t pages = vcpu_args->pages; uint64_t page; uint64_t still_idle = 0; uint64_t no_pfn = 0; @@ -138,7 +140,7 @@ static void mark_vcpu_memory_idle(struct kvm_vm *vm, int vcpu_id) int pagemap_fd; /* If vCPUs are using an overlapping region, let vCPU 0 mark it idle. */ - if (overlap_memory_access && vcpu_id) + if (overlap_memory_access && vcpu_idx) return; page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR); @@ -170,7 +172,7 @@ static void mark_vcpu_memory_idle(struct kvm_vm *vm, int vcpu_id) */ TEST_ASSERT(no_pfn < pages / 100, "vCPU %d: No PFN for %" PRIu64 " out of %" PRIu64 " pages.", - vcpu_id, no_pfn, pages); + vcpu_idx, no_pfn, pages); /* * Test that at least 90% of memory has been marked idle (the rest might @@ -183,17 +185,16 @@ static void mark_vcpu_memory_idle(struct kvm_vm *vm, int vcpu_id) TEST_ASSERT(still_idle < pages / 10, "vCPU%d: Too many pages still idle (%"PRIu64 " out of %" PRIu64 ").\n", - vcpu_id, still_idle, pages); + vcpu_idx, still_idle, pages); close(page_idle_fd); close(pagemap_fd); } -static void assert_ucall(struct kvm_vm *vm, uint32_t vcpu_id, - uint64_t expected_ucall) +static void assert_ucall(struct kvm_vcpu *vcpu, uint64_t expected_ucall) { struct ucall uc; - uint64_t actual_ucall = get_ucall(vm, vcpu_id, &uc); + uint64_t actual_ucall = get_ucall(vcpu->vm, vcpu->id, &uc); TEST_ASSERT(expected_ucall == actual_ucall, "Guest exited unexpectedly (expected ucall %" PRIu64 @@ -217,28 +218,29 @@ static bool spin_wait_for_next_iteration(int *current_iteration) static void vcpu_thread_main(struct perf_test_vcpu_args *vcpu_args) { + struct kvm_vcpu *vcpu = vcpu_args->vcpu; struct kvm_vm *vm = perf_test_args.vm; - int vcpu_id = vcpu_args->vcpu_id; + int vcpu_idx = vcpu_args->vcpu_idx; int current_iteration = 0; while (spin_wait_for_next_iteration(¤t_iteration)) { switch (READ_ONCE(iteration_work)) { case ITERATION_ACCESS_MEMORY: - vcpu_run(vm, vcpu_id); - assert_ucall(vm, vcpu_id, UCALL_SYNC); + vcpu_run(vm, vcpu->id); + assert_ucall(vcpu, UCALL_SYNC); break; case ITERATION_MARK_IDLE: - mark_vcpu_memory_idle(vm, vcpu_id); + mark_vcpu_memory_idle(vm, vcpu_args); break; }; - vcpu_last_completed_iteration[vcpu_id] = current_iteration; + vcpu_last_completed_iteration[vcpu_idx] = current_iteration; } } -static void spin_wait_for_vcpu(int vcpu_id, int target_iteration) +static void spin_wait_for_vcpu(int vcpu_idx, int target_iteration) { - while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) != + while (READ_ONCE(vcpu_last_completed_iteration[vcpu_idx]) != target_iteration) { continue; } @@ -250,12 +252,11 @@ enum access_type { ACCESS_WRITE, }; -static void run_iteration(struct kvm_vm *vm, int vcpus, const char *description) +static void run_iteration(struct kvm_vm *vm, int nr_vcpus, const char *description) { struct timespec ts_start; struct timespec ts_elapsed; - int next_iteration; - int vcpu_id; + int next_iteration, i; /* Kick off the vCPUs by incrementing iteration. */ next_iteration = ++iteration; @@ -263,23 +264,23 @@ static void run_iteration(struct kvm_vm *vm, int vcpus, const char *description) clock_gettime(CLOCK_MONOTONIC, &ts_start); /* Wait for all vCPUs to finish the iteration. */ - for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) - spin_wait_for_vcpu(vcpu_id, next_iteration); + for (i = 0; i < nr_vcpus; i++) + spin_wait_for_vcpu(i, next_iteration); ts_elapsed = timespec_elapsed(ts_start); pr_info("%-30s: %ld.%09lds\n", description, ts_elapsed.tv_sec, ts_elapsed.tv_nsec); } -static void access_memory(struct kvm_vm *vm, int vcpus, enum access_type access, - const char *description) +static void access_memory(struct kvm_vm *vm, int nr_vcpus, + enum access_type access, const char *description) { perf_test_set_wr_fract(vm, (access == ACCESS_READ) ? INT_MAX : 1); iteration_work = ITERATION_ACCESS_MEMORY; - run_iteration(vm, vcpus, description); + run_iteration(vm, nr_vcpus, description); } -static void mark_memory_idle(struct kvm_vm *vm, int vcpus) +static void mark_memory_idle(struct kvm_vm *vm, int nr_vcpus) { /* * Even though this parallelizes the work across vCPUs, this is still a @@ -289,37 +290,37 @@ static void mark_memory_idle(struct kvm_vm *vm, int vcpus) */ pr_debug("Marking VM memory idle (slow)...\n"); iteration_work = ITERATION_MARK_IDLE; - run_iteration(vm, vcpus, "Mark memory idle"); + run_iteration(vm, nr_vcpus, "Mark memory idle"); } static void run_test(enum vm_guest_mode mode, void *arg) { struct test_params *params = arg; struct kvm_vm *vm; - int vcpus = params->vcpus; + int nr_vcpus = params->nr_vcpus; - vm = perf_test_create_vm(mode, vcpus, params->vcpu_memory_bytes, 1, + vm = perf_test_create_vm(mode, nr_vcpus, params->vcpu_memory_bytes, 1, params->backing_src, !overlap_memory_access); - perf_test_start_vcpu_threads(vcpus, vcpu_thread_main); + perf_test_start_vcpu_threads(nr_vcpus, vcpu_thread_main); pr_info("\n"); - access_memory(vm, vcpus, ACCESS_WRITE, "Populating memory"); + access_memory(vm, nr_vcpus, ACCESS_WRITE, "Populating memory"); /* As a control, read and write to the populated memory first. */ - access_memory(vm, vcpus, ACCESS_WRITE, "Writing to populated memory"); - access_memory(vm, vcpus, ACCESS_READ, "Reading from populated memory"); + access_memory(vm, nr_vcpus, ACCESS_WRITE, "Writing to populated memory"); + access_memory(vm, nr_vcpus, ACCESS_READ, "Reading from populated memory"); /* Repeat on memory that has been marked as idle. */ - mark_memory_idle(vm, vcpus); - access_memory(vm, vcpus, ACCESS_WRITE, "Writing to idle memory"); - mark_memory_idle(vm, vcpus); - access_memory(vm, vcpus, ACCESS_READ, "Reading from idle memory"); + mark_memory_idle(vm, nr_vcpus); + access_memory(vm, nr_vcpus, ACCESS_WRITE, "Writing to idle memory"); + mark_memory_idle(vm, nr_vcpus); + access_memory(vm, nr_vcpus, ACCESS_READ, "Reading from idle memory"); /* Set done to signal the vCPU threads to exit */ done = true; - perf_test_join_vcpu_threads(vcpus); + perf_test_join_vcpu_threads(nr_vcpus); perf_test_destroy_vm(vm); } @@ -347,7 +348,7 @@ int main(int argc, char *argv[]) struct test_params params = { .backing_src = DEFAULT_VM_MEM_SRC, .vcpu_memory_bytes = DEFAULT_PER_VCPU_MEM_SIZE, - .vcpus = 1, + .nr_vcpus = 1, }; int page_idle_fd; int opt; @@ -363,7 +364,7 @@ int main(int argc, char *argv[]) params.vcpu_memory_bytes = parse_size(optarg); break; case 'v': - params.vcpus = atoi(optarg); + params.nr_vcpus = atoi(optarg); break; case 'o': overlap_memory_access = true; diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index d8db0a37e973..c46110721088 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -44,28 +44,27 @@ static char *guest_data_prototype; static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args) { - int ret; - int vcpu_id = vcpu_args->vcpu_id; + struct kvm_vcpu *vcpu = vcpu_args->vcpu; struct kvm_vm *vm = perf_test_args.vm; - struct kvm_run *run; + int vcpu_idx = vcpu_args->vcpu_idx; + struct kvm_run *run = vcpu->run; struct timespec start; struct timespec ts_diff; - - run = vcpu_state(vm, vcpu_id); + int ret; clock_gettime(CLOCK_MONOTONIC, &start); /* Let the guest access its memory */ - ret = _vcpu_run(vm, vcpu_id); + ret = _vcpu_run(vm, vcpu->id); TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); - if (get_ucall(vm, vcpu_id, NULL) != UCALL_SYNC) { + if (get_ucall(vm, vcpu->id, NULL) != UCALL_SYNC) { TEST_ASSERT(false, "Invalid guest sync status: exit_reason=%s\n", exit_reason_str(run->exit_reason)); } ts_diff = timespec_elapsed(start); - PER_VCPU_DEBUG("vCPU %d execution time: %ld.%.9lds\n", vcpu_id, + PER_VCPU_DEBUG("vCPU %d execution time: %ld.%.9lds\n", vcpu_idx, ts_diff.tv_sec, ts_diff.tv_nsec); } @@ -285,8 +284,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct timespec ts_diff; int *pipefds = NULL; struct kvm_vm *vm; - int vcpu_id; - int r; + int r, i; vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, p->src_type, p->partition_vcpu_memory_access); @@ -309,12 +307,12 @@ static void run_test(enum vm_guest_mode mode, void *arg) pipefds = malloc(sizeof(int) * nr_vcpus * 2); TEST_ASSERT(pipefds, "Unable to allocate memory for pipefd"); - for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { + for (i = 0; i < nr_vcpus; i++) { struct perf_test_vcpu_args *vcpu_args; void *vcpu_hva; void *vcpu_alias; - vcpu_args = &perf_test_args.vcpu_args[vcpu_id]; + vcpu_args = &perf_test_args.vcpu_args[i]; /* Cache the host addresses of the region */ vcpu_hva = addr_gpa2hva(vm, vcpu_args->gpa); @@ -324,13 +322,13 @@ static void run_test(enum vm_guest_mode mode, void *arg) * Set up user fault fd to handle demand paging * requests. */ - r = pipe2(&pipefds[vcpu_id * 2], + r = pipe2(&pipefds[i * 2], O_CLOEXEC | O_NONBLOCK); TEST_ASSERT(!r, "Failed to set up pipefd"); - setup_demand_paging(vm, &uffd_handler_threads[vcpu_id], - pipefds[vcpu_id * 2], p->uffd_mode, - p->uffd_delay, &uffd_args[vcpu_id], + setup_demand_paging(vm, &uffd_handler_threads[i], + pipefds[i * 2], p->uffd_mode, + p->uffd_delay, &uffd_args[i], vcpu_hva, vcpu_alias, vcpu_args->pages * perf_test_args.guest_page_size); } @@ -350,11 +348,11 @@ static void run_test(enum vm_guest_mode mode, void *arg) char c; /* Tell the user fault fd handler threads to quit */ - for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { - r = write(pipefds[vcpu_id * 2 + 1], &c, 1); + for (i = 0; i < nr_vcpus; i++) { + r = write(pipefds[i * 2 + 1], &c, 1); TEST_ASSERT(r == 1, "Unable to write to pipefd"); - pthread_join(uffd_handler_threads[vcpu_id], NULL); + pthread_join(uffd_handler_threads[i], NULL); } } diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 8aaa0949707a..24f6d836499b 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -68,44 +68,45 @@ static int vcpu_last_completed_iteration[KVM_MAX_VCPUS]; static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args) { - int ret; + struct kvm_vcpu *vcpu = vcpu_args->vcpu; struct kvm_vm *vm = perf_test_args.vm; + int vcpu_idx = vcpu_args->vcpu_idx; uint64_t pages_count = 0; struct kvm_run *run; struct timespec start; struct timespec ts_diff; struct timespec total = (struct timespec){0}; struct timespec avg; - int vcpu_id = vcpu_args->vcpu_id; + int ret; - run = vcpu_state(vm, vcpu_id); + run = vcpu->run; while (!READ_ONCE(host_quit)) { int current_iteration = READ_ONCE(iteration); clock_gettime(CLOCK_MONOTONIC, &start); - ret = _vcpu_run(vm, vcpu_id); + ret = _vcpu_run(vm, vcpu->id); ts_diff = timespec_elapsed(start); TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); - TEST_ASSERT(get_ucall(vm, vcpu_id, NULL) == UCALL_SYNC, + TEST_ASSERT(get_ucall(vm, vcpu->id, NULL) == UCALL_SYNC, "Invalid guest sync status: exit_reason=%s\n", exit_reason_str(run->exit_reason)); - pr_debug("Got sync event from vCPU %d\n", vcpu_id); - vcpu_last_completed_iteration[vcpu_id] = current_iteration; + pr_debug("Got sync event from vCPU %d\n", vcpu_idx); + vcpu_last_completed_iteration[vcpu_idx] = current_iteration; pr_debug("vCPU %d updated last completed iteration to %d\n", - vcpu_id, vcpu_last_completed_iteration[vcpu_id]); + vcpu->id, vcpu_last_completed_iteration[vcpu_idx]); if (current_iteration) { pages_count += vcpu_args->pages; total = timespec_add(total, ts_diff); pr_debug("vCPU %d iteration %d dirty memory time: %ld.%.9lds\n", - vcpu_id, current_iteration, ts_diff.tv_sec, + vcpu_idx, current_iteration, ts_diff.tv_sec, ts_diff.tv_nsec); } else { pr_debug("vCPU %d iteration %d populate memory time: %ld.%.9lds\n", - vcpu_id, current_iteration, ts_diff.tv_sec, + vcpu_idx, current_iteration, ts_diff.tv_sec, ts_diff.tv_nsec); } @@ -113,9 +114,9 @@ static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args) !READ_ONCE(host_quit)) {} } - avg = timespec_div(total, vcpu_last_completed_iteration[vcpu_id]); + avg = timespec_div(total, vcpu_last_completed_iteration[vcpu_idx]); pr_debug("\nvCPU %d dirtied 0x%lx pages over %d iterations in %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n", - vcpu_id, pages_count, vcpu_last_completed_iteration[vcpu_id], + vcpu_idx, pages_count, vcpu_last_completed_iteration[vcpu_idx], total.tv_sec, total.tv_nsec, avg.tv_sec, avg.tv_nsec); } @@ -207,13 +208,13 @@ static void run_test(enum vm_guest_mode mode, void *arg) uint64_t guest_num_pages; uint64_t host_num_pages; uint64_t pages_per_slot; - int vcpu_id; struct timespec start; struct timespec ts_diff; struct timespec get_dirty_log_total = (struct timespec){0}; struct timespec vcpu_dirty_total = (struct timespec){0}; struct timespec avg; struct timespec clear_dirty_log_total = (struct timespec){0}; + int i; vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, p->slots, p->backing_src, @@ -239,15 +240,15 @@ static void run_test(enum vm_guest_mode mode, void *arg) host_quit = false; clock_gettime(CLOCK_MONOTONIC, &start); - for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) - vcpu_last_completed_iteration[vcpu_id] = -1; + for (i = 0; i < nr_vcpus; i++) + vcpu_last_completed_iteration[i] = -1; perf_test_start_vcpu_threads(nr_vcpus, vcpu_worker); /* Allow the vCPUs to populate memory */ pr_debug("Starting iteration %d - Populating\n", iteration); - for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { - while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) != + for (i = 0; i < nr_vcpus; i++) { + while (READ_ONCE(vcpu_last_completed_iteration[i]) != iteration) ; } @@ -272,8 +273,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) iteration++; pr_debug("Starting iteration %d\n", iteration); - for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { - while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) + for (i = 0; i < nr_vcpus; i++) { + while (READ_ONCE(vcpu_last_completed_iteration[i]) != iteration) ; } diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index d822cb670f1c..eaa88df0555a 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -25,7 +25,8 @@ struct perf_test_vcpu_args { uint64_t pages; /* Only used by the host userspace part of the vCPU thread */ - int vcpu_id; + struct kvm_vcpu *vcpu; + int vcpu_idx; }; struct perf_test_args { @@ -44,7 +45,7 @@ struct perf_test_args { extern struct perf_test_args perf_test_args; -struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, +struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus, uint64_t vcpu_memory_bytes, int slots, enum vm_mem_backing_src_type backing_src, bool partition_vcpu_memory_access); @@ -57,6 +58,6 @@ void perf_test_join_vcpu_threads(int vcpus); void perf_test_guest_code(uint32_t vcpu_id); uint64_t perf_test_nested_pages(int nr_vcpus); -void perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus); +void perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]); #endif /* SELFTEST_KVM_PERF_TEST_UTIL_H */ diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index 05f65c9292eb..91a89553afdd 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -17,8 +17,8 @@ struct perf_test_args perf_test_args; static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM; struct vcpu_thread { - /* The id of the vCPU. */ - int vcpu_id; + /* The index of the vCPU. */ + int vcpu_idx; /* The pthread backing the vCPU. */ pthread_t thread; @@ -36,24 +36,26 @@ static void (*vcpu_thread_fn)(struct perf_test_vcpu_args *); /* Set to true once all vCPU threads are up and running. */ static bool all_vcpu_threads_running; +static struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; + /* * Continuously write to the first 8 bytes of each page in the * specified region. */ -void perf_test_guest_code(uint32_t vcpu_id) +void perf_test_guest_code(uint32_t vcpu_idx) { struct perf_test_args *pta = &perf_test_args; - struct perf_test_vcpu_args *vcpu_args = &pta->vcpu_args[vcpu_id]; + struct perf_test_vcpu_args *vcpu_args = &pta->vcpu_args[vcpu_idx]; uint64_t gva; uint64_t pages; int i; - /* Make sure vCPU args data structure is not corrupt. */ - GUEST_ASSERT(vcpu_args->vcpu_id == vcpu_id); - gva = vcpu_args->gva; pages = vcpu_args->pages; + /* Make sure vCPU args data structure is not corrupt. */ + GUEST_ASSERT(vcpu_args->vcpu_idx == vcpu_idx); + while (true) { for (i = 0; i < pages; i++) { uint64_t addr = gva + (i * pta->guest_page_size); @@ -68,40 +70,43 @@ void perf_test_guest_code(uint32_t vcpu_id) } } -void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, +void perf_test_setup_vcpus(struct kvm_vm *vm, int nr_vcpus, + struct kvm_vcpu *vcpus[], uint64_t vcpu_memory_bytes, bool partition_vcpu_memory_access) { struct perf_test_args *pta = &perf_test_args; struct perf_test_vcpu_args *vcpu_args; - int vcpu_id; + int i; + + for (i = 0; i < nr_vcpus; i++) { + vcpu_args = &pta->vcpu_args[i]; - for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { - vcpu_args = &pta->vcpu_args[vcpu_id]; + vcpu_args->vcpu = vcpus[i]; + vcpu_args->vcpu_idx = i; - vcpu_args->vcpu_id = vcpu_id; if (partition_vcpu_memory_access) { vcpu_args->gva = guest_test_virt_mem + - (vcpu_id * vcpu_memory_bytes); + (i * vcpu_memory_bytes); vcpu_args->pages = vcpu_memory_bytes / pta->guest_page_size; - vcpu_args->gpa = pta->gpa + (vcpu_id * vcpu_memory_bytes); + vcpu_args->gpa = pta->gpa + (i * vcpu_memory_bytes); } else { vcpu_args->gva = guest_test_virt_mem; - vcpu_args->pages = (vcpus * vcpu_memory_bytes) / + vcpu_args->pages = (nr_vcpus * vcpu_memory_bytes) / pta->guest_page_size; vcpu_args->gpa = pta->gpa; } - vcpu_args_set(vm, vcpu_id, 1, vcpu_id); + vcpu_args_set(vm, vcpus[i]->id, 1, i); pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n", - vcpu_id, vcpu_args->gpa, vcpu_args->gpa + + i, vcpu_args->gpa, vcpu_args->gpa + (vcpu_args->pages * pta->guest_page_size)); } } -struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, +struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus, uint64_t vcpu_memory_bytes, int slots, enum vm_mem_backing_src_type backing_src, bool partition_vcpu_memory_access) @@ -125,7 +130,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, pta->guest_page_size = vm_guest_mode_params[mode].page_size; guest_num_pages = vm_adjust_num_guest_pages(mode, - (vcpus * vcpu_memory_bytes) / pta->guest_page_size); + (nr_vcpus * vcpu_memory_bytes) / pta->guest_page_size); TEST_ASSERT(vcpu_memory_bytes % getpagesize() == 0, "Guest memory size is not host page size aligned."); @@ -140,16 +145,16 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, * in-memory data structures. */ if (pta->nested) - slot0_pages += perf_test_nested_pages(vcpus); + slot0_pages += perf_test_nested_pages(nr_vcpus); /* * Pass guest_num_pages to populate the page tables for test memory. * The memory is also added to memslot 0, but that's a benign side * effect as KVM allows aliasing HVAs in meslots. */ - vm = __vm_create_with_vcpus(mode, vcpus, DEFAULT_GUEST_PHY_PAGES, + vm = __vm_create_with_vcpus(mode, nr_vcpus, DEFAULT_GUEST_PHY_PAGES, slot0_pages + guest_num_pages, 0, - perf_test_guest_code, NULL); + perf_test_guest_code, vcpus); pta->vm = vm; @@ -171,11 +176,10 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, TEST_ASSERT(guest_num_pages < region_end_gfn, "Requested more guest memory than address space allows.\n" " guest pages: %" PRIx64 " max gfn: %" PRIx64 - " vcpus: %d wss: %" PRIx64 "]\n", - guest_num_pages, region_end_gfn - 1, vcpus, - vcpu_memory_bytes); + " nr_vcpus: %d wss: %" PRIx64 "]\n", + guest_num_pages, region_end_gfn - 1, nr_vcpus, vcpu_memory_bytes); - pta->gpa = (region_end_gfn - guest_num_pages) * pta->guest_page_size; + pta->gpa = (region_end_gfn - guest_num_pages - 1) * pta->guest_page_size; pta->gpa = align_down(pta->gpa, backing_src_pagesz); #ifdef __s390x__ /* Align to 1M (segment size) */ @@ -198,11 +202,12 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, /* Do mapping for the demand paging memory slot */ virt_map(vm, guest_test_virt_mem, pta->gpa, guest_num_pages); - perf_test_setup_vcpus(vm, vcpus, vcpu_memory_bytes, partition_vcpu_memory_access); + perf_test_setup_vcpus(vm, nr_vcpus, vcpus, vcpu_memory_bytes, + partition_vcpu_memory_access); if (pta->nested) { pr_info("Configuring vCPUs to run in L2 (nested).\n"); - perf_test_setup_nested(vm, vcpus); + perf_test_setup_nested(vm, nr_vcpus, vcpus); } ucall_init(vm, NULL); @@ -230,7 +235,7 @@ uint64_t __weak perf_test_nested_pages(int nr_vcpus) return 0; } -void __weak perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus) +void __weak perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu **vcpus) { pr_info("%s() not support on this architecture, skipping.\n", __func__); exit(KSFT_SKIP); @@ -251,39 +256,40 @@ static void *vcpu_thread_main(void *data) while (!READ_ONCE(all_vcpu_threads_running)) ; - vcpu_thread_fn(&perf_test_args.vcpu_args[vcpu->vcpu_id]); + vcpu_thread_fn(&perf_test_args.vcpu_args[vcpu->vcpu_idx]); return NULL; } -void perf_test_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct perf_test_vcpu_args *)) +void perf_test_start_vcpu_threads(int nr_vcpus, + void (*vcpu_fn)(struct perf_test_vcpu_args *)) { - int vcpu_id; + int i; vcpu_thread_fn = vcpu_fn; WRITE_ONCE(all_vcpu_threads_running, false); - for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { - struct vcpu_thread *vcpu = &vcpu_threads[vcpu_id]; + for (i = 0; i < nr_vcpus; i++) { + struct vcpu_thread *vcpu = &vcpu_threads[i]; - vcpu->vcpu_id = vcpu_id; + vcpu->vcpu_idx = i; WRITE_ONCE(vcpu->running, false); pthread_create(&vcpu->thread, NULL, vcpu_thread_main, vcpu); } - for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { - while (!READ_ONCE(vcpu_threads[vcpu_id].running)) + for (i = 0; i < nr_vcpus; i++) { + while (!READ_ONCE(vcpu_threads[i].running)) ; } WRITE_ONCE(all_vcpu_threads_running, true); } -void perf_test_join_vcpu_threads(int vcpus) +void perf_test_join_vcpu_threads(int nr_vcpus) { - int vcpu_id; + int i; - for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) - pthread_join(vcpu_threads[vcpu_id].thread, NULL); + for (i = 0; i < nr_vcpus; i++) + pthread_join(vcpu_threads[i].thread, NULL); } diff --git a/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c b/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c index f525427a37c4..446820a549ba 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c @@ -77,7 +77,7 @@ void perf_test_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm) nested_identity_map_1g(vmx, vm, start, end - start); } -void perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus) +void perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]) { struct vmx_pages *vmx, *vmx0 = NULL; struct kvm_regs regs; @@ -103,9 +103,9 @@ void perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus) * Override the vCPU to run perf_test_l1_guest_code() which will * bounce it into L2 before calling perf_test_guest_code(). */ - vcpu_regs_get(vm, vcpu_id, ®s); + vcpu_regs_get(vm, vcpus[vcpu_id]->id, ®s); regs.rip = (unsigned long) perf_test_l1_guest_code; - vcpu_regs_set(vm, vcpu_id, ®s); - vcpu_args_set(vm, vcpu_id, 2, vmx_gva, vcpu_id); + vcpu_regs_set(vm, vcpus[vcpu_id]->id, ®s); + vcpu_args_set(vm, vcpus[vcpu_id]->id, 2, vmx_gva, vcpu_id); } } diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index 1410d0a9141a..a3efb3182119 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -38,19 +38,19 @@ static bool run_vcpus = true; static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args) { - int ret; - int vcpu_id = vcpu_args->vcpu_id; + struct kvm_vcpu *vcpu = vcpu_args->vcpu; struct kvm_vm *vm = perf_test_args.vm; struct kvm_run *run; + int ret; - run = vcpu_state(vm, vcpu_id); + run = vcpu->run; /* Let the guest access its memory until a stop signal is received */ while (READ_ONCE(run_vcpus)) { - ret = _vcpu_run(vm, vcpu_id); + ret = _vcpu_run(vm, vcpu->id); TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); - if (get_ucall(vm, vcpu_id, NULL) == UCALL_SYNC) + if (get_ucall(vm, vcpu->id, NULL) == UCALL_SYNC) continue; TEST_ASSERT(false, -- cgit v1.2.3 From 64a1aacc89700f675c7648b0962519f57630b62a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 16:44:34 -0800 Subject: KVM: selftests: Remove vcpu_get() usage from dirty_log_test Grab the vCPU from vm_vcpu_add() directly instead of doing vcpu_get() after the fact. This will allow removing vcpu_get() entirely. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/dirty_log_test.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 1a5c01c65044..5db56140a995 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -665,7 +665,7 @@ static void vm_dirty_log_verify(enum vm_guest_mode mode, unsigned long *bmap) } } -static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, +static struct kvm_vm *create_vm(enum vm_guest_mode mode, struct kvm_vcpu **vcpu, uint64_t extra_mem_pages, void *guest_code) { struct kvm_vm *vm; @@ -676,7 +676,7 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, vm = __vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages); log_mode_create_vm_done(vm); - vm_vcpu_add(vm, vcpuid, guest_code); + *vcpu = vm_vcpu_add(vm, 0, guest_code); return vm; } @@ -710,10 +710,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) * (e.g., 64K page size guest will need even less memory for * page tables). */ - vm = create_vm(mode, 0, - 2ul << (DIRTY_MEM_BITS - PAGE_SHIFT_4K), - guest_code); - vcpu = vcpu_get(vm, 0); + vm = create_vm(mode, &vcpu, + 2ul << (DIRTY_MEM_BITS - PAGE_SHIFT_4K), guest_code); guest_page_size = vm_get_page_size(vm); /* -- cgit v1.2.3 From 5260db3eb8f96c0dc631b0f41035a5f1957d9a58 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 13:53:23 -0800 Subject: KVM: selftests: Require vCPU output array when creating VM with vCPUs Require the caller of __vm_create_with_vcpus() to provide a non-NULL array of vCPUs now that all callers do so. It's extremely unlikely a test will have a legitimate use case for creating a VM with vCPUs without wanting to do something with those vCPUs, and if there is such a use case, requiring that one-off test to provide a dummy array is a minor annoyance. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/kvm_util.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index c338713d8245..351c167e7b99 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -302,10 +302,11 @@ struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus struct kvm_vcpu *vcpus[]) { uint64_t vcpu_pages, extra_pg_pages, pages; - struct kvm_vcpu *vcpu; struct kvm_vm *vm; int i; + TEST_ASSERT(!nr_vcpus || vcpus, "Must provide vCPU array"); + /* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */ if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES) slot0_mem_pages = DEFAULT_GUEST_PHY_PAGES; @@ -326,11 +327,8 @@ struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus vm = __vm_create(mode, pages); - for (i = 0; i < nr_vcpus; ++i) { - vcpu = vm_vcpu_add(vm, i, guest_code); - if (vcpus) - vcpus[i] = vcpu; - } + for (i = 0; i < nr_vcpus; ++i) + vcpus[i] = vm_vcpu_add(vm, i, guest_code); return vm; } -- cgit v1.2.3 From 768e9a61856b75de08f5efa5813bb3e7f16ec271 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jun 2022 13:41:33 -0700 Subject: KVM: selftests: Purge vm+vcpu_id == vcpu silliness Take a vCPU directly instead of a VM+vcpu pair in all vCPU-scoped helpers and ioctls. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/arch_timer.c | 10 +- .../selftests/kvm/aarch64/debug-exceptions.c | 8 +- tools/testing/selftests/kvm/aarch64/get-reg-list.c | 16 +- tools/testing/selftests/kvm/aarch64/hypercalls.c | 18 +- tools/testing/selftests/kvm/aarch64/psci_test.c | 16 +- .../selftests/kvm/aarch64/vcpu_width_config.c | 8 +- tools/testing/selftests/kvm/aarch64/vgic_init.c | 2 +- tools/testing/selftests/kvm/aarch64/vgic_irq.c | 8 +- .../selftests/kvm/access_tracking_perf_test.c | 4 +- tools/testing/selftests/kvm/demand_paging_test.c | 5 +- tools/testing/selftests/kvm/dirty_log_perf_test.c | 7 +- tools/testing/selftests/kvm/dirty_log_test.c | 10 +- .../testing/selftests/kvm/hardware_disable_test.c | 2 +- .../selftests/kvm/include/aarch64/processor.h | 4 +- .../testing/selftests/kvm/include/kvm_util_base.h | 216 ++++++++++----------- tools/testing/selftests/kvm/include/ucall_common.h | 2 +- tools/testing/selftests/kvm/include/x86_64/evmcs.h | 2 +- .../selftests/kvm/include/x86_64/processor.h | 77 ++++---- .../testing/selftests/kvm/kvm_binary_stats_test.c | 2 +- tools/testing/selftests/kvm/kvm_page_table_test.c | 7 +- .../testing/selftests/kvm/lib/aarch64/processor.c | 47 ++--- tools/testing/selftests/kvm/lib/aarch64/ucall.c | 8 +- tools/testing/selftests/kvm/lib/kvm_util.c | 162 ++++------------ tools/testing/selftests/kvm/lib/perf_test_util.c | 2 +- tools/testing/selftests/kvm/lib/riscv/processor.c | 94 +++++---- tools/testing/selftests/kvm/lib/riscv/ucall.c | 13 +- .../selftests/kvm/lib/s390x/diag318_test_handler.c | 2 +- tools/testing/selftests/kvm/lib/s390x/processor.c | 22 +-- tools/testing/selftests/kvm/lib/s390x/ucall.c | 8 +- tools/testing/selftests/kvm/lib/x86_64/processor.c | 143 +++++++------- tools/testing/selftests/kvm/lib/x86_64/ucall.c | 10 +- tools/testing/selftests/kvm/lib/x86_64/vmx.c | 4 +- .../testing/selftests/kvm/max_guest_memory_test.c | 20 +- .../kvm/memslot_modification_stress_test.c | 5 +- tools/testing/selftests/kvm/memslot_perf_test.c | 4 +- tools/testing/selftests/kvm/rseq_test.c | 4 +- tools/testing/selftests/kvm/s390x/memop.c | 8 +- tools/testing/selftests/kvm/s390x/resets.c | 28 +-- tools/testing/selftests/kvm/s390x/sync_regs_test.c | 22 +-- tools/testing/selftests/kvm/s390x/tprot.c | 4 +- .../testing/selftests/kvm/set_memory_region_test.c | 8 +- tools/testing/selftests/kvm/steal_time.c | 20 +- .../selftests/kvm/system_counter_offset_test.c | 13 +- tools/testing/selftests/kvm/x86_64/amx_test.c | 22 +-- tools/testing/selftests/kvm/x86_64/cpuid_test.c | 14 +- .../selftests/kvm/x86_64/cr4_cpuid_sync_test.c | 8 +- tools/testing/selftests/kvm/x86_64/debug_regs.c | 30 +-- .../selftests/kvm/x86_64/emulator_error_test.c | 20 +- tools/testing/selftests/kvm/x86_64/evmcs_test.c | 28 +-- .../selftests/kvm/x86_64/fix_hypercall_test.c | 6 +- tools/testing/selftests/kvm/x86_64/hyperv_clock.c | 14 +- tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c | 8 +- .../testing/selftests/kvm/x86_64/hyperv_features.c | 28 +-- .../testing/selftests/kvm/x86_64/hyperv_svm_test.c | 8 +- .../testing/selftests/kvm/x86_64/kvm_clock_test.c | 6 +- tools/testing/selftests/kvm/x86_64/kvm_pv_test.c | 10 +- tools/testing/selftests/kvm/x86_64/mmu_role_test.c | 10 +- .../selftests/kvm/x86_64/platform_info_test.c | 14 +- .../selftests/kvm/x86_64/pmu_event_filter_test.c | 8 +- .../testing/selftests/kvm/x86_64/set_boot_cpu_id.c | 4 +- .../testing/selftests/kvm/x86_64/set_sregs_test.c | 16 +- tools/testing/selftests/kvm/x86_64/smm_test.c | 18 +- tools/testing/selftests/kvm/x86_64/state_test.c | 18 +- .../selftests/kvm/x86_64/svm_int_ctl_test.c | 8 +- .../kvm/x86_64/svm_nested_soft_inject_test.c | 10 +- .../testing/selftests/kvm/x86_64/svm_vmcall_test.c | 6 +- .../testing/selftests/kvm/x86_64/sync_regs_test.c | 36 ++-- .../selftests/kvm/x86_64/triple_fault_event_test.c | 16 +- tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c | 14 +- .../selftests/kvm/x86_64/tsc_scaling_sync.c | 6 +- .../selftests/kvm/x86_64/userspace_io_test.c | 8 +- .../selftests/kvm/x86_64/userspace_msr_exit_test.c | 22 +-- .../selftests/kvm/x86_64/vmx_apic_access_test.c | 6 +- .../kvm/x86_64/vmx_close_while_nested_test.c | 6 +- .../selftests/kvm/x86_64/vmx_dirty_log_test.c | 6 +- .../vmx_exception_with_invalid_guest_state.c | 10 +- .../kvm/x86_64/vmx_invalid_nested_guest_state.c | 12 +- .../kvm/x86_64/vmx_nested_tsc_scaling_test.c | 11 +- .../selftests/kvm/x86_64/vmx_pmu_caps_test.c | 20 +- .../kvm/x86_64/vmx_preemption_timer_test.c | 18 +- .../kvm/x86_64/vmx_set_nested_state_test.c | 12 +- .../selftests/kvm/x86_64/vmx_tsc_adjust_test.c | 6 +- .../testing/selftests/kvm/x86_64/xapic_ipi_test.c | 10 +- .../selftests/kvm/x86_64/xapic_state_test.c | 42 ++-- .../testing/selftests/kvm/x86_64/xen_shinfo_test.c | 38 ++-- .../testing/selftests/kvm/x86_64/xen_vmcall_test.c | 6 +- tools/testing/selftests/kvm/x86_64/xss_msr_test.c | 6 +- 87 files changed, 791 insertions(+), 909 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c index a873d9adc558..ca4c08b4e353 100644 --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -218,14 +218,14 @@ static void *test_vcpu_run(void *arg) struct kvm_vm *vm = vcpu->vm; struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[vcpu_idx]; - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); /* Currently, any exit from guest is an indication of completion */ pthread_mutex_lock(&vcpu_done_map_lock); set_bit(vcpu_idx, vcpu_done_map); pthread_mutex_unlock(&vcpu_done_map_lock); - switch (get_ucall(vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_SYNC: case UCALL_DONE: break; @@ -345,9 +345,9 @@ static void test_run(struct kvm_vm *vm) static void test_init_timer_irq(struct kvm_vm *vm) { /* Timer initid should be same for all the vCPUs, so query only vCPU-0 */ - vcpu_device_attr_get(vm, vcpus[0]->id, KVM_ARM_VCPU_TIMER_CTRL, + vcpu_device_attr_get(vcpus[0], KVM_ARM_VCPU_TIMER_CTRL, KVM_ARM_VCPU_TIMER_IRQ_PTIMER, &ptimer_irq); - vcpu_device_attr_get(vm, vcpus[0]->id, KVM_ARM_VCPU_TIMER_CTRL, + vcpu_device_attr_get(vcpus[0], KVM_ARM_VCPU_TIMER_CTRL, KVM_ARM_VCPU_TIMER_IRQ_VTIMER, &vtimer_irq); sync_global_to_guest(vm, ptimer_irq); @@ -370,7 +370,7 @@ static struct kvm_vm *test_vm_create(void) vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT, guest_irq_handler); for (i = 0; i < nr_vcpus; i++) - vcpu_init_descriptor_tables(vm, vcpus[i]->id); + vcpu_init_descriptor_tables(vcpus[i]); ucall_init(vm, NULL); test_init_timer_irq(vm); diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c index 2fe13e117dba..c27352b90ccf 100644 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -242,7 +242,7 @@ static int debug_version(struct kvm_vcpu *vcpu) { uint64_t id_aa64dfr0; - vcpu_get_reg(vcpu->vm, vcpu->id, KVM_ARM64_SYS_REG(SYS_ID_AA64DFR0_EL1), &id_aa64dfr0); + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64DFR0_EL1), &id_aa64dfr0); return id_aa64dfr0 & 0xf; } @@ -257,7 +257,7 @@ int main(int argc, char *argv[]) ucall_init(vm, NULL); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, vcpu->id); + vcpu_init_descriptor_tables(vcpu); if (debug_version(vcpu) < 6) { print_skip("Armv8 debug architecture not supported."); @@ -277,9 +277,9 @@ int main(int argc, char *argv[]) ESR_EC_SVC64, guest_svc_handler); for (stage = 0; stage < 11; stage++) { - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); - switch (get_ucall(vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_SYNC: TEST_ASSERT(uc.args[1] == stage, "Stage %d: Unexpected sync ucall, got %lx", diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c index f0aec117faae..f0f83ffda344 100644 --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -377,7 +377,7 @@ static void prepare_vcpu_init(struct vcpu_config *c, struct kvm_vcpu_init *init) init->features[s->feature / 32] |= 1 << (s->feature % 32); } -static void finalize_vcpu(struct kvm_vm *vm, uint32_t vcpuid, struct vcpu_config *c) +static void finalize_vcpu(struct kvm_vcpu *vcpu, struct vcpu_config *c) { struct reg_sublist *s; int feature; @@ -385,7 +385,7 @@ static void finalize_vcpu(struct kvm_vm *vm, uint32_t vcpuid, struct vcpu_config for_each_sublist(c, s) { if (s->finalize) { feature = s->feature; - vcpu_ioctl(vm, vcpuid, KVM_ARM_VCPU_FINALIZE, &feature); + vcpu_ioctl(vcpu, KVM_ARM_VCPU_FINALIZE, &feature); } } } @@ -420,10 +420,10 @@ static void run_test(struct vcpu_config *c) vm = vm_create_barebones(); prepare_vcpu_init(c, &init); vcpu = __vm_vcpu_add(vm, 0); - aarch64_vcpu_setup(vm, vcpu->id, &init); - finalize_vcpu(vm, vcpu->id, c); + aarch64_vcpu_setup(vcpu, &init); + finalize_vcpu(vcpu, c); - reg_list = vcpu_get_reg_list(vm, vcpu->id); + reg_list = vcpu_get_reg_list(vcpu); if (fixup_core_regs) core_reg_fixup(); @@ -459,7 +459,7 @@ static void run_test(struct vcpu_config *c) bool reject_reg = false; int ret; - ret = __vcpu_get_reg(vm, vcpu->id, reg_list->reg[i], &addr); + ret = __vcpu_get_reg(vcpu, reg_list->reg[i], &addr); if (ret) { printf("%s: Failed to get ", config_name(c)); print_reg(c, reg.id); @@ -471,7 +471,7 @@ static void run_test(struct vcpu_config *c) for_each_sublist(c, s) { if (s->rejects_set && find_reg(s->rejects_set, s->rejects_set_n, reg.id)) { reject_reg = true; - ret = __vcpu_ioctl(vm, vcpu->id, KVM_SET_ONE_REG, ®); + ret = __vcpu_ioctl(vcpu, KVM_SET_ONE_REG, ®); if (ret != -1 || errno != EPERM) { printf("%s: Failed to reject (ret=%d, errno=%d) ", config_name(c), ret, errno); print_reg(c, reg.id); @@ -483,7 +483,7 @@ static void run_test(struct vcpu_config *c) } if (!reject_reg) { - ret = __vcpu_ioctl(vm, vcpu->id, KVM_SET_ONE_REG, ®); + ret = __vcpu_ioctl(vcpu, KVM_SET_ONE_REG, ®); if (ret) { printf("%s: Failed to set ", config_name(c)); print_reg(c, reg.id); diff --git a/tools/testing/selftests/kvm/aarch64/hypercalls.c b/tools/testing/selftests/kvm/aarch64/hypercalls.c index fefa39dc9bc8..5fce4969cbb9 100644 --- a/tools/testing/selftests/kvm/aarch64/hypercalls.c +++ b/tools/testing/selftests/kvm/aarch64/hypercalls.c @@ -158,7 +158,7 @@ static void steal_time_init(struct kvm_vcpu *vcpu) gpages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, STEAL_TIME_SIZE); vm_userspace_mem_region_add(vcpu->vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, gpages, 0); - vcpu_device_attr_set(vcpu->vm, vcpu->id, KVM_ARM_VCPU_PVTIME_CTRL, + vcpu_device_attr_set(vcpu, KVM_ARM_VCPU_PVTIME_CTRL, KVM_ARM_VCPU_PVTIME_IPA, &st_ipa); } @@ -172,18 +172,18 @@ static void test_fw_regs_before_vm_start(struct kvm_vcpu *vcpu) const struct kvm_fw_reg_info *reg_info = &fw_reg_info[i]; /* First 'read' should be an upper limit of the features supported */ - vcpu_get_reg(vcpu->vm, vcpu->id, reg_info->reg, &val); + vcpu_get_reg(vcpu, reg_info->reg, &val); TEST_ASSERT(val == FW_REG_ULIMIT_VAL(reg_info->max_feat_bit), "Expected all the features to be set for reg: 0x%lx; expected: 0x%lx; read: 0x%lx\n", reg_info->reg, FW_REG_ULIMIT_VAL(reg_info->max_feat_bit), val); /* Test a 'write' by disabling all the features of the register map */ - ret = __vcpu_set_reg(vcpu->vm, vcpu->id, reg_info->reg, 0); + ret = __vcpu_set_reg(vcpu, reg_info->reg, 0); TEST_ASSERT(ret == 0, "Failed to clear all the features of reg: 0x%lx; ret: %d\n", reg_info->reg, errno); - vcpu_get_reg(vcpu->vm, vcpu->id, reg_info->reg, &val); + vcpu_get_reg(vcpu, reg_info->reg, &val); TEST_ASSERT(val == 0, "Expected all the features to be cleared for reg: 0x%lx\n", reg_info->reg); @@ -192,7 +192,7 @@ static void test_fw_regs_before_vm_start(struct kvm_vcpu *vcpu) * Avoid this check if all the bits are occupied. */ if (reg_info->max_feat_bit < 63) { - ret = __vcpu_set_reg(vcpu->vm, vcpu->id, reg_info->reg, BIT(reg_info->max_feat_bit + 1)); + ret = __vcpu_set_reg(vcpu, reg_info->reg, BIT(reg_info->max_feat_bit + 1)); TEST_ASSERT(ret != 0 && errno == EINVAL, "Unexpected behavior or return value (%d) while setting an unsupported feature for reg: 0x%lx\n", errno, reg_info->reg); @@ -213,7 +213,7 @@ static void test_fw_regs_after_vm_start(struct kvm_vcpu *vcpu) * Before starting the VM, the test clears all the bits. * Check if that's still the case. */ - vcpu_get_reg(vcpu->vm, vcpu->id, reg_info->reg, &val); + vcpu_get_reg(vcpu, reg_info->reg, &val); TEST_ASSERT(val == 0, "Expected all the features to be cleared for reg: 0x%lx\n", reg_info->reg); @@ -223,7 +223,7 @@ static void test_fw_regs_after_vm_start(struct kvm_vcpu *vcpu) * the registers and should return EBUSY. Set the registers and check for * the expected errno. */ - ret = __vcpu_set_reg(vcpu->vm, vcpu->id, reg_info->reg, FW_REG_ULIMIT_VAL(reg_info->max_feat_bit)); + ret = __vcpu_set_reg(vcpu, reg_info->reg, FW_REG_ULIMIT_VAL(reg_info->max_feat_bit)); TEST_ASSERT(ret != 0 && errno == EBUSY, "Unexpected behavior or return value (%d) while setting a feature while VM is running for reg: 0x%lx\n", errno, reg_info->reg); @@ -281,9 +281,9 @@ static void test_run(void) test_fw_regs_before_vm_start(vcpu); while (!guest_done) { - vcpu_run(vcpu->vm, vcpu->id); + vcpu_run(vcpu); - switch (get_ucall(vcpu->vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_SYNC: test_guest_stage(&vm, &vcpu); break; diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c index f4f73934351f..3e1bebe63adf 100644 --- a/tools/testing/selftests/kvm/aarch64/psci_test.c +++ b/tools/testing/selftests/kvm/aarch64/psci_test.c @@ -67,7 +67,7 @@ static void vcpu_power_off(struct kvm_vcpu *vcpu) .mp_state = KVM_MP_STATE_STOPPED, }; - vcpu_mp_state_set(vcpu->vm, vcpu->id, &mp_state); + vcpu_mp_state_set(vcpu, &mp_state); } static struct kvm_vm *setup_vm(void *guest_code, struct kvm_vcpu **source, @@ -92,8 +92,8 @@ static void enter_guest(struct kvm_vcpu *vcpu) { struct ucall uc; - vcpu_run(vcpu->vm, vcpu->id); - if (get_ucall(vcpu->vm, vcpu->id, &uc) == UCALL_ABORT) + vcpu_run(vcpu); + if (get_ucall(vcpu, &uc) == UCALL_ABORT) TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]); } @@ -102,8 +102,8 @@ static void assert_vcpu_reset(struct kvm_vcpu *vcpu) { uint64_t obs_pc, obs_x0; - vcpu_get_reg(vcpu->vm, vcpu->id, ARM64_CORE_REG(regs.pc), &obs_pc); - vcpu_get_reg(vcpu->vm, vcpu->id, ARM64_CORE_REG(regs.regs[0]), &obs_x0); + vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pc), &obs_pc); + vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.regs[0]), &obs_x0); TEST_ASSERT(obs_pc == CPU_ON_ENTRY_ADDR, "unexpected target cpu pc: %lx (expected: %lx)", @@ -143,11 +143,11 @@ static void host_test_cpu_on(void) */ vcpu_power_off(target); - vcpu_get_reg(vm, target->id, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), &target_mpidr); - vcpu_args_set(vm, source->id, 1, target_mpidr & MPIDR_HWID_BITMASK); + vcpu_get_reg(target, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), &target_mpidr); + vcpu_args_set(source, 1, target_mpidr & MPIDR_HWID_BITMASK); enter_guest(source); - if (get_ucall(vm, source->id, &uc) != UCALL_DONE) + if (get_ucall(source, &uc) != UCALL_DONE) TEST_FAIL("Unhandled ucall: %lu", uc.cmd); assert_vcpu_reset(target); diff --git a/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c b/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c index e4e66632f05c..dd5a1c4b49e0 100644 --- a/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c +++ b/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c @@ -28,12 +28,12 @@ static int add_init_2vcpus(struct kvm_vcpu_init *init0, vm = vm_create_barebones(); vcpu0 = __vm_vcpu_add(vm, 0); - ret = __vcpu_ioctl(vm, vcpu0->id, KVM_ARM_VCPU_INIT, init0); + ret = __vcpu_ioctl(vcpu0, KVM_ARM_VCPU_INIT, init0); if (ret) goto free_exit; vcpu1 = __vm_vcpu_add(vm, 1); - ret = __vcpu_ioctl(vm, vcpu1->id, KVM_ARM_VCPU_INIT, init1); + ret = __vcpu_ioctl(vcpu1, KVM_ARM_VCPU_INIT, init1); free_exit: kvm_vm_free(vm); @@ -56,11 +56,11 @@ static int add_2vcpus_init_2vcpus(struct kvm_vcpu_init *init0, vcpu0 = __vm_vcpu_add(vm, 0); vcpu1 = __vm_vcpu_add(vm, 1); - ret = __vcpu_ioctl(vm, vcpu0->id, KVM_ARM_VCPU_INIT, init0); + ret = __vcpu_ioctl(vcpu0, KVM_ARM_VCPU_INIT, init0); if (ret) goto free_exit; - ret = __vcpu_ioctl(vm, vcpu1->id, KVM_ARM_VCPU_INIT, init1); + ret = __vcpu_ioctl(vcpu1, KVM_ARM_VCPU_INIT, init1); free_exit: kvm_vm_free(vm); diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index 40504f05641d..6b9c9a391a01 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -70,7 +70,7 @@ static int run_vcpu(struct kvm_vcpu *vcpu) { ucall_init(vcpu->vm, NULL); - return __vcpu_run(vcpu->vm, vcpu->id) ? -errno : 0; + return __vcpu_run(vcpu) ? -errno : 0; } static struct vm_gic vm_gic_create_with_vcpus(uint32_t gic_dev_type, diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c index 111170201e9b..90dbba61d72a 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c @@ -759,12 +759,12 @@ static void test_vgic(uint32_t nr_irqs, bool level_sensitive, bool eoi_split) ucall_init(vm, NULL); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, vcpu->id); + vcpu_init_descriptor_tables(vcpu); /* Setup the guest args page (so it gets the args). */ args_gva = vm_vaddr_alloc_page(vm); memcpy(addr_gva2hva(vm, args_gva), &args, sizeof(args)); - vcpu_args_set(vm, vcpu->id, 1, args_gva); + vcpu_args_set(vcpu, 1, args_gva); gic_fd = vgic_v3_setup(vm, 1, nr_irqs, GICD_BASE_GPA, GICR_BASE_GPA); @@ -777,9 +777,9 @@ static void test_vgic(uint32_t nr_irqs, bool level_sensitive, bool eoi_split) guest_irq_handlers[args.eoi_split][args.level_sensitive]); while (1) { - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); - switch (get_ucall(vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_SYNC: kvm_inject_get_call(vm, &uc, &inject_args); run_guest_cmd(vcpu, gic_fd, &inject_args, &args); diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c index 86a90222f913..1c771378f7f4 100644 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -194,7 +194,7 @@ static void mark_vcpu_memory_idle(struct kvm_vm *vm, static void assert_ucall(struct kvm_vcpu *vcpu, uint64_t expected_ucall) { struct ucall uc; - uint64_t actual_ucall = get_ucall(vcpu->vm, vcpu->id, &uc); + uint64_t actual_ucall = get_ucall(vcpu, &uc); TEST_ASSERT(expected_ucall == actual_ucall, "Guest exited unexpectedly (expected ucall %" PRIu64 @@ -226,7 +226,7 @@ static void vcpu_thread_main(struct perf_test_vcpu_args *vcpu_args) while (spin_wait_for_next_iteration(¤t_iteration)) { switch (READ_ONCE(iteration_work)) { case ITERATION_ACCESS_MEMORY: - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); assert_ucall(vcpu, UCALL_SYNC); break; case ITERATION_MARK_IDLE: diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index c46110721088..779ae54f89c4 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -45,7 +45,6 @@ static char *guest_data_prototype; static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args) { struct kvm_vcpu *vcpu = vcpu_args->vcpu; - struct kvm_vm *vm = perf_test_args.vm; int vcpu_idx = vcpu_args->vcpu_idx; struct kvm_run *run = vcpu->run; struct timespec start; @@ -55,9 +54,9 @@ static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args) clock_gettime(CLOCK_MONOTONIC, &start); /* Let the guest access its memory */ - ret = _vcpu_run(vm, vcpu->id); + ret = _vcpu_run(vcpu); TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); - if (get_ucall(vm, vcpu->id, NULL) != UCALL_SYNC) { + if (get_ucall(vcpu, NULL) != UCALL_SYNC) { TEST_ASSERT(false, "Invalid guest sync status: exit_reason=%s\n", exit_reason_str(run->exit_reason)); diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 24f6d836499b..2027208e7d10 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -69,7 +69,6 @@ static int vcpu_last_completed_iteration[KVM_MAX_VCPUS]; static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args) { struct kvm_vcpu *vcpu = vcpu_args->vcpu; - struct kvm_vm *vm = perf_test_args.vm; int vcpu_idx = vcpu_args->vcpu_idx; uint64_t pages_count = 0; struct kvm_run *run; @@ -85,18 +84,18 @@ static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args) int current_iteration = READ_ONCE(iteration); clock_gettime(CLOCK_MONOTONIC, &start); - ret = _vcpu_run(vm, vcpu->id); + ret = _vcpu_run(vcpu); ts_diff = timespec_elapsed(start); TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); - TEST_ASSERT(get_ucall(vm, vcpu->id, NULL) == UCALL_SYNC, + TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC, "Invalid guest sync status: exit_reason=%s\n", exit_reason_str(run->exit_reason)); pr_debug("Got sync event from vCPU %d\n", vcpu_idx); vcpu_last_completed_iteration[vcpu_idx] = current_iteration; pr_debug("vCPU %d updated last completed iteration to %d\n", - vcpu->id, vcpu_last_completed_iteration[vcpu_idx]); + vcpu_idx, vcpu_last_completed_iteration[vcpu_idx]); if (current_iteration) { pages_count += vcpu_args->pages; diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 5db56140a995..906e893375df 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -255,7 +255,7 @@ static void default_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err) TEST_ASSERT(ret == 0 || (ret == -1 && err == EINTR), "vcpu run failed: errno=%d", err); - TEST_ASSERT(get_ucall(vcpu->vm, vcpu->id, NULL) == UCALL_SYNC, + TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC, "Invalid guest sync status: exit_reason=%s\n", exit_reason_str(run->exit_reason)); @@ -346,7 +346,7 @@ static void dirty_ring_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot, } /* Only have one vcpu */ - count = dirty_ring_collect_one(vcpu_map_dirty_ring(vcpu->vm, vcpu->id), + count = dirty_ring_collect_one(vcpu_map_dirty_ring(vcpu), slot, bitmap, num_pages, &fetch_index); cleared = kvm_vm_reset_dirty_ring(vcpu->vm); @@ -369,7 +369,7 @@ static void dirty_ring_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err) struct kvm_run *run = vcpu->run; /* A ucall-sync or ring-full event is allowed */ - if (get_ucall(vcpu->vm, vcpu->id, NULL) == UCALL_SYNC) { + if (get_ucall(vcpu, NULL) == UCALL_SYNC) { /* We should allow this to continue */ ; } else if (run->exit_reason == KVM_EXIT_DIRTY_RING_FULL || @@ -521,7 +521,7 @@ static void *vcpu_worker(void *data) sigmask->len = 8; pthread_sigmask(0, NULL, sigset); sigdelset(sigset, SIG_IPI); - vcpu_ioctl(vm, vcpu->id, KVM_SET_SIGNAL_MASK, sigmask); + vcpu_ioctl(vcpu, KVM_SET_SIGNAL_MASK, sigmask); sigemptyset(sigset); sigaddset(sigset, SIG_IPI); @@ -533,7 +533,7 @@ static void *vcpu_worker(void *data) generate_random_array(guest_array, TEST_PAGES_PER_LOOP); pages_count += TEST_PAGES_PER_LOOP; /* Let the guest dirty the random pages */ - ret = __vcpu_run(vm, vcpu->id); + ret = __vcpu_run(vcpu); if (ret == -1 && errno == EINTR) { int sig = -1; sigwait(sigset, &sig); diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c index 59bb43345a3e..2401577e3652 100644 --- a/tools/testing/selftests/kvm/hardware_disable_test.c +++ b/tools/testing/selftests/kvm/hardware_disable_test.c @@ -39,7 +39,7 @@ static void *run_vcpu(void *arg) struct kvm_vcpu *vcpu = arg; struct kvm_run *run = vcpu->run; - vcpu_run(vcpu->vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(false, "%s: exited with reason %d: %s\n", __func__, run->exit_reason, diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h index ba3e9066d990..a8124f9dd68a 100644 --- a/tools/testing/selftests/kvm/include/aarch64/processor.h +++ b/tools/testing/selftests/kvm/include/aarch64/processor.h @@ -47,7 +47,7 @@ #define MPIDR_HWID_BITMASK (0xff00fffffful) -void aarch64_vcpu_setup(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_init *init); +void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init); struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, struct kvm_vcpu_init *init, void *guest_code); @@ -101,7 +101,7 @@ void aarch64_get_supported_page_sizes(uint32_t ipa, bool *ps4k, bool *ps16k, bool *ps64k); void vm_init_descriptor_tables(struct kvm_vm *vm); -void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid); +void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu); typedef void(*handler_fn)(struct ex_regs *); void vm_install_exception_handler(struct kvm_vm *vm, diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index f409bae336d5..640634bdba9a 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -93,7 +93,7 @@ struct kvm_vm { continue; \ else -struct kvm_vcpu *vcpu_get(struct kvm_vm *vm, uint32_t vcpuid); +struct kvm_vcpu *vcpu_get(struct kvm_vm *vm, uint32_t vcpu_id); struct userspace_mem_region * memslot2region(struct kvm_vm *vm, uint32_t memslot); @@ -196,12 +196,12 @@ int __vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg); void _vm_ioctl(struct kvm_vm *vm, unsigned long cmd, const char *name, void *arg); #define vm_ioctl(vm, cmd, arg) _vm_ioctl(vm, cmd, #cmd, arg) -int __vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long cmd, +int __vcpu_ioctl(struct kvm_vcpu *vcpu, unsigned long cmd, void *arg); -void _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long cmd, +void _vcpu_ioctl(struct kvm_vcpu *vcpu, unsigned long cmd, const char *name, void *arg); -#define vcpu_ioctl(vm, vcpuid, cmd, arg) \ - _vcpu_ioctl(vm, vcpuid, cmd, #cmd, arg) +#define vcpu_ioctl(vcpu, cmd, arg) \ + _vcpu_ioctl(vcpu, cmd, #cmd, arg) /* * Looks up and returns the value corresponding to the capability @@ -288,7 +288,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); -struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid); +struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages); vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm); @@ -300,143 +300,132 @@ void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva); vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva); void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa); -struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid); -void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid); -int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid); +struct kvm_run *vcpu_state(struct kvm_vcpu *vcpu); +void vcpu_run(struct kvm_vcpu *vcpu); +int _vcpu_run(struct kvm_vcpu *vcpu); -static inline int __vcpu_run(struct kvm_vm *vm, uint32_t vcpuid) +static inline int __vcpu_run(struct kvm_vcpu *vcpu) { - return __vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL); + return __vcpu_ioctl(vcpu, KVM_RUN, NULL); } -void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid); -struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vm *vm, uint32_t vcpuid); +void vcpu_run_complete_io(struct kvm_vcpu *vcpu); +struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vcpu *vcpu); -static inline void vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id, - uint32_t cap, uint64_t arg0) +static inline void vcpu_enable_cap(struct kvm_vcpu *vcpu, uint32_t cap, + uint64_t arg0) { struct kvm_enable_cap enable_cap = { .cap = cap, .args = { arg0 } }; - vcpu_ioctl(vm, vcpu_id, KVM_ENABLE_CAP, &enable_cap); + vcpu_ioctl(vcpu, KVM_ENABLE_CAP, &enable_cap); } -static inline void vcpu_guest_debug_set(struct kvm_vm *vm, uint32_t vcpuid, +static inline void vcpu_guest_debug_set(struct kvm_vcpu *vcpu, struct kvm_guest_debug *debug) { - vcpu_ioctl(vm, vcpuid, KVM_SET_GUEST_DEBUG, debug); + vcpu_ioctl(vcpu, KVM_SET_GUEST_DEBUG, debug); } -static inline void vcpu_mp_state_get(struct kvm_vm *vm, uint32_t vcpuid, +static inline void vcpu_mp_state_get(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - vcpu_ioctl(vm, vcpuid, KVM_GET_MP_STATE, mp_state); + vcpu_ioctl(vcpu, KVM_GET_MP_STATE, mp_state); } -static inline void vcpu_mp_state_set(struct kvm_vm *vm, uint32_t vcpuid, +static inline void vcpu_mp_state_set(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - vcpu_ioctl(vm, vcpuid, KVM_SET_MP_STATE, mp_state); + vcpu_ioctl(vcpu, KVM_SET_MP_STATE, mp_state); } -static inline void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_regs *regs) +static inline void vcpu_regs_get(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { - vcpu_ioctl(vm, vcpuid, KVM_GET_REGS, regs); + vcpu_ioctl(vcpu, KVM_GET_REGS, regs); } -static inline void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_regs *regs) +static inline void vcpu_regs_set(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { - vcpu_ioctl(vm, vcpuid, KVM_SET_REGS, regs); + vcpu_ioctl(vcpu, KVM_SET_REGS, regs); } -static inline void vcpu_sregs_get(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_sregs *sregs) +static inline void vcpu_sregs_get(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - vcpu_ioctl(vm, vcpuid, KVM_GET_SREGS, sregs); + vcpu_ioctl(vcpu, KVM_GET_SREGS, sregs); } -static inline void vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_sregs *sregs) +static inline void vcpu_sregs_set(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - vcpu_ioctl(vm, vcpuid, KVM_SET_SREGS, sregs); + vcpu_ioctl(vcpu, KVM_SET_SREGS, sregs); } -static inline int _vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_sregs *sregs) +static inline int _vcpu_sregs_set(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - return __vcpu_ioctl(vm, vcpuid, KVM_SET_SREGS, sregs); + return __vcpu_ioctl(vcpu, KVM_SET_SREGS, sregs); } -static inline void vcpu_fpu_get(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_fpu *fpu) +static inline void vcpu_fpu_get(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - vcpu_ioctl(vm, vcpuid, KVM_GET_FPU, fpu); + vcpu_ioctl(vcpu, KVM_GET_FPU, fpu); } -static inline void vcpu_fpu_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_fpu *fpu) +static inline void vcpu_fpu_set(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - vcpu_ioctl(vm, vcpuid, KVM_SET_FPU, fpu); + vcpu_ioctl(vcpu, KVM_SET_FPU, fpu); } -static inline int __vcpu_get_reg(struct kvm_vm *vm, uint32_t vcpuid, - uint64_t reg_id, void *addr) +static inline int __vcpu_get_reg(struct kvm_vcpu *vcpu, uint64_t id, void *addr) { - struct kvm_one_reg reg = { .id = reg_id, .addr = (uint64_t)addr }; + struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)addr }; - return __vcpu_ioctl(vm, vcpuid, KVM_GET_ONE_REG, ®); + return __vcpu_ioctl(vcpu, KVM_GET_ONE_REG, ®); } -static inline int __vcpu_set_reg(struct kvm_vm *vm, uint32_t vcpuid, - uint64_t reg_id, uint64_t val) +static inline int __vcpu_set_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t val) { - struct kvm_one_reg reg = { .id = reg_id, .addr = (uint64_t)&val }; + struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)&val }; - return __vcpu_ioctl(vm, vcpuid, KVM_SET_ONE_REG, ®); + return __vcpu_ioctl(vcpu, KVM_SET_ONE_REG, ®); } -static inline void vcpu_get_reg(struct kvm_vm *vm, uint32_t vcpuid, - uint64_t reg_id, void *addr) +static inline void vcpu_get_reg(struct kvm_vcpu *vcpu, uint64_t id, void *addr) { - struct kvm_one_reg reg = { .id = reg_id, .addr = (uint64_t)addr }; + struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)addr }; - vcpu_ioctl(vm, vcpuid, KVM_GET_ONE_REG, ®); + vcpu_ioctl(vcpu, KVM_GET_ONE_REG, ®); } -static inline void vcpu_set_reg(struct kvm_vm *vm, uint32_t vcpuid, - uint64_t reg_id, uint64_t val) +static inline void vcpu_set_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t val) { - struct kvm_one_reg reg = { .id = reg_id, .addr = (uint64_t)&val }; + struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)&val }; - vcpu_ioctl(vm, vcpuid, KVM_SET_ONE_REG, ®); + vcpu_ioctl(vcpu, KVM_SET_ONE_REG, ®); } #ifdef __KVM_HAVE_VCPU_EVENTS -static inline void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid, +static inline void vcpu_events_get(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { - vcpu_ioctl(vm, vcpuid, KVM_GET_VCPU_EVENTS, events); + vcpu_ioctl(vcpu, KVM_GET_VCPU_EVENTS, events); } -static inline void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid, +static inline void vcpu_events_set(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { - vcpu_ioctl(vm, vcpuid, KVM_SET_VCPU_EVENTS, events); + vcpu_ioctl(vcpu, KVM_SET_VCPU_EVENTS, events); } #endif #ifdef __x86_64__ -static inline void vcpu_nested_state_get(struct kvm_vm *vm, uint32_t vcpuid, +static inline void vcpu_nested_state_get(struct kvm_vcpu *vcpu, struct kvm_nested_state *state) { - vcpu_ioctl(vm, vcpuid, KVM_GET_NESTED_STATE, state); + vcpu_ioctl(vcpu, KVM_GET_NESTED_STATE, state); } -static inline int __vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid, +static inline int __vcpu_nested_state_set(struct kvm_vcpu *vcpu, struct kvm_nested_state *state) { - return __vcpu_ioctl(vm, vcpuid, KVM_SET_NESTED_STATE, state); + return __vcpu_ioctl(vcpu, KVM_SET_NESTED_STATE, state); } -static inline void vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid, +static inline void vcpu_nested_state_set(struct kvm_vcpu *vcpu, struct kvm_nested_state *state) { - vcpu_ioctl(vm, vcpuid, KVM_SET_NESTED_STATE, state); + vcpu_ioctl(vcpu, KVM_SET_NESTED_STATE, state); } #endif -static inline int vcpu_get_stats_fd(struct kvm_vm *vm, uint32_t vcpuid) +static inline int vcpu_get_stats_fd(struct kvm_vcpu *vcpu) { - int fd = __vcpu_ioctl(vm, vcpuid, KVM_GET_STATS_FD, NULL); + int fd = __vcpu_ioctl(vcpu, KVM_GET_STATS_FD, NULL); TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_GET_STATS_FD, fd)); return fd; @@ -471,25 +460,42 @@ static inline void kvm_device_attr_set(int dev_fd, uint32_t group, TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_SET_DEVICE_ATTR, ret)); } -int __vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, - uint64_t attr); +static inline int __vcpu_has_device_attr(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr) +{ + return __kvm_has_device_attr(vcpu->fd, group, attr); +} -static inline void vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, - uint32_t group, uint64_t attr) +static inline void vcpu_has_device_attr(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr) { - int ret = __vcpu_has_device_attr(vm, vcpuid, group, attr); + kvm_has_device_attr(vcpu->fd, group, attr); +} - TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_HAS_DEVICE_ATTR, ret)); +static inline int __vcpu_device_attr_get(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr, void *val) +{ + return __kvm_device_attr_get(vcpu->fd, group, attr, val); +} + +static inline void vcpu_device_attr_get(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr, void *val) +{ + kvm_device_attr_get(vcpu->fd, group, attr, val); +} + +static inline int __vcpu_device_attr_set(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr, void *val) +{ + return __kvm_device_attr_set(vcpu->fd, group, attr, val); +} + +static inline void vcpu_device_attr_set(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr, void *val) +{ + kvm_device_attr_set(vcpu->fd, group, attr, val); } -int __vcpu_device_attr_get(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, - uint64_t attr, void *val); -void vcpu_device_attr_get(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, - uint64_t attr, void *val); -int __vcpu_device_attr_set(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, - uint64_t attr, void *val); -void vcpu_device_attr_set(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, - uint64_t attr, void *val); int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type); int __kvm_create_device(struct kvm_vm *vm, uint64_t type); @@ -501,14 +507,13 @@ static inline int kvm_create_device(struct kvm_vm *vm, uint64_t type) return fd; } -void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid); +void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu); /* * VM VCPU Args Set * * Input Args: * vm - Virtual Machine - * vcpuid - VCPU ID * num - number of arguments * ... - arguments, each of type uint64_t * @@ -516,12 +521,12 @@ void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid); * * Return: None * - * Sets the first @num function input registers of the VCPU with @vcpuid, - * per the C calling convention of the architecture, to the values given - * as variable args. Each of the variable args is expected to be of type - * uint64_t. The maximum @num can be is specific to the architecture. + * Sets the first @num input parameters for the function at @vcpu's entry point, + * per the C calling convention of the architecture, to the values given as + * variable args. Each of the variable args is expected to be of type uint64_t. + * The maximum @num can be is specific to the architecture. */ -void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...); +void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...); void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level); int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level); @@ -626,32 +631,15 @@ kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, memcpy(&(g), _p, sizeof(g)); \ }) -void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid); - -/* - * VM VCPU Dump - * - * Input Args: - * stream - Output FILE stream - * vm - Virtual Machine - * vcpuid - VCPU ID - * indent - Left margin indent amount - * - * Output Args: None - * - * Return: None - * - * Dumps the current state of the VCPU specified by @vcpuid, within the VM - * given by @vm, to the FILE stream given by @stream. - */ +void assert_on_unhandled_exception(struct kvm_vcpu *vcpu); -void vcpu_arch_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, +void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent); -static inline void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, +static inline void vcpu_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent) { - vcpu_arch_dump(stream, vm, vcpuid, indent); + vcpu_arch_dump(stream, vcpu, indent); } /* @@ -659,7 +647,7 @@ static inline void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, * * Input Args: * vm - Virtual Machine - * vcpuid - The id of the VCPU to add to the VM. + * vcpu_id - The id of the VCPU to add to the VM. * guest_code - The vCPU's entry point */ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, diff --git a/tools/testing/selftests/kvm/include/ucall_common.h b/tools/testing/selftests/kvm/include/ucall_common.h index 9eecc9d40b79..98562f685151 100644 --- a/tools/testing/selftests/kvm/include/ucall_common.h +++ b/tools/testing/selftests/kvm/include/ucall_common.h @@ -26,7 +26,7 @@ struct ucall { void ucall_init(struct kvm_vm *vm, void *arg); void ucall_uninit(struct kvm_vm *vm); void ucall(uint64_t cmd, int nargs, ...); -uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc); +uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc); #define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4) \ ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4) diff --git a/tools/testing/selftests/kvm/include/x86_64/evmcs.h b/tools/testing/selftests/kvm/include/x86_64/evmcs.h index cc5d14a45702..3c9260f8e116 100644 --- a/tools/testing/selftests/kvm/include/x86_64/evmcs.h +++ b/tools/testing/selftests/kvm/include/x86_64/evmcs.h @@ -241,7 +241,7 @@ struct hv_enlightened_vmcs { extern struct hv_enlightened_vmcs *current_evmcs; extern struct hv_vp_assist_page *current_vp_assist; -int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id); +int vcpu_enable_evmcs(struct kvm_vcpu *vcpu); static inline int enable_vp_assist(uint64_t vp_assist_pa, void *vp_assist) { diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index a19a52c50608..32964d7b2218 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -422,9 +422,8 @@ static inline unsigned int x86_model(unsigned int eax) return ((eax >> 12) & 0xf0) | ((eax >> 4) & 0x0f); } -struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid); -void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_x86_state *state); +struct kvm_x86_state *vcpu_save_state(struct kvm_vcpu *vcpu); +void vcpu_load_state(struct kvm_vcpu *vcpu, struct kvm_x86_state *state); void kvm_x86_state_cleanup(struct kvm_x86_state *state); const struct kvm_msr_list *kvm_get_msr_index_list(void); @@ -432,73 +431,71 @@ const struct kvm_msr_list *kvm_get_feature_msr_index_list(void); bool kvm_msr_is_in_save_restore_list(uint32_t msr_index); uint64_t kvm_get_feature_msr(uint64_t msr_index); -static inline void vcpu_msrs_get(struct kvm_vm *vm, uint32_t vcpuid, +static inline void vcpu_msrs_get(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs) { - int r = __vcpu_ioctl(vm, vcpuid, KVM_GET_MSRS, msrs); + int r = __vcpu_ioctl(vcpu, KVM_GET_MSRS, msrs); TEST_ASSERT(r == msrs->nmsrs, "KVM_GET_MSRS failed, r: %i (failed on MSR %x)", r, r < 0 || r >= msrs->nmsrs ? -1 : msrs->entries[r].index); } -static inline void vcpu_msrs_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_msrs *msrs) +static inline void vcpu_msrs_set(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs) { - int r = __vcpu_ioctl(vm, vcpuid, KVM_SET_MSRS, msrs); + int r = __vcpu_ioctl(vcpu, KVM_SET_MSRS, msrs); TEST_ASSERT(r == msrs->nmsrs, "KVM_GET_MSRS failed, r: %i (failed on MSR %x)", r, r < 0 || r >= msrs->nmsrs ? -1 : msrs->entries[r].index); } -static inline void vcpu_debugregs_get(struct kvm_vm *vm, uint32_t vcpuid, +static inline void vcpu_debugregs_get(struct kvm_vcpu *vcpu, struct kvm_debugregs *debugregs) { - vcpu_ioctl(vm, vcpuid, KVM_GET_DEBUGREGS, debugregs); + vcpu_ioctl(vcpu, KVM_GET_DEBUGREGS, debugregs); } -static inline void vcpu_debugregs_set(struct kvm_vm *vm, uint32_t vcpuid, +static inline void vcpu_debugregs_set(struct kvm_vcpu *vcpu, struct kvm_debugregs *debugregs) { - vcpu_ioctl(vm, vcpuid, KVM_SET_DEBUGREGS, debugregs); + vcpu_ioctl(vcpu, KVM_SET_DEBUGREGS, debugregs); } -static inline void vcpu_xsave_get(struct kvm_vm *vm, uint32_t vcpuid, +static inline void vcpu_xsave_get(struct kvm_vcpu *vcpu, struct kvm_xsave *xsave) { - vcpu_ioctl(vm, vcpuid, KVM_GET_XSAVE, xsave); + vcpu_ioctl(vcpu, KVM_GET_XSAVE, xsave); } -static inline void vcpu_xsave2_get(struct kvm_vm *vm, uint32_t vcpuid, +static inline void vcpu_xsave2_get(struct kvm_vcpu *vcpu, struct kvm_xsave *xsave) { - vcpu_ioctl(vm, vcpuid, KVM_GET_XSAVE2, xsave); + vcpu_ioctl(vcpu, KVM_GET_XSAVE2, xsave); } -static inline void vcpu_xsave_set(struct kvm_vm *vm, uint32_t vcpuid, +static inline void vcpu_xsave_set(struct kvm_vcpu *vcpu, struct kvm_xsave *xsave) { - vcpu_ioctl(vm, vcpuid, KVM_SET_XSAVE, xsave); + vcpu_ioctl(vcpu, KVM_SET_XSAVE, xsave); } -static inline void vcpu_xcrs_get(struct kvm_vm *vm, uint32_t vcpuid, +static inline void vcpu_xcrs_get(struct kvm_vcpu *vcpu, struct kvm_xcrs *xcrs) { - vcpu_ioctl(vm, vcpuid, KVM_GET_XCRS, xcrs); + vcpu_ioctl(vcpu, KVM_GET_XCRS, xcrs); } -static inline void vcpu_xcrs_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_xcrs *xcrs) +static inline void vcpu_xcrs_set(struct kvm_vcpu *vcpu, struct kvm_xcrs *xcrs) { - vcpu_ioctl(vm, vcpuid, KVM_SET_XCRS, xcrs); + vcpu_ioctl(vcpu, KVM_SET_XCRS, xcrs); } struct kvm_cpuid2 *kvm_get_supported_cpuid(void); -struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid); +struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vcpu *vcpu); -static inline int __vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid, +static inline int __vcpu_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid) { - return __vcpu_ioctl(vm, vcpuid, KVM_SET_CPUID2, cpuid); + return __vcpu_ioctl(vcpu, KVM_SET_CPUID2, cpuid); } -static inline void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid, +static inline void vcpu_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid) { - vcpu_ioctl(vm, vcpuid, KVM_SET_CPUID2, cpuid); + vcpu_ioctl(vcpu, KVM_SET_CPUID2, cpuid); } struct kvm_cpuid_entry2 * @@ -510,14 +507,13 @@ kvm_get_supported_cpuid_entry(uint32_t function) return kvm_get_supported_cpuid_index(function, 0); } -uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index); -int _vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, - uint64_t msr_value); +uint64_t vcpu_get_msr(struct kvm_vcpu *vcpu, uint64_t msr_index); +int _vcpu_set_msr(struct kvm_vcpu *vcpu, uint64_t msr_index, uint64_t msr_value); -static inline void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, - uint64_t msr_index, uint64_t msr_value) +static inline void vcpu_set_msr(struct kvm_vcpu *vcpu, uint64_t msr_index, + uint64_t msr_value) { - int r = _vcpu_set_msr(vm, vcpuid, msr_index, msr_value); + int r = _vcpu_set_msr(vcpu, msr_index, msr_value); TEST_ASSERT(r == 1, KVM_IOCTL_ERROR(KVM_SET_MSRS, r)); } @@ -541,13 +537,14 @@ struct ex_regs { }; void vm_init_descriptor_tables(struct kvm_vm *vm); -void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid); +void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu); void vm_install_exception_handler(struct kvm_vm *vm, int vector, void (*handler)(struct ex_regs *)); -uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr); -void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr, - uint64_t pte); +uint64_t vm_get_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, + uint64_t vaddr); +void vm_set_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, + uint64_t vaddr, uint64_t pte); /* * get_cpuid() - find matching CPUID entry and return pointer to it. @@ -567,8 +564,8 @@ uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void); -void vcpu_set_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid); -struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid); +void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu); +struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu); void vm_xsave_req_perm(int bit); enum pg_level { diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index dfc3cf531ced..7f2ddc1535d7 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -174,7 +174,7 @@ static void vm_stats_test(struct kvm_vm *vm) static void vcpu_stats_test(struct kvm_vcpu *vcpu) { - int stats_fd = vcpu_get_stats_fd(vcpu->vm, vcpu->id); + int stats_fd = vcpu_get_stats_fd(vcpu); stats_test(stats_fd); close(stats_fd); diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index b577b5999c95..8706ae358444 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -184,7 +184,6 @@ static void guest_code(bool do_write) static void *vcpu_worker(void *data) { - struct kvm_vm *vm = test_args.vm; struct kvm_vcpu *vcpu = data; bool do_write = !(vcpu->id % 2); struct timespec start; @@ -192,7 +191,7 @@ static void *vcpu_worker(void *data) enum test_stage stage; int ret; - vcpu_args_set(vm, vcpu->id, 1, do_write); + vcpu_args_set(vcpu, 1, do_write); while (!READ_ONCE(host_quit)) { ret = sem_wait(&test_stage_updated); @@ -202,11 +201,11 @@ static void *vcpu_worker(void *data) return NULL; clock_gettime(CLOCK_MONOTONIC_RAW, &start); - ret = _vcpu_run(vm, vcpu->id); + ret = _vcpu_run(vcpu); ts_diff = timespec_elapsed(start); TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); - TEST_ASSERT(get_ucall(vm, vcpu->id, NULL) == UCALL_SYNC, + TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC, "Invalid guest sync status: exit_reason=%s\n", exit_reason_str(vcpu->run->exit_reason)); diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index d158d5aa26e6..6bd27782f00c 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -212,9 +212,10 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) } } -void aarch64_vcpu_setup(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_init *init) +void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) { struct kvm_vcpu_init default_init = { .target = -1, }; + struct kvm_vm *vm = vcpu->vm; uint64_t sctlr_el1, tcr_el1; if (!init) @@ -226,16 +227,16 @@ void aarch64_vcpu_setup(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_init init->target = preferred.target; } - vcpu_ioctl(vm, vcpuid, KVM_ARM_VCPU_INIT, init); + vcpu_ioctl(vcpu, KVM_ARM_VCPU_INIT, init); /* * Enable FP/ASIMD to avoid trapping when accessing Q0-Q15 * registers, which the variable argument list macros do. */ - vcpu_set_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_CPACR_EL1), 3 << 20); + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CPACR_EL1), 3 << 20); - vcpu_get_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_SCTLR_EL1), &sctlr_el1); - vcpu_get_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_TCR_EL1), &tcr_el1); + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_SCTLR_EL1), &sctlr_el1); + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TCR_EL1), &tcr_el1); /* Configure base granule size */ switch (vm->mode) { @@ -296,19 +297,19 @@ void aarch64_vcpu_setup(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_init tcr_el1 |= (1 << 8) | (1 << 10) | (3 << 12); tcr_el1 |= (64 - vm->va_bits) /* T0SZ */; - vcpu_set_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_SCTLR_EL1), sctlr_el1); - vcpu_set_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_TCR_EL1), tcr_el1); - vcpu_set_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_MAIR_EL1), DEFAULT_MAIR_EL1); - vcpu_set_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_TTBR0_EL1), vm->pgd); - vcpu_set_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_TPIDR_EL1), vcpuid); + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_SCTLR_EL1), sctlr_el1); + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TCR_EL1), tcr_el1); + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MAIR_EL1), DEFAULT_MAIR_EL1); + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TTBR0_EL1), vm->pgd); + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TPIDR_EL1), vcpu->id); } -void vcpu_arch_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) +void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent) { uint64_t pstate, pc; - vcpu_get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pstate), &pstate); - vcpu_get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), &pc); + vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pstate), &pstate); + vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pc), &pc); fprintf(stream, "%*spstate: 0x%.16lx pc: 0x%.16lx\n", indent, "", pstate, pc); @@ -324,10 +325,10 @@ struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, DEFAULT_ARM64_GUEST_STACK_VADDR_MIN); struct kvm_vcpu *vcpu = __vm_vcpu_add(vm, vcpu_id); - aarch64_vcpu_setup(vm, vcpu_id, init); + aarch64_vcpu_setup(vcpu, init); - vcpu_set_reg(vm, vcpu_id, ARM64_CORE_REG(sp_el1), stack_vaddr + stack_size); - vcpu_set_reg(vm, vcpu_id, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code); + vcpu_set_reg(vcpu, ARM64_CORE_REG(sp_el1), stack_vaddr + stack_size); + vcpu_set_reg(vcpu, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code); return vcpu; } @@ -338,7 +339,7 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, return aarch64_vcpu_add(vm, vcpu_id, NULL, guest_code); } -void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) +void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...) { va_list ap; int i; @@ -349,8 +350,8 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) va_start(ap, num); for (i = 0; i < num; i++) { - vcpu_set_reg(vm, vcpuid, ARM64_CORE_REG(regs.regs[i]), - va_arg(ap, uint64_t)); + vcpu_set_reg(vcpu, ARM64_CORE_REG(regs.regs[i]), + va_arg(ap, uint64_t)); } va_end(ap); @@ -363,11 +364,11 @@ void kvm_exit_unexpected_exception(int vector, uint64_t ec, bool valid_ec) ; } -void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid) +void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) { struct ucall uc; - if (get_ucall(vm, vcpuid, &uc) != UCALL_UNHANDLED) + if (get_ucall(vcpu, &uc) != UCALL_UNHANDLED) return; if (uc.args[2]) /* valid_ec */ { @@ -385,11 +386,11 @@ struct handlers { handler_fn exception_handlers[VECTOR_NUM][ESR_EC_NUM]; }; -void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid) +void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu) { extern char vectors; - vcpu_set_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_VBAR_EL1), (uint64_t)&vectors); + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_VBAR_EL1), (uint64_t)&vectors); } void route_exception(struct ex_regs *regs, int vector) diff --git a/tools/testing/selftests/kvm/lib/aarch64/ucall.c b/tools/testing/selftests/kvm/lib/aarch64/ucall.c index 868ebab5369e..0b949ee06b5e 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/ucall.c +++ b/tools/testing/selftests/kvm/lib/aarch64/ucall.c @@ -88,9 +88,9 @@ void ucall(uint64_t cmd, int nargs, ...) *ucall_exit_mmio_addr = (vm_vaddr_t)&uc; } -uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) +uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) { - struct kvm_run *run = vcpu_state(vm, vcpu_id); + struct kvm_run *run = vcpu->run; struct ucall ucall = {}; if (uc) @@ -103,9 +103,9 @@ uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) TEST_ASSERT(run->mmio.is_write && run->mmio.len == 8, "Unexpected ucall exit mmio address access"); memcpy(&gva, run->mmio.data, sizeof(gva)); - memcpy(&ucall, addr_gva2hva(vm, gva), sizeof(ucall)); + memcpy(&ucall, addr_gva2hva(vcpu->vm, gva), sizeof(ucall)); - vcpu_run_complete_io(vm, vcpu_id); + vcpu_run_complete_io(vcpu); if (uc) memcpy(uc, &ucall, sizeof(ucall)); } diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 351c167e7b99..eb04b9c0a13c 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1395,88 +1395,49 @@ void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa) return (void *) ((uintptr_t) region->host_alias + offset); } -/* - * VM Create IRQ Chip - * - * Input Args: - * vm - Virtual Machine - * - * Output Args: None - * - * Return: None - * - * Creates an interrupt controller chip for the VM specified by vm. - */ +/* Create an interrupt controller chip for the specified VM. */ void vm_create_irqchip(struct kvm_vm *vm) { vm_ioctl(vm, KVM_CREATE_IRQCHIP, NULL); vm->has_irqchip = true; } - -/* - * VM VCPU State - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * - * Output Args: None - * - * Return: - * Pointer to structure that describes the state of the VCPU. - * - * Locates and returns a pointer to a structure that describes the - * state of the VCPU with the given vcpuid. - */ -struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid) +struct kvm_run *vcpu_state(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = vcpu_get(vm, vcpuid); - return vcpu->run; } -/* - * VM VCPU Run - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * - * Output Args: None - * - * Return: None - * - * Switch to executing the code for the VCPU given by vcpuid, within the VM - * given by vm. - */ -void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid) -{ - int ret = _vcpu_run(vm, vcpuid); - TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_RUN, ret)); -} - -int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid) +int _vcpu_run(struct kvm_vcpu *vcpu) { int rc; do { - rc = __vcpu_run(vm, vcpuid); + rc = __vcpu_run(vcpu); } while (rc == -1 && errno == EINTR); - assert_on_unhandled_exception(vm, vcpuid); + assert_on_unhandled_exception(vcpu); return rc; } -void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid) +/* + * Invoke KVM_RUN on a vCPU until KVM returns something other than -EINTR. + * Assert if the KVM returns an error (other than -EINTR). + */ +void vcpu_run(struct kvm_vcpu *vcpu) +{ + int ret = _vcpu_run(vcpu); + + TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_RUN, ret)); +} + +void vcpu_run_complete_io(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = vcpu_get(vm, vcpuid); int ret; vcpu->run->immediate_exit = 1; - ret = __vcpu_run(vm, vcpuid); + ret = __vcpu_run(vcpu); vcpu->run->immediate_exit = 0; TEST_ASSERT(ret == -1 && errno == EINTR, @@ -1485,73 +1446,57 @@ void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid) } /* - * VM VCPU Get Reg List - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * - * Output Args: - * None - * - * Return: - * A pointer to an allocated struct kvm_reg_list - * * Get the list of guest registers which are supported for - * KVM_GET_ONE_REG/KVM_SET_ONE_REG calls + * KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls. Returns a kvm_reg_list pointer, + * it is the callers responsibility to free the list. */ -struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vm *vm, uint32_t vcpuid) +struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vcpu *vcpu) { struct kvm_reg_list reg_list_n = { .n = 0 }, *reg_list; int ret; - ret = __vcpu_ioctl(vm, vcpuid, KVM_GET_REG_LIST, ®_list_n); + ret = __vcpu_ioctl(vcpu, KVM_GET_REG_LIST, ®_list_n); TEST_ASSERT(ret == -1 && errno == E2BIG, "KVM_GET_REG_LIST n=0"); + reg_list = calloc(1, sizeof(*reg_list) + reg_list_n.n * sizeof(__u64)); reg_list->n = reg_list_n.n; - vcpu_ioctl(vm, vcpuid, KVM_GET_REG_LIST, reg_list); + vcpu_ioctl(vcpu, KVM_GET_REG_LIST, reg_list); return reg_list; } -int __vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, - unsigned long cmd, void *arg) +int __vcpu_ioctl(struct kvm_vcpu *vcpu, unsigned long cmd, void *arg) { - struct kvm_vcpu *vcpu = vcpu_get(vm, vcpuid); - return ioctl(vcpu->fd, cmd, arg); } -void _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long cmd, - const char *name, void *arg) +void _vcpu_ioctl(struct kvm_vcpu *vcpu, unsigned long cmd, const char *name, + void *arg) { - int ret = __vcpu_ioctl(vm, vcpuid, cmd, arg); + int ret = __vcpu_ioctl(vcpu, cmd, arg); TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(name, ret)); } -void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid) +void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = vcpu_get(vm, vcpuid); - uint32_t size = vm->dirty_ring_size; + uint32_t page_size = vcpu->vm->page_size; + uint32_t size = vcpu->vm->dirty_ring_size; TEST_ASSERT(size > 0, "Should enable dirty ring first"); if (!vcpu->dirty_gfns) { void *addr; - addr = mmap(NULL, size, PROT_READ, - MAP_PRIVATE, vcpu->fd, - vm->page_size * KVM_DIRTY_LOG_PAGE_OFFSET); + addr = mmap(NULL, size, PROT_READ, MAP_PRIVATE, vcpu->fd, + page_size * KVM_DIRTY_LOG_PAGE_OFFSET); TEST_ASSERT(addr == MAP_FAILED, "Dirty ring mapped private"); - addr = mmap(NULL, size, PROT_READ | PROT_EXEC, - MAP_PRIVATE, vcpu->fd, - vm->page_size * KVM_DIRTY_LOG_PAGE_OFFSET); + addr = mmap(NULL, size, PROT_READ | PROT_EXEC, MAP_PRIVATE, vcpu->fd, + page_size * KVM_DIRTY_LOG_PAGE_OFFSET); TEST_ASSERT(addr == MAP_FAILED, "Dirty ring mapped exec"); - addr = mmap(NULL, size, PROT_READ | PROT_WRITE, - MAP_SHARED, vcpu->fd, - vm->page_size * KVM_DIRTY_LOG_PAGE_OFFSET); + addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd, + page_size * KVM_DIRTY_LOG_PAGE_OFFSET); TEST_ASSERT(addr != MAP_FAILED, "Dirty ring map failed"); vcpu->dirty_gfns = addr; @@ -1636,36 +1581,6 @@ int __kvm_device_attr_set(int dev_fd, uint32_t group, uint64_t attr, void *val) return __kvm_ioctl(dev_fd, KVM_SET_DEVICE_ATTR, &kvmattr); } -int __vcpu_device_attr_get(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, - uint64_t attr, void *val) -{ - return __kvm_device_attr_get(vcpu_get(vm, vcpuid)->fd, group, attr, val); -} - -void vcpu_device_attr_get(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, - uint64_t attr, void *val) -{ - kvm_device_attr_get(vcpu_get(vm, vcpuid)->fd, group, attr, val); -} - -int __vcpu_device_attr_set(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, - uint64_t attr, void *val) -{ - return __kvm_device_attr_set(vcpu_get(vm, vcpuid)->fd, group, attr, val); -} - -void vcpu_device_attr_set(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, - uint64_t attr, void *val) -{ - kvm_device_attr_set(vcpu_get(vm, vcpuid)->fd, group, attr, val); -} - -int __vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group, - uint64_t attr) -{ - return __kvm_has_device_attr(vcpu_get(vm, vcpuid)->fd, group, attr); -} - /* * IRQ related functions. */ @@ -1781,8 +1696,9 @@ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) virt_dump(stream, vm, indent + 4); } fprintf(stream, "%*sVCPUs:\n", indent, ""); + list_for_each_entry(vcpu, &vm->vcpus, list) - vcpu_dump(stream, vm, vcpu->id, indent + 2); + vcpu_dump(stream, vcpu, indent + 2); } /* Known KVM exit reasons */ diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index 91a89553afdd..a8c7b2785cc4 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -98,7 +98,7 @@ void perf_test_setup_vcpus(struct kvm_vm *vm, int nr_vcpus, vcpu_args->gpa = pta->gpa; } - vcpu_args_set(vm, vcpus[i]->id, 1, i); + vcpu_args_set(vcpus[i], 1, i); pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n", i, vcpu_args->gpa, vcpu_args->gpa + diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index edbdc7bef05b..604478151212 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -178,8 +178,9 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) } } -void riscv_vcpu_mmu_setup(struct kvm_vm *vm, int vcpuid) +void riscv_vcpu_mmu_setup(struct kvm_vcpu *vcpu) { + struct kvm_vm *vm = vcpu->vm; unsigned long satp; /* @@ -198,46 +199,46 @@ void riscv_vcpu_mmu_setup(struct kvm_vm *vm, int vcpuid) satp = (vm->pgd >> PGTBL_PAGE_SIZE_SHIFT) & SATP_PPN; satp |= SATP_MODE_48; - vcpu_set_reg(vm, vcpuid, RISCV_CSR_REG(satp), satp); + vcpu_set_reg(vcpu, RISCV_CSR_REG(satp), satp); } -void vcpu_arch_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) +void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent) { struct kvm_riscv_core core; - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(mode), &core.mode); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.pc), &core.regs.pc); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.ra), &core.regs.ra); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.sp), &core.regs.sp); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.gp), &core.regs.gp); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.tp), &core.regs.tp); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t0), &core.regs.t0); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t1), &core.regs.t1); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t2), &core.regs.t2); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s0), &core.regs.s0); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s1), &core.regs.s1); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a0), &core.regs.a0); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a1), &core.regs.a1); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a2), &core.regs.a2); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a3), &core.regs.a3); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a4), &core.regs.a4); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a5), &core.regs.a5); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a6), &core.regs.a6); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a7), &core.regs.a7); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s2), &core.regs.s2); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s3), &core.regs.s3); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s4), &core.regs.s4); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s5), &core.regs.s5); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s6), &core.regs.s6); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s7), &core.regs.s7); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s8), &core.regs.s8); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s9), &core.regs.s9); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s10), &core.regs.s10); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s11), &core.regs.s11); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t3), &core.regs.t3); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t4), &core.regs.t4); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t5), &core.regs.t5); - vcpu_get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t6), &core.regs.t6); + vcpu_get_reg(vcpu, RISCV_CORE_REG(mode), &core.mode); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.pc), &core.regs.pc); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.ra), &core.regs.ra); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.sp), &core.regs.sp); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.gp), &core.regs.gp); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.tp), &core.regs.tp); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t0), &core.regs.t0); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t1), &core.regs.t1); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t2), &core.regs.t2); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s0), &core.regs.s0); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s1), &core.regs.s1); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a0), &core.regs.a0); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a1), &core.regs.a1); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a2), &core.regs.a2); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a3), &core.regs.a3); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a4), &core.regs.a4); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a5), &core.regs.a5); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a6), &core.regs.a6); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a7), &core.regs.a7); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s2), &core.regs.s2); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s3), &core.regs.s3); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s4), &core.regs.s4); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s5), &core.regs.s5); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s6), &core.regs.s6); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s7), &core.regs.s7); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s8), &core.regs.s8); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s9), &core.regs.s9); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s10), &core.regs.s10); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s11), &core.regs.s11); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t3), &core.regs.t3); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t4), &core.regs.t4); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t5), &core.regs.t5); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t6), &core.regs.t6); fprintf(stream, " MODE: 0x%lx\n", core.mode); @@ -288,7 +289,7 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, struct kvm_vcpu *vcpu; vcpu = __vm_vcpu_add(vm, vcpu_id); - riscv_vcpu_mmu_setup(vm, vcpu_id); + riscv_vcpu_mmu_setup(vcpu); /* * With SBI HSM support in KVM RISC-V, all secondary VCPUs are @@ -296,28 +297,25 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, * are powered-on using KVM_SET_MP_STATE ioctl(). */ mps.mp_state = KVM_MP_STATE_RUNNABLE; - r = __vcpu_ioctl(vm, vcpu_id, KVM_SET_MP_STATE, &mps); + r = __vcpu_ioctl(vcpu, KVM_SET_MP_STATE, &mps); TEST_ASSERT(!r, "IOCTL KVM_SET_MP_STATE failed (error %d)", r); /* Setup global pointer of guest to be same as the host */ asm volatile ( "add %0, gp, zero" : "=r" (current_gp) : : "memory"); - vcpu_set_reg(vm, vcpu_id, RISCV_CORE_REG(regs.gp), current_gp); + vcpu_set_reg(vcpu, RISCV_CORE_REG(regs.gp), current_gp); /* Setup stack pointer and program counter of guest */ - vcpu_set_reg(vm, vcpu_id, RISCV_CORE_REG(regs.sp), - stack_vaddr + stack_size); - vcpu_set_reg(vm, vcpu_id, RISCV_CORE_REG(regs.pc), - (unsigned long)guest_code); + vcpu_set_reg(vcpu, RISCV_CORE_REG(regs.sp), stack_vaddr + stack_size); + vcpu_set_reg(vcpu, RISCV_CORE_REG(regs.pc), (unsigned long)guest_code); /* Setup default exception vector of guest */ - vcpu_set_reg(vm, vcpu_id, RISCV_CSR_REG(stvec), - (unsigned long)guest_unexp_trap); + vcpu_set_reg(vcpu, RISCV_CSR_REG(stvec), (unsigned long)guest_unexp_trap); return vcpu; } -void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) +void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...) { va_list ap; uint64_t id = RISCV_CORE_REG(regs.a0); @@ -355,12 +353,12 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) id = RISCV_CORE_REG(regs.a7); break; } - vcpu_set_reg(vm, vcpuid, id, va_arg(ap, uint64_t)); + vcpu_set_reg(vcpu, id, va_arg(ap, uint64_t)); } va_end(ap); } -void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid) +void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) { } diff --git a/tools/testing/selftests/kvm/lib/riscv/ucall.c b/tools/testing/selftests/kvm/lib/riscv/ucall.c index 48d91b77fa1d..087b9740bc8f 100644 --- a/tools/testing/selftests/kvm/lib/riscv/ucall.c +++ b/tools/testing/selftests/kvm/lib/riscv/ucall.c @@ -64,9 +64,9 @@ void ucall(uint64_t cmd, int nargs, ...) (vm_vaddr_t)&uc, 0, 0, 0, 0, 0); } -uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) +uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) { - struct kvm_run *run = vcpu_state(vm, vcpu_id); + struct kvm_run *run = vcpu->run; struct ucall ucall = {}; if (uc) @@ -76,16 +76,17 @@ uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) run->riscv_sbi.extension_id == KVM_RISCV_SELFTESTS_SBI_EXT) { switch (run->riscv_sbi.function_id) { case KVM_RISCV_SELFTESTS_SBI_UCALL: - memcpy(&ucall, addr_gva2hva(vm, - run->riscv_sbi.args[0]), sizeof(ucall)); + memcpy(&ucall, + addr_gva2hva(vcpu->vm, run->riscv_sbi.args[0]), + sizeof(ucall)); - vcpu_run_complete_io(vm, vcpu_id); + vcpu_run_complete_io(vcpu); if (uc) memcpy(uc, &ucall, sizeof(ucall)); break; case KVM_RISCV_SELFTESTS_SBI_UNEXP: - vcpu_dump(stderr, vm, vcpu_id, 2); + vcpu_dump(stderr, vcpu, 2); TEST_ASSERT(0, "Unexpected trap taken by guest"); break; default: diff --git a/tools/testing/selftests/kvm/lib/s390x/diag318_test_handler.c b/tools/testing/selftests/kvm/lib/s390x/diag318_test_handler.c index 21c31fe10c1a..05283f8c9948 100644 --- a/tools/testing/selftests/kvm/lib/s390x/diag318_test_handler.c +++ b/tools/testing/selftests/kvm/lib/s390x/diag318_test_handler.c @@ -32,7 +32,7 @@ static uint64_t diag318_handler(void) uint64_t diag318_info; vm = vm_create_with_one_vcpu(&vcpu, guest_code); - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); run = vcpu->run; TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC, diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c index f8170e97eeb7..89d7340d9cbd 100644 --- a/tools/testing/selftests/kvm/lib/s390x/processor.c +++ b/tools/testing/selftests/kvm/lib/s390x/processor.c @@ -173,23 +173,23 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, vcpu = __vm_vcpu_add(vm, vcpu_id); /* Setup guest registers */ - vcpu_regs_get(vm, vcpu_id, ®s); + vcpu_regs_get(vcpu, ®s); regs.gprs[15] = stack_vaddr + (DEFAULT_STACK_PGS * getpagesize()) - 160; - vcpu_regs_set(vm, vcpu_id, ®s); + vcpu_regs_set(vcpu, ®s); - vcpu_sregs_get(vm, vcpu_id, &sregs); + vcpu_sregs_get(vcpu, &sregs); sregs.crs[0] |= 0x00040000; /* Enable floating point regs */ sregs.crs[1] = vm->pgd | 0xf; /* Primary region table */ - vcpu_sregs_set(vm, vcpu_id, &sregs); + vcpu_sregs_set(vcpu, &sregs); - run = vcpu_state(vm, vcpu_id); + run = vcpu->run; run->psw_mask = 0x0400000180000000ULL; /* DAT enabled + 64 bit mode */ run->psw_addr = (uintptr_t)guest_code; return vcpu; } -void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) +void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...) { va_list ap; struct kvm_regs regs; @@ -200,23 +200,21 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) num); va_start(ap, num); - vcpu_regs_get(vm, vcpuid, ®s); + vcpu_regs_get(vcpu, ®s); for (i = 0; i < num; i++) regs.gprs[i + 2] = va_arg(ap, uint64_t); - vcpu_regs_set(vm, vcpuid, ®s); + vcpu_regs_set(vcpu, ®s); va_end(ap); } -void vcpu_arch_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) +void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent) { - struct kvm_vcpu *vcpu = vcpu_get(vm, vcpuid); - fprintf(stream, "%*spstate: psw: 0x%.16llx:0x%.16llx\n", indent, "", vcpu->run->psw_mask, vcpu->run->psw_addr); } -void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid) +void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) { } diff --git a/tools/testing/selftests/kvm/lib/s390x/ucall.c b/tools/testing/selftests/kvm/lib/s390x/ucall.c index 665267c1135d..73dc4e21190f 100644 --- a/tools/testing/selftests/kvm/lib/s390x/ucall.c +++ b/tools/testing/selftests/kvm/lib/s390x/ucall.c @@ -33,9 +33,9 @@ void ucall(uint64_t cmd, int nargs, ...) asm volatile ("diag 0,%0,0x501" : : "a"(&uc) : "memory"); } -uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) +uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) { - struct kvm_run *run = vcpu_state(vm, vcpu_id); + struct kvm_run *run = vcpu->run; struct ucall ucall = {}; if (uc) @@ -47,10 +47,10 @@ uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) (run->s390_sieic.ipb >> 16) == 0x501) { int reg = run->s390_sieic.ipa & 0xf; - memcpy(&ucall, addr_gva2hva(vm, run->s.regs.gprs[reg]), + memcpy(&ucall, addr_gva2hva(vcpu->vm, run->s.regs.gprs[reg]), sizeof(ucall)); - vcpu_run_complete_io(vm, vcpu_id); + vcpu_run_complete_io(vcpu); if (uc) memcpy(uc, &ucall, sizeof(ucall)); } diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index f89d67101bf1..a54910adea98 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -212,8 +212,9 @@ void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) __virt_pg_map(vm, vaddr, paddr, PG_LEVEL_4K); } -static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, - uint64_t vaddr) +static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, + struct kvm_vcpu *vcpu, + uint64_t vaddr) { uint16_t index[4]; uint64_t *pml4e, *pdpe, *pde; @@ -235,7 +236,7 @@ static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, * If IA32_EFER.NXE = 0 and the P flag of a paging-structure entry is 1, * the XD flag (bit 63) is reserved. */ - vcpu_sregs_get(vm, vcpuid, &sregs); + vcpu_sregs_get(vcpu, &sregs); if ((sregs.efer & EFER_NX) == 0) { rsvd_mask |= PTE_NX_MASK; } @@ -287,17 +288,18 @@ static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, return &pte[index[0]]; } -uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr) +uint64_t vm_get_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, + uint64_t vaddr) { - uint64_t *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr); + uint64_t *pte = _vm_get_page_table_entry(vm, vcpu, vaddr); return *(uint64_t *)pte; } -void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr, - uint64_t pte) +void vm_set_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, + uint64_t vaddr, uint64_t pte) { - uint64_t *new_pte = _vm_get_page_table_entry(vm, vcpuid, vaddr); + uint64_t *new_pte = _vm_get_page_table_entry(vm, vcpu, vaddr); *(uint64_t *)new_pte = pte; } @@ -546,12 +548,12 @@ static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp, kvm_seg_fill_gdt_64bit(vm, segp); } -static void vcpu_setup(struct kvm_vm *vm, int vcpuid) +static void vcpu_setup(struct kvm_vm *vm, struct kvm_vcpu *vcpu) { struct kvm_sregs sregs; /* Set mode specific system register values. */ - vcpu_sregs_get(vm, vcpuid, &sregs); + vcpu_sregs_get(vcpu, &sregs); sregs.idt.limit = 0; @@ -575,7 +577,7 @@ static void vcpu_setup(struct kvm_vm *vm, int vcpuid) } sregs.cr3 = vm->pgd; - vcpu_sregs_set(vm, vcpuid, &sregs); + vcpu_sregs_set(vcpu, &sregs); } #define CPUID_XFD_BIT (1 << 4) @@ -644,19 +646,19 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, DEFAULT_GUEST_STACK_VADDR_MIN); vcpu = __vm_vcpu_add(vm, vcpu_id); - vcpu_set_cpuid(vm, vcpu_id, kvm_get_supported_cpuid()); - vcpu_setup(vm, vcpu_id); + vcpu_set_cpuid(vcpu, kvm_get_supported_cpuid()); + vcpu_setup(vm, vcpu); /* Setup guest general purpose registers */ - vcpu_regs_get(vm, vcpu_id, ®s); + vcpu_regs_get(vcpu, ®s); regs.rflags = regs.rflags | 0x2; regs.rsp = stack_vaddr + (DEFAULT_STACK_PGS * getpagesize()); regs.rip = (unsigned long) guest_code; - vcpu_regs_set(vm, vcpu_id, ®s); + vcpu_regs_set(vcpu, ®s); /* Setup the MP state */ mp_state.mp_state = 0; - vcpu_mp_state_set(vm, vcpu_id, &mp_state); + vcpu_mp_state_set(vcpu, &mp_state); return vcpu; } @@ -742,20 +744,7 @@ uint64_t kvm_get_feature_msr(uint64_t msr_index) return buffer.entry.data; } -/* - * VM VCPU CPUID Set - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU id - * - * Output Args: None - * - * Return: KVM CPUID (KVM_GET_CPUID2) - * - * Set the VCPU's CPUID. - */ -struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid) +struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid2 *cpuid; int max_ent; @@ -765,7 +754,7 @@ struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid) max_ent = cpuid->nent; for (cpuid->nent = 1; cpuid->nent <= max_ent; cpuid->nent++) { - rc = __vcpu_ioctl(vm, vcpuid, KVM_GET_CPUID2, cpuid); + rc = __vcpu_ioctl(vcpu, KVM_GET_CPUID2, cpuid); if (!rc) break; @@ -812,7 +801,7 @@ kvm_get_supported_cpuid_index(uint32_t function, uint32_t index) return entry; } -uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index) +uint64_t vcpu_get_msr(struct kvm_vcpu *vcpu, uint64_t msr_index) { struct { struct kvm_msrs header; @@ -822,13 +811,12 @@ uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index) buffer.header.nmsrs = 1; buffer.entry.index = msr_index; - vcpu_msrs_get(vm, vcpuid, &buffer.header); + vcpu_msrs_get(vcpu, &buffer.header); return buffer.entry.data; } -int _vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, - uint64_t msr_value) +int _vcpu_set_msr(struct kvm_vcpu *vcpu, uint64_t msr_index, uint64_t msr_value) { struct { struct kvm_msrs header; @@ -840,10 +828,10 @@ int _vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, buffer.entry.index = msr_index; buffer.entry.data = msr_value; - return __vcpu_ioctl(vm, vcpuid, KVM_SET_MSRS, &buffer.header); + return __vcpu_ioctl(vcpu, KVM_SET_MSRS, &buffer.header); } -void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) +void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...) { va_list ap; struct kvm_regs regs; @@ -853,7 +841,7 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) num); va_start(ap, num); - vcpu_regs_get(vm, vcpuid, ®s); + vcpu_regs_get(vcpu, ®s); if (num >= 1) regs.rdi = va_arg(ap, uint64_t); @@ -873,23 +861,23 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) if (num >= 6) regs.r9 = va_arg(ap, uint64_t); - vcpu_regs_set(vm, vcpuid, ®s); + vcpu_regs_set(vcpu, ®s); va_end(ap); } -void vcpu_arch_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) +void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent) { struct kvm_regs regs; struct kvm_sregs sregs; - fprintf(stream, "%*scpuid: %u\n", indent, "", vcpuid); + fprintf(stream, "%*svCPU ID: %u\n", indent, "", vcpu->id); fprintf(stream, "%*sregs:\n", indent + 2, ""); - vcpu_regs_get(vm, vcpuid, ®s); + vcpu_regs_get(vcpu, ®s); regs_dump(stream, ®s, indent + 4); fprintf(stream, "%*ssregs:\n", indent + 2, ""); - vcpu_sregs_get(vm, vcpuid, &sregs); + vcpu_sregs_get(vcpu, &sregs); sregs_dump(stream, &sregs, indent + 4); } @@ -959,21 +947,21 @@ bool kvm_msr_is_in_save_restore_list(uint32_t msr_index) return false; } -static void vcpu_save_xsave_state(struct kvm_vm *vm, uint32_t vcpuid, +static void vcpu_save_xsave_state(struct kvm_vcpu *vcpu, struct kvm_x86_state *state) { - int size = vm_check_cap(vm, KVM_CAP_XSAVE2); + int size = vm_check_cap(vcpu->vm, KVM_CAP_XSAVE2); if (size) { state->xsave = malloc(size); - vcpu_xsave2_get(vm, vcpuid, state->xsave); + vcpu_xsave2_get(vcpu, state->xsave); } else { state->xsave = malloc(sizeof(struct kvm_xsave)); - vcpu_xsave_get(vm, vcpuid, state->xsave); + vcpu_xsave_get(vcpu, state->xsave); } } -struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) +struct kvm_x86_state *vcpu_save_state(struct kvm_vcpu *vcpu) { const struct kvm_msr_list *msr_list = kvm_get_msr_index_list(); struct kvm_x86_state *state; @@ -994,24 +982,24 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) * kernel with KVM_RUN. Complete IO prior to migrating state * to a new VM. */ - vcpu_run_complete_io(vm, vcpuid); + vcpu_run_complete_io(vcpu); state = malloc(sizeof(*state) + msr_list->nmsrs * sizeof(state->msrs.entries[0])); - vcpu_events_get(vm, vcpuid, &state->events); - vcpu_mp_state_get(vm, vcpuid, &state->mp_state); - vcpu_regs_get(vm, vcpuid, &state->regs); - vcpu_save_xsave_state(vm, vcpuid, state); + vcpu_events_get(vcpu, &state->events); + vcpu_mp_state_get(vcpu, &state->mp_state); + vcpu_regs_get(vcpu, &state->regs); + vcpu_save_xsave_state(vcpu, state); if (kvm_check_cap(KVM_CAP_XCRS)) - vcpu_xcrs_get(vm, vcpuid, &state->xcrs); + vcpu_xcrs_get(vcpu, &state->xcrs); - vcpu_sregs_get(vm, vcpuid, &state->sregs); + vcpu_sregs_get(vcpu, &state->sregs); if (nested_size) { state->nested.size = sizeof(state->nested_); - vcpu_nested_state_get(vm, vcpuid, &state->nested); + vcpu_nested_state_get(vcpu, &state->nested); TEST_ASSERT(state->nested.size <= nested_size, "Nested state size too big, %i (KVM_CHECK_CAP gave %i)", state->nested.size, nested_size); @@ -1022,29 +1010,29 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) state->msrs.nmsrs = msr_list->nmsrs; for (i = 0; i < msr_list->nmsrs; i++) state->msrs.entries[i].index = msr_list->indices[i]; - vcpu_msrs_get(vm, vcpuid, &state->msrs); + vcpu_msrs_get(vcpu, &state->msrs); - vcpu_debugregs_get(vm, vcpuid, &state->debugregs); + vcpu_debugregs_get(vcpu, &state->debugregs); return state; } -void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *state) +void vcpu_load_state(struct kvm_vcpu *vcpu, struct kvm_x86_state *state) { - vcpu_sregs_set(vm, vcpuid, &state->sregs); - vcpu_msrs_set(vm, vcpuid, &state->msrs); + vcpu_sregs_set(vcpu, &state->sregs); + vcpu_msrs_set(vcpu, &state->msrs); if (kvm_check_cap(KVM_CAP_XCRS)) - vcpu_xcrs_set(vm, vcpuid, &state->xcrs); + vcpu_xcrs_set(vcpu, &state->xcrs); - vcpu_xsave_set(vm, vcpuid, state->xsave); - vcpu_events_set(vm, vcpuid, &state->events); - vcpu_mp_state_set(vm, vcpuid, &state->mp_state); - vcpu_debugregs_set(vm, vcpuid, &state->debugregs); - vcpu_regs_set(vm, vcpuid, &state->regs); + vcpu_xsave_set(vcpu, state->xsave); + vcpu_events_set(vcpu, &state->events); + vcpu_mp_state_set(vcpu, &state->mp_state); + vcpu_debugregs_set(vcpu, &state->debugregs); + vcpu_regs_set(vcpu, &state->regs); if (state->nested.size) - vcpu_nested_state_set(vm, vcpuid, &state->nested); + vcpu_nested_state_set(vcpu, &state->nested); } void kvm_x86_state_cleanup(struct kvm_x86_state *state) @@ -1170,17 +1158,18 @@ void vm_init_descriptor_tables(struct kvm_vm *vm) DEFAULT_CODE_SELECTOR); } -void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid) +void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu) { + struct kvm_vm *vm = vcpu->vm; struct kvm_sregs sregs; - vcpu_sregs_get(vm, vcpuid, &sregs); + vcpu_sregs_get(vcpu, &sregs); sregs.idt.base = vm->idt; sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1; sregs.gdt.base = vm->gdt; sregs.gdt.limit = getpagesize() - 1; kvm_seg_set_kernel_data_64bit(NULL, DEFAULT_DATA_SELECTOR, &sregs.gs); - vcpu_sregs_set(vm, vcpuid, &sregs); + vcpu_sregs_set(vcpu, &sregs); *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; } @@ -1192,11 +1181,11 @@ void vm_install_exception_handler(struct kvm_vm *vm, int vector, handlers[vector] = (vm_vaddr_t)handler; } -void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid) +void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) { struct ucall uc; - if (get_ucall(vm, vcpuid, &uc) == UCALL_UNHANDLED) { + if (get_ucall(vcpu, &uc) == UCALL_UNHANDLED) { uint64_t vector = uc.args[0]; TEST_FAIL("Unexpected vectored event in guest (vector:0x%lx)", @@ -1267,7 +1256,7 @@ struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void) return cpuid; } -void vcpu_set_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid) +void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu) { static struct kvm_cpuid2 *cpuid_full; struct kvm_cpuid2 *cpuid_sys, *cpuid_hv; @@ -1299,16 +1288,16 @@ void vcpu_set_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid) cpuid_full->nent = nent + cpuid_hv->nent; } - vcpu_set_cpuid(vm, vcpuid, cpuid_full); + vcpu_set_cpuid(vcpu, cpuid_full); } -struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid) +struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu) { static struct kvm_cpuid2 *cpuid; cpuid = allocate_kvm_cpuid2(); - vcpu_ioctl(vm, vcpuid, KVM_GET_SUPPORTED_HV_CPUID, cpuid); + vcpu_ioctl(vcpu, KVM_GET_SUPPORTED_HV_CPUID, cpuid); return cpuid; } diff --git a/tools/testing/selftests/kvm/lib/x86_64/ucall.c b/tools/testing/selftests/kvm/lib/x86_64/ucall.c index 2ea31a0ebe30..e5f0f9e0d3ee 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/ucall.c +++ b/tools/testing/selftests/kvm/lib/x86_64/ucall.c @@ -35,9 +35,9 @@ void ucall(uint64_t cmd, int nargs, ...) : : [port] "d" (UCALL_PIO_PORT), "D" (&uc) : "rax", "memory"); } -uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) +uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) { - struct kvm_run *run = vcpu_state(vm, vcpu_id); + struct kvm_run *run = vcpu->run; struct ucall ucall = {}; if (uc) @@ -46,11 +46,11 @@ uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) if (run->exit_reason == KVM_EXIT_IO && run->io.port == UCALL_PIO_PORT) { struct kvm_regs regs; - vcpu_regs_get(vm, vcpu_id, ®s); - memcpy(&ucall, addr_gva2hva(vm, (vm_vaddr_t)regs.rdi), + vcpu_regs_get(vcpu, ®s); + memcpy(&ucall, addr_gva2hva(vcpu->vm, (vm_vaddr_t)regs.rdi), sizeof(ucall)); - vcpu_run_complete_io(vm, vcpu_id); + vcpu_run_complete_io(vcpu); if (uc) memcpy(uc, &ucall, sizeof(ucall)); } diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index a12c0e1224af..2cbfe9962fb1 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -42,11 +42,11 @@ struct eptPageTablePointer { uint64_t address:40; uint64_t reserved_63_52:12; }; -int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id) +int vcpu_enable_evmcs(struct kvm_vcpu *vcpu) { uint16_t evmcs_ver; - vcpu_enable_cap(vm, vcpu_id, KVM_CAP_HYPERV_ENLIGHTENED_VMCS, + vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_ENLIGHTENED_VMCS, (unsigned long)&evmcs_ver); /* KVM should return supported EVMCS version range */ diff --git a/tools/testing/selftests/kvm/max_guest_memory_test.c b/tools/testing/selftests/kvm/max_guest_memory_test.c index d59918d5cbe2..8f34c5aca420 100644 --- a/tools/testing/selftests/kvm/max_guest_memory_test.c +++ b/tools/testing/selftests/kvm/max_guest_memory_test.c @@ -51,10 +51,10 @@ static void rendezvous_with_boss(void) } } -static void run_vcpu(struct kvm_vm *vm, uint32_t vcpu_id) +static void run_vcpu(struct kvm_vcpu *vcpu) { - vcpu_run(vm, vcpu_id); - ASSERT_EQ(get_ucall(vm, vcpu_id, NULL), UCALL_DONE); + vcpu_run(vcpu); + ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE); } static void *vcpu_worker(void *data) @@ -65,25 +65,25 @@ static void *vcpu_worker(void *data) struct kvm_sregs sregs; struct kvm_regs regs; - vcpu_args_set(vm, vcpu->id, 3, info->start_gpa, info->end_gpa, + vcpu_args_set(vcpu, 3, info->start_gpa, info->end_gpa, vm_get_page_size(vm)); /* Snapshot regs before the first run. */ - vcpu_regs_get(vm, vcpu->id, ®s); + vcpu_regs_get(vcpu, ®s); rendezvous_with_boss(); - run_vcpu(vm, vcpu->id); + run_vcpu(vcpu); rendezvous_with_boss(); - vcpu_regs_set(vm, vcpu->id, ®s); - vcpu_sregs_get(vm, vcpu->id, &sregs); + vcpu_regs_set(vcpu, ®s); + vcpu_sregs_get(vcpu, &sregs); #ifdef __x86_64__ /* Toggle CR0.WP to trigger a MMU context reset. */ sregs.cr0 ^= X86_CR0_WP; #endif - vcpu_sregs_set(vm, vcpu->id, &sregs); + vcpu_sregs_set(vcpu, &sregs); rendezvous_with_boss(); - run_vcpu(vm, vcpu->id); + run_vcpu(vcpu); rendezvous_with_boss(); return NULL; diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index a3efb3182119..1f9036cdcaa9 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -39,7 +39,6 @@ static bool run_vcpus = true; static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args) { struct kvm_vcpu *vcpu = vcpu_args->vcpu; - struct kvm_vm *vm = perf_test_args.vm; struct kvm_run *run; int ret; @@ -47,10 +46,10 @@ static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args) /* Let the guest access its memory until a stop signal is received */ while (READ_ONCE(run_vcpus)) { - ret = _vcpu_run(vm, vcpu->id); + ret = _vcpu_run(vcpu); TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); - if (get_ucall(vm, vcpu->id, NULL) == UCALL_SYNC) + if (get_ucall(vcpu, NULL) == UCALL_SYNC) continue; TEST_ASSERT(false, diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index 009eb19b28af..5f98489e4f4d 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -146,9 +146,9 @@ static void *vcpu_worker(void *__data) struct ucall uc; while (1) { - vcpu_run(data->vm, vcpu->id); + vcpu_run(vcpu); - switch (get_ucall(data->vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_SYNC: TEST_ASSERT(uc.args[1] == 0, "Unexpected sync ucall, got %lx", diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c index fd754de0b74c..68c0c8bb206e 100644 --- a/tools/testing/selftests/kvm/rseq_test.c +++ b/tools/testing/selftests/kvm/rseq_test.c @@ -233,8 +233,8 @@ int main(int argc, char *argv[]) pthread_create(&migration_thread, NULL, migration_worker, 0); for (i = 0; !done; i++) { - vcpu_run(vm, vcpu->id); - TEST_ASSERT(get_ucall(vm, vcpu->id, NULL) == UCALL_SYNC, + vcpu_run(vcpu); + TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC, "Guest failed?"); /* diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c index 703bf137d6ac..22b0bb785591 100644 --- a/tools/testing/selftests/kvm/s390x/memop.c +++ b/tools/testing/selftests/kvm/s390x/memop.c @@ -152,7 +152,7 @@ static void memop_ioctl(struct test_info info, struct kvm_s390_mem_op *ksmo) if (!vcpu) vm_ioctl(info.vm, KVM_S390_MEM_OP, ksmo); else - vcpu_ioctl(vcpu->vm, vcpu->id, KVM_S390_MEM_OP, ksmo); + vcpu_ioctl(vcpu, KVM_S390_MEM_OP, ksmo); } static int err_memop_ioctl(struct test_info info, struct kvm_s390_mem_op *ksmo) @@ -162,7 +162,7 @@ static int err_memop_ioctl(struct test_info info, struct kvm_s390_mem_op *ksmo) if (!vcpu) return __vm_ioctl(info.vm, KVM_S390_MEM_OP, ksmo); else - return __vcpu_ioctl(vcpu->vm, vcpu->id, KVM_S390_MEM_OP, ksmo); + return __vcpu_ioctl(vcpu, KVM_S390_MEM_OP, ksmo); } #define MEMOP(err, info_p, mop_target_p, access_mode_p, buf_p, size_p, ...) \ @@ -250,8 +250,8 @@ enum stage { struct ucall uc; \ int __stage = (stage); \ \ - vcpu_run(__vcpu->vm, __vcpu->id); \ - get_ucall(__vcpu->vm, __vcpu->id, &uc); \ + vcpu_run(__vcpu); \ + get_ucall(__vcpu, &uc); \ ASSERT_EQ(uc.cmd, UCALL_SYNC); \ ASSERT_EQ(uc.args[1], __stage); \ }) \ diff --git a/tools/testing/selftests/kvm/s390x/resets.c b/tools/testing/selftests/kvm/s390x/resets.c index b2375d81571c..633ba9055231 100644 --- a/tools/testing/selftests/kvm/s390x/resets.c +++ b/tools/testing/selftests/kvm/s390x/resets.c @@ -61,7 +61,7 @@ static void test_one_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t value) { uint64_t eval_reg; - vcpu_get_reg(vcpu->vm, vcpu->id, id, &eval_reg); + vcpu_get_reg(vcpu, id, &eval_reg); TEST_ASSERT(eval_reg == value, "value == 0x%lx", value); } @@ -72,7 +72,7 @@ static void assert_noirq(struct kvm_vcpu *vcpu) irq_state.len = sizeof(buf); irq_state.buf = (unsigned long)buf; - irqs = __vcpu_ioctl(vcpu->vm, vcpu->id, KVM_S390_GET_IRQ_STATE, &irq_state); + irqs = __vcpu_ioctl(vcpu, KVM_S390_GET_IRQ_STATE, &irq_state); /* * irqs contains the number of retrieved interrupts. Any interrupt * (notably, the emergency call interrupt we have injected) should @@ -89,13 +89,13 @@ static void assert_clear(struct kvm_vcpu *vcpu) struct kvm_regs regs; struct kvm_fpu fpu; - vcpu_regs_get(vcpu->vm, vcpu->id, ®s); + vcpu_regs_get(vcpu, ®s); TEST_ASSERT(!memcmp(®s.gprs, regs_null, sizeof(regs.gprs)), "grs == 0"); - vcpu_sregs_get(vcpu->vm, vcpu->id, &sregs); + vcpu_sregs_get(vcpu, &sregs); TEST_ASSERT(!memcmp(&sregs.acrs, regs_null, sizeof(sregs.acrs)), "acrs == 0"); - vcpu_fpu_get(vcpu->vm, vcpu->id, &fpu); + vcpu_fpu_get(vcpu, &fpu); TEST_ASSERT(!memcmp(&fpu.fprs, regs_null, sizeof(fpu.fprs)), "fprs == 0"); /* sync regs */ @@ -133,7 +133,7 @@ static void assert_initial(struct kvm_vcpu *vcpu) struct kvm_fpu fpu; /* KVM_GET_SREGS */ - vcpu_sregs_get(vcpu->vm, vcpu->id, &sregs); + vcpu_sregs_get(vcpu, &sregs); TEST_ASSERT(sregs.crs[0] == 0xE0UL, "cr0 == 0xE0 (KVM_GET_SREGS)"); TEST_ASSERT(sregs.crs[14] == 0xC2000000UL, "cr14 == 0xC2000000 (KVM_GET_SREGS)"); @@ -159,7 +159,7 @@ static void assert_initial(struct kvm_vcpu *vcpu) TEST_ASSERT(vcpu->run->psw_addr == 0, "psw_addr == 0 (kvm_run)"); TEST_ASSERT(vcpu->run->psw_mask == 0, "psw_mask == 0 (kvm_run)"); - vcpu_fpu_get(vcpu->vm, vcpu->id, &fpu); + vcpu_fpu_get(vcpu, &fpu); TEST_ASSERT(!fpu.fpc, "fpc == 0"); test_one_reg(vcpu, KVM_REG_S390_GBEA, 1); @@ -198,7 +198,7 @@ static void inject_irq(struct kvm_vcpu *vcpu) irq_state.buf = (unsigned long)buf; irq->type = KVM_S390_INT_EMERGENCY; irq->u.emerg.code = vcpu->id; - irqs = __vcpu_ioctl(vcpu->vm, vcpu->id, KVM_S390_SET_IRQ_STATE, &irq_state); + irqs = __vcpu_ioctl(vcpu, KVM_S390_SET_IRQ_STATE, &irq_state); TEST_ASSERT(irqs >= 0, "Error injecting EMERGENCY IRQ errno %d\n", errno); } @@ -221,11 +221,11 @@ static void test_normal(void) ksft_print_msg("Testing normal reset\n"); vm = create_vm(&vcpu); - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); inject_irq(vcpu); - vcpu_ioctl(vm, vcpu->id, KVM_S390_NORMAL_RESET, 0); + vcpu_ioctl(vcpu, KVM_S390_NORMAL_RESET, 0); /* must clears */ assert_normal(vcpu); @@ -244,11 +244,11 @@ static void test_initial(void) ksft_print_msg("Testing initial reset\n"); vm = create_vm(&vcpu); - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); inject_irq(vcpu); - vcpu_ioctl(vm, vcpu->id, KVM_S390_INITIAL_RESET, 0); + vcpu_ioctl(vcpu, KVM_S390_INITIAL_RESET, 0); /* must clears */ assert_normal(vcpu); @@ -267,11 +267,11 @@ static void test_clear(void) ksft_print_msg("Testing clear reset\n"); vm = create_vm(&vcpu); - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); inject_irq(vcpu); - vcpu_ioctl(vm, vcpu->id, KVM_S390_CLEAR_RESET, 0); + vcpu_ioctl(vcpu, KVM_S390_CLEAR_RESET, 0); /* must clears */ assert_normal(vcpu); diff --git a/tools/testing/selftests/kvm/s390x/sync_regs_test.c b/tools/testing/selftests/kvm/s390x/sync_regs_test.c index 7c4143141ca7..4b2eb5edde5e 100644 --- a/tools/testing/selftests/kvm/s390x/sync_regs_test.c +++ b/tools/testing/selftests/kvm/s390x/sync_regs_test.c @@ -80,14 +80,14 @@ void test_read_invalid(struct kvm_vcpu *vcpu) /* Request reading invalid register set from VCPU. */ run->kvm_valid_regs = INVALID_SYNC_FIELD; - rv = _vcpu_run(vcpu->vm, vcpu->id); + rv = _vcpu_run(vcpu); TEST_ASSERT(rv < 0 && errno == EINVAL, "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n", rv); run->kvm_valid_regs = 0; run->kvm_valid_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS; - rv = _vcpu_run(vcpu->vm, vcpu->id); + rv = _vcpu_run(vcpu); TEST_ASSERT(rv < 0 && errno == EINVAL, "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n", rv); @@ -101,14 +101,14 @@ void test_set_invalid(struct kvm_vcpu *vcpu) /* Request setting invalid register set into VCPU. */ run->kvm_dirty_regs = INVALID_SYNC_FIELD; - rv = _vcpu_run(vcpu->vm, vcpu->id); + rv = _vcpu_run(vcpu); TEST_ASSERT(rv < 0 && errno == EINVAL, "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n", rv); run->kvm_dirty_regs = 0; run->kvm_dirty_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS; - rv = _vcpu_run(vcpu->vm, vcpu->id); + rv = _vcpu_run(vcpu); TEST_ASSERT(rv < 0 && errno == EINVAL, "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n", rv); @@ -124,7 +124,7 @@ void test_req_and_verify_all_valid_regs(struct kvm_vcpu *vcpu) /* Request and verify all valid register sets. */ run->kvm_valid_regs = TEST_SYNC_FIELDS; - rv = _vcpu_run(vcpu->vm, vcpu->id); + rv = _vcpu_run(vcpu); TEST_ASSERT(rv == 0, "vcpu_run failed: %d\n", rv); TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC, "Unexpected exit reason: %u (%s)\n", @@ -137,10 +137,10 @@ void test_req_and_verify_all_valid_regs(struct kvm_vcpu *vcpu) run->s390_sieic.icptcode, run->s390_sieic.ipa, run->s390_sieic.ipb); - vcpu_regs_get(vcpu->vm, vcpu->id, ®s); + vcpu_regs_get(vcpu, ®s); compare_regs(®s, &run->s.regs); - vcpu_sregs_get(vcpu->vm, vcpu->id, &sregs); + vcpu_sregs_get(vcpu, &sregs); compare_sregs(&sregs, &run->s.regs); } @@ -163,7 +163,7 @@ void test_set_and_verify_various_reg_values(struct kvm_vcpu *vcpu) run->kvm_dirty_regs |= KVM_SYNC_DIAG318; } - rv = _vcpu_run(vcpu->vm, vcpu->id); + rv = _vcpu_run(vcpu); TEST_ASSERT(rv == 0, "vcpu_run failed: %d\n", rv); TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC, "Unexpected exit reason: %u (%s)\n", @@ -179,10 +179,10 @@ void test_set_and_verify_various_reg_values(struct kvm_vcpu *vcpu) "diag318 sync regs value incorrect 0x%llx.", run->s.regs.diag318); - vcpu_regs_get(vcpu->vm, vcpu->id, ®s); + vcpu_regs_get(vcpu, ®s); compare_regs(®s, &run->s.regs); - vcpu_sregs_get(vcpu->vm, vcpu->id, &sregs); + vcpu_sregs_get(vcpu, &sregs); compare_sregs(&sregs, &run->s.regs); } @@ -198,7 +198,7 @@ void test_clear_kvm_dirty_regs_bits(struct kvm_vcpu *vcpu) run->kvm_dirty_regs = 0; run->s.regs.gprs[11] = 0xDEADBEEF; run->s.regs.diag318 = 0x4B1D; - rv = _vcpu_run(vcpu->vm, vcpu->id); + rv = _vcpu_run(vcpu); TEST_ASSERT(rv == 0, "vcpu_run failed: %d\n", rv); TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC, "Unexpected exit reason: %u (%s)\n", diff --git a/tools/testing/selftests/kvm/s390x/tprot.c b/tools/testing/selftests/kvm/s390x/tprot.c index a82a8d1dc6c8..015a13056503 100644 --- a/tools/testing/selftests/kvm/s390x/tprot.c +++ b/tools/testing/selftests/kvm/s390x/tprot.c @@ -187,8 +187,8 @@ static void guest_code(void) struct ucall uc; \ int __stage = (stage); \ \ - vcpu_run(__vcpu->vm, __vcpu->id); \ - get_ucall(__vcpu->vm, __vcpu->id, &uc); \ + vcpu_run(__vcpu); \ + get_ucall(__vcpu, &uc); \ if (uc.cmd == UCALL_ABORT) { \ TEST_FAIL("line %lu: %s, hints: %lu, %lu", uc.args[1], \ (const char *)uc.args[0], uc.args[2], uc.args[3]); \ diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index d832fc12984e..47b219dd60e4 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -63,10 +63,10 @@ static void *vcpu_worker(void *data) * has been deleted or while it is being moved . */ while (1) { - vcpu_run(vcpu->vm, vcpu->id); + vcpu_run(vcpu); if (run->exit_reason == KVM_EXIT_IO) { - cmd = get_ucall(vcpu->vm, vcpu->id, &uc); + cmd = get_ucall(vcpu, &uc); if (cmd != UCALL_SYNC) break; @@ -291,7 +291,7 @@ static void test_delete_memory_region(void) run->exit_reason == KVM_EXIT_INTERNAL_ERROR, "Unexpected exit reason = %d", run->exit_reason); - vcpu_regs_get(vm, vcpu->id, ®s); + vcpu_regs_get(vcpu, ®s); /* * On AMD, after KVM_EXIT_SHUTDOWN the VMCB has been reinitialized already, @@ -318,7 +318,7 @@ static void test_zero_memory_regions(void) vcpu = __vm_vcpu_add(vm, 0); vm_ioctl(vm, KVM_SET_NR_MMU_PAGES, (void *)64ul); - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); run = vcpu->run; TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR, diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index 7a6645464925..398819d4074f 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -73,11 +73,11 @@ static void steal_time_init(struct kvm_vcpu *vcpu, uint32_t i) st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE); sync_global_to_guest(vcpu->vm, st_gva[i]); - ret = _vcpu_set_msr(vcpu->vm, vcpu->id, MSR_KVM_STEAL_TIME, + ret = _vcpu_set_msr(vcpu, MSR_KVM_STEAL_TIME, (ulong)st_gva[i] | KVM_STEAL_RESERVED_MASK); TEST_ASSERT(ret == 0, "Bad GPA didn't fail"); - vcpu_set_msr(vcpu->vm, vcpu->id, MSR_KVM_STEAL_TIME, (ulong)st_gva[i] | KVM_MSR_ENABLED); + vcpu_set_msr(vcpu, MSR_KVM_STEAL_TIME, (ulong)st_gva[i] | KVM_MSR_ENABLED); } static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpu_idx) @@ -163,7 +163,7 @@ static bool is_steal_time_supported(struct kvm_vcpu *vcpu) .attr = KVM_ARM_VCPU_PVTIME_IPA, }; - return !__vcpu_ioctl(vcpu->vm, vcpu->id, KVM_HAS_DEVICE_ATTR, &dev); + return !__vcpu_ioctl(vcpu, KVM_HAS_DEVICE_ATTR, &dev); } static void steal_time_init(struct kvm_vcpu *vcpu, uint32_t i) @@ -178,20 +178,20 @@ static void steal_time_init(struct kvm_vcpu *vcpu, uint32_t i) .addr = (uint64_t)&st_ipa, }; - vcpu_ioctl(vm, vcpu->id, KVM_HAS_DEVICE_ATTR, &dev); + vcpu_ioctl(vcpu, KVM_HAS_DEVICE_ATTR, &dev); /* ST_GPA_BASE is identity mapped */ st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE); sync_global_to_guest(vm, st_gva[i]); st_ipa = (ulong)st_gva[i] | 1; - ret = __vcpu_ioctl(vm, vcpu->id, KVM_SET_DEVICE_ATTR, &dev); + ret = __vcpu_ioctl(vcpu, KVM_SET_DEVICE_ATTR, &dev); TEST_ASSERT(ret == -1 && errno == EINVAL, "Bad IPA didn't report EINVAL"); st_ipa = (ulong)st_gva[i]; - vcpu_ioctl(vm, vcpu->id, KVM_SET_DEVICE_ATTR, &dev); + vcpu_ioctl(vcpu, KVM_SET_DEVICE_ATTR, &dev); - ret = __vcpu_ioctl(vm, vcpu->id, KVM_SET_DEVICE_ATTR, &dev); + ret = __vcpu_ioctl(vcpu, KVM_SET_DEVICE_ATTR, &dev); TEST_ASSERT(ret == -1 && errno == EEXIST, "Set IPA twice without EEXIST"); } @@ -227,9 +227,9 @@ static void run_vcpu(struct kvm_vcpu *vcpu) { struct ucall uc; - vcpu_run(vcpu->vm, vcpu->id); + vcpu_run(vcpu); - switch (get_ucall(vcpu->vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_SYNC: case UCALL_DONE: break; @@ -280,7 +280,7 @@ int main(int ac, char **av) for (i = 0; i < NR_VCPUS; ++i) { steal_time_init(vcpus[i], i); - vcpu_args_set(vm, vcpus[i]->id, 1, i); + vcpu_args_set(vcpus[i], 1, i); /* First VCPU run initializes steal-time */ run_vcpu(vcpus[i]); diff --git a/tools/testing/selftests/kvm/system_counter_offset_test.c b/tools/testing/selftests/kvm/system_counter_offset_test.c index 0690ce0ae4fa..7c8be0930737 100644 --- a/tools/testing/selftests/kvm/system_counter_offset_test.c +++ b/tools/testing/selftests/kvm/system_counter_offset_test.c @@ -28,8 +28,7 @@ static struct test_case test_cases[] = { static void check_preconditions(struct kvm_vcpu *vcpu) { - if (!__vcpu_has_device_attr(vcpu->vm, vcpu->id, KVM_VCPU_TSC_CTRL, - KVM_VCPU_TSC_OFFSET)) + if (!__vcpu_has_device_attr(vcpu, KVM_VCPU_TSC_CTRL, KVM_VCPU_TSC_OFFSET)) return; print_skip("KVM_VCPU_TSC_OFFSET not supported; skipping test"); @@ -38,8 +37,8 @@ static void check_preconditions(struct kvm_vcpu *vcpu) static void setup_system_counter(struct kvm_vcpu *vcpu, struct test_case *test) { - vcpu_device_attr_set(vcpu->vm, vcpu->id, KVM_VCPU_TSC_CTRL, - KVM_VCPU_TSC_OFFSET, &test->tsc_offset); + vcpu_device_attr_set(vcpu, KVM_VCPU_TSC_CTRL, KVM_VCPU_TSC_OFFSET, + &test->tsc_offset); } static uint64_t guest_read_system_counter(struct test_case *test) @@ -101,10 +100,10 @@ static void enter_guest(struct kvm_vcpu *vcpu) setup_system_counter(vcpu, test); start = host_read_guest_system_counter(test); - vcpu_run(vcpu->vm, vcpu->id); + vcpu_run(vcpu); end = host_read_guest_system_counter(test); - switch (get_ucall(vcpu->vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_SYNC: handle_sync(&uc, start, end); break; @@ -113,7 +112,7 @@ static void enter_guest(struct kvm_vcpu *vcpu) return; default: TEST_ASSERT(0, "unhandled ucall %ld\n", - get_ucall(vcpu->vm, vcpu->id, &uc)); + get_ucall(vcpu, &uc)); } } } diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c index 7755fe8fcffb..b421c8369dba 100644 --- a/tools/testing/selftests/kvm/x86_64/amx_test.c +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -351,11 +351,11 @@ int main(int argc, char *argv[]) } run = vcpu->run; - vcpu_regs_get(vm, vcpu->id, ®s1); + vcpu_regs_get(vcpu, ®s1); /* Register #NM handler */ vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, vcpu->id); + vcpu_init_descriptor_tables(vcpu); vm_install_exception_handler(vm, NM_VECTOR, guest_nm_handler); /* amx cfg for guest_code */ @@ -369,16 +369,16 @@ int main(int argc, char *argv[]) /* xsave data for guest_code */ xsavedata = vm_vaddr_alloc_pages(vm, 3); memset(addr_gva2hva(vm, xsavedata), 0, 3 * getpagesize()); - vcpu_args_set(vm, vcpu->id, 3, amx_cfg, tiledata, xsavedata); + vcpu_args_set(vcpu, 3, amx_cfg, tiledata, xsavedata); for (stage = 1; ; stage++) { - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Stage %d: unexpected exit reason: %u (%s),\n", stage, run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]); @@ -403,7 +403,7 @@ int main(int argc, char *argv[]) * size subtract 8K amx size. */ amx_offset = xsave_restore_size - NUM_TILES*TILE_SIZE; - state = vcpu_save_state(vm, vcpu->id); + state = vcpu_save_state(vcpu); void *amx_start = (void *)state->xsave + amx_offset; void *tiles_data = (void *)addr_gva2hva(vm, tiledata); /* Only check TMM0 register, 1 tile */ @@ -424,21 +424,21 @@ int main(int argc, char *argv[]) TEST_FAIL("Unknown ucall %lu", uc.cmd); } - state = vcpu_save_state(vm, vcpu->id); + state = vcpu_save_state(vcpu); memset(®s1, 0, sizeof(regs1)); - vcpu_regs_get(vm, vcpu->id, ®s1); + vcpu_regs_get(vcpu, ®s1); kvm_vm_release(vm); /* Restore state in a new VM. */ vcpu = vm_recreate_with_one_vcpu(vm); - vcpu_set_cpuid(vm, vcpu->id, kvm_get_supported_cpuid()); - vcpu_load_state(vm, vcpu->id, state); + vcpu_set_cpuid(vcpu, kvm_get_supported_cpuid()); + vcpu_load_state(vcpu, state); run = vcpu->run; kvm_x86_state_cleanup(state); memset(®s2, 0, sizeof(regs2)); - vcpu_regs_get(vm, vcpu->id, ®s2); + vcpu_regs_get(vcpu, ®s2); TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", (ulong) regs2.rdi, (ulong) regs2.rsi); diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c index 76cdd0d10757..4aa784932597 100644 --- a/tools/testing/selftests/kvm/x86_64/cpuid_test.c +++ b/tools/testing/selftests/kvm/x86_64/cpuid_test.c @@ -120,9 +120,9 @@ static void run_vcpu(struct kvm_vcpu *vcpu, int stage) { struct ucall uc; - vcpu_run(vcpu->vm, vcpu->id); + vcpu_run(vcpu); - switch (get_ucall(vcpu->vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_SYNC: TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && uc.args[1] == stage + 1, @@ -159,14 +159,14 @@ static void set_cpuid_after_run(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid) u32 eax, ebx, x; /* Setting unmodified CPUID is allowed */ - rc = __vcpu_set_cpuid(vcpu->vm, vcpu->id, cpuid); + rc = __vcpu_set_cpuid(vcpu, cpuid); TEST_ASSERT(!rc, "Setting unmodified CPUID after KVM_RUN failed: %d", rc); /* Changing CPU features is forbidden */ ent = get_cpuid(cpuid, 0x7, 0); ebx = ent->ebx; ent->ebx--; - rc = __vcpu_set_cpuid(vcpu->vm, vcpu->id, cpuid); + rc = __vcpu_set_cpuid(vcpu, cpuid); TEST_ASSERT(rc, "Changing CPU features should fail"); ent->ebx = ebx; @@ -175,7 +175,7 @@ static void set_cpuid_after_run(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid) eax = ent->eax; x = eax & 0xff; ent->eax = (eax & ~0xffu) | (x - 1); - rc = __vcpu_set_cpuid(vcpu->vm, vcpu->id, cpuid); + rc = __vcpu_set_cpuid(vcpu, cpuid); TEST_ASSERT(rc, "Changing MAXPHYADDR should fail"); ent->eax = eax; } @@ -191,13 +191,13 @@ int main(void) vm = vm_create_with_one_vcpu(&vcpu, guest_main); supp_cpuid = kvm_get_supported_cpuid(); - cpuid2 = vcpu_get_cpuid(vm, vcpu->id); + cpuid2 = vcpu_get_cpuid(vcpu); compare_cpuids(supp_cpuid, cpuid2); vcpu_alloc_cpuid(vm, &cpuid_gva, cpuid2); - vcpu_args_set(vm, vcpu->id, 1, cpuid_gva); + vcpu_args_set(vcpu, 1, cpuid_gva); for (stage = 0; stage < 3; stage++) run_vcpu(vcpu, stage); diff --git a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c index d5615cd0b81b..1635aae970e9 100644 --- a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c +++ b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c @@ -82,19 +82,19 @@ int main(int argc, char *argv[]) run = vcpu->run; while (1) { - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Unexpected exit reason: %u (%s),\n", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_SYNC: /* emulate hypervisor clearing CR4.OSXSAVE */ - vcpu_sregs_get(vm, vcpu->id, &sregs); + vcpu_sregs_get(vcpu, &sregs); sregs.cr4 &= ~X86_CR4_OSXSAVE; - vcpu_sregs_set(vm, vcpu->id, &sregs); + vcpu_sregs_set(vcpu, &sregs); break; case UCALL_ABORT: TEST_FAIL("Guest CR4 bit (OSXSAVE) unsynchronized with CPUID bit."); diff --git a/tools/testing/selftests/kvm/x86_64/debug_regs.c b/tools/testing/selftests/kvm/x86_64/debug_regs.c index 3cc25714d703..c16799b616e0 100644 --- a/tools/testing/selftests/kvm/x86_64/debug_regs.c +++ b/tools/testing/selftests/kvm/x86_64/debug_regs.c @@ -70,9 +70,9 @@ static void vcpu_skip_insn(struct kvm_vcpu *vcpu, int insn_len) { struct kvm_regs regs; - vcpu_regs_get(vcpu->vm, vcpu->id, ®s); + vcpu_regs_get(vcpu, ®s); regs.rip += insn_len; - vcpu_regs_set(vcpu->vm, vcpu->id, ®s); + vcpu_regs_set(vcpu, ®s); } int main(void) @@ -106,8 +106,8 @@ int main(void) /* Test software BPs - int3 */ memset(&debug, 0, sizeof(debug)); debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP; - vcpu_guest_debug_set(vm, vcpu->id, &debug); - vcpu_run(vm, vcpu->id); + vcpu_guest_debug_set(vcpu, &debug); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && run->debug.arch.exception == BP_VECTOR && run->debug.arch.pc == CAST_TO_RIP(sw_bp), @@ -122,8 +122,8 @@ int main(void) debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP; debug.arch.debugreg[i] = CAST_TO_RIP(hw_bp); debug.arch.debugreg[7] = 0x400 | (1UL << (2*i+1)); - vcpu_guest_debug_set(vm, vcpu->id, &debug); - vcpu_run(vm, vcpu->id); + vcpu_guest_debug_set(vcpu, &debug); + vcpu_run(vcpu); target_dr6 = 0xffff0ff0 | (1UL << i); TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && run->debug.arch.exception == DB_VECTOR && @@ -145,8 +145,8 @@ int main(void) debug.arch.debugreg[i] = CAST_TO_RIP(guest_value); debug.arch.debugreg[7] = 0x00000400 | (1UL << (2*i+1)) | (0x000d0000UL << (4*i)); - vcpu_guest_debug_set(vm, vcpu->id, &debug); - vcpu_run(vm, vcpu->id); + vcpu_guest_debug_set(vcpu, &debug); + vcpu_run(vcpu); target_dr6 = 0xffff0ff0 | (1UL << i); TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && run->debug.arch.exception == DB_VECTOR && @@ -172,8 +172,8 @@ int main(void) debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_BLOCKIRQ; debug.arch.debugreg[7] = 0x00000400; - vcpu_guest_debug_set(vm, vcpu->id, &debug); - vcpu_run(vm, vcpu->id); + vcpu_guest_debug_set(vcpu, &debug); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && run->debug.arch.exception == DB_VECTOR && run->debug.arch.pc == target_rip && @@ -189,8 +189,8 @@ int main(void) memset(&debug, 0, sizeof(debug)); debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP; debug.arch.debugreg[7] = 0x400 | DR7_GD; - vcpu_guest_debug_set(vm, vcpu->id, &debug); - vcpu_run(vm, vcpu->id); + vcpu_guest_debug_set(vcpu, &debug); + vcpu_run(vcpu); target_dr6 = 0xffff0ff0 | DR6_BD; TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && run->debug.arch.exception == DB_VECTOR && @@ -204,11 +204,11 @@ int main(void) /* Disable all debug controls, run to the end */ memset(&debug, 0, sizeof(debug)); - vcpu_guest_debug_set(vm, vcpu->id, &debug); + vcpu_guest_debug_set(vcpu, &debug); - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "KVM_EXIT_IO"); - cmd = get_ucall(vm, vcpu->id, &uc); + cmd = get_ucall(vcpu, &uc); TEST_ASSERT(cmd == UCALL_DONE, "UCALL_DONE"); kvm_vm_free(vm); diff --git a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c index 08a95dab3a6b..fb2a2390b4af 100644 --- a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c +++ b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c @@ -83,9 +83,9 @@ static void process_exit_on_emulation_error(struct kvm_vcpu *vcpu) * contained an flds instruction that is 2-bytes in * length (ie: no prefix, no SIB, no displacement). */ - vcpu_regs_get(vcpu->vm, vcpu->id, ®s); + vcpu_regs_get(vcpu, ®s); regs.rip += 2; - vcpu_regs_set(vcpu->vm, vcpu->id, ®s); + vcpu_regs_set(vcpu, ®s); } } } @@ -101,7 +101,7 @@ static void check_for_guest_assert(struct kvm_vcpu *vcpu) struct ucall uc; if (vcpu->run->exit_reason == KVM_EXIT_IO && - get_ucall(vcpu->vm, vcpu->id, &uc) == UCALL_ABORT) { + get_ucall(vcpu, &uc) == UCALL_ABORT) { do_guest_assert(&uc); } } @@ -118,7 +118,7 @@ static void process_ucall_done(struct kvm_vcpu *vcpu) run->exit_reason, exit_reason_str(run->exit_reason)); - TEST_ASSERT(get_ucall(vcpu->vm, vcpu->id, &uc) == UCALL_DONE, + TEST_ASSERT(get_ucall(vcpu, &uc) == UCALL_DONE, "Unexpected ucall command: %lu, expected UCALL_DONE (%d)", uc.cmd, UCALL_DONE); } @@ -133,7 +133,7 @@ static uint64_t process_ucall(struct kvm_vcpu *vcpu) run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vcpu->vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_SYNC: break; case UCALL_ABORT: @@ -175,7 +175,7 @@ int main(int argc, char *argv[]) entry->eax = (entry->eax & 0xffffff00) | MAXPHYADDR; set_cpuid(cpuid, entry); - vcpu_set_cpuid(vm, vcpu->id, cpuid); + vcpu_set_cpuid(vcpu, cpuid); rc = kvm_check_cap(KVM_CAP_EXIT_ON_EMULATION_FAILURE); TEST_ASSERT(rc, "KVM_CAP_EXIT_ON_EMULATION_FAILURE is unavailable"); @@ -190,12 +190,12 @@ int main(int argc, char *argv[]) virt_map(vm, MEM_REGION_GVA, MEM_REGION_GPA, 1); hva = addr_gpa2hva(vm, MEM_REGION_GPA); memset(hva, 0, PAGE_SIZE); - pte = vm_get_page_table_entry(vm, vcpu->id, MEM_REGION_GVA); - vm_set_page_table_entry(vm, vcpu->id, MEM_REGION_GVA, pte | (1ull << 36)); + pte = vm_get_page_table_entry(vm, vcpu, MEM_REGION_GVA); + vm_set_page_table_entry(vm, vcpu, MEM_REGION_GVA, pte | (1ull << 36)); - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); process_exit_on_emulation_error(vcpu); - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(process_ucall(vcpu) == UCALL_DONE, "Expected UCALL_DONE"); diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index ba39042a5d96..6c4e728d2d85 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -161,12 +161,12 @@ void inject_nmi(struct kvm_vcpu *vcpu) { struct kvm_vcpu_events events; - vcpu_events_get(vcpu->vm, vcpu->id, &events); + vcpu_events_get(vcpu, &events); events.nmi.pending = 1; events.flags |= KVM_VCPUEVENT_VALID_NMI_PENDING; - vcpu_events_set(vcpu->vm, vcpu->id, &events); + vcpu_events_set(vcpu, &events); } static struct kvm_vcpu *save_restore_vm(struct kvm_vm *vm, @@ -175,21 +175,21 @@ static struct kvm_vcpu *save_restore_vm(struct kvm_vm *vm, struct kvm_regs regs1, regs2; struct kvm_x86_state *state; - state = vcpu_save_state(vm, vcpu->id); + state = vcpu_save_state(vcpu); memset(®s1, 0, sizeof(regs1)); - vcpu_regs_get(vm, vcpu->id, ®s1); + vcpu_regs_get(vcpu, ®s1); kvm_vm_release(vm); /* Restore state in a new VM. */ vcpu = vm_recreate_with_one_vcpu(vm); - vcpu_set_hv_cpuid(vm, vcpu->id); - vcpu_enable_evmcs(vm, vcpu->id); - vcpu_load_state(vm, vcpu->id, state); + vcpu_set_hv_cpuid(vcpu); + vcpu_enable_evmcs(vcpu); + vcpu_load_state(vcpu, state); kvm_x86_state_cleanup(state); memset(®s2, 0, sizeof(regs2)); - vcpu_regs_get(vm, vcpu->id, ®s2); + vcpu_regs_get(vcpu, ®s2); TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", (ulong) regs2.rdi, (ulong) regs2.rsi); @@ -215,14 +215,14 @@ int main(int argc, char *argv[]) exit(KSFT_SKIP); } - vcpu_set_hv_cpuid(vm, vcpu->id); - vcpu_enable_evmcs(vm, vcpu->id); + vcpu_set_hv_cpuid(vcpu); + vcpu_enable_evmcs(vcpu); vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vm, vcpu->id, 1, vmx_pages_gva); + vcpu_args_set(vcpu, 1, vmx_pages_gva); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, vcpu->id); + vcpu_init_descriptor_tables(vcpu); vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); vm_install_exception_handler(vm, NMI_VECTOR, guest_nmi_handler); @@ -231,13 +231,13 @@ int main(int argc, char *argv[]) for (stage = 1;; stage++) { run = vcpu->run; - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Stage %d: unexpected exit reason: %u (%s),\n", stage, run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]); diff --git a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c index 108c3f75361d..137759547720 100644 --- a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c +++ b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c @@ -95,7 +95,7 @@ static void guest_main(void) static void setup_ud_vector(struct kvm_vcpu *vcpu) { vm_init_descriptor_tables(vcpu->vm); - vcpu_init_descriptor_tables(vcpu->vm, vcpu->id); + vcpu_init_descriptor_tables(vcpu); vm_install_exception_handler(vcpu->vm, UD_VECTOR, guest_ud_handler); } @@ -104,8 +104,8 @@ static void enter_guest(struct kvm_vcpu *vcpu) struct kvm_run *run = vcpu->run; struct ucall uc; - vcpu_run(vcpu->vm, vcpu->id); - switch (get_ucall(vcpu->vm, vcpu->id, &uc)) { + vcpu_run(vcpu); + switch (get_ucall(vcpu, &uc)) { case UCALL_SYNC: pr_info("%s: %016lx\n", (const char *)uc.args[2], uc.args[3]); break; diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c index 4f8f3999de2d..f7a9e29ff0c7 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c @@ -178,16 +178,16 @@ static void host_check_tsc_msr_rdtsc(struct kvm_vcpu *vcpu) u64 tsc_freq, r1, r2, t1, t2; s64 delta_ns; - tsc_freq = vcpu_get_msr(vcpu->vm, vcpu->id, HV_X64_MSR_TSC_FREQUENCY); + tsc_freq = vcpu_get_msr(vcpu, HV_X64_MSR_TSC_FREQUENCY); TEST_ASSERT(tsc_freq > 0, "TSC frequency must be nonzero"); /* For increased accuracy, take mean rdtsc() before and afrer ioctl */ r1 = rdtsc(); - t1 = vcpu_get_msr(vcpu->vm, vcpu->id, HV_X64_MSR_TIME_REF_COUNT); + t1 = vcpu_get_msr(vcpu, HV_X64_MSR_TIME_REF_COUNT); r1 = (r1 + rdtsc()) / 2; nop_loop(); r2 = rdtsc(); - t2 = vcpu_get_msr(vcpu->vm, vcpu->id, HV_X64_MSR_TIME_REF_COUNT); + t2 = vcpu_get_msr(vcpu, HV_X64_MSR_TIME_REF_COUNT); r2 = (r2 + rdtsc()) / 2; TEST_ASSERT(t2 > t1, "Time reference MSR is not monotonic (%ld <= %ld)", t1, t2); @@ -215,24 +215,24 @@ int main(void) vm = vm_create_with_one_vcpu(&vcpu, guest_main); run = vcpu->run; - vcpu_set_hv_cpuid(vm, vcpu->id); + vcpu_set_hv_cpuid(vcpu); tsc_page_gva = vm_vaddr_alloc_page(vm); memset(addr_gva2hva(vm, tsc_page_gva), 0x0, getpagesize()); TEST_ASSERT((addr_gva2gpa(vm, tsc_page_gva) & (getpagesize() - 1)) == 0, "TSC page has to be page aligned\n"); - vcpu_args_set(vm, vcpu->id, 2, tsc_page_gva, addr_gva2gpa(vm, tsc_page_gva)); + vcpu_args_set(vcpu, 2, tsc_page_gva, addr_gva2gpa(vm, tsc_page_gva)); host_check_tsc_msr_rdtsc(vcpu); for (stage = 1;; stage++) { - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Stage %d: unexpected exit reason: %u (%s),\n", stage, run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c index d1a22ee98cf3..af13c48f0f30 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c @@ -119,7 +119,7 @@ void test_hv_cpuid_e2big(struct kvm_vm *vm, struct kvm_vcpu *vcpu) int ret; if (vcpu) - ret = __vcpu_ioctl(vm, vcpu->id, KVM_GET_SUPPORTED_HV_CPUID, &cpuid); + ret = __vcpu_ioctl(vcpu, KVM_GET_SUPPORTED_HV_CPUID, &cpuid); else ret = __kvm_ioctl(vm_get_kvm_fd(vm), KVM_GET_SUPPORTED_HV_CPUID, &cpuid); @@ -147,7 +147,7 @@ int main(int argc, char *argv[]) /* Test vCPU ioctl version */ test_hv_cpuid_e2big(vm, vcpu); - hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vm, vcpu->id); + hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vcpu); test_hv_cpuid(hv_cpuid_entries, false); free(hv_cpuid_entries); @@ -156,8 +156,8 @@ int main(int argc, char *argv[]) print_skip("Enlightened VMCS is unsupported"); goto do_sys; } - vcpu_enable_evmcs(vm, vcpu->id); - hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vm, vcpu->id); + vcpu_enable_evmcs(vcpu); + hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vcpu); test_hv_cpuid(hv_cpuid_entries, true); free(hv_cpuid_entries); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c index d0bd9d5e8a99..d5f37495ade8 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c @@ -161,7 +161,7 @@ static void hv_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, "failed to set HYPERV_CPUID_ENLIGHTMENT_INFO leaf"); TEST_ASSERT(set_cpuid(cpuid, dbg), "failed to set HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES leaf"); - vcpu_set_cpuid(vcpu->vm, vcpu->id, cpuid); + vcpu_set_cpuid(vcpu, cpuid); } static void guest_test_msrs_access(void) @@ -191,15 +191,15 @@ static void guest_test_msrs_access(void) memset(addr_gva2hva(vm, msr_gva), 0x0, getpagesize()); msr = addr_gva2hva(vm, msr_gva); - vcpu_args_set(vm, vcpu->id, 1, msr_gva); - vcpu_enable_cap(vm, vcpu->id, KVM_CAP_HYPERV_ENFORCE_CPUID, 1); + vcpu_args_set(vcpu, 1, msr_gva); + vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_ENFORCE_CPUID, 1); - vcpu_set_hv_cpuid(vm, vcpu->id); + vcpu_set_hv_cpuid(vcpu); best = kvm_get_supported_hv_cpuid(); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, vcpu->id); + vcpu_init_descriptor_tables(vcpu); vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); run = vcpu->run; @@ -333,7 +333,7 @@ static void guest_test_msrs_access(void) * Remains unavailable even with KVM_CAP_HYPERV_SYNIC2 * capability enabled and guest visible CPUID bit unset. */ - vcpu_enable_cap(vm, vcpu->id, KVM_CAP_HYPERV_SYNIC2, 0); + vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_SYNIC2, 0); break; case 22: feat.eax |= HV_MSR_SYNIC_AVAILABLE; @@ -471,12 +471,12 @@ static void guest_test_msrs_access(void) else pr_debug("Stage %d: finish\n", stage); - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "unexpected exit reason: %u (%s)", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_SYNC: TEST_ASSERT(uc.args[1] == 0, "Unexpected stage: %ld (0 expected)\n", @@ -520,7 +520,7 @@ static void guest_test_hcalls_access(void) vm = vm_create_with_one_vcpu(&vcpu, guest_hcall); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, vcpu->id); + vcpu_init_descriptor_tables(vcpu); vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); /* Hypercall input/output */ @@ -531,10 +531,10 @@ static void guest_test_hcalls_access(void) hcall_params = vm_vaddr_alloc_page(vm); memset(addr_gva2hva(vm, hcall_params), 0x0, getpagesize()); - vcpu_args_set(vm, vcpu->id, 2, addr_gva2gpa(vm, hcall_page), hcall_params); - vcpu_enable_cap(vm, vcpu->id, KVM_CAP_HYPERV_ENFORCE_CPUID, 1); + vcpu_args_set(vcpu, 2, addr_gva2gpa(vm, hcall_page), hcall_params); + vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_ENFORCE_CPUID, 1); - vcpu_set_hv_cpuid(vm, vcpu->id); + vcpu_set_hv_cpuid(vcpu); best = kvm_get_supported_hv_cpuid(); @@ -641,12 +641,12 @@ static void guest_test_hcalls_access(void) else pr_debug("Stage %d: finish\n", stage); - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "unexpected exit reason: %u (%s)", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_SYNC: TEST_ASSERT(uc.args[1] == 0, "Unexpected stage: %ld (0 expected)\n", diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c index b6a749f5c766..171009184c3b 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c @@ -133,19 +133,19 @@ int main(int argc, char *argv[]) } /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, guest_code); - vcpu_set_hv_cpuid(vm, vcpu->id); + vcpu_set_hv_cpuid(vcpu); run = vcpu->run; vcpu_alloc_svm(vm, &nested_gva); - vcpu_args_set(vm, vcpu->id, 1, nested_gva); + vcpu_args_set(vcpu, 1, nested_gva); for (stage = 1;; stage++) { - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Stage %d: unexpected exit reason: %u (%s),\n", stage, run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]); diff --git a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c index 2c1f850c4053..6e3c4bd60b76 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c @@ -116,14 +116,14 @@ static void enter_guest(struct kvm_vcpu *vcpu) vm_ioctl(vm, KVM_GET_CLOCK, &start); - vcpu_run(vcpu->vm, vcpu->id); + vcpu_run(vcpu); vm_ioctl(vm, KVM_GET_CLOCK, &end); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "unexpected exit reason: %u (%s)", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vcpu->vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_SYNC: handle_sync(&uc, &start, &end); break; @@ -193,7 +193,7 @@ int main(void) pvti_gva = vm_vaddr_alloc(vm, getpagesize(), 0x10000); pvti_gpa = addr_gva2gpa(vm, pvti_gva); - vcpu_args_set(vm, vcpu->id, 2, pvti_gpa, pvti_gva); + vcpu_args_set(vcpu, 2, pvti_gpa, pvti_gva); enter_guest(vcpu); kvm_vm_free(vm); diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c index 734e71739d33..f497d6ecec25 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c @@ -177,12 +177,12 @@ static void enter_guest(struct kvm_vcpu *vcpu) struct ucall uc; while (true) { - vcpu_run(vcpu->vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "unexpected exit reason: %u (%s)", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vcpu->vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_PR_MSR: pr_msr(&uc); break; @@ -211,14 +211,14 @@ int main(void) vm = vm_create_with_one_vcpu(&vcpu, guest_main); - vcpu_enable_cap(vm, vcpu->id, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, 1); + vcpu_enable_cap(vcpu, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, 1); best = kvm_get_supported_cpuid(); clear_kvm_cpuid_features(best); - vcpu_set_cpuid(vm, vcpu->id, best); + vcpu_set_cpuid(vcpu, best); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, vcpu->id); + vcpu_init_descriptor_tables(vcpu); vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); enter_guest(vcpu); diff --git a/tools/testing/selftests/kvm/x86_64/mmu_role_test.c b/tools/testing/selftests/kvm/x86_64/mmu_role_test.c index 829a5cf94f5c..1404dfda2e9f 100644 --- a/tools/testing/selftests/kvm/x86_64/mmu_role_test.c +++ b/tools/testing/selftests/kvm/x86_64/mmu_role_test.c @@ -35,7 +35,7 @@ static void mmu_role_test(u32 *cpuid_reg, u32 evil_cpuid_val) /* Map 1gb page without a backing memlot. */ __virt_pg_map(vm, MMIO_GPA, MMIO_GPA, PG_LEVEL_1G); - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); /* Guest access to the 1gb page should trigger MMIO. */ TEST_ASSERT(run->exit_reason == KVM_EXIT_MMIO, @@ -54,7 +54,7 @@ static void mmu_role_test(u32 *cpuid_reg, u32 evil_cpuid_val) * returns the struct that contains the entry being modified. Eww. */ *cpuid_reg = evil_cpuid_val; - vcpu_set_cpuid(vm, vcpu->id, kvm_get_supported_cpuid()); + vcpu_set_cpuid(vcpu, kvm_get_supported_cpuid()); /* * Add a dummy memslot to coerce KVM into bumping the MMIO generation. @@ -67,12 +67,12 @@ static void mmu_role_test(u32 *cpuid_reg, u32 evil_cpuid_val) /* Set up a #PF handler to eat the RSVD #PF and signal all done! */ vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, vcpu->id); + vcpu_init_descriptor_tables(vcpu); vm_install_exception_handler(vm, PF_VECTOR, guest_pf_handler); - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); - cmd = get_ucall(vm, vcpu->id, NULL); + cmd = get_ucall(vcpu, NULL); TEST_ASSERT(cmd == UCALL_DONE, "Unexpected guest exit, exit_reason=%s, ucall.cmd = %lu\n", exit_reason_str(run->exit_reason), cmd); diff --git a/tools/testing/selftests/kvm/x86_64/platform_info_test.c b/tools/testing/selftests/kvm/x86_64/platform_info_test.c index eb5e1f972d76..3cb48e4b615b 100644 --- a/tools/testing/selftests/kvm/x86_64/platform_info_test.c +++ b/tools/testing/selftests/kvm/x86_64/platform_info_test.c @@ -40,12 +40,12 @@ static void test_msr_platform_info_enabled(struct kvm_vcpu *vcpu) struct ucall uc; vm_enable_cap(vcpu->vm, KVM_CAP_MSR_PLATFORM_INFO, true); - vcpu_run(vcpu->vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Exit_reason other than KVM_EXIT_IO: %u (%s),\n", run->exit_reason, exit_reason_str(run->exit_reason)); - get_ucall(vcpu->vm, vcpu->id, &uc); + get_ucall(vcpu, &uc); TEST_ASSERT(uc.cmd == UCALL_SYNC, "Received ucall other than UCALL_SYNC: %lu\n", uc.cmd); TEST_ASSERT((uc.args[1] & MSR_PLATFORM_INFO_MAX_TURBO_RATIO) == @@ -59,7 +59,7 @@ static void test_msr_platform_info_disabled(struct kvm_vcpu *vcpu) struct kvm_run *run = vcpu->run; vm_enable_cap(vcpu->vm, KVM_CAP_MSR_PLATFORM_INFO, false); - vcpu_run(vcpu->vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN, "Exit_reason other than KVM_EXIT_SHUTDOWN: %u (%s)\n", run->exit_reason, @@ -84,12 +84,12 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); - msr_platform_info = vcpu_get_msr(vm, vcpu->id, MSR_PLATFORM_INFO); - vcpu_set_msr(vm, vcpu->id, MSR_PLATFORM_INFO, - msr_platform_info | MSR_PLATFORM_INFO_MAX_TURBO_RATIO); + msr_platform_info = vcpu_get_msr(vcpu, MSR_PLATFORM_INFO); + vcpu_set_msr(vcpu, MSR_PLATFORM_INFO, + msr_platform_info | MSR_PLATFORM_INFO_MAX_TURBO_RATIO); test_msr_platform_info_enabled(vcpu); test_msr_platform_info_disabled(vcpu); - vcpu_set_msr(vm, vcpu->id, MSR_PLATFORM_INFO, msr_platform_info); + vcpu_set_msr(vcpu, MSR_PLATFORM_INFO, msr_platform_info); kvm_vm_free(vm); diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index ffa9e267188c..c86b84b68ee3 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -177,12 +177,12 @@ static uint64_t run_vcpu_to_sync(struct kvm_vcpu *vcpu) struct kvm_run *run = vcpu->run; struct ucall uc; - vcpu_run(vcpu->vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Exit_reason other than KVM_EXIT_IO: %u (%s)\n", run->exit_reason, exit_reason_str(run->exit_reason)); - get_ucall(vcpu->vm, vcpu->id, &uc); + get_ucall(vcpu, &uc); TEST_ASSERT(uc.cmd == UCALL_SYNC, "Received ucall other than UCALL_SYNC: %lu", uc.cmd); return uc.args[1]; @@ -371,7 +371,7 @@ static void test_pmu_config_disable(void (*guest_code)(void)) vcpu = vm_vcpu_add(vm, 0, guest_code); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, vcpu->id); + vcpu_init_descriptor_tables(vcpu); TEST_ASSERT(!sanity_check_pmu(vcpu), "Guest should not be able to use disabled PMU."); @@ -470,7 +470,7 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, vcpu->id); + vcpu_init_descriptor_tables(vcpu); if (!sanity_check_pmu(vcpu)) { print_skip("Guest PMU is not functional"); diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c index b11f12888fad..afc063178c6a 100644 --- a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c +++ b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c @@ -49,9 +49,9 @@ static void run_vcpu(struct kvm_vcpu *vcpu) for (stage = 0; stage < 2; stage++) { - vcpu_run(vcpu->vm, vcpu->id); + vcpu_run(vcpu); - switch (get_ucall(vcpu->vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_SYNC: TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && uc.args[1] == stage + 1, diff --git a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c index 2e67df3a95ba..dd344439ad33 100644 --- a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c +++ b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c @@ -35,11 +35,11 @@ static void test_cr4_feature_bit(struct kvm_vcpu *vcpu, struct kvm_sregs *orig, memcpy(&sregs, orig, sizeof(sregs)); sregs.cr4 |= feature_bit; - rc = _vcpu_sregs_set(vcpu->vm, vcpu->id, &sregs); + rc = _vcpu_sregs_set(vcpu, &sregs); TEST_ASSERT(rc, "KVM allowed unsupported CR4 bit (0x%lx)", feature_bit); /* Sanity check that KVM didn't change anything. */ - vcpu_sregs_get(vcpu->vm, vcpu->id, &sregs); + vcpu_sregs_get(vcpu, &sregs); TEST_ASSERT(!memcmp(&sregs, orig, sizeof(sregs)), "KVM modified sregs"); } @@ -97,15 +97,15 @@ int main(int argc, char *argv[]) vm = vm_create_barebones(); vcpu = __vm_vcpu_add(vm, 0); - vcpu_sregs_get(vm, vcpu->id, &sregs); + vcpu_sregs_get(vcpu, &sregs); sregs.cr4 |= calc_cr4_feature_bits(vm); cr4 = sregs.cr4; - rc = _vcpu_sregs_set(vm, vcpu->id, &sregs); + rc = _vcpu_sregs_set(vcpu, &sregs); TEST_ASSERT(!rc, "Failed to set supported CR4 bits (0x%lx)", cr4); - vcpu_sregs_get(vm, vcpu->id, &sregs); + vcpu_sregs_get(vcpu, &sregs); TEST_ASSERT(sregs.cr4 == cr4, "sregs.CR4 (0x%llx) != CR4 (0x%lx)", sregs.cr4, cr4); @@ -125,13 +125,13 @@ int main(int argc, char *argv[]) /* Create a "real" VM and verify APIC_BASE can be set. */ vm = vm_create_with_one_vcpu(&vcpu, NULL); - vcpu_sregs_get(vm, vcpu->id, &sregs); + vcpu_sregs_get(vcpu, &sregs); sregs.apic_base = 1 << 10; - rc = _vcpu_sregs_set(vm, vcpu->id, &sregs); + rc = _vcpu_sregs_set(vcpu, &sregs); TEST_ASSERT(rc, "Set IA32_APIC_BASE to %llx (invalid)", sregs.apic_base); sregs.apic_base = 1 << 11; - rc = _vcpu_sregs_set(vm, vcpu->id, &sregs); + rc = _vcpu_sregs_set(vcpu, &sregs); TEST_ASSERT(!rc, "Couldn't set IA32_APIC_BASE to %llx (valid)", sregs.apic_base); diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c index 36165b774a28..3cd1da388b52 100644 --- a/tools/testing/selftests/kvm/x86_64/smm_test.c +++ b/tools/testing/selftests/kvm/x86_64/smm_test.c @@ -118,12 +118,12 @@ void inject_smi(struct kvm_vcpu *vcpu) { struct kvm_vcpu_events events; - vcpu_events_get(vcpu->vm, vcpu->id, &events); + vcpu_events_get(vcpu, &events); events.smi.pending = 1; events.flags |= KVM_VCPUEVENT_VALID_SMM; - vcpu_events_set(vcpu->vm, vcpu->id, &events); + vcpu_events_set(vcpu, &events); } int main(int argc, char *argv[]) @@ -151,7 +151,7 @@ int main(int argc, char *argv[]) memcpy(addr_gpa2hva(vm, SMRAM_GPA) + 0x8000, smi_handler, sizeof(smi_handler)); - vcpu_set_msr(vm, vcpu->id, MSR_IA32_SMBASE, SMRAM_GPA); + vcpu_set_msr(vcpu, MSR_IA32_SMBASE, SMRAM_GPA); if (kvm_check_cap(KVM_CAP_NESTED_STATE)) { if (nested_svm_supported()) @@ -163,17 +163,17 @@ int main(int argc, char *argv[]) if (!nested_gva) pr_info("will skip SMM test with VMX enabled\n"); - vcpu_args_set(vm, vcpu->id, 1, nested_gva); + vcpu_args_set(vcpu, 1, nested_gva); for (stage = 1;; stage++) { - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Stage %d: unexpected exit reason: %u (%s),\n", stage, run->exit_reason, exit_reason_str(run->exit_reason)); memset(®s, 0, sizeof(regs)); - vcpu_regs_get(vm, vcpu->id, ®s); + vcpu_regs_get(vcpu, ®s); stage_reported = regs.rax & 0xff; @@ -201,12 +201,12 @@ int main(int argc, char *argv[]) if (stage == 10) inject_smi(vcpu); - state = vcpu_save_state(vm, vcpu->id); + state = vcpu_save_state(vcpu); kvm_vm_release(vm); vcpu = vm_recreate_with_one_vcpu(vm); - vcpu_set_cpuid(vm, vcpu->id, kvm_get_supported_cpuid()); - vcpu_load_state(vm, vcpu->id, state); + vcpu_set_cpuid(vcpu, kvm_get_supported_cpuid()); + vcpu_load_state(vcpu, state); run = vcpu->run; kvm_x86_state_cleanup(state); } diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c index b7869efad22a..0bcd78cf7c79 100644 --- a/tools/testing/selftests/kvm/x86_64/state_test.c +++ b/tools/testing/selftests/kvm/x86_64/state_test.c @@ -167,7 +167,7 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); run = vcpu->run; - vcpu_regs_get(vm, vcpu->id, ®s1); + vcpu_regs_get(vcpu, ®s1); if (kvm_check_cap(KVM_CAP_NESTED_STATE)) { if (nested_svm_supported()) @@ -179,16 +179,16 @@ int main(int argc, char *argv[]) if (!nested_gva) pr_info("will skip nested state checks\n"); - vcpu_args_set(vm, vcpu->id, 1, nested_gva); + vcpu_args_set(vcpu, 1, nested_gva); for (stage = 1;; stage++) { - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Stage %d: unexpected exit reason: %u (%s),\n", stage, run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]); @@ -206,21 +206,21 @@ int main(int argc, char *argv[]) uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx", stage, (ulong)uc.args[1]); - state = vcpu_save_state(vm, vcpu->id); + state = vcpu_save_state(vcpu); memset(®s1, 0, sizeof(regs1)); - vcpu_regs_get(vm, vcpu->id, ®s1); + vcpu_regs_get(vcpu, ®s1); kvm_vm_release(vm); /* Restore state in a new VM. */ vcpu = vm_recreate_with_one_vcpu(vm); - vcpu_set_cpuid(vm, vcpu->id, kvm_get_supported_cpuid()); - vcpu_load_state(vm, vcpu->id, state); + vcpu_set_cpuid(vcpu, kvm_get_supported_cpuid()); + vcpu_load_state(vcpu, state); run = vcpu->run; kvm_x86_state_cleanup(state); memset(®s2, 0, sizeof(regs2)); - vcpu_regs_get(vm, vcpu->id, ®s2); + vcpu_regs_get(vcpu, ®s2); TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", (ulong) regs2.rdi, (ulong) regs2.rsi); diff --git a/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c index 8e90e463895a..9c68a47b69e1 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c @@ -95,23 +95,23 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, vcpu->id); + vcpu_init_descriptor_tables(vcpu); vm_install_exception_handler(vm, VINTR_IRQ_NUMBER, vintr_irq_handler); vm_install_exception_handler(vm, INTR_IRQ_NUMBER, intr_irq_handler); vcpu_alloc_svm(vm, &svm_gva); - vcpu_args_set(vm, vcpu->id, 1, svm_gva); + vcpu_args_set(vcpu, 1, svm_gva); run = vcpu->run; - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: TEST_FAIL("%s", (const char *)uc.args[0]); break; diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c index a337ab2ec101..1c3f457aa3aa 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c @@ -145,7 +145,7 @@ static void run_test(bool is_nmi) vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, vcpu->id); + vcpu_init_descriptor_tables(vcpu); vm_install_exception_handler(vm, NMI_VECTOR, guest_nmi_handler); vm_install_exception_handler(vm, BP_VECTOR, guest_bp_handler); @@ -163,23 +163,23 @@ static void run_test(bool is_nmi) } else { idt_alt_vm = 0; } - vcpu_args_set(vm, vcpu->id, 3, svm_gva, (uint64_t)is_nmi, (uint64_t)idt_alt_vm); + vcpu_args_set(vcpu, 3, svm_gva, (uint64_t)is_nmi, (uint64_t)idt_alt_vm); memset(&debug, 0, sizeof(debug)); - vcpu_guest_debug_set(vm, vcpu->id, &debug); + vcpu_guest_debug_set(vcpu, &debug); struct kvm_run *run = vcpu->run; struct ucall uc; alarm(2); - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); alarm(0); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: TEST_FAIL("%s at %s:%ld, vals = 0x%lx 0x%lx 0x%lx", (const char *)uc.args[0], __FILE__, uc.args[1], uc.args[2], uc.args[3], uc.args[4]); diff --git a/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c index 15e389a7cd31..e6d7191866a5 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c @@ -44,19 +44,19 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); vcpu_alloc_svm(vm, &svm_gva); - vcpu_args_set(vm, vcpu->id, 1, svm_gva); + vcpu_args_set(vcpu, 1, svm_gva); for (;;) { volatile struct kvm_run *run = vcpu->run; struct ucall uc; - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: TEST_FAIL("%s", (const char *)uc.args[0]); /* NOT REACHED */ diff --git a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c index c971706b49f5..773db9d4f228 100644 --- a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c +++ b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c @@ -109,14 +109,14 @@ int main(int argc, char *argv[]) /* Request reading invalid register set from VCPU. */ run->kvm_valid_regs = INVALID_SYNC_FIELD; - rv = _vcpu_run(vm, vcpu->id); + rv = _vcpu_run(vcpu); TEST_ASSERT(rv < 0 && errno == EINVAL, "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n", rv); run->kvm_valid_regs = 0; run->kvm_valid_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS; - rv = _vcpu_run(vm, vcpu->id); + rv = _vcpu_run(vcpu); TEST_ASSERT(rv < 0 && errno == EINVAL, "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n", rv); @@ -124,14 +124,14 @@ int main(int argc, char *argv[]) /* Request setting invalid register set into VCPU. */ run->kvm_dirty_regs = INVALID_SYNC_FIELD; - rv = _vcpu_run(vm, vcpu->id); + rv = _vcpu_run(vcpu); TEST_ASSERT(rv < 0 && errno == EINVAL, "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n", rv); run->kvm_dirty_regs = 0; run->kvm_dirty_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS; - rv = _vcpu_run(vm, vcpu->id); + rv = _vcpu_run(vcpu); TEST_ASSERT(rv < 0 && errno == EINVAL, "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n", rv); @@ -140,19 +140,19 @@ int main(int argc, char *argv[]) /* Request and verify all valid register sets. */ /* TODO: BUILD TIME CHECK: TEST_ASSERT(KVM_SYNC_X86_NUM_FIELDS != 3); */ run->kvm_valid_regs = TEST_SYNC_FIELDS; - rv = _vcpu_run(vm, vcpu->id); + rv = _vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Unexpected exit reason: %u (%s),\n", run->exit_reason, exit_reason_str(run->exit_reason)); - vcpu_regs_get(vm, vcpu->id, ®s); + vcpu_regs_get(vcpu, ®s); compare_regs(®s, &run->s.regs.regs); - vcpu_sregs_get(vm, vcpu->id, &sregs); + vcpu_sregs_get(vcpu, &sregs); compare_sregs(&sregs, &run->s.regs.sregs); - vcpu_events_get(vm, vcpu->id, &events); + vcpu_events_get(vcpu, &events); compare_vcpu_events(&events, &run->s.regs.events); /* Set and verify various register values. */ @@ -162,7 +162,7 @@ int main(int argc, char *argv[]) run->kvm_valid_regs = TEST_SYNC_FIELDS; run->kvm_dirty_regs = KVM_SYNC_X86_REGS | KVM_SYNC_X86_SREGS; - rv = _vcpu_run(vm, vcpu->id); + rv = _vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Unexpected exit reason: %u (%s),\n", run->exit_reason, @@ -174,13 +174,13 @@ int main(int argc, char *argv[]) "apic_base sync regs value incorrect 0x%llx.", run->s.regs.sregs.apic_base); - vcpu_regs_get(vm, vcpu->id, ®s); + vcpu_regs_get(vcpu, ®s); compare_regs(®s, &run->s.regs.regs); - vcpu_sregs_get(vm, vcpu->id, &sregs); + vcpu_sregs_get(vcpu, &sregs); compare_sregs(&sregs, &run->s.regs.sregs); - vcpu_events_get(vm, vcpu->id, &events); + vcpu_events_get(vcpu, &events); compare_vcpu_events(&events, &run->s.regs.events); /* Clear kvm_dirty_regs bits, verify new s.regs values are @@ -189,7 +189,7 @@ int main(int argc, char *argv[]) run->kvm_valid_regs = TEST_SYNC_FIELDS; run->kvm_dirty_regs = 0; run->s.regs.regs.rbx = 0xDEADBEEF; - rv = _vcpu_run(vm, vcpu->id); + rv = _vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Unexpected exit reason: %u (%s),\n", run->exit_reason, @@ -206,8 +206,8 @@ int main(int argc, char *argv[]) run->kvm_dirty_regs = 0; run->s.regs.regs.rbx = 0xAAAA; regs.rbx = 0xBAC0; - vcpu_regs_set(vm, vcpu->id, ®s); - rv = _vcpu_run(vm, vcpu->id); + vcpu_regs_set(vcpu, ®s); + rv = _vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Unexpected exit reason: %u (%s),\n", run->exit_reason, @@ -215,7 +215,7 @@ int main(int argc, char *argv[]) TEST_ASSERT(run->s.regs.regs.rbx == 0xAAAA, "rbx sync regs value incorrect 0x%llx.", run->s.regs.regs.rbx); - vcpu_regs_get(vm, vcpu->id, ®s); + vcpu_regs_get(vcpu, ®s); TEST_ASSERT(regs.rbx == 0xBAC0 + 1, "rbx guest value incorrect 0x%llx.", regs.rbx); @@ -227,7 +227,7 @@ int main(int argc, char *argv[]) run->kvm_valid_regs = 0; run->kvm_dirty_regs = TEST_SYNC_FIELDS; run->s.regs.regs.rbx = 0xBBBB; - rv = _vcpu_run(vm, vcpu->id); + rv = _vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Unexpected exit reason: %u (%s),\n", run->exit_reason, @@ -235,7 +235,7 @@ int main(int argc, char *argv[]) TEST_ASSERT(run->s.regs.regs.rbx == 0xBBBB, "rbx sync regs value incorrect 0x%llx.", run->s.regs.regs.rbx); - vcpu_regs_get(vm, vcpu->id, ®s); + vcpu_regs_get(vcpu, ®s); TEST_ASSERT(regs.rbx == 0xBBBB + 1, "rbx guest value incorrect 0x%llx.", regs.rbx); diff --git a/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c index 2b0f19ddbc8b..01d491f849c2 100644 --- a/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c +++ b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c @@ -61,8 +61,8 @@ int main(void) run = vcpu->run; vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vm, vcpu->id, 1, vmx_pages_gva); - vcpu_run(vm, vcpu->id); + vcpu_args_set(vcpu, 1, vmx_pages_gva); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Expected KVM_EXIT_IO, got: %u (%s)\n", @@ -70,21 +70,21 @@ int main(void) TEST_ASSERT(run->io.port == ARBITRARY_IO_PORT, "Expected IN from port %d from L2, got port %d", ARBITRARY_IO_PORT, run->io.port); - vcpu_events_get(vm, vcpu->id, &events); + vcpu_events_get(vcpu, &events); events.flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT; events.triple_fault.pending = true; - vcpu_events_set(vm, vcpu->id, &events); + vcpu_events_set(vcpu, &events); run->immediate_exit = true; - vcpu_run_complete_io(vm, vcpu->id); + vcpu_run_complete_io(vcpu); - vcpu_events_get(vm, vcpu->id, &events); + vcpu_events_get(vcpu, &events); TEST_ASSERT(events.flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT, "Triple fault event invalid"); TEST_ASSERT(events.triple_fault.pending, "No triple fault pending"); - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); - switch (get_ucall(vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_DONE: break; case UCALL_ABORT: diff --git a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c index 3b7bf660eced..3165d3f7e065 100644 --- a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c +++ b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c @@ -14,7 +14,7 @@ #define GUEST_STEP (UNITY * 4) #define ROUND(x) ((x + UNITY / 2) & -UNITY) #define rounded_rdmsr(x) ROUND(rdmsr(x)) -#define rounded_host_rdmsr(x) ROUND(vcpu_get_msr(vm, vcpu->id, x)) +#define rounded_host_rdmsr(x) ROUND(vcpu_get_msr(vcpu, x)) static void guest_code(void) { @@ -68,9 +68,9 @@ static void run_vcpu(struct kvm_vcpu *vcpu, int stage) { struct ucall uc; - vcpu_run(vcpu->vm, vcpu->id); + vcpu_run(vcpu); - switch (get_ucall(vcpu->vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_SYNC: TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx", @@ -116,18 +116,18 @@ int main(void) * Host: writes to MSR_IA32_TSC set the host-side offset * and therefore do not change MSR_IA32_TSC_ADJUST. */ - vcpu_set_msr(vm, vcpu->id, MSR_IA32_TSC, HOST_ADJUST + val); + vcpu_set_msr(vcpu, MSR_IA32_TSC, HOST_ADJUST + val); ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val); ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); run_vcpu(vcpu, 3); /* Host: writes to MSR_IA32_TSC_ADJUST do not modify the TSC. */ - vcpu_set_msr(vm, vcpu->id, MSR_IA32_TSC_ADJUST, UNITY * 123456); + vcpu_set_msr(vcpu, MSR_IA32_TSC_ADJUST, UNITY * 123456); ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val); - ASSERT_EQ(vcpu_get_msr(vm, vcpu->id, MSR_IA32_TSC_ADJUST), UNITY * 123456); + ASSERT_EQ(vcpu_get_msr(vcpu, MSR_IA32_TSC_ADJUST), UNITY * 123456); /* Restore previous value. */ - vcpu_set_msr(vm, vcpu->id, MSR_IA32_TSC_ADJUST, val); + vcpu_set_msr(vcpu, MSR_IA32_TSC_ADJUST, val); ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val); ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); diff --git a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c index 728b252597cc..e416af887ca0 100644 --- a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c +++ b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c @@ -58,7 +58,7 @@ static void *run_vcpu(void *_cpu_nr) if (!first_cpu_done) { first_cpu_done = true; - vcpu_set_msr(vm, vcpu->id, MSR_IA32_TSC, TEST_TSC_OFFSET); + vcpu_set_msr(vcpu, MSR_IA32_TSC, TEST_TSC_OFFSET); } pthread_spin_unlock(&create_lock); @@ -67,13 +67,13 @@ static void *run_vcpu(void *_cpu_nr) volatile struct kvm_run *run = vcpu->run; struct ucall uc; - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_DONE: goto out; diff --git a/tools/testing/selftests/kvm/x86_64/userspace_io_test.c b/tools/testing/selftests/kvm/x86_64/userspace_io_test.c index 0ba774ed6476..7538d57a41d5 100644 --- a/tools/testing/selftests/kvm/x86_64/userspace_io_test.c +++ b/tools/testing/selftests/kvm/x86_64/userspace_io_test.c @@ -65,14 +65,14 @@ int main(int argc, char *argv[]) memset(®s, 0, sizeof(regs)); while (1) { - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Unexpected exit reason: %u (%s),\n", run->exit_reason, exit_reason_str(run->exit_reason)); - if (get_ucall(vm, vcpu->id, &uc)) + if (get_ucall(vcpu, &uc)) break; TEST_ASSERT(run->io.port == 0x80, @@ -85,13 +85,13 @@ int main(int argc, char *argv[]) * scope from a testing perspective as it's not ABI in any way, * i.e. it really is abusing internal KVM knowledge. */ - vcpu_regs_get(vm, vcpu->id, ®s); + vcpu_regs_get(vcpu, ®s); if (regs.rcx == 2) regs.rcx = 1; if (regs.rcx == 3) regs.rcx = 8192; memset((void *)run + run->io.data_offset, 0xaa, 4096); - vcpu_regs_set(vm, vcpu->id, ®s); + vcpu_regs_set(vcpu, ®s); } switch (uc.cmd) { diff --git a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c index a0d35e578b25..f84dc37426f5 100644 --- a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c +++ b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c @@ -399,7 +399,7 @@ static void check_for_guest_assert(struct kvm_vcpu *vcpu) struct ucall uc; if (vcpu->run->exit_reason == KVM_EXIT_IO && - get_ucall(vcpu->vm, vcpu->id, &uc) == UCALL_ABORT) { + get_ucall(vcpu, &uc) == UCALL_ABORT) { TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]); } @@ -483,7 +483,7 @@ static void process_ucall_done(struct kvm_vcpu *vcpu) run->exit_reason, exit_reason_str(run->exit_reason)); - TEST_ASSERT(get_ucall(vcpu->vm, vcpu->id, &uc) == UCALL_DONE, + TEST_ASSERT(get_ucall(vcpu, &uc) == UCALL_DONE, "Unexpected ucall command: %lu, expected UCALL_DONE (%d)", uc.cmd, UCALL_DONE); } @@ -500,7 +500,7 @@ static uint64_t process_ucall(struct kvm_vcpu *vcpu) run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vcpu->vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_SYNC: break; case UCALL_ABORT: @@ -519,26 +519,26 @@ static uint64_t process_ucall(struct kvm_vcpu *vcpu) static void run_guest_then_process_rdmsr(struct kvm_vcpu *vcpu, uint32_t msr_index) { - vcpu_run(vcpu->vm, vcpu->id); + vcpu_run(vcpu); process_rdmsr(vcpu, msr_index); } static void run_guest_then_process_wrmsr(struct kvm_vcpu *vcpu, uint32_t msr_index) { - vcpu_run(vcpu->vm, vcpu->id); + vcpu_run(vcpu); process_wrmsr(vcpu, msr_index); } static uint64_t run_guest_then_process_ucall(struct kvm_vcpu *vcpu) { - vcpu_run(vcpu->vm, vcpu->id); + vcpu_run(vcpu); return process_ucall(vcpu); } static void run_guest_then_process_ucall_done(struct kvm_vcpu *vcpu) { - vcpu_run(vcpu->vm, vcpu->id); + vcpu_run(vcpu); process_ucall_done(vcpu); } @@ -560,7 +560,7 @@ static void test_msr_filter_allow(void) vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_allow); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, vcpu->id); + vcpu_init_descriptor_tables(vcpu); vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); @@ -577,7 +577,7 @@ static void test_msr_filter_allow(void) run_guest_then_process_rdmsr(vcpu, MSR_NON_EXISTENT); vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); vm_install_exception_handler(vm, UD_VECTOR, NULL); if (process_ucall(vcpu) != UCALL_DONE) { @@ -608,7 +608,7 @@ static int handle_ucall(struct kvm_vcpu *vcpu) { struct ucall uc; - switch (get_ucall(vcpu->vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: TEST_FAIL("Guest assertion not met"); break; @@ -684,7 +684,7 @@ static void test_msr_filter_deny(void) vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_deny); while (1) { - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); switch (run->exit_reason) { case KVM_EXIT_X86_RDMSR: diff --git a/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c index 10f9c86029e6..ef7514376b1e 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c @@ -95,13 +95,13 @@ int main(int argc, char *argv[]) vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva); prepare_virtualize_apic_accesses(vmx, vm); - vcpu_args_set(vm, vcpu->id, 2, vmx_pages_gva, high_gpa); + vcpu_args_set(vcpu, 2, vmx_pages_gva, high_gpa); while (!done) { volatile struct kvm_run *run = vcpu->run; struct ucall uc; - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); if (apic_access_addr == high_gpa) { TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR, @@ -119,7 +119,7 @@ int main(int argc, char *argv[]) run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c index da0363076fba..40c77bb706a1 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c @@ -57,13 +57,13 @@ int main(int argc, char *argv[]) /* Allocate VMX pages and shared descriptors (vmx_pages). */ vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vm, vcpu->id, 1, vmx_pages_gva); + vcpu_args_set(vcpu, 1, vmx_pages_gva); for (;;) { volatile struct kvm_run *run = vcpu->run; struct ucall uc; - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", run->exit_reason, @@ -72,7 +72,7 @@ int main(int argc, char *argv[]) if (run->io.port == PORT_L0_EXIT) break; - switch (get_ucall(vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: TEST_FAIL("%s", (const char *)uc.args[0]); /* NOT REACHED */ diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c index fb8c7f7236f7..215ffa0589d4 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c @@ -82,7 +82,7 @@ int main(int argc, char *argv[]) /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vm, vcpu->id, 1, vmx_pages_gva); + vcpu_args_set(vcpu, 1, vmx_pages_gva); run = vcpu->run; /* Add an extra memory slot for testing dirty logging */ @@ -115,13 +115,13 @@ int main(int argc, char *argv[]) while (!done) { memset(host_test_mem, 0xaa, TEST_MEM_PAGES * 4096); - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Unexpected exit reason: %u (%s),\n", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c index 70b30583e50d..5bc2cee0d613 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c @@ -24,7 +24,7 @@ static void __run_vcpu_with_invalid_state(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - vcpu_run(vcpu->vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR, "Expected KVM_EXIT_INTERNAL_ERROR, got %d (%s)\n", @@ -60,9 +60,9 @@ static void set_or_clear_invalid_guest_state(struct kvm_vcpu *vcpu, bool set) static struct kvm_sregs sregs; if (!sregs.cr0) - vcpu_sregs_get(vcpu->vm, vcpu->id, &sregs); + vcpu_sregs_get(vcpu, &sregs); sregs.tr.unusable = !!set; - vcpu_sregs_set(vcpu->vm, vcpu->id, &sregs); + vcpu_sregs_set(vcpu, &sregs); } static void set_invalid_guest_state(struct kvm_vcpu *vcpu) @@ -91,7 +91,7 @@ static void sigalrm_handler(int sig) TEST_ASSERT(sig == SIGALRM, "Unexpected signal = %d", sig); - vcpu_events_get(vcpu->vm, vcpu->id, &events); + vcpu_events_get(vcpu, &events); /* * If an exception is pending, attempt KVM_RUN with invalid guest, @@ -120,7 +120,7 @@ int main(int argc, char *argv[]) get_set_sigalrm_vcpu(vcpu); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, vcpu->id); + vcpu_init_descriptor_tables(vcpu); vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c b/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c index ba534be498f9..683f4f0a1616 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c @@ -64,9 +64,9 @@ int main(int argc, char *argv[]) /* Allocate VMX pages and shared descriptors (vmx_pages). */ vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vm, vcpu->id, 1, vmx_pages_gva); + vcpu_args_set(vcpu, 1, vmx_pages_gva); - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); run = vcpu->run; @@ -88,13 +88,13 @@ int main(int argc, char *argv[]) * emulating invalid guest state for L2. */ memset(&sregs, 0, sizeof(sregs)); - vcpu_sregs_get(vm, vcpu->id, &sregs); + vcpu_sregs_get(vcpu, &sregs); sregs.tr.unusable = 1; - vcpu_sregs_set(vm, vcpu->id, &sregs); + vcpu_sregs_set(vcpu, &sregs); - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); - switch (get_ucall(vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_DONE: break; case UCALL_ABORT: diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c index c9cb29f06244..647a4320d3bc 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c @@ -182,26 +182,25 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vm, vcpu->id, 1, vmx_pages_gva); + vcpu_args_set(vcpu, 1, vmx_pages_gva); - tsc_khz = __vcpu_ioctl(vm, vcpu->id, KVM_GET_TSC_KHZ, NULL); + tsc_khz = __vcpu_ioctl(vcpu, KVM_GET_TSC_KHZ, NULL); TEST_ASSERT(tsc_khz != -1, "vcpu ioctl KVM_GET_TSC_KHZ failed"); /* scale down L1's TSC frequency */ - vcpu_ioctl(vm, vcpu->id, KVM_SET_TSC_KHZ, - (void *) (tsc_khz / l1_scale_factor)); + vcpu_ioctl(vcpu, KVM_SET_TSC_KHZ, (void *) (tsc_khz / l1_scale_factor)); for (;;) { volatile struct kvm_run *run = vcpu->run; struct ucall uc; - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: TEST_FAIL("%s", (const char *) uc.args[0]); case UCALL_SYNC: diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c index 63129ff5d003..a308442458b8 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c @@ -87,27 +87,27 @@ int main(int argc, char *argv[]) } /* testcase 1, set capabilities when we have PDCM bit */ - vcpu_set_cpuid(vm, vcpu->id, cpuid); - vcpu_set_msr(vm, vcpu->id, MSR_IA32_PERF_CAPABILITIES, PMU_CAP_FW_WRITES); + vcpu_set_cpuid(vcpu, cpuid); + vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, PMU_CAP_FW_WRITES); /* check capabilities can be retrieved with KVM_GET_MSR */ - ASSERT_EQ(vcpu_get_msr(vm, vcpu->id, MSR_IA32_PERF_CAPABILITIES), PMU_CAP_FW_WRITES); + ASSERT_EQ(vcpu_get_msr(vcpu, MSR_IA32_PERF_CAPABILITIES), PMU_CAP_FW_WRITES); /* check whatever we write with KVM_SET_MSR is _not_ modified */ - vcpu_run(vm, vcpu->id); - ASSERT_EQ(vcpu_get_msr(vm, vcpu->id, MSR_IA32_PERF_CAPABILITIES), PMU_CAP_FW_WRITES); + vcpu_run(vcpu); + ASSERT_EQ(vcpu_get_msr(vcpu, MSR_IA32_PERF_CAPABILITIES), PMU_CAP_FW_WRITES); /* testcase 2, check valid LBR formats are accepted */ - vcpu_set_msr(vm, vcpu->id, MSR_IA32_PERF_CAPABILITIES, 0); - ASSERT_EQ(vcpu_get_msr(vm, vcpu->id, MSR_IA32_PERF_CAPABILITIES), 0); + vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, 0); + ASSERT_EQ(vcpu_get_msr(vcpu, MSR_IA32_PERF_CAPABILITIES), 0); - vcpu_set_msr(vm, vcpu->id, MSR_IA32_PERF_CAPABILITIES, host_cap.lbr_format); - ASSERT_EQ(vcpu_get_msr(vm, vcpu->id, MSR_IA32_PERF_CAPABILITIES), (u64)host_cap.lbr_format); + vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.lbr_format); + ASSERT_EQ(vcpu_get_msr(vcpu, MSR_IA32_PERF_CAPABILITIES), (u64)host_cap.lbr_format); /* testcase 3, check invalid LBR format is rejected */ /* Note, on Arch LBR capable platforms, LBR_FMT in perf capability msr is 0x3f, * to avoid the failure, use a true invalid format 0x30 for the test. */ - ret = _vcpu_set_msr(vm, vcpu->id, MSR_IA32_PERF_CAPABILITIES, 0x30); + ret = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, 0x30); TEST_ASSERT(ret == 0, "Bad PERF_CAPABILITIES didn't fail."); printf("Completed perf capability tests.\n"); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c index 168adc5b2272..b775a11ec08b 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c @@ -178,19 +178,19 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); run = vcpu->run; - vcpu_regs_get(vm, vcpu->id, ®s1); + vcpu_regs_get(vcpu, ®s1); vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vm, vcpu->id, 1, vmx_pages_gva); + vcpu_args_set(vcpu, 1, vmx_pages_gva); for (stage = 1;; stage++) { - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Stage %d: unexpected exit reason: %u (%s),\n", stage, run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]); @@ -232,22 +232,22 @@ int main(int argc, char *argv[]) stage, uc.args[4], uc.args[5]); } - state = vcpu_save_state(vm, vcpu->id); + state = vcpu_save_state(vcpu); memset(®s1, 0, sizeof(regs1)); - vcpu_regs_get(vm, vcpu->id, ®s1); + vcpu_regs_get(vcpu, ®s1); kvm_vm_release(vm); /* Restore state in a new VM. */ vcpu = vm_recreate_with_one_vcpu(vm); - vcpu_set_cpuid(vm, vcpu->id, kvm_get_supported_cpuid()); - vcpu_load_state(vm, vcpu->id, state); + vcpu_set_cpuid(vcpu, kvm_get_supported_cpuid()); + vcpu_load_state(vcpu, state); run = vcpu->run; kvm_x86_state_cleanup(state); memset(®s2, 0, sizeof(regs2)); - vcpu_regs_get(vm, vcpu->id, ®s2); + vcpu_regs_get(vcpu, ®s2); TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", (ulong) regs2.rdi, (ulong) regs2.rsi); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c index de38f0e68153..ba783ceb007f 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c @@ -28,7 +28,7 @@ bool have_evmcs; void test_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state *state) { - vcpu_nested_state_set(vcpu->vm, vcpu->id, state); + vcpu_nested_state_set(vcpu, state); } void test_nested_state_expect_errno(struct kvm_vcpu *vcpu, @@ -37,7 +37,7 @@ void test_nested_state_expect_errno(struct kvm_vcpu *vcpu, { int rv; - rv = __vcpu_nested_state_set(vcpu->vm, vcpu->id, state); + rv = __vcpu_nested_state_set(vcpu, state); TEST_ASSERT(rv == -1 && errno == expected_errno, "Expected %s (%d) from vcpu_nested_state_set but got rv: %i errno: %s (%d)", strerror(expected_errno), expected_errno, rv, strerror(errno), @@ -121,7 +121,7 @@ void test_vmx_nested_state(struct kvm_vcpu *vcpu) test_nested_state(vcpu, state); /* Enable VMX in the guest CPUID. */ - vcpu_set_cpuid(vcpu->vm, vcpu->id, kvm_get_supported_cpuid()); + vcpu_set_cpuid(vcpu, kvm_get_supported_cpuid()); /* * Setting vmxon_pa == -1ull and vmcs_pa == -1ull exits early without @@ -137,7 +137,7 @@ void test_vmx_nested_state(struct kvm_vcpu *vcpu) state->flags &= KVM_STATE_NESTED_EVMCS; if (have_evmcs) { test_nested_state_expect_einval(vcpu, state); - vcpu_enable_evmcs(vcpu->vm, vcpu->id); + vcpu_enable_evmcs(vcpu); } test_nested_state(vcpu, state); @@ -233,7 +233,7 @@ void test_vmx_nested_state(struct kvm_vcpu *vcpu) state->hdr.vmx.vmcs12_pa = -1ull; state->flags = 0; test_nested_state(vcpu, state); - vcpu_nested_state_get(vcpu->vm, vcpu->id, state); + vcpu_nested_state_get(vcpu, state); TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz, "Size must be between %ld and %d. The size returned was %d.", sizeof(*state), state_sz, state->size); @@ -255,7 +255,7 @@ void disable_vmx(struct kvm_vcpu *vcpu) TEST_ASSERT(i != cpuid->nent, "CPUID function 1 not found"); cpuid->entries[i].ecx &= ~CPUID_VMX; - vcpu_set_cpuid(vcpu->vm, vcpu->id, cpuid); + vcpu_set_cpuid(vcpu, cpuid); cpuid->entries[i].ecx |= CPUID_VMX; } diff --git a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c index 29699d7c16c3..e32bfb102699 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c @@ -133,19 +133,19 @@ int main(int argc, char *argv[]) /* Allocate VMX pages and shared descriptors (vmx_pages). */ vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vm, vcpu->id, 1, vmx_pages_gva); + vcpu_args_set(vcpu, 1, vmx_pages_gva); for (;;) { volatile struct kvm_run *run = vcpu->run; struct ucall uc; - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: TEST_FAIL("%s", (const char *)uc.args[0]); /* NOT REACHED */ diff --git a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c index 4484ee563b18..3d272d7f961e 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c @@ -206,14 +206,14 @@ static void *vcpu_thread(void *arg) vcpu->id, r); fprintf(stderr, "vCPU thread running vCPU %u\n", vcpu->id); - vcpu_run(vcpu->vm, vcpu->id); + vcpu_run(vcpu); exit_reason = vcpu->run->exit_reason; TEST_ASSERT(exit_reason == KVM_EXIT_IO, "vCPU %u exited with unexpected exit reason %u-%s, expected KVM_EXIT_IO", vcpu->id, exit_reason, exit_reason_str(exit_reason)); - if (get_ucall(vcpu->vm, vcpu->id, &uc) == UCALL_ABORT) { + if (get_ucall(vcpu, &uc) == UCALL_ABORT) { TEST_ASSERT(false, "vCPU %u exited with error: %s.\n" "Sending vCPU sent %lu IPIs to halting vCPU\n" @@ -415,7 +415,7 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(¶ms[0].vcpu, halter_guest_code); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, params[0].vcpu->id); + vcpu_init_descriptor_tables(params[0].vcpu); vm_install_exception_handler(vm, IPI_VECTOR, guest_ipi_handler); virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); @@ -428,8 +428,8 @@ int main(int argc, char *argv[]) params[0].data = data; params[1].data = data; - vcpu_args_set(vm, params[0].vcpu->id, 1, test_data_page_vaddr); - vcpu_args_set(vm, params[1].vcpu->id, 1, test_data_page_vaddr); + vcpu_args_set(params[0].vcpu, 1, test_data_page_vaddr); + vcpu_args_set(params[1].vcpu, 1, test_data_page_vaddr); pipis_rcvd = (uint64_t *)addr_gva2hva(vm, (uint64_t)&ipis_rcvd); params[0].pipis_rcvd = pipis_rcvd; diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c index 56301ee1adee..5c5dc7bbb4e2 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c @@ -47,7 +47,7 @@ static void x2apic_guest_code(void) } while (1); } -static void ____test_icr(struct kvm_vm *vm, struct xapic_vcpu *x, uint64_t val) +static void ____test_icr(struct xapic_vcpu *x, uint64_t val) { struct kvm_vcpu *vcpu = x->vcpu; struct kvm_lapic_state xapic; @@ -59,16 +59,16 @@ static void ____test_icr(struct kvm_vm *vm, struct xapic_vcpu *x, uint64_t val) * all bits are valid and should not be modified by KVM (ignoring the * fact that vectors 0-15 are technically illegal). */ - vcpu_ioctl(vm, vcpu->id, KVM_GET_LAPIC, &xapic); + vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic); *((u32 *)&xapic.regs[APIC_IRR]) = val; *((u32 *)&xapic.regs[APIC_IRR + 0x10]) = val >> 32; - vcpu_ioctl(vm, vcpu->id, KVM_SET_LAPIC, &xapic); + vcpu_ioctl(vcpu, KVM_SET_LAPIC, &xapic); - vcpu_run(vm, vcpu->id); - ASSERT_EQ(get_ucall(vm, vcpu->id, &uc), UCALL_SYNC); + vcpu_run(vcpu); + ASSERT_EQ(get_ucall(vcpu, &uc), UCALL_SYNC); ASSERT_EQ(uc.args[1], val); - vcpu_ioctl(vm, vcpu->id, KVM_GET_LAPIC, &xapic); + vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic); icr = (u64)(*((u32 *)&xapic.regs[APIC_ICR])) | (u64)(*((u32 *)&xapic.regs[APIC_ICR2])) << 32; if (!x->is_x2apic) @@ -76,24 +76,24 @@ static void ____test_icr(struct kvm_vm *vm, struct xapic_vcpu *x, uint64_t val) ASSERT_EQ(icr, val & ~APIC_ICR_BUSY); } -static void __test_icr(struct kvm_vm *vm, struct xapic_vcpu *x, uint64_t val) +static void __test_icr(struct xapic_vcpu *x, uint64_t val) { - ____test_icr(vm, x, val | APIC_ICR_BUSY); - ____test_icr(vm, x, val & ~(u64)APIC_ICR_BUSY); + ____test_icr(x, val | APIC_ICR_BUSY); + ____test_icr(x, val & ~(u64)APIC_ICR_BUSY); } -static void test_icr(struct kvm_vm *vm, struct xapic_vcpu *x) +static void test_icr(struct xapic_vcpu *x) { struct kvm_vcpu *vcpu = x->vcpu; uint64_t icr, i, j; icr = APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_FIXED; for (i = 0; i <= 0xff; i++) - __test_icr(vm, x, icr | i); + __test_icr(x, icr | i); icr = APIC_INT_ASSERT | APIC_DM_FIXED; for (i = 0; i <= 0xff; i++) - __test_icr(vm, x, icr | i); + __test_icr(x, icr | i); /* * Send all flavors of IPIs to non-existent vCPUs. TODO: use number of @@ -102,18 +102,18 @@ static void test_icr(struct kvm_vm *vm, struct xapic_vcpu *x) icr = APIC_INT_ASSERT | 0xff; for (i = vcpu->id + 1; i < 0xff; i++) { for (j = 0; j < 8; j++) - __test_icr(vm, x, i << (32 + 24) | APIC_INT_ASSERT | (j << 8)); + __test_icr(x, i << (32 + 24) | APIC_INT_ASSERT | (j << 8)); } /* And again with a shorthand destination for all types of IPIs. */ icr = APIC_DEST_ALLBUT | APIC_INT_ASSERT; for (i = 0; i < 8; i++) - __test_icr(vm, x, icr | (i << 8)); + __test_icr(x, icr | (i << 8)); /* And a few garbage value, just make sure it's an IRQ (blocked). */ - __test_icr(vm, x, 0xa5a5a5a5a5a5a5a5 & ~APIC_DM_FIXED_MASK); - __test_icr(vm, x, 0x5a5a5a5a5a5a5a5a & ~APIC_DM_FIXED_MASK); - __test_icr(vm, x, -1ull & ~APIC_DM_FIXED_MASK); + __test_icr(x, 0xa5a5a5a5a5a5a5a5 & ~APIC_DM_FIXED_MASK); + __test_icr(x, 0x5a5a5a5a5a5a5a5a & ~APIC_DM_FIXED_MASK); + __test_icr(x, -1ull & ~APIC_DM_FIXED_MASK); } int main(int argc, char *argv[]) @@ -127,7 +127,7 @@ int main(int argc, char *argv[]) int i; vm = vm_create_with_one_vcpu(&x.vcpu, x2apic_guest_code); - test_icr(vm, &x); + test_icr(&x); kvm_vm_free(vm); /* @@ -138,15 +138,15 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&x.vcpu, xapic_guest_code); x.is_x2apic = false; - cpuid = vcpu_get_cpuid(vm, x.vcpu->id); + cpuid = vcpu_get_cpuid(x.vcpu); for (i = 0; i < cpuid->nent; i++) { if (cpuid->entries[i].function == 1) break; } cpuid->entries[i].ecx &= ~BIT(21); - vcpu_set_cpuid(vm, x.vcpu->id, cpuid); + vcpu_set_cpuid(x.vcpu, cpuid); virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); - test_icr(vm, &x); + test_icr(&x); kvm_vm_free(vm); } diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index 5c0abaf0eb60..4340c2f2300f 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -348,7 +348,7 @@ static void handle_alrm(int sig) { if (vinfo) printf("evtchn_upcall_pending 0x%x\n", vinfo->evtchn_upcall_pending); - vcpu_dump(stdout, vcpu->vm, vcpu->id, 0); + vcpu_dump(stdout, vcpu, 0); TEST_FAIL("IRQ delivery timed out"); } @@ -423,13 +423,13 @@ int main(int argc, char *argv[]) .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, .u.gpa = VCPU_INFO_ADDR, }; - vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_SET_ATTR, &vi); + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &vi); struct kvm_xen_vcpu_attr pvclock = { .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO, .u.gpa = PVTIME_ADDR, }; - vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_SET_ATTR, &pvclock); + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &pvclock); struct kvm_xen_hvm_attr vec = { .type = KVM_XEN_ATTR_TYPE_UPCALL_VECTOR, @@ -438,7 +438,7 @@ int main(int argc, char *argv[]) vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &vec); vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, vcpu->id); + vcpu_init_descriptor_tables(vcpu); vm_install_exception_handler(vm, EVTCHN_VECTOR, evtchn_handler); if (do_runstate_tests) { @@ -446,7 +446,7 @@ int main(int argc, char *argv[]) .type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR, .u.gpa = RUNSTATE_ADDR, }; - vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_SET_ATTR, &st); + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &st); } int irq_fd[2] = { -1, -1 }; @@ -522,7 +522,7 @@ int main(int argc, char *argv[]) inj.u.evtchn.flags = 0; vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj); - vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_SET_ATTR, &tmr); + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr); } vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR); vinfo->evtchn_upcall_pending = 0; @@ -536,14 +536,14 @@ int main(int argc, char *argv[]) volatile struct kvm_run *run = vcpu->run; struct ucall uc; - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: TEST_FAIL("%s", (const char *)uc.args[0]); /* NOT REACHED */ @@ -572,7 +572,7 @@ int main(int argc, char *argv[]) printf("Testing runstate %s\n", runstate_names[uc.args[1]]); rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT; rst.u.runstate.state = uc.args[1]; - vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_SET_ATTR, &rst); + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &rst); break; case 4: @@ -587,7 +587,7 @@ int main(int argc, char *argv[]) 0x6b6b - rs->time[RUNSTATE_offline]; rst.u.runstate.time_runnable = -rst.u.runstate.time_blocked - rst.u.runstate.time_offline; - vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_SET_ATTR, &rst); + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &rst); break; case 5: @@ -599,7 +599,7 @@ int main(int argc, char *argv[]) rst.u.runstate.state_entry_time = 0x6b6b + 0x5a; rst.u.runstate.time_blocked = 0x6b6b; rst.u.runstate.time_offline = 0x5a; - vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_SET_ATTR, &rst); + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &rst); break; case 6: @@ -700,7 +700,7 @@ int main(int argc, char *argv[]) case 14: memset(&tmr, 0, sizeof(tmr)); tmr.type = KVM_XEN_VCPU_ATTR_TYPE_TIMER; - vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_GET_ATTR, &tmr); + vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr); TEST_ASSERT(tmr.u.timer.port == EVTCHN_TIMER, "Timer port not returned"); TEST_ASSERT(tmr.u.timer.priority == KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL, @@ -720,7 +720,7 @@ int main(int argc, char *argv[]) printf("Testing restored oneshot timer\n"); tmr.u.timer.expires_ns = rs->state_entry_time + 100000000, - vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_SET_ATTR, &tmr); + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr); evtchn_irq_expected = true; alarm(1); break; @@ -747,7 +747,7 @@ int main(int argc, char *argv[]) printf("Testing SCHEDOP_poll wake on masked event\n"); tmr.u.timer.expires_ns = rs->state_entry_time + 100000000, - vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_SET_ATTR, &tmr); + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr); alarm(1); break; @@ -758,11 +758,11 @@ int main(int argc, char *argv[]) evtchn_irq_expected = true; tmr.u.timer.expires_ns = rs->state_entry_time + 100000000; - vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_SET_ATTR, &tmr); + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr); /* Read it back and check the pending time is reported correctly */ tmr.u.timer.expires_ns = 0; - vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_GET_ATTR, &tmr); + vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr); TEST_ASSERT(tmr.u.timer.expires_ns == rs->state_entry_time + 100000000, "Timer not reported pending"); alarm(1); @@ -772,7 +772,7 @@ int main(int argc, char *argv[]) TEST_ASSERT(!evtchn_irq_expected, "Expected event channel IRQ but it didn't happen"); /* Read timer and check it is no longer pending */ - vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_GET_ATTR, &tmr); + vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr); TEST_ASSERT(!tmr.u.timer.expires_ns, "Timer still reported pending"); shinfo->evtchn_pending[0] = 0; @@ -781,7 +781,7 @@ int main(int argc, char *argv[]) evtchn_irq_expected = true; tmr.u.timer.expires_ns = rs->state_entry_time - 100000000ULL; - vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_SET_ATTR, &tmr); + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr); alarm(1); break; @@ -851,7 +851,7 @@ int main(int argc, char *argv[]) struct kvm_xen_vcpu_attr rst = { .type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA, }; - vcpu_ioctl(vm, vcpu->id, KVM_XEN_VCPU_GET_ATTR, &rst); + vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &rst); if (verbose) { printf("Runstate: %s(%d), entry %" PRIu64 " ns\n", diff --git a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c index 1411ead620fe..a91f11fb26f4 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c @@ -90,7 +90,7 @@ int main(int argc, char *argv[]) } vm = vm_create_with_one_vcpu(&vcpu, guest_code); - vcpu_set_hv_cpuid(vm, vcpu->id); + vcpu_set_hv_cpuid(vcpu); struct kvm_xen_hvm_config hvmc = { .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL, @@ -107,7 +107,7 @@ int main(int argc, char *argv[]) volatile struct kvm_run *run = vcpu->run; struct ucall uc; - vcpu_run(vm, vcpu->id); + vcpu_run(vcpu); if (run->exit_reason == KVM_EXIT_XEN) { ASSERT_EQ(run->xen.type, KVM_EXIT_XEN_HCALL); @@ -129,7 +129,7 @@ int main(int argc, char *argv[]) run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vm, vcpu->id, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: TEST_FAIL("%s", (const char *)uc.args[0]); /* NOT REACHED */ diff --git a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c index a89d49ae79a6..1e3506c3deed 100644 --- a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c +++ b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c @@ -38,11 +38,11 @@ int main(int argc, char *argv[]) exit(KSFT_SKIP); } - xss_val = vcpu_get_msr(vm, vcpu->id, MSR_IA32_XSS); + xss_val = vcpu_get_msr(vcpu, MSR_IA32_XSS); TEST_ASSERT(xss_val == 0, "MSR_IA32_XSS should be initialized to zero\n"); - vcpu_set_msr(vm, vcpu->id, MSR_IA32_XSS, xss_val); + vcpu_set_msr(vcpu, MSR_IA32_XSS, xss_val); /* * At present, KVM only supports a guest IA32_XSS value of 0. Verify @@ -52,7 +52,7 @@ int main(int argc, char *argv[]) */ xss_in_msr_list = kvm_msr_is_in_save_restore_list(MSR_IA32_XSS); for (i = 0; i < MSR_BITS; ++i) { - r = _vcpu_set_msr(vm, vcpu->id, MSR_IA32_XSS, 1ull << i); + r = _vcpu_set_msr(vcpu, MSR_IA32_XSS, 1ull << i); /* * Setting a list of MSRs returns the entry that "faulted", or -- cgit v1.2.3 From fce542992b5d8baaf6f7a4d61a3273de3a2ff10b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 16:46:46 -0800 Subject: KVM: selftests: Drop vcpu_get(), rename vcpu_find() => vcpu_exists() Drop vcpu_get() and rename vcpu_find() to vcpu_exists() to make it that much harder for a test to give meaning to a vCPU ID. I.e. force tests to capture a vCPU when the vCPU is created. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/kvm_util_base.h | 2 -- tools/testing/selftests/kvm/lib/kvm_util.c | 34 +++++++++------------- 2 files changed, 13 insertions(+), 23 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 640634bdba9a..2da9db060378 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -93,8 +93,6 @@ struct kvm_vm { continue; \ else -struct kvm_vcpu *vcpu_get(struct kvm_vm *vm, uint32_t vcpu_id); - struct userspace_mem_region * memslot2region(struct kvm_vm *vm, uint32_t memslot); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index eb04b9c0a13c..60051258a3a3 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -459,26 +459,6 @@ kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, return ®ion->region; } -static struct kvm_vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpu_id) -{ - struct kvm_vcpu *vcpu; - - list_for_each_entry(vcpu, &vm->vcpus, list) { - if (vcpu->id == vcpu_id) - return vcpu; - } - - return NULL; -} - -struct kvm_vcpu *vcpu_get(struct kvm_vm *vm, uint32_t vcpu_id) -{ - struct kvm_vcpu *vcpu = vcpu_find(vm, vcpu_id); - - TEST_ASSERT(vcpu, "vCPU %d does not exist", vcpu_id); - return vcpu; -} - /* * VM VCPU Remove * @@ -1049,6 +1029,18 @@ static int vcpu_mmap_sz(void) return ret; } +static bool vcpu_exists(struct kvm_vm *vm, uint32_t vcpu_id) +{ + struct kvm_vcpu *vcpu; + + list_for_each_entry(vcpu, &vm->vcpus, list) { + if (vcpu->id == vcpu_id) + return true; + } + + return false; +} + /* * Adds a virtual CPU to the VM specified by vm with the ID given by vcpu_id. * No additional vCPU setup is done. Returns the vCPU. @@ -1058,7 +1050,7 @@ struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) struct kvm_vcpu *vcpu; /* Confirm a vcpu with the specified id doesn't already exist. */ - TEST_ASSERT(!vcpu_find(vm, vcpu_id), "vCPU%d already exists\n", vcpu_id); + TEST_ASSERT(!vcpu_exists(vm, vcpu_id), "vCPU%d already exists\n", vcpu_id); /* Allocate and initialize new vcpu structure. */ vcpu = calloc(1, sizeof(*vcpu)); -- cgit v1.2.3 From 96a96e1ad06f8fc380a69954f5511e950a5eeb23 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 16:48:13 -0800 Subject: KVM: selftests: Remove vcpu_state() helper Drop vcpu_state() now that all tests reference vcpu->run directly. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/kvm_util_base.h | 1 - tools/testing/selftests/kvm/lib/kvm_util.c | 19 +------------------ 2 files changed, 1 insertion(+), 19 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 2da9db060378..5741a999aca1 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -298,7 +298,6 @@ void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva); vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva); void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa); -struct kvm_run *vcpu_state(struct kvm_vcpu *vcpu); void vcpu_run(struct kvm_vcpu *vcpu); int _vcpu_run(struct kvm_vcpu *vcpu); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 60051258a3a3..ec4642f79fa3 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1001,19 +1001,7 @@ void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot) __vm_mem_region_delete(vm, memslot2region(vm, slot), true); } -/* - * VCPU mmap Size - * - * Input Args: None - * - * Output Args: None - * - * Return: - * Size of VCPU state - * - * Returns the size of the structure pointed to by the return value - * of vcpu_state(). - */ +/* Returns the size of a vCPU's kvm_run structure. */ static int vcpu_mmap_sz(void) { int dev_fd, ret; @@ -1394,11 +1382,6 @@ void vm_create_irqchip(struct kvm_vm *vm) vm->has_irqchip = true; } -struct kvm_run *vcpu_state(struct kvm_vcpu *vcpu) -{ - return vcpu->run; -} - int _vcpu_run(struct kvm_vcpu *vcpu) { -- cgit v1.2.3 From 68c1b3e910c0451ed9a51093c603cc4898d643ce Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 16 Feb 2022 16:51:20 -0800 Subject: KVM: selftests: Open code and drop 'struct kvm_vm' accessors Drop a variety of 'struct kvm_vm' accessors that wrap a single variable now that tests can simply reference the variable directly. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/dirty_log_perf_test.c | 2 +- tools/testing/selftests/kvm/dirty_log_test.c | 9 ++++---- .../testing/selftests/kvm/include/kvm_util_base.h | 6 ------ tools/testing/selftests/kvm/kvm_page_table_test.c | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 25 ---------------------- tools/testing/selftests/kvm/lib/perf_test_util.c | 2 +- .../testing/selftests/kvm/max_guest_memory_test.c | 11 +++++----- .../kvm/memslot_modification_stress_test.c | 2 +- tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c | 2 +- 9 files changed, 14 insertions(+), 47 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 2027208e7d10..808a36dbf0c0 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -221,7 +221,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) perf_test_set_wr_fract(vm, p->wr_fract); - guest_num_pages = (nr_vcpus * guest_percpu_mem_size) >> vm_get_page_shift(vm); + guest_num_pages = (nr_vcpus * guest_percpu_mem_size) >> vm->page_shift; guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); host_num_pages = vm_num_host_pages(mode, guest_num_pages); pages_per_slot = host_num_pages / p->slots; diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 906e893375df..ca584b9bf5c0 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -713,21 +713,20 @@ static void run_test(enum vm_guest_mode mode, void *arg) vm = create_vm(mode, &vcpu, 2ul << (DIRTY_MEM_BITS - PAGE_SHIFT_4K), guest_code); - guest_page_size = vm_get_page_size(vm); + guest_page_size = vm->page_size; /* * A little more than 1G of guest page sized pages. Cover the * case where the size is not aligned to 64 pages. */ - guest_num_pages = (1ul << (DIRTY_MEM_BITS - - vm_get_page_shift(vm))) + 3; + guest_num_pages = (1ul << (DIRTY_MEM_BITS - vm->page_shift)) + 3; guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); host_page_size = getpagesize(); host_num_pages = vm_num_host_pages(mode, guest_num_pages); if (!p->phys_offset) { - guest_test_phys_mem = (vm_get_max_gfn(vm) - - guest_num_pages) * guest_page_size; + guest_test_phys_mem = (vm->max_gfn - guest_num_pages) * + guest_page_size; guest_test_phys_mem = align_down(guest_test_phys_mem, host_page_size); } else { guest_test_phys_mem = p->phys_offset; diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 5741a999aca1..45f536f99399 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -592,13 +592,7 @@ static inline struct kvm_vm *vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm); -unsigned int vm_get_page_size(struct kvm_vm *vm); -unsigned int vm_get_page_shift(struct kvm_vm *vm); unsigned long vm_compute_max_gfn(struct kvm_vm *vm); -uint64_t vm_get_max_gfn(struct kvm_vm *vm); -int vm_get_kvm_fd(struct kvm_vm *vm); -int vm_get_fd(struct kvm_vm *vm); - unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size); unsigned int vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages); unsigned int vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages); diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index 8706ae358444..0f8792aa0366 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -260,7 +260,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) /* Align down GPA of the testing memslot */ if (!p->phys_offset) - guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) * + guest_test_phys_mem = (vm->max_gfn - guest_num_pages) * guest_page_size; else guest_test_phys_mem = p->phys_offset; diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index ec4642f79fa3..548c3c366bf5 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1827,36 +1827,11 @@ void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva) return addr_gpa2hva(vm, addr_gva2gpa(vm, gva)); } -unsigned int vm_get_page_size(struct kvm_vm *vm) -{ - return vm->page_size; -} - -unsigned int vm_get_page_shift(struct kvm_vm *vm) -{ - return vm->page_shift; -} - unsigned long __attribute__((weak)) vm_compute_max_gfn(struct kvm_vm *vm) { return ((1ULL << vm->pa_bits) >> vm->page_shift) - 1; } -uint64_t vm_get_max_gfn(struct kvm_vm *vm) -{ - return vm->max_gfn; -} - -int vm_get_kvm_fd(struct kvm_vm *vm) -{ - return vm->kvm_fd; -} - -int vm_get_fd(struct kvm_vm *vm) -{ - return vm->fd; -} - static unsigned int vm_calc_num_pages(unsigned int num_pages, unsigned int page_shift, unsigned int new_page_shift, diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index a8c7b2785cc4..2649595194ad 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -159,7 +159,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus, pta->vm = vm; /* Put the test region at the top guest physical memory. */ - region_end_gfn = vm_get_max_gfn(vm) + 1; + region_end_gfn = vm->max_gfn + 1; #ifdef __x86_64__ /* diff --git a/tools/testing/selftests/kvm/max_guest_memory_test.c b/tools/testing/selftests/kvm/max_guest_memory_test.c index 8f34c5aca420..9a6e4f3ad6b5 100644 --- a/tools/testing/selftests/kvm/max_guest_memory_test.c +++ b/tools/testing/selftests/kvm/max_guest_memory_test.c @@ -65,8 +65,7 @@ static void *vcpu_worker(void *data) struct kvm_sregs sregs; struct kvm_regs regs; - vcpu_args_set(vcpu, 3, info->start_gpa, info->end_gpa, - vm_get_page_size(vm)); + vcpu_args_set(vcpu, 3, info->start_gpa, info->end_gpa, vm->page_size); /* Snapshot regs before the first run. */ vcpu_regs_get(vcpu, ®s); @@ -104,7 +103,7 @@ static pthread_t *spawn_workers(struct kvm_vm *vm, struct kvm_vcpu **vcpus, TEST_ASSERT(info, "Failed to allocate vCPU gpa ranges"); nr_bytes = ((end_gpa - start_gpa) / nr_vcpus) & - ~((uint64_t)vm_get_page_size(vm) - 1); + ~((uint64_t)vm->page_size - 1); TEST_ASSERT(nr_bytes, "C'mon, no way you have %d CPUs", nr_vcpus); for (i = 0, gpa = start_gpa; i < nr_vcpus; i++, gpa += nr_bytes) { @@ -220,7 +219,7 @@ int main(int argc, char *argv[]) vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus); - max_gpa = vm_get_max_gfn(vm) << vm_get_page_shift(vm); + max_gpa = vm->max_gfn << vm->page_shift; TEST_ASSERT(max_gpa > (4 * slot_size), "MAXPHYADDR <4gb "); fd = kvm_memfd_alloc(slot_size, hugepages); @@ -230,7 +229,7 @@ int main(int argc, char *argv[]) TEST_ASSERT(!madvise(mem, slot_size, MADV_NOHUGEPAGE), "madvise() failed"); /* Pre-fault the memory to avoid taking mmap_sem on guest page faults. */ - for (i = 0; i < slot_size; i += vm_get_page_size(vm)) + for (i = 0; i < slot_size; i += vm->page_size) ((uint8_t *)mem)[i] = 0xaa; gpa = 0; @@ -249,7 +248,7 @@ int main(int argc, char *argv[]) for (i = 0; i < slot_size; i += size_1gb) __virt_pg_map(vm, gpa + i, gpa + i, PG_LEVEL_1G); #else - for (i = 0; i < slot_size; i += vm_get_page_size(vm)) + for (i = 0; i < slot_size; i += vm->page_size) virt_pg_map(vm, gpa + i, gpa + i); #endif } diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index 1f9036cdcaa9..6ee7e1dde404 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -75,7 +75,7 @@ static void add_remove_memslot(struct kvm_vm *vm, useconds_t delay, * Add the dummy memslot just below the perf_test_util memslot, which is * at the top of the guest physical address space. */ - gpa = perf_test_args.gpa - pages * vm_get_page_size(vm); + gpa = perf_test_args.gpa - pages * vm->page_size; for (i = 0; i < nr_modifications; i++) { usleep(delay); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c index af13c48f0f30..6df5a6356181 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c @@ -121,7 +121,7 @@ void test_hv_cpuid_e2big(struct kvm_vm *vm, struct kvm_vcpu *vcpu) if (vcpu) ret = __vcpu_ioctl(vcpu, KVM_GET_SUPPORTED_HV_CPUID, &cpuid); else - ret = __kvm_ioctl(vm_get_kvm_fd(vm), KVM_GET_SUPPORTED_HV_CPUID, &cpuid); + ret = __kvm_ioctl(vm->kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, &cpuid); TEST_ASSERT(ret == -1 && errno == E2BIG, "%s KVM_GET_SUPPORTED_HV_CPUID didn't fail with -E2BIG when" -- cgit v1.2.3 From 3222d0264fb60a9aa359a58a059a7fb0d2b3c467 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 May 2022 17:25:17 -0700 Subject: KVM: selftests: Drop @slot0_mem_pages from __vm_create_with_vcpus() All callers of __vm_create_with_vcpus() pass DEFAULT_GUEST_PHY_PAGES for @slot_mem_pages; drop the param and just hardcode the "default" as the base number of pages for slot0. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/kvm_util_base.h | 9 +++------ tools/testing/selftests/kvm/kvm_page_table_test.c | 5 ++--- tools/testing/selftests/kvm/lib/kvm_util.c | 21 +++++++-------------- tools/testing/selftests/kvm/lib/perf_test_util.c | 5 ++--- 4 files changed, 14 insertions(+), 26 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 45f536f99399..f84e01612c52 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -561,18 +561,15 @@ static inline struct kvm_vm *vm_create(uint64_t nr_pages) return __vm_create(VM_MODE_DEFAULT, nr_pages); } -/* Like vm_create_default_with_vcpus, but accepts mode and slot0 memory as a parameter */ struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, - uint64_t slot0_mem_pages, uint64_t extra_mem_pages, - uint32_t num_percpu_pages, void *guest_code, - struct kvm_vcpu *vcpus[]); + uint64_t extra_mem_pages, uint32_t num_percpu_pages, + void *guest_code, struct kvm_vcpu *vcpus[]); static inline struct kvm_vm *vm_create_with_vcpus(uint32_t nr_vcpus, void *guest_code, struct kvm_vcpu *vcpus[]) { - return __vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, - DEFAULT_GUEST_PHY_PAGES, 0, 0, + return __vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, 0, 0, guest_code, vcpus); } diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index 0f8792aa0366..a68c57572ab4 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -254,9 +254,8 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) /* Create a VM with enough guest pages */ guest_num_pages = test_mem_size / guest_page_size; - vm = __vm_create_with_vcpus(mode, nr_vcpus, DEFAULT_GUEST_PHY_PAGES, - guest_num_pages, 0, guest_code, - test_args.vcpus); + vm = __vm_create_with_vcpus(mode, nr_vcpus, guest_num_pages, 0, + guest_code, test_args.vcpus); /* Align down GPA of the testing memslot */ if (!p->phys_offset) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 548c3c366bf5..342bcd576ad6 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -280,7 +280,6 @@ struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint64_t nr_pages) * Input Args: * mode - VM Mode (e.g. VM_MODE_P52V48_4K) * nr_vcpus - VCPU count - * slot0_mem_pages - Slot0 physical memory size * extra_mem_pages - Non-slot0 physical memory total size * num_percpu_pages - Per-cpu physical memory pages * guest_code - Guest entry point @@ -291,15 +290,13 @@ struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint64_t nr_pages) * Return: * Pointer to opaque structure that describes the created VM. * - * Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K), - * with customized slot0 memory size, at least 512 pages currently. + * Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K). * extra_mem_pages is only used to calculate the maximum page table size, * no real memory allocation for non-slot0 memory in this function. */ struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, - uint64_t slot0_mem_pages, uint64_t extra_mem_pages, - uint32_t num_percpu_pages, void *guest_code, - struct kvm_vcpu *vcpus[]) + uint64_t extra_mem_pages, uint32_t num_percpu_pages, + void *guest_code, struct kvm_vcpu *vcpus[]) { uint64_t vcpu_pages, extra_pg_pages, pages; struct kvm_vm *vm; @@ -307,10 +304,6 @@ struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus TEST_ASSERT(!nr_vcpus || vcpus, "Must provide vCPU array"); - /* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */ - if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES) - slot0_mem_pages = DEFAULT_GUEST_PHY_PAGES; - /* The maximum page table size for a memory region will be when the * smallest pages are used. Considering each page contains x page * table descriptors, the total extra size for page tables (for extra @@ -318,8 +311,8 @@ struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus * than N/x*2. */ vcpu_pages = (DEFAULT_STACK_PGS + num_percpu_pages) * nr_vcpus; - extra_pg_pages = (slot0_mem_pages + extra_mem_pages + vcpu_pages) / PTES_PER_MIN_PAGE * 2; - pages = slot0_mem_pages + vcpu_pages + extra_pg_pages; + extra_pg_pages = (DEFAULT_GUEST_PHY_PAGES + extra_mem_pages + vcpu_pages) / PTES_PER_MIN_PAGE * 2; + pages = DEFAULT_GUEST_PHY_PAGES + vcpu_pages + extra_pg_pages; TEST_ASSERT(nr_vcpus <= kvm_check_cap(KVM_CAP_MAX_VCPUS), "nr_vcpus = %d too large for host, max-vcpus = %d", @@ -340,8 +333,8 @@ struct kvm_vm *__vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, struct kvm_vcpu *vcpus[1]; struct kvm_vm *vm; - vm = __vm_create_with_vcpus(VM_MODE_DEFAULT, 1, DEFAULT_GUEST_PHY_PAGES, - extra_mem_pages, 0, guest_code, vcpus); + vm = __vm_create_with_vcpus(VM_MODE_DEFAULT, 1, extra_mem_pages, 0, + guest_code, vcpus); *vcpu = vcpus[0]; return vm; diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index 2649595194ad..20d1b2ad75d1 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -113,7 +113,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus, { struct perf_test_args *pta = &perf_test_args; struct kvm_vm *vm; - uint64_t guest_num_pages, slot0_pages = DEFAULT_GUEST_PHY_PAGES; + uint64_t guest_num_pages, slot0_pages = 0; uint64_t backing_src_pagesz = get_backing_src_pagesz(backing_src); uint64_t region_end_gfn; int i; @@ -152,8 +152,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus, * The memory is also added to memslot 0, but that's a benign side * effect as KVM allows aliasing HVAs in meslots. */ - vm = __vm_create_with_vcpus(mode, nr_vcpus, DEFAULT_GUEST_PHY_PAGES, - slot0_pages + guest_num_pages, 0, + vm = __vm_create_with_vcpus(mode, nr_vcpus, slot0_pages + guest_num_pages, 0, perf_test_guest_code, vcpus); pta->vm = vm; -- cgit v1.2.3 From acaf50ad6dcb69a9857ef6c9a42100f6270f5f03 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 May 2022 17:39:47 -0700 Subject: KVM: selftests: Drop @num_percpu_pages from __vm_create_with_vcpus() Drop @num_percpu_pages from __vm_create_with_vcpus(), all callers pass '0' and there's unlikely to be a test that allocates just enough memory that it needs a per-CPU allocation, but not so much that it won't just do its own memory management. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/kvm_util_base.h | 4 ++-- tools/testing/selftests/kvm/kvm_page_table_test.c | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 7 +++---- tools/testing/selftests/kvm/lib/perf_test_util.c | 2 +- 4 files changed, 7 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index f84e01612c52..6143d45a02a7 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -562,14 +562,14 @@ static inline struct kvm_vm *vm_create(uint64_t nr_pages) } struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, - uint64_t extra_mem_pages, uint32_t num_percpu_pages, + uint64_t extra_mem_pages, void *guest_code, struct kvm_vcpu *vcpus[]); static inline struct kvm_vm *vm_create_with_vcpus(uint32_t nr_vcpus, void *guest_code, struct kvm_vcpu *vcpus[]) { - return __vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, 0, 0, + return __vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, 0, guest_code, vcpus); } diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index a68c57572ab4..f42c6ac6d71d 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -254,7 +254,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) /* Create a VM with enough guest pages */ guest_num_pages = test_mem_size / guest_page_size; - vm = __vm_create_with_vcpus(mode, nr_vcpus, guest_num_pages, 0, + vm = __vm_create_with_vcpus(mode, nr_vcpus, guest_num_pages, guest_code, test_args.vcpus); /* Align down GPA of the testing memslot */ diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 342bcd576ad6..afefc48757e6 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -281,7 +281,6 @@ struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint64_t nr_pages) * mode - VM Mode (e.g. VM_MODE_P52V48_4K) * nr_vcpus - VCPU count * extra_mem_pages - Non-slot0 physical memory total size - * num_percpu_pages - Per-cpu physical memory pages * guest_code - Guest entry point * vcpuids - VCPU IDs * @@ -295,7 +294,7 @@ struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint64_t nr_pages) * no real memory allocation for non-slot0 memory in this function. */ struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, - uint64_t extra_mem_pages, uint32_t num_percpu_pages, + uint64_t extra_mem_pages, void *guest_code, struct kvm_vcpu *vcpus[]) { uint64_t vcpu_pages, extra_pg_pages, pages; @@ -310,7 +309,7 @@ struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus * N pages) will be: N/x+N/x^2+N/x^3+... which is definitely smaller * than N/x*2. */ - vcpu_pages = (DEFAULT_STACK_PGS + num_percpu_pages) * nr_vcpus; + vcpu_pages = nr_vcpus * DEFAULT_STACK_PGS; extra_pg_pages = (DEFAULT_GUEST_PHY_PAGES + extra_mem_pages + vcpu_pages) / PTES_PER_MIN_PAGE * 2; pages = DEFAULT_GUEST_PHY_PAGES + vcpu_pages + extra_pg_pages; @@ -333,7 +332,7 @@ struct kvm_vm *__vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, struct kvm_vcpu *vcpus[1]; struct kvm_vm *vm; - vm = __vm_create_with_vcpus(VM_MODE_DEFAULT, 1, extra_mem_pages, 0, + vm = __vm_create_with_vcpus(VM_MODE_DEFAULT, 1, extra_mem_pages, guest_code, vcpus); *vcpu = vcpus[0]; diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index 20d1b2ad75d1..9618b37c66f7 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -152,7 +152,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus, * The memory is also added to memslot 0, but that's a benign side * effect as KVM allows aliasing HVAs in meslots. */ - vm = __vm_create_with_vcpus(mode, nr_vcpus, slot0_pages + guest_num_pages, 0, + vm = __vm_create_with_vcpus(mode, nr_vcpus, slot0_pages + guest_num_pages, perf_test_guest_code, vcpus); pta->vm = vm; -- cgit v1.2.3 From 6e1d13bf3815d6058620dd72135be292d9f44bc9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 3 May 2022 09:52:48 -0700 Subject: KVM: selftests: Move per-VM/per-vCPU nr pages calculation to __vm_create() Handle all memslot0 size adjustments in __vm_create(). Currently, the adjustments reside in __vm_create_with_vcpus(), which means tests that call vm_create() or __vm_create() directly are left to their own devices. Some tests just pass DEFAULT_GUEST_PHY_PAGES and don't bother with any adjustments, while others mimic the per-vCPU calculations. For vm_create(), and thus __vm_create(), take the number of vCPUs that will be runnable to calculate that number of per-vCPU pages needed for memslot0. To give readers a hint that neither vm_create() nor __vm_create() create vCPUs, name the parameter @nr_runnable_vcpus instead of @nr_vcpus. That also gives readers a hint as to why tests that create larger numbers of vCPUs but never actually run those vCPUs can skip straight to the vm_create_barebones() variant. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/psci_test.c | 2 +- tools/testing/selftests/kvm/aarch64/vgic_init.c | 4 +- tools/testing/selftests/kvm/dirty_log_test.c | 3 +- .../testing/selftests/kvm/hardware_disable_test.c | 2 +- .../testing/selftests/kvm/include/kvm_util_base.h | 9 ++-- tools/testing/selftests/kvm/lib/kvm_util.c | 57 ++++++++++++++-------- tools/testing/selftests/kvm/s390x/resets.c | 2 +- .../selftests/kvm/x86_64/pmu_event_filter_test.c | 2 +- .../testing/selftests/kvm/x86_64/set_boot_cpu_id.c | 5 +- .../selftests/kvm/x86_64/tsc_scaling_sync.c | 2 +- 10 files changed, 53 insertions(+), 35 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c index 3e1bebe63adf..7928c62635fd 100644 --- a/tools/testing/selftests/kvm/aarch64/psci_test.c +++ b/tools/testing/selftests/kvm/aarch64/psci_test.c @@ -76,7 +76,7 @@ static struct kvm_vm *setup_vm(void *guest_code, struct kvm_vcpu **source, struct kvm_vcpu_init init; struct kvm_vm *vm; - vm = vm_create(DEFAULT_GUEST_PHY_PAGES); + vm = vm_create(2); ucall_init(vm, NULL); vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init); diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index 6b9c9a391a01..02402802b163 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -419,7 +419,7 @@ static void test_v3_typer_accesses(void) uint64_t addr; int ret, i; - v.vm = vm_create(DEFAULT_GUEST_PHY_PAGES); + v.vm = vm_create(NR_VCPUS); (void)vm_vcpu_add(v.vm, 0, guest_code); v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3); @@ -479,7 +479,7 @@ static struct vm_gic vm_gic_v3_create_with_vcpuids(int nr_vcpus, struct vm_gic v; int i; - v.vm = vm_create(DEFAULT_GUEST_PHY_PAGES); + v.vm = vm_create(nr_vcpus); for (i = 0; i < nr_vcpus; i++) vm_vcpu_add(v.vm, vcpuids[i], guest_code); diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index ca584b9bf5c0..8542f713a101 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -669,11 +669,10 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, struct kvm_vcpu **vcpu, uint64_t extra_mem_pages, void *guest_code) { struct kvm_vm *vm; - uint64_t extra_pg_pages = extra_mem_pages / 512 * 2; pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); - vm = __vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages); + vm = __vm_create(mode, 1, extra_mem_pages); log_mode_create_vm_done(vm); *vcpu = vm_vcpu_add(vm, 0, guest_code); diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c index 2401577e3652..f5d59b9934f1 100644 --- a/tools/testing/selftests/kvm/hardware_disable_test.c +++ b/tools/testing/selftests/kvm/hardware_disable_test.c @@ -98,7 +98,7 @@ static void run_test(uint32_t run) for (i = 0; i < VCPU_NUM; i++) CPU_SET(i, &cpu_set); - vm = vm_create(DEFAULT_GUEST_PHY_PAGES); + vm = vm_create(VCPU_NUM); pr_debug("%s: [%d] start vcpus\n", __func__, run); for (i = 0; i < VCPU_NUM; ++i) { diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 6143d45a02a7..db9c00a7af4e 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -547,18 +547,21 @@ vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm); /* * ____vm_create() does KVM_CREATE_VM and little else. __vm_create() also * loads the test binary into guest memory and creates an IRQ chip (x86 only). + * __vm_create() does NOT create vCPUs, @nr_runnable_vcpus is used purely to + * calculate the amount of memory needed for per-vCPU data, e.g. stacks. */ struct kvm_vm *____vm_create(enum vm_guest_mode mode, uint64_t nr_pages); -struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint64_t nr_pages); +struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint32_t nr_runnable_vcpus, + uint64_t nr_extra_pages); static inline struct kvm_vm *vm_create_barebones(void) { return ____vm_create(VM_MODE_DEFAULT, 0); } -static inline struct kvm_vm *vm_create(uint64_t nr_pages) +static inline struct kvm_vm *vm_create(uint32_t nr_runnable_vcpus) { - return __vm_create(VM_MODE_DEFAULT, nr_pages); + return __vm_create(VM_MODE_DEFAULT, nr_runnable_vcpus, 0); } struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index afefc48757e6..620c35561122 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -258,11 +258,45 @@ struct kvm_vm *____vm_create(enum vm_guest_mode mode, uint64_t nr_pages) return vm; } -struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint64_t nr_pages) +static uint64_t vm_nr_pages_required(enum vm_guest_mode mode, + uint32_t nr_runnable_vcpus, + uint64_t extra_mem_pages) { - struct kvm_vm *vm; + uint64_t nr_pages; + + TEST_ASSERT(nr_runnable_vcpus, + "Use vm_create_barebones() for VMs that _never_ have vCPUs\n"); + + TEST_ASSERT(nr_runnable_vcpus <= kvm_check_cap(KVM_CAP_MAX_VCPUS), + "nr_vcpus = %d too large for host, max-vcpus = %d", + nr_runnable_vcpus, kvm_check_cap(KVM_CAP_MAX_VCPUS)); + + nr_pages = DEFAULT_GUEST_PHY_PAGES; + nr_pages += nr_runnable_vcpus * DEFAULT_STACK_PGS; + + /* + * Account for the number of pages needed for the page tables. The + * maximum page table size for a memory region will be when the + * smallest page size is used. Considering each page contains x page + * table descriptors, the total extra size for page tables (for extra + * N pages) will be: N/x+N/x^2+N/x^3+... which is definitely smaller + * than N/x*2. + */ + nr_pages += (nr_pages + extra_mem_pages) / PTES_PER_MIN_PAGE * 2; - nr_pages = vm_adjust_num_guest_pages(mode, nr_pages); + TEST_ASSERT(nr_runnable_vcpus <= kvm_check_cap(KVM_CAP_MAX_VCPUS), + "Host doesn't support %d vCPUs, max-vcpus = %d", + nr_runnable_vcpus, kvm_check_cap(KVM_CAP_MAX_VCPUS)); + + return vm_adjust_num_guest_pages(mode, nr_pages); +} + +struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint32_t nr_runnable_vcpus, + uint64_t nr_extra_pages) +{ + uint64_t nr_pages = vm_nr_pages_required(mode, nr_runnable_vcpus, + nr_extra_pages); + struct kvm_vm *vm; vm = ____vm_create(mode, nr_pages); @@ -297,27 +331,12 @@ struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus uint64_t extra_mem_pages, void *guest_code, struct kvm_vcpu *vcpus[]) { - uint64_t vcpu_pages, extra_pg_pages, pages; struct kvm_vm *vm; int i; TEST_ASSERT(!nr_vcpus || vcpus, "Must provide vCPU array"); - /* The maximum page table size for a memory region will be when the - * smallest pages are used. Considering each page contains x page - * table descriptors, the total extra size for page tables (for extra - * N pages) will be: N/x+N/x^2+N/x^3+... which is definitely smaller - * than N/x*2. - */ - vcpu_pages = nr_vcpus * DEFAULT_STACK_PGS; - extra_pg_pages = (DEFAULT_GUEST_PHY_PAGES + extra_mem_pages + vcpu_pages) / PTES_PER_MIN_PAGE * 2; - pages = DEFAULT_GUEST_PHY_PAGES + vcpu_pages + extra_pg_pages; - - TEST_ASSERT(nr_vcpus <= kvm_check_cap(KVM_CAP_MAX_VCPUS), - "nr_vcpus = %d too large for host, max-vcpus = %d", - nr_vcpus, kvm_check_cap(KVM_CAP_MAX_VCPUS)); - - vm = __vm_create(mode, pages); + vm = __vm_create(mode, nr_vcpus, extra_mem_pages); for (i = 0; i < nr_vcpus; ++i) vcpus[i] = vm_vcpu_add(vm, i, guest_code); diff --git a/tools/testing/selftests/kvm/s390x/resets.c b/tools/testing/selftests/kvm/s390x/resets.c index 633ba9055231..e1737411accc 100644 --- a/tools/testing/selftests/kvm/s390x/resets.c +++ b/tools/testing/selftests/kvm/s390x/resets.c @@ -206,7 +206,7 @@ static struct kvm_vm *create_vm(struct kvm_vcpu **vcpu) { struct kvm_vm *vm; - vm = vm_create(DEFAULT_GUEST_PHY_PAGES); + vm = vm_create(1); *vcpu = vm_vcpu_add(vm, ARBITRARY_NON_ZERO_VCPU_ID, guest_code_initial); diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index c86b84b68ee3..07fa68a1fdc4 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -365,7 +365,7 @@ static void test_pmu_config_disable(void (*guest_code)(void)) if (!(r & KVM_PMU_CAP_DISABLE)) return; - vm = vm_create(DEFAULT_GUEST_PHY_PAGES); + vm = vm_create(1); vm_enable_cap(vm, KVM_CAP_PMU_CAPABILITY, KVM_PMU_CAP_DISABLE); diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c index afc063178c6a..8bcaf4421dc5 100644 --- a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c +++ b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c @@ -78,13 +78,10 @@ static void run_vcpu(struct kvm_vcpu *vcpu) static struct kvm_vm *create_vm(uint32_t nr_vcpus, uint32_t bsp_vcpu_id, struct kvm_vcpu *vcpus[]) { - uint64_t vcpu_pages = (DEFAULT_STACK_PGS) * nr_vcpus; - uint64_t extra_pg_pages = vcpu_pages / PTES_PER_MIN_PAGE * nr_vcpus; - uint64_t pages = DEFAULT_GUEST_PHY_PAGES + vcpu_pages + extra_pg_pages; struct kvm_vm *vm; uint32_t i; - vm = vm_create(pages); + vm = vm_create(nr_vcpus); vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *)(unsigned long)bsp_vcpu_id); diff --git a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c index e416af887ca0..4a962952212e 100644 --- a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c +++ b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c @@ -98,7 +98,7 @@ int main(int argc, char *argv[]) exit(KSFT_SKIP); } - vm = vm_create(DEFAULT_GUEST_PHY_PAGES + DEFAULT_STACK_PGS * NR_TEST_VCPUS); + vm = vm_create(NR_TEST_VCPUS); vm_ioctl(vm, KVM_SET_TSC_KHZ, (void *) TEST_TSC_KHZ); pthread_spin_init(&create_lock, PTHREAD_PROCESS_PRIVATE); -- cgit v1.2.3 From 38081d28835c84e73ff414f23663d57946cc2234 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 3 May 2022 14:48:59 -0700 Subject: KVM: selftests: Trust that MAXPHYADDR > memslot0 in vmx_apic_access_test Use vm->max_gfn to compute the highest gpa in vmx_apic_access_test, and blindly trust that the highest gfn/gpa will be well above the memory carved out for memslot0. The existing check is beyond paranoid; KVM doesn't support CPUs with host.MAXPHYADDR < 32, and the selftests are all kinds of hosed if memslot0 overlaps the local xAPIC, which resides above "lower" (below 4gb) DRAM. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c index ef7514376b1e..ccb05ef7234e 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c @@ -72,8 +72,6 @@ static void l1_guest_code(struct vmx_pages *vmx_pages, unsigned long high_gpa) int main(int argc, char *argv[]) { unsigned long apic_access_addr = ~0ul; - unsigned int paddr_width; - unsigned int vaddr_width; vm_vaddr_t vmx_pages_gva; unsigned long high_gpa; struct vmx_pages *vmx; @@ -86,12 +84,7 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); - kvm_get_cpu_address_width(&paddr_width, &vaddr_width); - high_gpa = (1ul << paddr_width) - getpagesize(); - if ((unsigned long)DEFAULT_GUEST_PHY_PAGES * getpagesize() > high_gpa) { - print_skip("No unbacked physical page available"); - exit(KSFT_SKIP); - } + high_gpa = (vm->max_gfn - 1) << vm->page_shift; vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva); prepare_virtualize_apic_accesses(vmx, vm); -- cgit v1.2.3 From 032604529827cf889e470dc9b3ad18b2a40311b3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 3 May 2022 15:26:02 -0700 Subject: KVM: selftests: Drop DEFAULT_GUEST_PHY_PAGES, open code the magic number Remove DEFAULT_GUEST_PHY_PAGES and open code the magic number (with a comment) in vm_nr_pages_required(). Exposing DEFAULT_GUEST_PHY_PAGES to tests was a symptom of the VM creation APIs not cleanly supporting tests that create runnable vCPUs, but can't do so immediately. Now that tests don't have to manually compute the amount of memory needed for basic operation, make it harder for tests to do things that should be handled by the framework, i.e. force developers to improve the framework instead of hacking around flaws in individual tests. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/kvm_util_base.h | 1 - tools/testing/selftests/kvm/lib/kvm_util.c | 8 +++++++- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index db9c00a7af4e..1c762988ab9c 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -100,7 +100,6 @@ memslot2region(struct kvm_vm *vm, uint32_t memslot); #define KVM_UTIL_MIN_VADDR 0x2000 #define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 -#define DEFAULT_GUEST_PHY_PAGES 512 #define DEFAULT_GUEST_STACK_VADDR_MIN 0xab6000 #define DEFAULT_STACK_PGS 5 diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 620c35561122..86e7f5afce63 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -271,7 +271,13 @@ static uint64_t vm_nr_pages_required(enum vm_guest_mode mode, "nr_vcpus = %d too large for host, max-vcpus = %d", nr_runnable_vcpus, kvm_check_cap(KVM_CAP_MAX_VCPUS)); - nr_pages = DEFAULT_GUEST_PHY_PAGES; + /* + * Arbitrarily allocate 512 pages (2mb when page size is 4kb) for the + * test code and other per-VM assets that will be loaded into memslot0. + */ + nr_pages = 512; + + /* Account for the per-vCPU stacks on behalf of the test. */ nr_pages += nr_runnable_vcpus * DEFAULT_STACK_PGS; /* -- cgit v1.2.3 From d8ba3f14a50e4bc9745bb8f66a599c6be8ad0cda Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 May 2022 15:09:52 -0700 Subject: KVM: selftests: Return an 'unsigned int' from kvm_check_cap() Return an 'unsigned int' instead of a signed 'int' from kvm_check_cap(), to make it more obvious that kvm_check_cap() can never return a negative value due to its assertion that the return is ">= 0". Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/kvm_util_base.h | 2 +- tools/testing/selftests/kvm/kvm_binary_stats_test.c | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 1c762988ab9c..72cc0ecda067 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -167,7 +167,7 @@ extern const struct vm_guest_mode_params vm_guest_mode_params[]; int open_path_or_exit(const char *path, int flags); int open_kvm_dev_path_or_exit(void); -int kvm_check_cap(long cap); +unsigned int kvm_check_cap(long cap); #define __KVM_SYSCALL_ERROR(_name, _ret) \ "%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno) diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index 7f2ddc1535d7..982bf3f7d9c5 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -213,7 +213,7 @@ int main(int argc, char *argv[]) } /* Check the extension for binary stats */ - if (kvm_check_cap(KVM_CAP_BINARY_STATS_FD) <= 0) { + if (!kvm_check_cap(KVM_CAP_BINARY_STATS_FD)) { print_skip("Binary form statistics interface is not supported"); exit(KSFT_SKIP); } diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 86e7f5afce63..4cf0e6bf33bb 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -69,7 +69,7 @@ int open_kvm_dev_path_or_exit(void) * Looks up and returns the value corresponding to the capability * (KVM_CAP_*) given by cap. */ -int kvm_check_cap(long cap) +unsigned int kvm_check_cap(long cap) { int ret; int kvm_fd; -- cgit v1.2.3 From 3ea9b809650b4eda5d4ae18ed7bb080e499af154 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 May 2022 15:13:03 -0700 Subject: KVM: selftests: Add kvm_has_cap() to provide syntactic sugar Add kvm_has_cap() to wrap kvm_check_cap() and return a bool for the use cases where the caller only wants check if a capability is supported, i.e. doesn't care about the value beyond whether or not it's non-zero. The "check" terminology is somewhat ambiguous as the non-boolean return suggests that '0' might mean "success", i.e. suggests that the ioctl uses the 0/-errno pattern. Provide a wrapper instead of trying to find a new name for the raw helper; the "check" terminology is derived from the name of the ioctl, so using e.g. "get" isn't a clear win. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/get-reg-list.c | 2 +- tools/testing/selftests/kvm/aarch64/vcpu_width_config.c | 2 +- tools/testing/selftests/kvm/dirty_log_test.c | 4 ++-- tools/testing/selftests/kvm/include/kvm_util_base.h | 5 +++++ tools/testing/selftests/kvm/kvm_binary_stats_test.c | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 4 ++-- tools/testing/selftests/kvm/lib/s390x/diag318_test_handler.c | 2 +- tools/testing/selftests/kvm/x86_64/debug_regs.c | 2 +- tools/testing/selftests/kvm/x86_64/emulator_error_test.c | 2 +- tools/testing/selftests/kvm/x86_64/evmcs_test.c | 4 ++-- tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c | 6 +++--- tools/testing/selftests/kvm/x86_64/kvm_pv_test.c | 2 +- tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c | 2 +- tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c | 4 ++-- tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c | 2 +- tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c | 2 +- tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c | 2 +- tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c | 2 +- tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c | 2 +- 19 files changed, 29 insertions(+), 24 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c index f0f83ffda344..8bc10b9acbbc 100644 --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -395,7 +395,7 @@ static void check_supported(struct vcpu_config *c) struct reg_sublist *s; for_each_sublist(c, s) { - if (s->capability && !kvm_check_cap(s->capability)) { + if (s->capability && !kvm_has_cap(s->capability)) { fprintf(stderr, "%s: %s not available, skipping tests\n", config_name(c), s->name); exit(KSFT_SKIP); } diff --git a/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c b/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c index dd5a1c4b49e0..fff02c442610 100644 --- a/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c +++ b/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c @@ -82,7 +82,7 @@ int main(void) struct kvm_vm *vm; int ret; - if (!kvm_check_cap(KVM_CAP_ARM_EL1_32BIT)) { + if (!kvm_has_cap(KVM_CAP_ARM_EL1_32BIT)) { print_skip("KVM_CAP_ARM_EL1_32BIT is not supported"); exit(KSFT_SKIP); } diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 8542f713a101..9c883c94d478 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -210,7 +210,7 @@ static void sem_wait_until(sem_t *sem) static bool clear_log_supported(void) { - return kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); + return kvm_has_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); } static void clear_log_create_vm_done(struct kvm_vm *vm) @@ -264,7 +264,7 @@ static void default_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err) static bool dirty_ring_supported(void) { - return kvm_check_cap(KVM_CAP_DIRTY_LOG_RING); + return kvm_has_cap(KVM_CAP_DIRTY_LOG_RING); } static void dirty_ring_create_vm_done(struct kvm_vm *vm) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 72cc0ecda067..04ddab322b6b 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -169,6 +169,11 @@ int open_path_or_exit(const char *path, int flags); int open_kvm_dev_path_or_exit(void); unsigned int kvm_check_cap(long cap); +static inline bool kvm_has_cap(long cap) +{ + return kvm_check_cap(cap); +} + #define __KVM_SYSCALL_ERROR(_name, _ret) \ "%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno) diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index 982bf3f7d9c5..8754b78ae785 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -213,7 +213,7 @@ int main(int argc, char *argv[]) } /* Check the extension for binary stats */ - if (!kvm_check_cap(KVM_CAP_BINARY_STATS_FD)) { + if (!kvm_has_cap(KVM_CAP_BINARY_STATS_FD)) { print_skip("Binary form statistics interface is not supported"); exit(KSFT_SKIP); } diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 4cf0e6bf33bb..f93b0d9334e5 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -80,7 +80,7 @@ unsigned int kvm_check_cap(long cap) close(kvm_fd); - return ret; + return (unsigned int)ret; } void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size) @@ -93,7 +93,7 @@ static void vm_open(struct kvm_vm *vm) { vm->kvm_fd = _open_kvm_dev_path_or_exit(O_RDWR); - if (!kvm_check_cap(KVM_CAP_IMMEDIATE_EXIT)) { + if (!kvm_has_cap(KVM_CAP_IMMEDIATE_EXIT)) { print_skip("immediate_exit not available"); exit(KSFT_SKIP); } diff --git a/tools/testing/selftests/kvm/lib/s390x/diag318_test_handler.c b/tools/testing/selftests/kvm/lib/s390x/diag318_test_handler.c index 05283f8c9948..cdb7daeed5fd 100644 --- a/tools/testing/selftests/kvm/lib/s390x/diag318_test_handler.c +++ b/tools/testing/selftests/kvm/lib/s390x/diag318_test_handler.c @@ -61,7 +61,7 @@ uint64_t get_diag318_info(void) * If KVM does not support diag318, then return 0 to * ensure tests do not break. */ - if (!kvm_check_cap(KVM_CAP_S390_DIAG318)) { + if (!kvm_has_cap(KVM_CAP_S390_DIAG318)) { if (!printed_skip) { fprintf(stdout, "KVM_CAP_S390_DIAG318 not supported. " "Skipping diag318 test.\n"); diff --git a/tools/testing/selftests/kvm/x86_64/debug_regs.c b/tools/testing/selftests/kvm/x86_64/debug_regs.c index c16799b616e0..bba811edef96 100644 --- a/tools/testing/selftests/kvm/x86_64/debug_regs.c +++ b/tools/testing/selftests/kvm/x86_64/debug_regs.c @@ -95,7 +95,7 @@ int main(void) 1, /* cli */ }; - if (!kvm_check_cap(KVM_CAP_SET_GUEST_DEBUG)) { + if (!kvm_has_cap(KVM_CAP_SET_GUEST_DEBUG)) { print_skip("KVM_CAP_SET_GUEST_DEBUG not supported"); return 0; } diff --git a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c index fb2a2390b4af..119bcb1158d5 100644 --- a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c +++ b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c @@ -162,7 +162,7 @@ int main(int argc, char *argv[]) /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); - if (!kvm_check_cap(KVM_CAP_SMALLER_MAXPHYADDR)) { + if (!kvm_has_cap(KVM_CAP_SMALLER_MAXPHYADDR)) { printf("module parameter 'allow_smaller_maxphyaddr' is not set. Skipping test.\n"); return 0; } diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index 6c4e728d2d85..a6da1ccbee4e 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -209,8 +209,8 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); if (!nested_vmx_supported() || - !kvm_check_cap(KVM_CAP_NESTED_STATE) || - !kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) { + !kvm_has_cap(KVM_CAP_NESTED_STATE) || + !kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) { print_skip("Enlightened VMCS is unsupported"); exit(KSFT_SKIP); } diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c index 6df5a6356181..e2fac752d354 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c @@ -137,7 +137,7 @@ int main(int argc, char *argv[]) /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); - if (!kvm_check_cap(KVM_CAP_HYPERV_CPUID)) { + if (!kvm_has_cap(KVM_CAP_HYPERV_CPUID)) { print_skip("KVM_CAP_HYPERV_CPUID not supported"); exit(KSFT_SKIP); } @@ -152,7 +152,7 @@ int main(int argc, char *argv[]) free(hv_cpuid_entries); if (!nested_vmx_supported() || - !kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) { + !kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) { print_skip("Enlightened VMCS is unsupported"); goto do_sys; } @@ -163,7 +163,7 @@ int main(int argc, char *argv[]) do_sys: /* Test system ioctl version */ - if (!kvm_check_cap(KVM_CAP_SYS_HYPERV_CPUID)) { + if (!kvm_has_cap(KVM_CAP_SYS_HYPERV_CPUID)) { print_skip("KVM_CAP_SYS_HYPERV_CPUID not supported"); goto out; } diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c index f497d6ecec25..24dad3a47206 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c @@ -204,7 +204,7 @@ int main(void) struct kvm_vcpu *vcpu; struct kvm_vm *vm; - if (!kvm_check_cap(KVM_CAP_ENFORCE_PV_FEATURE_CPUID)) { + if (!kvm_has_cap(KVM_CAP_ENFORCE_PV_FEATURE_CPUID)) { print_skip("KVM_CAP_ENFORCE_PV_FEATURE_CPUID not supported"); exit(KSFT_SKIP); } diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c index 8bcaf4421dc5..abf740f08d68 100644 --- a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c +++ b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c @@ -123,7 +123,7 @@ static void check_set_bsp_busy(void) int main(int argc, char *argv[]) { - if (!kvm_check_cap(KVM_CAP_SET_BOOT_CPU_ID)) { + if (!kvm_has_cap(KVM_CAP_SET_BOOT_CPU_ID)) { print_skip("set_boot_cpu_id not available"); return 0; } diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c index ec418b823273..ffd8613987ae 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c +++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c @@ -400,8 +400,8 @@ int main(int argc, char *argv[]) { struct kvm_cpuid_entry2 *cpuid; - if (!kvm_check_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM) && - !kvm_check_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) { + if (!kvm_has_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM) && + !kvm_has_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) { print_skip("Capabilities not available"); exit(KSFT_SKIP); } diff --git a/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c index 01d491f849c2..078bd7a0bbb1 100644 --- a/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c +++ b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c @@ -51,7 +51,7 @@ int main(void) exit(KSFT_SKIP); } - if (!kvm_check_cap(KVM_CAP_X86_TRIPLE_FAULT_EVENT)) { + if (!kvm_has_cap(KVM_CAP_X86_TRIPLE_FAULT_EVENT)) { print_skip("KVM_CAP_X86_TRIPLE_FAULT_EVENT not supported"); exit(KSFT_SKIP); } diff --git a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c index 4a962952212e..fcc713ff75ff 100644 --- a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c +++ b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c @@ -93,7 +93,7 @@ static void *run_vcpu(void *_cpu_nr) int main(int argc, char *argv[]) { - if (!kvm_check_cap(KVM_CAP_VM_TSC_CONTROL)) { + if (!kvm_has_cap(KVM_CAP_VM_TSC_CONTROL)) { print_skip("KVM_CAP_VM_TSC_CONTROL not available"); exit(KSFT_SKIP); } diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c index 647a4320d3bc..190af8124677 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c @@ -118,7 +118,7 @@ static void l1_guest_code(struct vmx_pages *vmx_pages) static void tsc_scaling_check_supported(void) { - if (!kvm_check_cap(KVM_CAP_TSC_CONTROL)) { + if (!kvm_has_cap(KVM_CAP_TSC_CONTROL)) { print_skip("TSC scaling not supported by the HW"); exit(KSFT_SKIP); } diff --git a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c index b775a11ec08b..7438258511da 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c @@ -169,7 +169,7 @@ int main(int argc, char *argv[]) */ nested_vmx_check_supported(); - if (!kvm_check_cap(KVM_CAP_NESTED_STATE)) { + if (!kvm_has_cap(KVM_CAP_NESTED_STATE)) { print_skip("KVM_CAP_NESTED_STATE not supported"); exit(KSFT_SKIP); } diff --git a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c index ba783ceb007f..21f280a7c5e1 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c @@ -267,7 +267,7 @@ int main(int argc, char *argv[]) have_evmcs = kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS); - if (!kvm_check_cap(KVM_CAP_NESTED_STATE)) { + if (!kvm_has_cap(KVM_CAP_NESTED_STATE)) { print_skip("KVM_CAP_NESTED_STATE not available"); exit(KSFT_SKIP); } -- cgit v1.2.3 From 7ed397d107d461a53e350e5025d68ec3c4a8934d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 May 2022 16:24:02 -0700 Subject: KVM: selftests: Add TEST_REQUIRE macros to reduce skipping copy+paste Add TEST_REQUIRE() and __TEST_REQUIRE() to replace the myriad open coded instances of selftests exiting with KSFT_SKIP after printing an informational message. In addition to reducing the amount of boilerplate code in selftests, the UPPERCASE macro names make it easier to visually identify a test's requirements. Convert usage that erroneously uses something other than print_skip() and/or "exits" with '0' or some other non-KSFT_SKIP value. Intentionally drop a kvm_vm_free() in aarch64/debug-exceptions.c as part of the conversion. All memory and file descriptors are freed on process exit, so the explicit free is superfluous. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/arch_timer.c | 11 +++------- .../selftests/kvm/aarch64/debug-exceptions.c | 7 ++---- tools/testing/selftests/kvm/aarch64/get-reg-list.c | 10 +++++---- tools/testing/selftests/kvm/aarch64/psci_test.c | 5 +---- .../selftests/kvm/aarch64/vcpu_width_config.c | 5 +---- tools/testing/selftests/kvm/aarch64/vgic_init.c | 10 +++------ tools/testing/selftests/kvm/aarch64/vgic_irq.c | 5 +---- .../selftests/kvm/access_tracking_perf_test.c | 11 +++------- tools/testing/selftests/kvm/include/test_util.h | 9 ++++++++ .../testing/selftests/kvm/kvm_binary_stats_test.c | 5 +---- tools/testing/selftests/kvm/kvm_create_max_vcpus.c | 6 ++---- tools/testing/selftests/kvm/lib/kvm_util.c | 10 ++------- tools/testing/selftests/kvm/lib/x86_64/processor.c | 8 +++---- tools/testing/selftests/kvm/lib/x86_64/svm.c | 5 +---- tools/testing/selftests/kvm/lib/x86_64/vmx.c | 5 +---- tools/testing/selftests/kvm/rseq_test.c | 13 +++++------ tools/testing/selftests/kvm/s390x/memop.c | 11 ++++------ tools/testing/selftests/kvm/s390x/sync_regs_test.c | 5 ++--- tools/testing/selftests/kvm/steal_time.c | 5 +---- .../selftests/kvm/system_counter_offset_test.c | 8 +++---- tools/testing/selftests/kvm/x86_64/amx_test.c | 23 +++++++------------- .../selftests/kvm/x86_64/cr4_cpuid_sync_test.c | 5 +---- tools/testing/selftests/kvm/x86_64/debug_regs.c | 5 +---- .../selftests/kvm/x86_64/emulator_error_test.c | 5 +---- tools/testing/selftests/kvm/x86_64/evmcs_test.c | 9 +++----- .../selftests/kvm/x86_64/fix_hypercall_test.c | 5 +---- .../selftests/kvm/x86_64/get_msr_index_features.c | 5 +---- tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c | 5 +---- .../testing/selftests/kvm/x86_64/hyperv_svm_test.c | 6 ++---- .../testing/selftests/kvm/x86_64/kvm_clock_test.c | 6 +----- tools/testing/selftests/kvm/x86_64/kvm_pv_test.c | 5 +---- .../selftests/kvm/x86_64/mmio_warning_test.c | 10 ++------- tools/testing/selftests/kvm/x86_64/mmu_role_test.c | 10 ++------- .../selftests/kvm/x86_64/platform_info_test.c | 7 +----- .../selftests/kvm/x86_64/pmu_event_filter_test.c | 25 +++++----------------- .../testing/selftests/kvm/x86_64/set_boot_cpu_id.c | 5 +---- .../selftests/kvm/x86_64/sev_migrate_tests.c | 19 ++++++---------- .../testing/selftests/kvm/x86_64/sync_regs_test.c | 10 ++------- .../selftests/kvm/x86_64/triple_fault_event_test.c | 10 ++------- .../selftests/kvm/x86_64/tsc_scaling_sync.c | 5 +---- .../vmx_exception_with_invalid_guest_state.c | 6 ++---- .../kvm/x86_64/vmx_nested_tsc_scaling_test.c | 10 +-------- .../selftests/kvm/x86_64/vmx_pmu_caps_test.c | 23 +++++++------------- .../kvm/x86_64/vmx_preemption_timer_test.c | 5 +---- .../kvm/x86_64/vmx_set_nested_state_test.c | 5 +---- .../testing/selftests/kvm/x86_64/xen_shinfo_test.c | 5 +---- .../testing/selftests/kvm/x86_64/xen_vmcall_test.c | 8 +++---- tools/testing/selftests/kvm/x86_64/xss_msr_test.c | 13 ++++------- 48 files changed, 119 insertions(+), 290 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c index ca4c08b4e353..f68019be67c0 100644 --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -375,10 +375,7 @@ static struct kvm_vm *test_vm_create(void) ucall_init(vm, NULL); test_init_timer_irq(vm); gic_fd = vgic_v3_setup(vm, nr_vcpus, 64, GICD_BASE_GPA, GICR_BASE_GPA); - if (gic_fd < 0) { - print_skip("Failed to create vgic-v3"); - exit(KSFT_SKIP); - } + __TEST_REQUIRE(gic_fd >= 0, "Failed to create vgic-v3"); /* Make all the test's cmdline args visible to the guest */ sync_global_to_guest(vm, test_args); @@ -468,10 +465,8 @@ int main(int argc, char *argv[]) if (!parse_args(argc, argv)) exit(KSFT_SKIP); - if (test_args.migration_freq_ms && get_nprocs() < 2) { - print_skip("At least two physical CPUs needed for vCPU migration"); - exit(KSFT_SKIP); - } + __TEST_REQUIRE(!test_args.migration_freq_ms || get_nprocs() >= 2, + "At least two physical CPUs needed for vCPU migration"); vm = test_vm_create(); test_run(vm); diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c index c27352b90ccf..b8072b40ccc8 100644 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -259,11 +259,8 @@ int main(int argc, char *argv[]) vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vcpu); - if (debug_version(vcpu) < 6) { - print_skip("Armv8 debug architecture not supported."); - kvm_vm_free(vm); - exit(KSFT_SKIP); - } + __TEST_REQUIRE(debug_version(vcpu) >= 6, + "Armv8 debug architecture not supported."); vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, ESR_EC_BRK_INS, guest_sw_bp_handler); diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c index 8bc10b9acbbc..d287dd2cac0a 100644 --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -395,10 +395,12 @@ static void check_supported(struct vcpu_config *c) struct reg_sublist *s; for_each_sublist(c, s) { - if (s->capability && !kvm_has_cap(s->capability)) { - fprintf(stderr, "%s: %s not available, skipping tests\n", config_name(c), s->name); - exit(KSFT_SKIP); - } + if (!s->capability) + continue; + + __TEST_REQUIRE(kvm_has_cap(s->capability), + "%s: %s not available, skipping tests\n", + config_name(c), s->name); } } diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c index 7928c62635fd..a889e1cf5e4d 100644 --- a/tools/testing/selftests/kvm/aarch64/psci_test.c +++ b/tools/testing/selftests/kvm/aarch64/psci_test.c @@ -192,10 +192,7 @@ static void host_test_system_suspend(void) int main(void) { - if (!kvm_check_cap(KVM_CAP_ARM_SYSTEM_SUSPEND)) { - print_skip("KVM_CAP_ARM_SYSTEM_SUSPEND not supported"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(kvm_check_cap(KVM_CAP_ARM_SYSTEM_SUSPEND)); host_test_cpu_on(); host_test_system_suspend(); diff --git a/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c b/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c index fff02c442610..80b74c6f152b 100644 --- a/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c +++ b/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c @@ -82,10 +82,7 @@ int main(void) struct kvm_vm *vm; int ret; - if (!kvm_has_cap(KVM_CAP_ARM_EL1_32BIT)) { - print_skip("KVM_CAP_ARM_EL1_32BIT is not supported"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_EL1_32BIT)); /* Get the preferred target type and copy that to init1 for later use */ vm = vm_create_barebones(); diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index 02402802b163..e8cab9840aa3 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -703,13 +703,9 @@ int main(int ac, char **av) } ret = test_kvm_device(KVM_DEV_TYPE_ARM_VGIC_V2); - if (!ret) { - pr_info("Running GIC_v2 tests.\n"); - run_tests(KVM_DEV_TYPE_ARM_VGIC_V2); - return 0; - } + __TEST_REQUIRE(!ret, "No GICv2 nor GICv3 support"); - print_skip("No GICv2 nor GICv3 support"); - exit(KSFT_SKIP); + pr_info("Running GIC_v2 tests.\n"); + run_tests(KVM_DEV_TYPE_ARM_VGIC_V2); return 0; } diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c index 90dbba61d72a..046ba4fde648 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c @@ -768,10 +768,7 @@ static void test_vgic(uint32_t nr_irqs, bool level_sensitive, bool eoi_split) gic_fd = vgic_v3_setup(vm, 1, nr_irqs, GICD_BASE_GPA, GICR_BASE_GPA); - if (gic_fd < 0) { - print_skip("Failed to create vgic-v3, skipping"); - exit(KSFT_SKIP); - } + __TEST_REQUIRE(gic_fd >= 0, "Failed to create vgic-v3, skipping"); vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT, guest_irq_handlers[args.eoi_split][args.level_sensitive]); diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c index 1c771378f7f4..1c2749b1481a 100644 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -104,10 +104,7 @@ static uint64_t lookup_pfn(int pagemap_fd, struct kvm_vm *vm, uint64_t gva) return 0; pfn = entry & PAGEMAP_PFN_MASK; - if (!pfn) { - print_skip("Looking up PFNs requires CAP_SYS_ADMIN"); - exit(KSFT_SKIP); - } + __TEST_REQUIRE(pfn, "Looking up PFNs requires CAP_SYS_ADMIN"); return pfn; } @@ -380,10 +377,8 @@ int main(int argc, char *argv[]) } page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR); - if (page_idle_fd < 0) { - print_skip("CONFIG_IDLE_PAGE_TRACKING is not enabled"); - exit(KSFT_SKIP); - } + __TEST_REQUIRE(page_idle_fd >= 0, + "CONFIG_IDLE_PAGE_TRACKING is not enabled"); close(page_idle_fd); for_each_guest_mode(run_test, ¶ms); diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index 99e0dcdc923f..493b2a799a61 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -34,6 +34,15 @@ static inline int _no_printf(const char *format, ...) { return 0; } #endif void print_skip(const char *fmt, ...) __attribute__((format(printf, 1, 2))); +#define __TEST_REQUIRE(f, fmt, ...) \ +do { \ + if (!(f)) { \ + print_skip(fmt, ##__VA_ARGS__); \ + exit(KSFT_SKIP); \ + } \ +} while (0) + +#define TEST_REQUIRE(f) __TEST_REQUIRE(f, "Requirement not met: %s", #f) ssize_t test_write(int fd, const void *buf, size_t count); ssize_t test_read(int fd, void *buf, size_t count); diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index 8754b78ae785..1baabf955d63 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -213,10 +213,7 @@ int main(int argc, char *argv[]) } /* Check the extension for binary stats */ - if (!kvm_has_cap(KVM_CAP_BINARY_STATS_FD)) { - print_skip("Binary form statistics interface is not supported"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(kvm_has_cap(KVM_CAP_BINARY_STATS_FD)); /* Create VMs and VCPUs */ vms = malloc(sizeof(vms[0]) * max_vm); diff --git a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c index 3ae0237e96b2..31b3cb24b9a7 100644 --- a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c +++ b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c @@ -64,11 +64,9 @@ int main(int argc, char *argv[]) rl.rlim_max = nr_fds_wanted; int r = setrlimit(RLIMIT_NOFILE, &rl); - if (r < 0) { - printf("RLIMIT_NOFILE hard limit is too low (%d, wanted %d)\n", + __TEST_REQUIRE(r >= 0, + "RLIMIT_NOFILE hard limit is too low (%d, wanted %d)\n", old_rlim_max, nr_fds_wanted); - exit(KSFT_SKIP); - } } else { TEST_ASSERT(!setrlimit(RLIMIT_NOFILE, &rl), "setrlimit() failed!"); } diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index f93b0d9334e5..cca89d9a83ea 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -26,10 +26,7 @@ int open_path_or_exit(const char *path, int flags) int fd; fd = open(path, flags); - if (fd < 0) { - print_skip("%s not available (errno: %d)", path, errno); - exit(KSFT_SKIP); - } + __TEST_REQUIRE(fd >= 0, "%s not available (errno: %d)", path, errno); return fd; } @@ -93,10 +90,7 @@ static void vm_open(struct kvm_vm *vm) { vm->kvm_fd = _open_kvm_dev_path_or_exit(O_RDWR); - if (!kvm_has_cap(KVM_CAP_IMMEDIATE_EXIT)) { - print_skip("immediate_exit not available"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(kvm_has_cap(KVM_CAP_IMMEDIATE_EXIT)); vm->fd = __kvm_ioctl(vm->kvm_fd, KVM_CREATE_VM, vm->type); TEST_ASSERT(vm->fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VM, vm->fd)); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index a54910adea98..4a7de11d6f37 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -609,14 +609,14 @@ void vm_xsave_req_perm(int bit) kvm_fd = open_kvm_dev_path_or_exit(); rc = __kvm_ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr); close(kvm_fd); + if (rc == -1 && (errno == ENXIO || errno == EINVAL)) exit(KSFT_SKIP); TEST_ASSERT(rc == 0, "KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) error: %ld", rc); - if (!(bitmask & (1ULL << bit))) - exit(KSFT_SKIP); - if (!is_xfd_supported()) - exit(KSFT_SKIP); + TEST_REQUIRE(bitmask & (1ULL << bit)); + + TEST_REQUIRE(is_xfd_supported()); rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit); diff --git a/tools/testing/selftests/kvm/lib/x86_64/svm.c b/tools/testing/selftests/kvm/lib/x86_64/svm.c index 01a9d831da13..37e9c0a923e0 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/svm.c +++ b/tools/testing/selftests/kvm/lib/x86_64/svm.c @@ -174,10 +174,7 @@ bool nested_svm_supported(void) void nested_svm_check_supported(void) { - if (!nested_svm_supported()) { - print_skip("nested SVM not enabled"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(nested_svm_supported()); } /* diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index 2cbfe9962fb1..381432741df4 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -391,10 +391,7 @@ bool nested_vmx_supported(void) void nested_vmx_check_supported(void) { - if (!nested_vmx_supported()) { - print_skip("nested VMX not enabled"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(nested_vmx_supported()); } static void nested_create_pte(struct kvm_vm *vm, diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c index 68c0c8bb206e..aba7be178dab 100644 --- a/tools/testing/selftests/kvm/rseq_test.c +++ b/tools/testing/selftests/kvm/rseq_test.c @@ -171,12 +171,11 @@ static void *migration_worker(void *ign) return NULL; } -static int calc_min_max_cpu(void) +static void calc_min_max_cpu(void) { int i, cnt, nproc; - if (CPU_COUNT(&possible_mask) < 2) - return -EINVAL; + TEST_REQUIRE(CPU_COUNT(&possible_mask) >= 2); /* * CPU_SET doesn't provide a FOR_EACH helper, get the min/max CPU that @@ -198,7 +197,8 @@ static int calc_min_max_cpu(void) cnt++; } - return (cnt < 2) ? -EINVAL : 0; + __TEST_REQUIRE(cnt >= 2, + "Only one usable CPU, task migration not possible"); } int main(int argc, char *argv[]) @@ -215,10 +215,7 @@ int main(int argc, char *argv[]) TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno, strerror(errno)); - if (calc_min_max_cpu()) { - print_skip("Only one usable CPU, task migration not possible"); - exit(KSFT_SKIP); - } + calc_min_max_cpu(); sys_rseq(0); diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c index 22b0bb785591..a8f9b74b9144 100644 --- a/tools/testing/selftests/kvm/s390x/memop.c +++ b/tools/testing/selftests/kvm/s390x/memop.c @@ -756,20 +756,17 @@ struct testdef { int main(int argc, char *argv[]) { - int memop_cap, extension_cap, idx; + int extension_cap, idx; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_MEM_OP)); setbuf(stdout, NULL); /* Tell stdout not to buffer its content */ ksft_print_header(); - memop_cap = kvm_check_cap(KVM_CAP_S390_MEM_OP); - extension_cap = kvm_check_cap(KVM_CAP_S390_MEM_OP_EXTENSION); - if (!memop_cap) { - ksft_exit_skip("CAP_S390_MEM_OP not supported.\n"); - } - ksft_set_plan(ARRAY_SIZE(testlist)); + extension_cap = kvm_check_cap(KVM_CAP_S390_MEM_OP_EXTENSION); for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) { if (testlist[idx].extension >= extension_cap) { testlist[idx].test(); diff --git a/tools/testing/selftests/kvm/s390x/sync_regs_test.c b/tools/testing/selftests/kvm/s390x/sync_regs_test.c index 4b2eb5edde5e..b69710822c47 100644 --- a/tools/testing/selftests/kvm/s390x/sync_regs_test.c +++ b/tools/testing/selftests/kvm/s390x/sync_regs_test.c @@ -229,14 +229,13 @@ int main(int argc, char *argv[]) struct kvm_vm *vm; int idx; + TEST_REQUIRE(kvm_check_cap(KVM_CAP_SYNC_REGS)); + /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); ksft_print_header(); - if (!kvm_check_cap(KVM_CAP_SYNC_REGS)) - ksft_exit_skip("CAP_SYNC_REGS not supported"); - ksft_set_plan(ARRAY_SIZE(testlist)); /* Create VM */ diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index 398819d4074f..d122f1e05cdd 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -271,10 +271,7 @@ int main(int ac, char **av) virt_map(vm, ST_GPA_BASE, ST_GPA_BASE, gpages); ucall_init(vm, NULL); - if (!is_steal_time_supported(vcpus[0])) { - print_skip("steal-time not supported"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(is_steal_time_supported(vcpus[0])); /* Run test on each VCPU */ for (i = 0; i < NR_VCPUS; ++i) { diff --git a/tools/testing/selftests/kvm/system_counter_offset_test.c b/tools/testing/selftests/kvm/system_counter_offset_test.c index 7c8be0930737..862a8e93e070 100644 --- a/tools/testing/selftests/kvm/system_counter_offset_test.c +++ b/tools/testing/selftests/kvm/system_counter_offset_test.c @@ -28,11 +28,9 @@ static struct test_case test_cases[] = { static void check_preconditions(struct kvm_vcpu *vcpu) { - if (!__vcpu_has_device_attr(vcpu, KVM_VCPU_TSC_CTRL, KVM_VCPU_TSC_OFFSET)) - return; - - print_skip("KVM_VCPU_TSC_OFFSET not supported; skipping test"); - exit(KSFT_SKIP); + __TEST_REQUIRE(!__vcpu_has_device_attr(vcpu, KVM_VCPU_TSC_CTRL, + KVM_VCPU_TSC_OFFSET), + "KVM_VCPU_TSC_OFFSET not supported; skipping test"); } static void setup_system_counter(struct kvm_vcpu *vcpu, struct test_case *test) diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c index b421c8369dba..dab4ca16a2df 100644 --- a/tools/testing/selftests/kvm/x86_64/amx_test.c +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -317,7 +317,6 @@ int main(int argc, char *argv[]) { struct kvm_cpuid_entry2 *entry; struct kvm_regs regs1, regs2; - bool amx_supported = false; struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct kvm_run *run; @@ -334,21 +333,15 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); entry = kvm_get_supported_cpuid_entry(1); - if (!(entry->ecx & X86_FEATURE_XSAVE)) { - print_skip("XSAVE feature not supported"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(entry->ecx & X86_FEATURE_XSAVE); - if (kvm_get_cpuid_max_basic() >= 0xd) { - entry = kvm_get_supported_cpuid_index(0xd, 0); - amx_supported = entry && !!(entry->eax & XFEATURE_MASK_XTILE); - if (!amx_supported) { - print_skip("AMX is not supported by the vCPU (eax=0x%x)", entry->eax); - exit(KSFT_SKIP); - } - /* Get xsave/restore max size */ - xsave_restore_size = entry->ecx; - } + TEST_REQUIRE(kvm_get_cpuid_max_basic() >= 0xd); + + entry = kvm_get_supported_cpuid_index(0xd, 0); + TEST_REQUIRE(entry->eax & XFEATURE_MASK_XTILE); + + /* Get xsave/restore max size */ + xsave_restore_size = entry->ecx; run = vcpu->run; vcpu_regs_get(vcpu, ®s1); diff --git a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c index 1635aae970e9..a80940ac420f 100644 --- a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c +++ b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c @@ -70,10 +70,7 @@ int main(int argc, char *argv[]) struct ucall uc; entry = kvm_get_supported_cpuid_entry(1); - if (!(entry->ecx & X86_FEATURE_XSAVE)) { - print_skip("XSAVE feature not supported"); - return 0; - } + TEST_REQUIRE(entry->ecx & X86_FEATURE_XSAVE); /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); diff --git a/tools/testing/selftests/kvm/x86_64/debug_regs.c b/tools/testing/selftests/kvm/x86_64/debug_regs.c index bba811edef96..7ef99c3359a0 100644 --- a/tools/testing/selftests/kvm/x86_64/debug_regs.c +++ b/tools/testing/selftests/kvm/x86_64/debug_regs.c @@ -95,10 +95,7 @@ int main(void) 1, /* cli */ }; - if (!kvm_has_cap(KVM_CAP_SET_GUEST_DEBUG)) { - print_skip("KVM_CAP_SET_GUEST_DEBUG not supported"); - return 0; - } + TEST_REQUIRE(kvm_has_cap(KVM_CAP_SET_GUEST_DEBUG)); vm = vm_create_with_one_vcpu(&vcpu, guest_code); run = vcpu->run; diff --git a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c index 119bcb1158d5..bfff2d271c48 100644 --- a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c +++ b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c @@ -162,10 +162,7 @@ int main(int argc, char *argv[]) /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); - if (!kvm_has_cap(KVM_CAP_SMALLER_MAXPHYADDR)) { - printf("module parameter 'allow_smaller_maxphyaddr' is not set. Skipping test.\n"); - return 0; - } + TEST_REQUIRE(kvm_has_cap(KVM_CAP_SMALLER_MAXPHYADDR)); vm = vm_create_with_one_vcpu(&vcpu, guest_code); diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index a6da1ccbee4e..8dda527cc080 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -208,12 +208,9 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); - if (!nested_vmx_supported() || - !kvm_has_cap(KVM_CAP_NESTED_STATE) || - !kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) { - print_skip("Enlightened VMCS is unsupported"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(nested_vmx_supported()); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)); vcpu_set_hv_cpuid(vcpu); vcpu_enable_evmcs(vcpu); diff --git a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c index 137759547720..f6f251ce59e1 100644 --- a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c +++ b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c @@ -156,10 +156,7 @@ static void test_fix_hypercall_disabled(void) int main(void) { - if (!(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) { - print_skip("KVM_X86_QUIRK_HYPERCALL_INSN not supported"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_FIX_HYPERCALL_INSN); test_fix_hypercall(); test_fix_hypercall_disabled(); diff --git a/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c index 1e366fdfe7be..d09b3cbcadc6 100644 --- a/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c +++ b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c @@ -25,10 +25,7 @@ int main(int argc, char *argv[]) * will cover the "regular" list of MSRs, the coverage here is purely * opportunistic and not interesting on its own. */ - if (!kvm_check_cap(KVM_CAP_GET_MSR_FEATURES)) { - print_skip("KVM_CAP_GET_MSR_FEATURES not supported"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(kvm_has_cap(KVM_CAP_GET_MSR_FEATURES)); (void)kvm_get_msr_index_list(); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c index e2fac752d354..cbd4a7d36189 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c @@ -137,10 +137,7 @@ int main(int argc, char *argv[]) /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); - if (!kvm_has_cap(KVM_CAP_HYPERV_CPUID)) { - print_skip("KVM_CAP_HYPERV_CPUID not supported"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_CPUID)); vm = vm_create_with_one_vcpu(&vcpu, guest_code); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c index 171009184c3b..c5cd9835dbd6 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c @@ -127,10 +127,8 @@ int main(int argc, char *argv[]) struct ucall uc; int stage; - if (!nested_svm_supported()) { - print_skip("Nested SVM not supported"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(nested_svm_supported()); + /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, guest_code); vcpu_set_hv_cpuid(vcpu); diff --git a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c index 6e3c4bd60b76..138455575a11 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c @@ -181,11 +181,7 @@ int main(void) int flags; flags = kvm_check_cap(KVM_CAP_ADJUST_CLOCK); - if (!(flags & KVM_CLOCK_REALTIME)) { - print_skip("KVM_CLOCK_REALTIME not supported; flags: %x", - flags); - exit(KSFT_SKIP); - } + TEST_REQUIRE(flags & KVM_CLOCK_REALTIME); check_clocksource(); diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c index 24dad3a47206..5901ccec7079 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c @@ -204,10 +204,7 @@ int main(void) struct kvm_vcpu *vcpu; struct kvm_vm *vm; - if (!kvm_has_cap(KVM_CAP_ENFORCE_PV_FEATURE_CPUID)) { - print_skip("KVM_CAP_ENFORCE_PV_FEATURE_CPUID not supported"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(kvm_has_cap(KVM_CAP_ENFORCE_PV_FEATURE_CPUID)); vm = vm_create_with_one_vcpu(&vcpu, guest_main); diff --git a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c index 31ae837fedb1..0e4590afd0e1 100644 --- a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c +++ b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c @@ -93,15 +93,9 @@ int main(void) { int warnings_before, warnings_after; - if (!is_intel_cpu()) { - print_skip("Must be run on an Intel CPU"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(is_intel_cpu()); - if (vm_is_unrestricted_guest(NULL)) { - print_skip("Unrestricted guest must be disabled"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(!vm_is_unrestricted_guest(NULL)); warnings_before = get_warnings_count(); diff --git a/tools/testing/selftests/kvm/x86_64/mmu_role_test.c b/tools/testing/selftests/kvm/x86_64/mmu_role_test.c index 1404dfda2e9f..383fff2c9587 100644 --- a/tools/testing/selftests/kvm/x86_64/mmu_role_test.c +++ b/tools/testing/selftests/kvm/x86_64/mmu_role_test.c @@ -117,16 +117,10 @@ int main(int argc, char *argv[]) } } - if (!do_gbpages && !do_maxphyaddr) { - print_skip("No sub-tests selected"); - return 0; - } + __TEST_REQUIRE(do_gbpages || do_maxphyaddr, "No sub-tests selected"); entry = kvm_get_supported_cpuid_entry(0x80000001); - if (!(entry->edx & CPUID_GBPAGES)) { - print_skip("1gb hugepages not supported"); - return 0; - } + TEST_REQUIRE(entry->edx & CPUID_GBPAGES); if (do_gbpages) { pr_info("Test MMIO after toggling CPUID.GBPAGES\n\n"); diff --git a/tools/testing/selftests/kvm/x86_64/platform_info_test.c b/tools/testing/selftests/kvm/x86_64/platform_info_test.c index 3cb48e4b615b..76417c7d687b 100644 --- a/tools/testing/selftests/kvm/x86_64/platform_info_test.c +++ b/tools/testing/selftests/kvm/x86_64/platform_info_test.c @@ -70,17 +70,12 @@ int main(int argc, char *argv[]) { struct kvm_vcpu *vcpu; struct kvm_vm *vm; - int rv; uint64_t msr_platform_info; /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); - rv = kvm_check_cap(KVM_CAP_MSR_PLATFORM_INFO); - if (!rv) { - print_skip("KVM_CAP_MSR_PLATFORM_INFO not supported"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(kvm_has_cap(KVM_CAP_MSR_PLATFORM_INFO)); vm = vm_create_with_one_vcpu(&vcpu, guest_code); diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index 07fa68a1fdc4..656fb2227a81 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -443,39 +443,24 @@ static bool use_amd_pmu(void) int main(int argc, char *argv[]) { - void (*guest_code)(void) = NULL; + void (*guest_code)(void); struct kvm_vcpu *vcpu; struct kvm_vm *vm; - int r; /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); - r = kvm_check_cap(KVM_CAP_PMU_EVENT_FILTER); - if (!r) { - print_skip("KVM_CAP_PMU_EVENT_FILTER not supported"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(kvm_check_cap(KVM_CAP_PMU_EVENT_FILTER)); - if (use_intel_pmu()) - guest_code = intel_guest_code; - else if (use_amd_pmu()) - guest_code = amd_guest_code; - - if (!guest_code) { - print_skip("Don't know how to test this guest PMU"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(use_intel_pmu() || use_amd_pmu()); + guest_code = use_intel_pmu() ? intel_guest_code : amd_guest_code; vm = vm_create_with_one_vcpu(&vcpu, guest_code); vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vcpu); - if (!sanity_check_pmu(vcpu)) { - print_skip("Guest PMU is not functional"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(sanity_check_pmu(vcpu)); if (use_amd_pmu()) test_amd_deny_list(vcpu); diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c index abf740f08d68..7ef713fdd0a5 100644 --- a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c +++ b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c @@ -123,10 +123,7 @@ static void check_set_bsp_busy(void) int main(int argc, char *argv[]) { - if (!kvm_has_cap(KVM_CAP_SET_BOOT_CPU_ID)) { - print_skip("set_boot_cpu_id not available"); - return 0; - } + TEST_REQUIRE(kvm_has_cap(KVM_CAP_SET_BOOT_CPU_ID)); run_vm_bsp(0); run_vm_bsp(1); diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c index ffd8613987ae..76ba6fc80e37 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c +++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c @@ -400,22 +400,15 @@ int main(int argc, char *argv[]) { struct kvm_cpuid_entry2 *cpuid; - if (!kvm_has_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM) && - !kvm_has_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) { - print_skip("Capabilities not available"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)); cpuid = kvm_get_supported_cpuid_entry(0x80000000); - if (cpuid->eax < 0x8000001f) { - print_skip("AMD memory encryption not available"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(cpuid->eax >= 0x8000001f); + cpuid = kvm_get_supported_cpuid_entry(0x8000001f); - if (!(cpuid->eax & X86_FEATURE_SEV)) { - print_skip("AMD SEV not available"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(cpuid->eax & X86_FEATURE_SEV); + have_sev_es = !!(cpuid->eax & X86_FEATURE_SEV_ES); if (kvm_check_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM)) { diff --git a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c index 773db9d4f228..9b6db0b0b13e 100644 --- a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c +++ b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c @@ -94,14 +94,8 @@ int main(int argc, char *argv[]) setbuf(stdout, NULL); cap = kvm_check_cap(KVM_CAP_SYNC_REGS); - if ((cap & TEST_SYNC_FIELDS) != TEST_SYNC_FIELDS) { - print_skip("KVM_CAP_SYNC_REGS not supported"); - exit(KSFT_SKIP); - } - if ((cap & INVALID_SYNC_FIELD) != 0) { - print_skip("The \"invalid\" field is not invalid"); - exit(KSFT_SKIP); - } + TEST_REQUIRE((cap & TEST_SYNC_FIELDS) == TEST_SYNC_FIELDS); + TEST_REQUIRE(!(cap & INVALID_SYNC_FIELD)); vm = vm_create_with_one_vcpu(&vcpu, guest_code); diff --git a/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c index 078bd7a0bbb1..5a202ecb8ea0 100644 --- a/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c +++ b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c @@ -46,15 +46,9 @@ int main(void) vm_vaddr_t vmx_pages_gva; struct ucall uc; - if (!nested_vmx_supported()) { - print_skip("Nested VMX not supported"); - exit(KSFT_SKIP); - } + nested_vmx_check_supported(); - if (!kvm_has_cap(KVM_CAP_X86_TRIPLE_FAULT_EVENT)) { - print_skip("KVM_CAP_X86_TRIPLE_FAULT_EVENT not supported"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(kvm_has_cap(KVM_CAP_X86_TRIPLE_FAULT_EVENT)); vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); vm_enable_cap(vm, KVM_CAP_X86_TRIPLE_FAULT_EVENT, 1); diff --git a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c index fcc713ff75ff..47139aab7408 100644 --- a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c +++ b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c @@ -93,10 +93,7 @@ static void *run_vcpu(void *_cpu_nr) int main(int argc, char *argv[]) { - if (!kvm_has_cap(KVM_CAP_VM_TSC_CONTROL)) { - print_skip("KVM_CAP_VM_TSC_CONTROL not available"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_TSC_CONTROL)); vm = vm_create(NR_TEST_VCPUS); vm_ioctl(vm, KVM_SET_TSC_KHZ, (void *) TEST_TSC_KHZ); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c index 5bc2cee0d613..2641b286b4ed 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c @@ -111,10 +111,8 @@ int main(int argc, char *argv[]) struct kvm_vcpu *vcpu; struct kvm_vm *vm; - if (!is_intel_cpu() || vm_is_unrestricted_guest(NULL)) { - print_skip("Must be run with kvm_intel.unrestricted_guest=0"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(is_intel_cpu()); + TEST_REQUIRE(!vm_is_unrestricted_guest(NULL)); vm = vm_create_with_one_vcpu(&vcpu, guest_code); get_set_sigalrm_vcpu(vcpu); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c index 190af8124677..ff4644038c55 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c @@ -116,14 +116,6 @@ static void l1_guest_code(struct vmx_pages *vmx_pages) GUEST_DONE(); } -static void tsc_scaling_check_supported(void) -{ - if (!kvm_has_cap(KVM_CAP_TSC_CONTROL)) { - print_skip("TSC scaling not supported by the HW"); - exit(KSFT_SKIP); - } -} - static void stable_tsc_check_supported(void) { FILE *fp; @@ -159,7 +151,7 @@ int main(int argc, char *argv[]) uint64_t l2_tsc_freq = 0; nested_vmx_check_supported(); - tsc_scaling_check_supported(); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_TSC_CONTROL)); stable_tsc_check_supported(); /* diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c index a308442458b8..eb592fae44ef 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c @@ -57,7 +57,6 @@ int main(int argc, char *argv[]) struct kvm_cpuid2 *cpuid; struct kvm_cpuid_entry2 *entry_1_0; struct kvm_cpuid_entry2 *entry_a_0; - bool pdcm_supported = false; struct kvm_vm *vm; struct kvm_vcpu *vcpu; int ret; @@ -71,20 +70,14 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); cpuid = kvm_get_supported_cpuid(); - if (kvm_get_cpuid_max_basic() >= 0xa) { - entry_1_0 = kvm_get_supported_cpuid_index(1, 0); - entry_a_0 = kvm_get_supported_cpuid_index(0xa, 0); - pdcm_supported = entry_1_0 && !!(entry_1_0->ecx & X86_FEATURE_PDCM); - eax.full = entry_a_0->eax; - } - if (!pdcm_supported) { - print_skip("MSR_IA32_PERF_CAPABILITIES is not supported by the vCPU"); - exit(KSFT_SKIP); - } - if (!eax.split.version_id) { - print_skip("PMU is not supported by the vCPU"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(kvm_get_cpuid_max_basic() >= 0xa); + + entry_1_0 = kvm_get_supported_cpuid_index(1, 0); + entry_a_0 = kvm_get_supported_cpuid_index(0xa, 0); + TEST_REQUIRE(entry_1_0->ecx & X86_FEATURE_PDCM); + + eax.full = entry_a_0->eax; + __TEST_REQUIRE(eax.split.version_id, "PMU is not supported by the vCPU"); /* testcase 1, set capabilities when we have PDCM bit */ vcpu_set_cpuid(vcpu, cpuid); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c index 7438258511da..99e57b0cc2c9 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c @@ -169,10 +169,7 @@ int main(int argc, char *argv[]) */ nested_vmx_check_supported(); - if (!kvm_has_cap(KVM_CAP_NESTED_STATE)) { - print_skip("KVM_CAP_NESTED_STATE not supported"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, guest_code); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c index 21f280a7c5e1..b564b86dfc1d 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c @@ -267,10 +267,7 @@ int main(int argc, char *argv[]) have_evmcs = kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS); - if (!kvm_has_cap(KVM_CAP_NESTED_STATE)) { - print_skip("KVM_CAP_NESTED_STATE not available"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); /* * AMD currently does not implement set_nested_state, so for now we diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index 4340c2f2300f..bdcb28186ccc 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -362,10 +362,7 @@ int main(int argc, char *argv[]) !strncmp(argv[1], "--verbose", 10)); int xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM); - if (!(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO) ) { - print_skip("KVM_XEN_HVM_CONFIG_SHARED_INFO not available"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO); bool do_runstate_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE); bool do_eventfd_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL); diff --git a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c index a91f11fb26f4..8b76cade9bcd 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c @@ -80,14 +80,12 @@ static void guest_code(void) int main(int argc, char *argv[]) { + unsigned int xen_caps; struct kvm_vcpu *vcpu; struct kvm_vm *vm; - if (!(kvm_check_cap(KVM_CAP_XEN_HVM) & - KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) ) { - print_skip("KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL not available"); - exit(KSFT_SKIP); - } + xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM); + TEST_REQUIRE(xen_caps & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL); vm = vm_create_with_one_vcpu(&vcpu, guest_code); vcpu_set_hv_cpuid(vcpu); diff --git a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c index 1e3506c3deed..4e2e08059b95 100644 --- a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c +++ b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c @@ -19,7 +19,6 @@ int main(int argc, char *argv[]) { struct kvm_cpuid_entry2 *entry; - bool xss_supported = false; bool xss_in_msr_list; struct kvm_vm *vm; struct kvm_vcpu *vcpu; @@ -29,14 +28,10 @@ int main(int argc, char *argv[]) /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, NULL); - if (kvm_get_cpuid_max_basic() >= 0xd) { - entry = kvm_get_supported_cpuid_index(0xd, 1); - xss_supported = entry && !!(entry->eax & X86_FEATURE_XSAVES); - } - if (!xss_supported) { - print_skip("IA32_XSS is not supported by the vCPU"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(kvm_get_cpuid_max_basic() >= 0xd); + + entry = kvm_get_supported_cpuid_index(0xd, 1); + TEST_REQUIRE(entry->eax & X86_FEATURE_XSAVES); xss_val = vcpu_get_msr(vcpu, MSR_IA32_XSS); TEST_ASSERT(xss_val == 0, -- cgit v1.2.3 From 5321270b2362a85a74c3d52c00c3c6730a228f0c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 9 Jun 2022 17:03:19 -0700 Subject: KVM: selftests: Use TAP-friendly ksft_exit_skip() in __TEST_REQUIRE Use the TAP-friendly ksft_exit_skip() instead of KVM's custom print_skip() when skipping a test via __TEST_REQUIRE. KVM's "skipping test" has no known benefit, whereas some setups rely on TAP output. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/test_util.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index 493b2a799a61..5c5a88180b6c 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -34,12 +34,10 @@ static inline int _no_printf(const char *format, ...) { return 0; } #endif void print_skip(const char *fmt, ...) __attribute__((format(printf, 1, 2))); -#define __TEST_REQUIRE(f, fmt, ...) \ -do { \ - if (!(f)) { \ - print_skip(fmt, ##__VA_ARGS__); \ - exit(KSFT_SKIP); \ - } \ +#define __TEST_REQUIRE(f, fmt, ...) \ +do { \ + if (!(f)) \ + ksft_exit_skip("- " fmt "\n", ##__VA_ARGS__); \ } while (0) #define TEST_REQUIRE(f) __TEST_REQUIRE(f, "Requirement not met: %s", #f) -- cgit v1.2.3 From fcba483e82462830dd368951c0df03a95676f34d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 1 Jun 2022 11:01:58 -0700 Subject: KVM: selftests: Sanity check input to ioctls() at build time Add a static assert to the KVM/VM/vCPU ioctl() helpers to verify that the size of the argument provided matches the expected size of the IOCTL. Because ioctl() ultimately takes a "void *", it's all too easy to pass in garbage and not detect the error until runtime. E.g. while working on a CPUID rework, selftests happily compiled when vcpu_set_cpuid() unintentionally passed the cpuid() function as the parameter to ioctl() (a local "cpuid" parameter was removed, but its use was not replaced with "vcpu->cpuid" as intended). Tweak a variety of benign issues that aren't compatible with the sanity check, e.g. passing a non-pointer for ioctls(). Note, static_assert() requires a string on older versions of GCC. Feed it an empty string to make the compiler happy. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/kvm_util_base.h | 59 ++++++++++++++++------ .../testing/selftests/kvm/lib/aarch64/processor.c | 2 +- tools/testing/selftests/kvm/lib/guest_modes.c | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 29 +---------- .../selftests/kvm/lib/x86_64/perf_test_util.c | 6 +-- tools/testing/selftests/kvm/s390x/resets.c | 6 +-- .../selftests/kvm/x86_64/mmio_warning_test.c | 2 +- .../selftests/kvm/x86_64/pmu_event_filter_test.c | 2 +- .../testing/selftests/kvm/x86_64/xen_shinfo_test.c | 6 +-- 9 files changed, 58 insertions(+), 56 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 04ddab322b6b..cdaea2383543 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -180,29 +180,56 @@ static inline bool kvm_has_cap(long cap) #define __KVM_IOCTL_ERROR(_name, _ret) __KVM_SYSCALL_ERROR(_name, _ret) #define KVM_IOCTL_ERROR(_ioctl, _ret) __KVM_IOCTL_ERROR(#_ioctl, _ret) -#define __kvm_ioctl(kvm_fd, cmd, arg) \ - ioctl(kvm_fd, cmd, arg) +#define kvm_do_ioctl(fd, cmd, arg) \ +({ \ + static_assert(!_IOC_SIZE(cmd) || sizeof(*arg) == _IOC_SIZE(cmd), ""); \ + ioctl(fd, cmd, arg); \ +}) -static inline void _kvm_ioctl(int kvm_fd, unsigned long cmd, const char *name, - void *arg) -{ - int ret = __kvm_ioctl(kvm_fd, cmd, arg); +#define __kvm_ioctl(kvm_fd, cmd, arg) \ + kvm_do_ioctl(kvm_fd, cmd, arg) - TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(name, ret)); -} + +#define _kvm_ioctl(kvm_fd, cmd, name, arg) \ +({ \ + int ret = __kvm_ioctl(kvm_fd, cmd, arg); \ + \ + TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(name, ret)); \ +}) #define kvm_ioctl(kvm_fd, cmd, arg) \ _kvm_ioctl(kvm_fd, cmd, #cmd, arg) -int __vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg); -void _vm_ioctl(struct kvm_vm *vm, unsigned long cmd, const char *name, void *arg); -#define vm_ioctl(vm, cmd, arg) _vm_ioctl(vm, cmd, #cmd, arg) +#define __vm_ioctl(vm, cmd, arg) \ +({ \ + static_assert(sizeof(*(vm)) == sizeof(struct kvm_vm), ""); \ + kvm_do_ioctl((vm)->fd, cmd, arg); \ +}) + +#define _vm_ioctl(vm, cmd, name, arg) \ +({ \ + int ret = __vm_ioctl(vm, cmd, arg); \ + \ + TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(name, ret)); \ +}) + +#define vm_ioctl(vm, cmd, arg) \ + _vm_ioctl(vm, cmd, #cmd, arg) + +#define __vcpu_ioctl(vcpu, cmd, arg) \ +({ \ + static_assert(sizeof(*(vcpu)) == sizeof(struct kvm_vcpu), ""); \ + kvm_do_ioctl((vcpu)->fd, cmd, arg); \ +}) + +#define _vcpu_ioctl(vcpu, cmd, name, arg) \ +({ \ + int ret = __vcpu_ioctl(vcpu, cmd, arg); \ + \ + TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(name, ret)); \ +}) -int __vcpu_ioctl(struct kvm_vcpu *vcpu, unsigned long cmd, - void *arg); -void _vcpu_ioctl(struct kvm_vcpu *vcpu, unsigned long cmd, - const char *name, void *arg); -#define vcpu_ioctl(vcpu, cmd, arg) \ +#define vcpu_ioctl(vcpu, cmd, arg) \ _vcpu_ioctl(vcpu, cmd, #cmd, arg) /* diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index 6bd27782f00c..6f5551368944 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -472,7 +472,7 @@ void aarch64_get_supported_page_sizes(uint32_t ipa, }; kvm_fd = open_kvm_dev_path_or_exit(); - vm_fd = __kvm_ioctl(kvm_fd, KVM_CREATE_VM, ipa); + vm_fd = __kvm_ioctl(kvm_fd, KVM_CREATE_VM, (void *)(unsigned long)ipa); TEST_ASSERT(vm_fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VM, vm_fd)); vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, 0); diff --git a/tools/testing/selftests/kvm/lib/guest_modes.c b/tools/testing/selftests/kvm/lib/guest_modes.c index 0be56c63aed6..99a575bbbc52 100644 --- a/tools/testing/selftests/kvm/lib/guest_modes.c +++ b/tools/testing/selftests/kvm/lib/guest_modes.c @@ -65,7 +65,7 @@ void guest_modes_append_default(void) struct kvm_s390_vm_cpu_processor info; kvm_fd = open_kvm_dev_path_or_exit(); - vm_fd = __kvm_ioctl(kvm_fd, KVM_CREATE_VM, 0); + vm_fd = __kvm_ioctl(kvm_fd, KVM_CREATE_VM, NULL); kvm_device_attr_get(vm_fd, KVM_S390_VM_CPU_MODEL, KVM_S390_VM_CPU_PROCESSOR, &info); close(vm_fd); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index cca89d9a83ea..39f2f5f1338f 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -72,7 +72,7 @@ unsigned int kvm_check_cap(long cap) int kvm_fd; kvm_fd = open_kvm_dev_path_or_exit(); - ret = __kvm_ioctl(kvm_fd, KVM_CHECK_EXTENSION, cap); + ret = __kvm_ioctl(kvm_fd, KVM_CHECK_EXTENSION, (void *)cap); TEST_ASSERT(ret >= 0, KVM_IOCTL_ERROR(KVM_CHECK_EXTENSION, ret)); close(kvm_fd); @@ -92,7 +92,7 @@ static void vm_open(struct kvm_vm *vm) TEST_REQUIRE(kvm_has_cap(KVM_CAP_IMMEDIATE_EXIT)); - vm->fd = __kvm_ioctl(vm->kvm_fd, KVM_CREATE_VM, vm->type); + vm->fd = __kvm_ioctl(vm->kvm_fd, KVM_CREATE_VM, (void *)vm->type); TEST_ASSERT(vm->fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VM, vm->fd)); } @@ -1450,19 +1450,6 @@ struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vcpu *vcpu) return reg_list; } -int __vcpu_ioctl(struct kvm_vcpu *vcpu, unsigned long cmd, void *arg) -{ - return ioctl(vcpu->fd, cmd, arg); -} - -void _vcpu_ioctl(struct kvm_vcpu *vcpu, unsigned long cmd, const char *name, - void *arg) -{ - int ret = __vcpu_ioctl(vcpu, cmd, arg); - - TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(name, ret)); -} - void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu) { uint32_t page_size = vcpu->vm->page_size; @@ -1492,18 +1479,6 @@ void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu) return vcpu->dirty_gfns; } -int __vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) -{ - return ioctl(vm->fd, cmd, arg); -} - -void _vm_ioctl(struct kvm_vm *vm, unsigned long cmd, const char *name, void *arg) -{ - int ret = __vm_ioctl(vm, cmd, arg); - - TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(name, ret)); -} - /* * Device Ioctl */ diff --git a/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c b/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c index 446820a549ba..bfe85c8c2f6e 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c @@ -103,9 +103,9 @@ void perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc * Override the vCPU to run perf_test_l1_guest_code() which will * bounce it into L2 before calling perf_test_guest_code(). */ - vcpu_regs_get(vm, vcpus[vcpu_id]->id, ®s); + vcpu_regs_get(vcpus[vcpu_id], ®s); regs.rip = (unsigned long) perf_test_l1_guest_code; - vcpu_regs_set(vm, vcpus[vcpu_id]->id, ®s); - vcpu_args_set(vm, vcpus[vcpu_id]->id, 2, vmx_gva, vcpu_id); + vcpu_regs_set(vcpus[vcpu_id], ®s); + vcpu_args_set(vcpus[vcpu_id], 2, vmx_gva, vcpu_id); } } diff --git a/tools/testing/selftests/kvm/s390x/resets.c b/tools/testing/selftests/kvm/s390x/resets.c index e1737411accc..19486084eb30 100644 --- a/tools/testing/selftests/kvm/s390x/resets.c +++ b/tools/testing/selftests/kvm/s390x/resets.c @@ -225,7 +225,7 @@ static void test_normal(void) inject_irq(vcpu); - vcpu_ioctl(vcpu, KVM_S390_NORMAL_RESET, 0); + vcpu_ioctl(vcpu, KVM_S390_NORMAL_RESET, NULL); /* must clears */ assert_normal(vcpu); @@ -248,7 +248,7 @@ static void test_initial(void) inject_irq(vcpu); - vcpu_ioctl(vcpu, KVM_S390_INITIAL_RESET, 0); + vcpu_ioctl(vcpu, KVM_S390_INITIAL_RESET, NULL); /* must clears */ assert_normal(vcpu); @@ -271,7 +271,7 @@ static void test_clear(void) inject_irq(vcpu); - vcpu_ioctl(vcpu, KVM_S390_CLEAR_RESET, 0); + vcpu_ioctl(vcpu, KVM_S390_CLEAR_RESET, NULL); /* must clears */ assert_normal(vcpu); diff --git a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c index 0e4590afd0e1..fb02581953a3 100644 --- a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c +++ b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c @@ -59,7 +59,7 @@ void test(void) kvm = open("/dev/kvm", O_RDWR); TEST_ASSERT(kvm != -1, "failed to open /dev/kvm"); - kvmvm = __kvm_ioctl(kvm, KVM_CREATE_VM, 0); + kvmvm = __kvm_ioctl(kvm, KVM_CREATE_VM, NULL); TEST_ASSERT(kvmvm > 0, KVM_IOCTL_ERROR(KVM_CREATE_VM, kvmvm)); kvmcpu = ioctl(kvmvm, KVM_CREATE_VCPU, 0); TEST_ASSERT(kvmcpu != -1, KVM_IOCTL_ERROR(KVM_CREATE_VCPU, kvmcpu)); diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index 656fb2227a81..786b3a794f84 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -276,7 +276,7 @@ static void test_without_filter(struct kvm_vcpu *vcpu) static uint64_t test_with_filter(struct kvm_vcpu *vcpu, struct kvm_pmu_event_filter *f) { - vm_ioctl(vcpu->vm, KVM_SET_PMU_EVENT_FILTER, (void *)f); + vm_ioctl(vcpu->vm, KVM_SET_PMU_EVENT_FILTER, f); return run_vcpu_to_sync(vcpu); } diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index bdcb28186ccc..a4a78637c35a 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -472,7 +472,7 @@ int main(int argc, char *argv[]) irq_routes.entries[1].u.xen_evtchn.vcpu = vcpu->id; irq_routes.entries[1].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; - vm_ioctl(vm, KVM_SET_GSI_ROUTING, &irq_routes); + vm_ioctl(vm, KVM_SET_GSI_ROUTING, &irq_routes.info); struct kvm_irqfd ifd = { }; @@ -716,7 +716,7 @@ int main(int argc, char *argv[]) if (verbose) printf("Testing restored oneshot timer\n"); - tmr.u.timer.expires_ns = rs->state_entry_time + 100000000, + tmr.u.timer.expires_ns = rs->state_entry_time + 100000000; vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr); evtchn_irq_expected = true; alarm(1); @@ -743,7 +743,7 @@ int main(int argc, char *argv[]) if (verbose) printf("Testing SCHEDOP_poll wake on masked event\n"); - tmr.u.timer.expires_ns = rs->state_entry_time + 100000000, + tmr.u.timer.expires_ns = rs->state_entry_time + 100000000; vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr); alarm(1); break; -- cgit v1.2.3 From 89eda98428ce10f8df110d60aa934aa5c5170686 Mon Sep 17 00:00:00 2001 From: Feng Zhou Date: Fri, 10 Jun 2022 10:33:08 +0800 Subject: selftest/bpf/benchs: Add bpf_map benchmark Add benchmark for hash_map to reproduce the worst case that non-stop update when map's free is zero. Just like this: ./run_bench_bpf_hashmap_full_update.sh Setting up benchmark 'bpf-hashmap-ful-update'... Benchmark 'bpf-hashmap-ful-update' started. 1:hash_map_full_perf 555830 events per sec ... Signed-off-by: Feng Zhou Link: https://lore.kernel.org/r/20220610023308.93798-3-zhoufeng.zf@bytedance.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 4 +- tools/testing/selftests/bpf/bench.c | 2 + .../bpf/benchs/bench_bpf_hashmap_full_update.c | 96 ++++++++++++++++++++++ .../benchs/run_bench_bpf_hashmap_full_update.sh | 11 +++ .../bpf/progs/bpf_hashmap_full_update_bench.c | 40 +++++++++ 5 files changed, 152 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_bpf_hashmap_full_update.sh create mode 100644 tools/testing/selftests/bpf/progs/bpf_hashmap_full_update_bench.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 2d3c8c8f558a..8ad7a733a505 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -560,6 +560,7 @@ $(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \ $(OUTPUT)/bench_bloom_filter_map.o: $(OUTPUT)/bloom_filter_bench.skel.h $(OUTPUT)/bench_bpf_loop.o: $(OUTPUT)/bpf_loop_bench.skel.h $(OUTPUT)/bench_strncmp.o: $(OUTPUT)/strncmp_bench.skel.h +$(OUTPUT)/bench_bpf_hashmap_full_update.o: $(OUTPUT)/bpf_hashmap_full_update_bench.skel.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ) $(OUTPUT)/bench: LDLIBS += -lm $(OUTPUT)/bench: $(OUTPUT)/bench.o \ @@ -571,7 +572,8 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \ $(OUTPUT)/bench_ringbufs.o \ $(OUTPUT)/bench_bloom_filter_map.o \ $(OUTPUT)/bench_bpf_loop.o \ - $(OUTPUT)/bench_strncmp.o + $(OUTPUT)/bench_strncmp.o \ + $(OUTPUT)/bench_bpf_hashmap_full_update.o $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index f061cc20e776..d8aa62be996b 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -396,6 +396,7 @@ extern const struct bench bench_hashmap_with_bloom; extern const struct bench bench_bpf_loop; extern const struct bench bench_strncmp_no_helper; extern const struct bench bench_strncmp_helper; +extern const struct bench bench_bpf_hashmap_full_update; static const struct bench *benchs[] = { &bench_count_global, @@ -430,6 +431,7 @@ static const struct bench *benchs[] = { &bench_bpf_loop, &bench_strncmp_no_helper, &bench_strncmp_helper, + &bench_bpf_hashmap_full_update, }; static void setup_benchmark() diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c b/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c new file mode 100644 index 000000000000..cec51e0ff4b8 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Bytedance */ + +#include +#include "bench.h" +#include "bpf_hashmap_full_update_bench.skel.h" +#include "bpf_util.h" + +/* BPF triggering benchmarks */ +static struct ctx { + struct bpf_hashmap_full_update_bench *skel; +} ctx; + +#define MAX_LOOP_NUM 10000 + +static void validate(void) +{ + if (env.consumer_cnt != 1) { + fprintf(stderr, "benchmark doesn't support multi-consumer!\n"); + exit(1); + } +} + +static void *producer(void *input) +{ + while (true) { + /* trigger the bpf program */ + syscall(__NR_getpgid); + } + + return NULL; +} + +static void *consumer(void *input) +{ + return NULL; +} + +static void measure(struct bench_res *res) +{ +} + +static void setup(void) +{ + struct bpf_link *link; + int map_fd, i, max_entries; + + setup_libbpf(); + + ctx.skel = bpf_hashmap_full_update_bench__open_and_load(); + if (!ctx.skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + ctx.skel->bss->nr_loops = MAX_LOOP_NUM; + + link = bpf_program__attach(ctx.skel->progs.benchmark); + if (!link) { + fprintf(stderr, "failed to attach program!\n"); + exit(1); + } + + /* fill hash_map */ + map_fd = bpf_map__fd(ctx.skel->maps.hash_map_bench); + max_entries = bpf_map__max_entries(ctx.skel->maps.hash_map_bench); + for (i = 0; i < max_entries; i++) + bpf_map_update_elem(map_fd, &i, &i, BPF_ANY); +} + +void hashmap_report_final(struct bench_res res[], int res_cnt) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + int i; + + for (i = 0; i < nr_cpus; i++) { + u64 time = ctx.skel->bss->percpu_time[i]; + + if (!time) + continue; + + printf("%d:hash_map_full_perf %lld events per sec\n", + i, ctx.skel->bss->nr_loops * 1000000000ll / time); + } +} + +const struct bench bench_bpf_hashmap_full_update = { + .name = "bpf-hashmap-ful-update", + .validate = validate, + .setup = setup, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = NULL, + .report_final = hashmap_report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/run_bench_bpf_hashmap_full_update.sh b/tools/testing/selftests/bpf/benchs/run_bench_bpf_hashmap_full_update.sh new file mode 100755 index 000000000000..1e2de838f9fa --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/run_bench_bpf_hashmap_full_update.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source ./benchs/run_common.sh + +set -eufo pipefail + +nr_threads=`expr $(cat /proc/cpuinfo | grep "processor"| wc -l) - 1` +summary=$($RUN_BENCH -p $nr_threads bpf-hashmap-ful-update) +printf "$summary" +printf "\n" diff --git a/tools/testing/selftests/bpf/progs/bpf_hashmap_full_update_bench.c b/tools/testing/selftests/bpf/progs/bpf_hashmap_full_update_bench.c new file mode 100644 index 000000000000..56957557e3e1 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_hashmap_full_update_bench.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Bytedance */ + +#include "vmlinux.h" +#include +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +#define MAX_ENTRIES 1000 + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, u64); + __uint(max_entries, MAX_ENTRIES); +} hash_map_bench SEC(".maps"); + +u64 __attribute__((__aligned__(256))) percpu_time[256]; +u64 nr_loops; + +static int loop_update_callback(__u32 index, u32 *key) +{ + u64 init_val = 1; + + bpf_map_update_elem(&hash_map_bench, key, &init_val, BPF_ANY); + return 0; +} + +SEC("fentry/" SYS_PREFIX "sys_getpgid") +int benchmark(void *ctx) +{ + u32 cpu = bpf_get_smp_processor_id(); + u32 key = cpu + MAX_ENTRIES; + u64 start_time = bpf_ktime_get_ns(); + + bpf_loop(nr_loops, loop_update_callback, &key, 0); + percpu_time[cpu & 255] = bpf_ktime_get_ns() - start_time; + return 0; +} -- cgit v1.2.3 From 41ecad2c3cce807abf32bff879cef5dfcacae363 Mon Sep 17 00:00:00 2001 From: David Fries Date: Mon, 13 Jun 2022 00:39:41 -0500 Subject: spi: spidev_test: Warn when the mode is not the requested mode Print a warning if the device mode doesn't match the requested mode. The user doesn't enter the mode in hex so it isn't obvious when setting the mode succeeds that the mode isn't the requested mode. The kernel logs a message, it will be more visible if the test also prints a warning. I was testing --quad, which is unsupported, but doesn't cause the mode request to fail. Signed-off-by: David Fries Link: https://lore.kernel.org/r/YqbNnSGtWHe/GG7w@spacedout.fries.net Signed-off-by: Mark Brown --- tools/spi/spidev_test.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'tools') diff --git a/tools/spi/spidev_test.c b/tools/spi/spidev_test.c index 83844f8b862a..b0ca44c70e83 100644 --- a/tools/spi/spidev_test.c +++ b/tools/spi/spidev_test.c @@ -417,6 +417,7 @@ int main(int argc, char *argv[]) { int ret = 0; int fd; + uint32_t request; parse_opts(argc, argv); @@ -430,13 +431,23 @@ int main(int argc, char *argv[]) /* * spi mode */ + /* WR is make a request to assign 'mode' */ + request = mode; ret = ioctl(fd, SPI_IOC_WR_MODE32, &mode); if (ret == -1) pabort("can't set spi mode"); + /* RD is read what mode the device actually is in */ ret = ioctl(fd, SPI_IOC_RD_MODE32, &mode); if (ret == -1) pabort("can't get spi mode"); + /* Drivers can reject some mode bits without returning an error. + * Read the current value to identify what mode it is in, and if it + * differs from the requested mode, warn the user. + */ + if (request != mode) + printf("WARNING device does not support requested mode 0x%x\n", + request); /* * bits per word -- cgit v1.2.3 From c49a44b39b313a4191ca2f06316c547ee673264e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 12 Jun 2022 22:43:14 -0700 Subject: libbpf: Fix an unsigned < 0 bug Andrii reported a bug with the following information: 2859 if (enum64_placeholder_id == 0) { 2860 enum64_placeholder_id = btf__add_int(btf, "enum64_placeholder", 1, 0); >>> CID 394804: Control flow issues (NO_EFFECT) >>> This less-than-zero comparison of an unsigned value is never true. "enum64_placeholder_id < 0U". 2861 if (enum64_placeholder_id < 0) 2862 return enum64_placeholder_id; 2863 ... Here enum64_placeholder_id declared as '__u32' so enum64_placeholder_id < 0 is always false. Declare enum64_placeholder_id as 'int' in order to capture the potential error properly. Fixes: f2a625889bb8 ("libbpf: Add enum64 sanitization") Reported-by: Andrii Nakryiko Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220613054314.1251905-1-yhs@fb.com --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 0781fae58a06..d989b0a17a89 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -2786,7 +2786,7 @@ static int bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf) bool has_decl_tag = kernel_supports(obj, FEAT_BTF_DECL_TAG); bool has_type_tag = kernel_supports(obj, FEAT_BTF_TYPE_TAG); bool has_enum64 = kernel_supports(obj, FEAT_BTF_ENUM64); - __u32 enum64_placeholder_id = 0; + int enum64_placeholder_id = 0; struct btf_type *t; int i, j, vlen; -- cgit v1.2.3 From b3b7c6a6e80d8347a4a5a13a25fa314800f2cbbe Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Tue, 14 Jun 2022 10:10:41 +0200 Subject: KVM: selftests: kvm_binary_stats_test: Fix index expressions kvm_binary_stats_test accepts two arguments, the number of vms and number of vcpus. If these inputs are not equal then the test would likely crash for one reason or another due to using miscalculated indices for the vcpus array. Fix the index expressions by swapping the use of i and j. Signed-off-by: Andrew Jones Message-Id: <20220614081041.2571511-1-drjones@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/kvm_binary_stats_test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index 1baabf955d63..3c2d06b61442 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -225,14 +225,14 @@ int main(int argc, char *argv[]) for (i = 0; i < max_vm; ++i) { vms[i] = vm_create_barebones(); for (j = 0; j < max_vcpu; ++j) - vcpus[j * max_vcpu + i] = __vm_vcpu_add(vms[i], j); + vcpus[i * max_vcpu + j] = __vm_vcpu_add(vms[i], j); } /* Check stats read for every VM and VCPU */ for (i = 0; i < max_vm; ++i) { vm_stats_test(vms[i]); for (j = 0; j < max_vcpu; ++j) - vcpu_stats_test(vcpus[j * max_vcpu + i]); + vcpu_stats_test(vcpus[i * max_vcpu + j]); } for (i = 0; i < max_vm; ++i) -- cgit v1.2.3 From 4f48e2e737451365bc81c3b6283036a9079bcc24 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 13 Jun 2022 16:19:39 +0000 Subject: KVM: selftests: Add a missing apostrophe in comment to show ownership Add an apostrophe in a comment about it being the caller's, not callers, responsibility to free an object. Reported-by: Andrew Jones Fixes: 768e9a61856b ("KVM: selftests: Purge vm+vcpu_id == vcpu silliness") Signed-off-by: Sean Christopherson Message-Id: <20220613161942.1586791-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/kvm_util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 39f2f5f1338f..0c550fb0dab2 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1434,7 +1434,7 @@ void vcpu_run_complete_io(struct kvm_vcpu *vcpu) /* * Get the list of guest registers which are supported for * KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls. Returns a kvm_reg_list pointer, - * it is the callers responsibility to free the list. + * it is the caller's responsibility to free the list. */ struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vcpu *vcpu) { -- cgit v1.2.3 From ad125f309850b9cf2ba4f39729c5cc827595fac4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 13 Jun 2022 16:19:40 +0000 Subject: KVM: selftests: Call a dummy helper in VM/vCPU ioctls() to enforce type Replace the goofy static_assert on the size of the @vm/@vcpu parameters with a call to a dummy helper, i.e. let the compiler naturally complain about an incompatible type instead of homebrewing a poor replacement. Reported-by: Andrew Jones Fixes: fcba483e8246 ("KVM: selftests: Sanity check input to ioctls() at build time") Signed-off-by: Sean Christopherson Message-Id: <20220613161942.1586791-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/kvm_util_base.h | 57 ++++++++++++---------- 1 file changed, 31 insertions(+), 26 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index cdaea2383543..7ebfc8c7de17 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -186,50 +186,55 @@ static inline bool kvm_has_cap(long cap) ioctl(fd, cmd, arg); \ }) -#define __kvm_ioctl(kvm_fd, cmd, arg) \ +#define __kvm_ioctl(kvm_fd, cmd, arg) \ kvm_do_ioctl(kvm_fd, cmd, arg) -#define _kvm_ioctl(kvm_fd, cmd, name, arg) \ -({ \ - int ret = __kvm_ioctl(kvm_fd, cmd, arg); \ - \ - TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(name, ret)); \ +#define _kvm_ioctl(kvm_fd, cmd, name, arg) \ +({ \ + int ret = __kvm_ioctl(kvm_fd, cmd, arg); \ + \ + TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(name, ret)); \ }) #define kvm_ioctl(kvm_fd, cmd, arg) \ _kvm_ioctl(kvm_fd, cmd, #cmd, arg) -#define __vm_ioctl(vm, cmd, arg) \ -({ \ - static_assert(sizeof(*(vm)) == sizeof(struct kvm_vm), ""); \ - kvm_do_ioctl((vm)->fd, cmd, arg); \ +static __always_inline void static_assert_is_vm(struct kvm_vm *vm) { } + +#define __vm_ioctl(vm, cmd, arg) \ +({ \ + static_assert_is_vm(vm); \ + kvm_do_ioctl((vm)->fd, cmd, arg); \ }) -#define _vm_ioctl(vm, cmd, name, arg) \ -({ \ - int ret = __vm_ioctl(vm, cmd, arg); \ - \ - TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(name, ret)); \ +#define _vm_ioctl(vm, cmd, name, arg) \ +({ \ + int ret = __vm_ioctl(vm, cmd, arg); \ + \ + TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(name, ret)); \ }) -#define vm_ioctl(vm, cmd, arg) \ +#define vm_ioctl(vm, cmd, arg) \ _vm_ioctl(vm, cmd, #cmd, arg) -#define __vcpu_ioctl(vcpu, cmd, arg) \ -({ \ - static_assert(sizeof(*(vcpu)) == sizeof(struct kvm_vcpu), ""); \ - kvm_do_ioctl((vcpu)->fd, cmd, arg); \ + +static __always_inline void static_assert_is_vcpu(struct kvm_vcpu *vcpu) { } + +#define __vcpu_ioctl(vcpu, cmd, arg) \ +({ \ + static_assert_is_vcpu(vcpu); \ + kvm_do_ioctl((vcpu)->fd, cmd, arg); \ }) -#define _vcpu_ioctl(vcpu, cmd, name, arg) \ -({ \ - int ret = __vcpu_ioctl(vcpu, cmd, arg); \ - \ - TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(name, ret)); \ +#define _vcpu_ioctl(vcpu, cmd, name, arg) \ +({ \ + int ret = __vcpu_ioctl(vcpu, cmd, arg); \ + \ + TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(name, ret)); \ }) -#define vcpu_ioctl(vcpu, cmd, arg) \ +#define vcpu_ioctl(vcpu, cmd, arg) \ _vcpu_ioctl(vcpu, cmd, #cmd, arg) /* -- cgit v1.2.3 From 96f113c40d2882ac9b5e4fcac9e48c32eb030aa3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 13 Jun 2022 16:19:41 +0000 Subject: KVM: selftests: Drop a duplicate TEST_ASSERT() in vm_nr_pages_required() Remove a duplicate TEST_ASSERT() on the number of runnable vCPUs in vm_nr_pages_required() that snuck in during a rebase gone bad. Reported-by: Andrew Jones Fixes: 6e1d13bf3815 ("KVM: selftests: Move per-VM/per-vCPU nr pages calculation to __vm_create()") Signed-off-by: Sean Christopherson Message-Id: <20220613161942.1586791-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/kvm_util.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 0c550fb0dab2..bceb668f2627 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -284,10 +284,6 @@ static uint64_t vm_nr_pages_required(enum vm_guest_mode mode, */ nr_pages += (nr_pages + extra_mem_pages) / PTES_PER_MIN_PAGE * 2; - TEST_ASSERT(nr_runnable_vcpus <= kvm_check_cap(KVM_CAP_MAX_VCPUS), - "Host doesn't support %d vCPUs, max-vcpus = %d", - nr_runnable_vcpus, kvm_check_cap(KVM_CAP_MAX_VCPUS)); - return vm_adjust_num_guest_pages(mode, nr_pages); } -- cgit v1.2.3 From 9393cb13fa5d4c1ef2f1c3086af1c2cc03389bad Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 13 Jun 2022 16:19:42 +0000 Subject: KVM: selftests: Use kvm_has_cap(), not kvm_check_cap(), where possible Replace calls to kvm_check_cap() that treat its return as a boolean with calls to kvm_has_cap(). Several instances of kvm_check_cap() were missed when kvm_has_cap() was introduced. Reported-by: Andrew Jones Fixes: 3ea9b809650b ("KVM: selftests: Add kvm_has_cap() to provide syntactic sugar") Signed-off-by: Sean Christopherson Message-Id: <20220613161942.1586791-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/aarch64/psci_test.c | 2 +- tools/testing/selftests/kvm/lib/x86_64/processor.c | 4 ++-- tools/testing/selftests/kvm/s390x/sync_regs_test.c | 2 +- tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c | 2 +- tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c | 6 +++--- tools/testing/selftests/kvm/x86_64/smm_test.c | 2 +- tools/testing/selftests/kvm/x86_64/state_test.c | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c index a889e1cf5e4d..b665b534cb78 100644 --- a/tools/testing/selftests/kvm/aarch64/psci_test.c +++ b/tools/testing/selftests/kvm/aarch64/psci_test.c @@ -192,7 +192,7 @@ static void host_test_system_suspend(void) int main(void) { - TEST_REQUIRE(kvm_check_cap(KVM_CAP_ARM_SYSTEM_SUSPEND)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_SYSTEM_SUSPEND)); host_test_cpu_on(); host_test_system_suspend(); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 4a7de11d6f37..906132e70fa4 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -991,7 +991,7 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vcpu *vcpu) vcpu_regs_get(vcpu, &state->regs); vcpu_save_xsave_state(vcpu, state); - if (kvm_check_cap(KVM_CAP_XCRS)) + if (kvm_has_cap(KVM_CAP_XCRS)) vcpu_xcrs_get(vcpu, &state->xcrs); vcpu_sregs_get(vcpu, &state->sregs); @@ -1022,7 +1022,7 @@ void vcpu_load_state(struct kvm_vcpu *vcpu, struct kvm_x86_state *state) vcpu_sregs_set(vcpu, &state->sregs); vcpu_msrs_set(vcpu, &state->msrs); - if (kvm_check_cap(KVM_CAP_XCRS)) + if (kvm_has_cap(KVM_CAP_XCRS)) vcpu_xcrs_set(vcpu, &state->xcrs); vcpu_xsave_set(vcpu, state->xsave); diff --git a/tools/testing/selftests/kvm/s390x/sync_regs_test.c b/tools/testing/selftests/kvm/s390x/sync_regs_test.c index b69710822c47..3fdb6e2598eb 100644 --- a/tools/testing/selftests/kvm/s390x/sync_regs_test.c +++ b/tools/testing/selftests/kvm/s390x/sync_regs_test.c @@ -229,7 +229,7 @@ int main(int argc, char *argv[]) struct kvm_vm *vm; int idx; - TEST_REQUIRE(kvm_check_cap(KVM_CAP_SYNC_REGS)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_SYNC_REGS)); /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index 786b3a794f84..530a75fee92c 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -450,7 +450,7 @@ int main(int argc, char *argv[]) /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); - TEST_REQUIRE(kvm_check_cap(KVM_CAP_PMU_EVENT_FILTER)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_PMU_EVENT_FILTER)); TEST_REQUIRE(use_intel_pmu() || use_amd_pmu()); guest_code = use_intel_pmu() ? intel_guest_code : amd_guest_code; diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c index 76ba6fc80e37..46018b247a04 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c +++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c @@ -411,16 +411,16 @@ int main(int argc, char *argv[]) have_sev_es = !!(cpuid->eax & X86_FEATURE_SEV_ES); - if (kvm_check_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM)) { + if (kvm_has_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM)) { test_sev_migrate_from(/* es= */ false); if (have_sev_es) test_sev_migrate_from(/* es= */ true); test_sev_migrate_locking(); test_sev_migrate_parameters(); - if (kvm_check_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) + if (kvm_has_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) test_sev_move_copy(); } - if (kvm_check_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) { + if (kvm_has_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) { test_sev_mirror(/* es= */ false); if (have_sev_es) test_sev_mirror(/* es= */ true); diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c index 3cd1da388b52..921cbf117329 100644 --- a/tools/testing/selftests/kvm/x86_64/smm_test.c +++ b/tools/testing/selftests/kvm/x86_64/smm_test.c @@ -153,7 +153,7 @@ int main(int argc, char *argv[]) vcpu_set_msr(vcpu, MSR_IA32_SMBASE, SMRAM_GPA); - if (kvm_check_cap(KVM_CAP_NESTED_STATE)) { + if (kvm_has_cap(KVM_CAP_NESTED_STATE)) { if (nested_svm_supported()) vcpu_alloc_svm(vm, &nested_gva); else if (nested_vmx_supported()) diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c index 0bcd78cf7c79..e2f1f35e51ff 100644 --- a/tools/testing/selftests/kvm/x86_64/state_test.c +++ b/tools/testing/selftests/kvm/x86_64/state_test.c @@ -169,7 +169,7 @@ int main(int argc, char *argv[]) vcpu_regs_get(vcpu, ®s1); - if (kvm_check_cap(KVM_CAP_NESTED_STATE)) { + if (kvm_has_cap(KVM_CAP_NESTED_STATE)) { if (nested_svm_supported()) vcpu_alloc_svm(vm, &nested_gva); else if (nested_vmx_supported()) -- cgit v1.2.3 From 6b4384ff108874cf336fe2fb1633313c2c7620bf Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 10 Jun 2022 12:26:47 +0100 Subject: Revert "bpftool: Use libbpf 1.0 API mode instead of RLIMIT_MEMLOCK" This reverts commit a777e18f1bcd32528ff5dfd10a6629b655b05eb8. In commit a777e18f1bcd ("bpftool: Use libbpf 1.0 API mode instead of RLIMIT_MEMLOCK"), we removed the rlimit bump in bpftool, because the kernel has switched to memcg-based memory accounting. Thanks to the LIBBPF_STRICT_AUTO_RLIMIT_MEMLOCK, we attempted to keep compatibility with other systems and ask libbpf to raise the limit for us if necessary. How do we know if memcg-based accounting is supported? There is a probe in libbpf to check this. But this probe currently relies on the availability of a given BPF helper, bpf_ktime_get_coarse_ns(), which landed in the same kernel version as the memory accounting change. This works in the generic case, but it may fail, for example, if the helper function has been backported to an older kernel. This has been observed for Google Cloud's Container-Optimized OS (COS), where the helper is available but rlimit is still in use. The probe succeeds, the rlimit is not raised, and probing features with bpftool, for example, fails. A patch was submitted [0] to update this probe in libbpf, based on what the cilium/ebpf Go library does [1]. It would lower the soft rlimit to 0, attempt to load a BPF object, and reset the rlimit. But it may induce some hard-to-debug flakiness if another process starts, or the current application is killed, while the rlimit is reduced, and the approach was discarded. As a workaround to ensure that the rlimit bump does not depend on the availability of a given helper, we restore the unconditional rlimit bump in bpftool for now. [0] https://lore.kernel.org/bpf/20220609143614.97837-1-quentin@isovalent.com/ [1] https://github.com/cilium/ebpf/blob/v0.9.0/rlimit/rlimit.go#L39 Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Cc: Yafang Shao Cc: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20220610112648.29695-2-quentin@isovalent.com --- tools/bpf/bpftool/common.c | 8 ++++++++ tools/bpf/bpftool/feature.c | 2 ++ tools/bpf/bpftool/main.c | 6 +++--- tools/bpf/bpftool/main.h | 2 ++ tools/bpf/bpftool/map.c | 2 ++ tools/bpf/bpftool/pids.c | 1 + tools/bpf/bpftool/prog.c | 3 +++ tools/bpf/bpftool/struct_ops.c | 2 ++ 8 files changed, 23 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index a45b42ee8ab0..a0d4acd7c54a 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -72,6 +73,13 @@ static bool is_bpffs(char *path) return (unsigned long)st_fs.f_type == BPF_FS_MAGIC; } +void set_max_rlimit(void) +{ + struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY }; + + setrlimit(RLIMIT_MEMLOCK, &rinf); +} + static int mnt_fs(const char *target, const char *type, char *buff, size_t bufflen) { diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index cc9e4df8c58e..bac4ef428a02 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -1167,6 +1167,8 @@ static int do_probe(int argc, char **argv) __u32 ifindex = 0; char *ifname; + set_max_rlimit(); + while (argc) { if (is_prefix(*argv, "kernel")) { if (target != COMPONENT_UNSPEC) { diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index 9062ef2b8767..e81227761f5d 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -507,9 +507,9 @@ int main(int argc, char **argv) * It will still be rejected if users use LIBBPF_STRICT_ALL * mode for loading generated skeleton. */ - libbpf_set_strict_mode(LIBBPF_STRICT_ALL & ~LIBBPF_STRICT_MAP_DEFINITIONS); - } else { - libbpf_set_strict_mode(LIBBPF_STRICT_AUTO_RLIMIT_MEMLOCK); + ret = libbpf_set_strict_mode(LIBBPF_STRICT_ALL & ~LIBBPF_STRICT_MAP_DEFINITIONS); + if (ret) + p_err("failed to enable libbpf strict mode: %d", ret); } argc -= optind; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 6c311f47147e..589cb76b227a 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -96,6 +96,8 @@ int detect_common_prefix(const char *arg, ...); void fprint_hex(FILE *f, void *arg, unsigned int n, const char *sep); void usage(void) __noreturn; +void set_max_rlimit(void); + int mount_tracefs(const char *target); struct obj_ref { diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 800834be1bcb..38b6bc9c26c3 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -1326,6 +1326,8 @@ static int do_create(int argc, char **argv) goto exit; } + set_max_rlimit(); + fd = bpf_map_create(map_type, map_name, key_size, value_size, max_entries, &attr); if (fd < 0) { p_err("map create failed: %s", strerror(errno)); diff --git a/tools/bpf/bpftool/pids.c b/tools/bpf/bpftool/pids.c index e2d00d3cd868..bb6c969a114a 100644 --- a/tools/bpf/bpftool/pids.c +++ b/tools/bpf/bpftool/pids.c @@ -108,6 +108,7 @@ int build_obj_refs_table(struct hashmap **map, enum bpf_obj_type type) p_err("failed to create hashmap for PID references"); return -1; } + set_max_rlimit(); skel = pid_iter_bpf__open(); if (!skel) { diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index e71f0b2da50b..f081de398b60 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -1590,6 +1590,8 @@ static int load_with_options(int argc, char **argv, bool first_prog_only) } } + set_max_rlimit(); + if (verifier_logs) /* log_level1 + log_level2 + stats, but not stable UAPI */ open_opts.kernel_log_level = 1 + 2 + 4; @@ -2287,6 +2289,7 @@ static int do_profile(int argc, char **argv) } } + set_max_rlimit(); err = profiler_bpf__load(profile_obj); if (err) { p_err("failed to load profile_obj"); diff --git a/tools/bpf/bpftool/struct_ops.c b/tools/bpf/bpftool/struct_ops.c index 2535f079ed67..e08a6ff2866c 100644 --- a/tools/bpf/bpftool/struct_ops.c +++ b/tools/bpf/bpftool/struct_ops.c @@ -501,6 +501,8 @@ static int do_register(int argc, char **argv) if (libbpf_get_error(obj)) return -1; + set_max_rlimit(); + if (bpf_object__load(obj)) { bpf_object__close(obj); return -1; -- cgit v1.2.3 From 93270357daa949e4bed375b40d0a100ce04f3399 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 10 Jun 2022 12:26:48 +0100 Subject: bpftool: Do not check return value from libbpf_set_strict_mode() The function always returns 0, so we don't need to check whether the return value is 0 or not. This change was first introduced in commit a777e18f1bcd ("bpftool: Use libbpf 1.0 API mode instead of RLIMIT_MEMLOCK"), but later reverted to restore the unconditional rlimit bump in bpftool. Let's re-add it. Co-developed-by: Yafang Shao Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220610112648.29695-3-quentin@isovalent.com --- tools/bpf/bpftool/main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index e81227761f5d..451cefc2d0da 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -507,9 +507,7 @@ int main(int argc, char **argv) * It will still be rejected if users use LIBBPF_STRICT_ALL * mode for loading generated skeleton. */ - ret = libbpf_set_strict_mode(LIBBPF_STRICT_ALL & ~LIBBPF_STRICT_MAP_DEFINITIONS); - if (ret) - p_err("failed to enable libbpf strict mode: %d", ret); + libbpf_set_strict_mode(LIBBPF_STRICT_ALL & ~LIBBPF_STRICT_MAP_DEFINITIONS); } argc -= optind; -- cgit v1.2.3 From 96752e1ec0e0d763ccb05dfd0b6efe43a1a74f1f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 13 Jun 2022 16:34:49 -0700 Subject: selftests/bpf: Fix test_varlen verification failure with latest llvm With latest llvm15, test_varlen failed with the following verifier log: 17: (85) call bpf_probe_read_kernel_str#115 ; R0_w=scalar(smin=-4095,smax=256) 18: (bf) r1 = r0 ; R0_w=scalar(id=1,smin=-4095,smax=256) R1_w=scalar(id=1,smin=-4095,smax=256) 19: (67) r1 <<= 32 ; R1_w=scalar(smax=1099511627776,umax=18446744069414584320,var_off=(0x0; 0xffffffff00000000),s32_min=0,s32_max=0,u32_max=) 20: (bf) r2 = r1 ; R1_w=scalar(id=2,smax=1099511627776,umax=18446744069414584320,var_off=(0x0; 0xffffffff00000000),s32_min=0,s32_max=0,u32) 21: (c7) r2 s>>= 32 ; R2=scalar(smin=-2147483648,smax=256) ; if (len >= 0) { 22: (c5) if r2 s< 0x0 goto pc+7 ; R2=scalar(umax=256,var_off=(0x0; 0x1ff)) ; payload4_len1 = len; 23: (18) r2 = 0xffffc90000167418 ; R2_w=map_value(off=1048,ks=4,vs=1572,imm=0) 25: (63) *(u32 *)(r2 +0) = r0 ; R0=scalar(id=1,smin=-4095,smax=256) R2_w=map_value(off=1048,ks=4,vs=1572,imm=0) 26: (77) r1 >>= 32 ; R1_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) ; payload += len; 27: (18) r6 = 0xffffc90000167424 ; R6_w=map_value(off=1060,ks=4,vs=1572,imm=0) 29: (0f) r6 += r1 ; R1_w=Pscalar(umax=4294967295,var_off=(0x0; 0xffffffff)) R6_w=map_value(off=1060,ks=4,vs=1572,umax=4294967295,var_off=(0) ; len = bpf_probe_read_kernel_str(payload, MAX_LEN, &buf_in2[0]); 30: (bf) r1 = r6 ; R1_w=map_value(off=1060,ks=4,vs=1572,umax=4294967295,var_off=(0x0; 0xffffffff)) R6_w=map_value(off=1060,ks=4,vs=1572,um) 31: (b7) r2 = 256 ; R2_w=256 32: (18) r3 = 0xffffc90000164100 ; R3_w=map_value(off=256,ks=4,vs=1056,imm=0) 34: (85) call bpf_probe_read_kernel_str#115 R1 unbounded memory access, make sure to bounds check any such access processed 27 insns (limit 1000000) max_states_per_insn 0 total_states 2 peak_states 2 mark_read 1 -- END PROG LOAD LOG -- libbpf: failed to load program 'handler32_signed' The failure is due to 20: (bf) r2 = r1 ; R1_w=scalar(id=2,smax=1099511627776,umax=18446744069414584320,var_off=(0x0; 0xffffffff00000000),s32_min=0,s32_max=0,u32) 21: (c7) r2 s>>= 32 ; R2=scalar(smin=-2147483648,smax=256) 22: (c5) if r2 s< 0x0 goto pc+7 ; R2=scalar(umax=256,var_off=(0x0; 0x1ff)) 26: (77) r1 >>= 32 ; R1_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) 29: (0f) r6 += r1 ; R1_w=Pscalar(umax=4294967295,var_off=(0x0; 0xffffffff)) R6_w=map_value(off=1060,ks=4,vs=1572,umax=4294967295,var_off=(0) where r1 has conservative value range compared to r2 and r1 is used later. In llvm, commit [1] triggered the above code generation and caused verification failure. It may take a while for llvm to address this issue. In the main time, let us change the variable 'len' type to 'long' and adjust condition properly. Tested with llvm14 and latest llvm15, both worked fine. [1] https://reviews.llvm.org/D126647 Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220613233449.2860753-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/test_varlen.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_varlen.c b/tools/testing/selftests/bpf/progs/test_varlen.c index 913acdffd90f..3987ff174f1f 100644 --- a/tools/testing/selftests/bpf/progs/test_varlen.c +++ b/tools/testing/selftests/bpf/progs/test_varlen.c @@ -41,20 +41,20 @@ int handler64_unsigned(void *regs) { int pid = bpf_get_current_pid_tgid() >> 32; void *payload = payload1; - u64 len; + long len; /* ignore irrelevant invocations */ if (test_pid != pid || !capture) return 0; len = bpf_probe_read_kernel_str(payload, MAX_LEN, &buf_in1[0]); - if (len <= MAX_LEN) { + if (len >= 0) { payload += len; payload1_len1 = len; } len = bpf_probe_read_kernel_str(payload, MAX_LEN, &buf_in2[0]); - if (len <= MAX_LEN) { + if (len >= 0) { payload += len; payload1_len2 = len; } @@ -123,7 +123,7 @@ int handler32_signed(void *regs) { int pid = bpf_get_current_pid_tgid() >> 32; void *payload = payload4; - int len; + long len; /* ignore irrelevant invocations */ if (test_pid != pid || !capture) -- cgit v1.2.3 From 3831cd1f9ff627734096f22d8e37f72a5cabf92e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 13 Jun 2022 22:55:26 -0700 Subject: selftests/bpf: Avoid skipping certain subtests Commit 704c91e59fe0 ('selftests/bpf: Test "bpftool gen min_core_btf"') added a test test_core_btfgen to test core relocation with btf generated with 'bpftool gen min_core_btf'. Currently, among 76 subtests, 25 are skipped. ... #46/69 core_reloc_btfgen/enumval:OK #46/70 core_reloc_btfgen/enumval___diff:OK #46/71 core_reloc_btfgen/enumval___val3_missing:OK #46/72 core_reloc_btfgen/enumval___err_missing:SKIP #46/73 core_reloc_btfgen/enum64val:OK #46/74 core_reloc_btfgen/enum64val___diff:OK #46/75 core_reloc_btfgen/enum64val___val3_missing:OK #46/76 core_reloc_btfgen/enum64val___err_missing:SKIP ... #46 core_reloc_btfgen:SKIP Summary: 1/51 PASSED, 25 SKIPPED, 0 FAILED Alexei found that in the above core_reloc_btfgen/enum64val___err_missing should not be skipped. Currently, the core_reloc tests have some negative tests. In Commit 704c91e59fe0, for core_reloc_btfgen, all negative tests are skipped with the following condition if (!test_case->btf_src_file || test_case->fails) { test__skip(); continue; } This is too conservative. Negative tests do not fail mkstemp() and run_btfgen() should not be skipped. There are a few negative tests indeed failing run_btfgen() and this patch added 'run_btfgen_fails' to mark these tests so that they can be skipped for btfgen tests. With this, we have ... #46/69 core_reloc_btfgen/enumval:OK #46/70 core_reloc_btfgen/enumval___diff:OK #46/71 core_reloc_btfgen/enumval___val3_missing:OK #46/72 core_reloc_btfgen/enumval___err_missing:OK #46/73 core_reloc_btfgen/enum64val:OK #46/74 core_reloc_btfgen/enum64val___diff:OK #46/75 core_reloc_btfgen/enum64val___val3_missing:OK #46/76 core_reloc_btfgen/enum64val___err_missing:OK ... Summary: 1/62 PASSED, 14 SKIPPED, 0 FAILED Totally 14 subtests are skipped instead of 25. Reported-by: Alexei Starovoitov Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220614055526.628299-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/core_reloc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index 47c1ef117275..2f92feb809be 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -84,6 +84,7 @@ static int duration = 0; #define NESTING_ERR_CASE(name) { \ NESTING_CASE_COMMON(name), \ .fails = true, \ + .run_btfgen_fails = true, \ } #define ARRAYS_DATA(struct_name) STRUCT_TO_CHAR_PTR(struct_name) { \ @@ -258,12 +259,14 @@ static int duration = 0; BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_probed.o", \ "probed:", name), \ .fails = true, \ + .run_btfgen_fails = true, \ .raw_tp_name = "sys_enter", \ .prog_name = "test_core_bitfields", \ }, { \ BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_direct.o", \ "direct:", name), \ .fails = true, \ + .run_btfgen_fails = true, \ .prog_name = "test_core_bitfields_direct", \ } @@ -304,6 +307,7 @@ static int duration = 0; #define SIZE_ERR_CASE(name) { \ SIZE_CASE_COMMON(name), \ .fails = true, \ + .run_btfgen_fails = true, \ } #define TYPE_BASED_CASE_COMMON(name) \ @@ -396,6 +400,7 @@ struct core_reloc_test_case { const char *output; int output_len; bool fails; + bool run_btfgen_fails; bool needs_testmod; bool relaxed_core_relocs; const char *prog_name; @@ -952,7 +957,7 @@ static void run_core_reloc_tests(bool use_btfgen) /* generate a "minimal" BTF file and use it as source */ if (use_btfgen) { - if (!test_case->btf_src_file || test_case->fails) { + if (!test_case->btf_src_file || test_case->run_btfgen_fails) { test__skip(); continue; } -- cgit v1.2.3 From 1cb67e25f9a844425f85e592c7ffb8428800a796 Mon Sep 17 00:00:00 2001 From: Shaoqin Huang Date: Tue, 14 Jun 2022 16:41:19 -0600 Subject: KVM: selftests: Remove the mismatched parameter comments There are some parameter being removed in function but the parameter comments still exist, so remove them. Signed-off-by: Shaoqin Huang Message-Id: <20220614224126.211054-1-shaoqin.huang@intel.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/kvm_util.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index bceb668f2627..f8c104dba258 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1163,8 +1163,6 @@ va_found: * vm - Virtual Machine * sz - Size in bytes * vaddr_min - Minimum starting virtual address - * data_memslot - Memory region slot for data pages - * pgd_memslot - Memory region slot for new virtual translation tables * * Output Args: None * @@ -1250,7 +1248,6 @@ vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm) * vaddr - Virtuall address to map * paddr - VM Physical Address * npages - The number of pages to map - * pgd_memslot - Memory region slot for new virtual translation tables * * Output Args: None * -- cgit v1.2.3 From 37f80a7c9987ff5f0a1e023dbbda2ad6b47431f7 Mon Sep 17 00:00:00 2001 From: Janis Schoetterl-Glausch Date: Tue, 14 Jun 2022 18:26:35 +0200 Subject: KVM: s390: selftests: Fix memop extension capability check Fix the inverted logic of the memop extension capability check. Fixes: 97da92c0ff92 ("KVM: s390: selftests: Use TAP interface in the memop test") Signed-off-by: Janis Schoetterl-Glausch Message-Id: <20220614162635.3445019-1-scgl@linux.ibm.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/s390x/memop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c index a8f9b74b9144..9113696d5178 100644 --- a/tools/testing/selftests/kvm/s390x/memop.c +++ b/tools/testing/selftests/kvm/s390x/memop.c @@ -768,7 +768,7 @@ int main(int argc, char *argv[]) extension_cap = kvm_check_cap(KVM_CAP_S390_MEM_OP_EXTENSION); for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) { - if (testlist[idx].extension >= extension_cap) { + if (extension_cap >= testlist[idx].extension) { testlist[idx].test(); ksft_test_result_pass("%s\n", testlist[idx].name); } else { -- cgit v1.2.3 From 6342140db6609a0c7d34f68c52b2947468e0e630 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 12 Jun 2022 23:07:23 -0700 Subject: selftests/timens: add a test for vfork+exit * check that a child process is in parent's time namespace after vfork. * check that a child process is in the target namespace after exec. Output on success: $ ./vfork_exec 1..1 ok 1 exec Signed-off-by: Andrei Vagin Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220613060723.197407-2-avagin@gmail.com --- tools/testing/selftests/timens/Makefile | 2 +- tools/testing/selftests/timens/vfork_exec.c | 90 +++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/timens/vfork_exec.c (limited to 'tools') diff --git a/tools/testing/selftests/timens/Makefile b/tools/testing/selftests/timens/Makefile index 3a5936cc10ab..f0d51d4d2c87 100644 --- a/tools/testing/selftests/timens/Makefile +++ b/tools/testing/selftests/timens/Makefile @@ -1,4 +1,4 @@ -TEST_GEN_PROGS := timens timerfd timer clock_nanosleep procfs exec futex +TEST_GEN_PROGS := timens timerfd timer clock_nanosleep procfs exec futex vfork_exec TEST_GEN_PROGS_EXTENDED := gettime_perf CFLAGS := -Wall -Werror -pthread diff --git a/tools/testing/selftests/timens/vfork_exec.c b/tools/testing/selftests/timens/vfork_exec.c new file mode 100644 index 000000000000..e6ccd900f30a --- /dev/null +++ b/tools/testing/selftests/timens/vfork_exec.c @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "log.h" +#include "timens.h" + +#define OFFSET (36000) + +int main(int argc, char *argv[]) +{ + struct timespec now, tst; + int status, i; + pid_t pid; + + if (argc > 1) { + if (sscanf(argv[1], "%ld", &now.tv_sec) != 1) + return pr_perror("sscanf"); + + for (i = 0; i < 2; i++) { + _gettime(CLOCK_MONOTONIC, &tst, i); + if (abs(tst.tv_sec - now.tv_sec) > 5) + return pr_fail("%ld %ld\n", now.tv_sec, tst.tv_sec); + } + return 0; + } + + nscheck(); + + ksft_set_plan(1); + + clock_gettime(CLOCK_MONOTONIC, &now); + + if (unshare_timens()) + return 1; + + if (_settime(CLOCK_MONOTONIC, OFFSET)) + return 1; + + for (i = 0; i < 2; i++) { + _gettime(CLOCK_MONOTONIC, &tst, i); + if (abs(tst.tv_sec - now.tv_sec) > 5) + return pr_fail("%ld %ld\n", + now.tv_sec, tst.tv_sec); + } + + pid = vfork(); + if (pid < 0) + return pr_perror("fork"); + + if (pid == 0) { + char now_str[64]; + char *cargv[] = {"exec", now_str, NULL}; + char *cenv[] = {NULL}; + + // Check that we are still in the source timens. + for (i = 0; i < 2; i++) { + _gettime(CLOCK_MONOTONIC, &tst, i); + if (abs(tst.tv_sec - now.tv_sec) > 5) + return pr_fail("%ld %ld\n", + now.tv_sec, tst.tv_sec); + } + + /* Check for proper vvar offsets after execve. */ + snprintf(now_str, sizeof(now_str), "%ld", now.tv_sec + OFFSET); + execve("/proc/self/exe", cargv, cenv); + return pr_perror("execve"); + } + + if (waitpid(pid, &status, 0) != pid) + return pr_perror("waitpid"); + + if (status) + ksft_exit_fail(); + + ksft_test_result_pass("exec\n"); + ksft_exit_pass(); + return 0; +} -- cgit v1.2.3 From 3e6fe5ce4d4860c3a111c246fddc6f31492f4fb0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 15 Jun 2022 22:55:43 -0700 Subject: libbpf: Fix internal USDT address translation logic for shared libraries Perform the same virtual address to file offset translation that libbpf is doing for executable ELF binaries also for shared libraries. Currently libbpf is making a simplifying and sometimes wrong assumption that for shared libraries relative virtual addresses inside ELF are always equal to file offsets. Unfortunately, this is not always the case with LLVM's lld linker, which now by default generates quite more complicated ELF segments layout. E.g., for liburandom_read.so from selftests/bpf, here's an excerpt from readelf output listing ELF segments (a.k.a. program headers): Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align PHDR 0x000040 0x0000000000000040 0x0000000000000040 0x0001f8 0x0001f8 R 0x8 LOAD 0x000000 0x0000000000000000 0x0000000000000000 0x0005e4 0x0005e4 R 0x1000 LOAD 0x0005f0 0x00000000000015f0 0x00000000000015f0 0x000160 0x000160 R E 0x1000 LOAD 0x000750 0x0000000000002750 0x0000000000002750 0x000210 0x000210 RW 0x1000 LOAD 0x000960 0x0000000000003960 0x0000000000003960 0x000028 0x000029 RW 0x1000 Compare that to what is generated by GNU ld (or LLVM lld's with extra -znoseparate-code argument which disables this cleverness in the name of file size reduction): Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align LOAD 0x000000 0x0000000000000000 0x0000000000000000 0x000550 0x000550 R 0x1000 LOAD 0x001000 0x0000000000001000 0x0000000000001000 0x000131 0x000131 R E 0x1000 LOAD 0x002000 0x0000000000002000 0x0000000000002000 0x0000ac 0x0000ac R 0x1000 LOAD 0x002dc0 0x0000000000003dc0 0x0000000000003dc0 0x000262 0x000268 RW 0x1000 You can see from the first example above that for executable (Flg == "R E") PT_LOAD segment (LOAD #2), Offset doesn't match VirtAddr columns. And it does in the second case (GNU ld output). This is important because all the addresses, including USDT specs, operate in a virtual address space, while kernel is expecting file offsets when performing uprobe attach. So such mismatches have to be properly taken care of and compensated by libbpf, which is what this patch is fixing. Also patch clarifies few function and variable names, as well as updates comments to reflect this important distinction (virtaddr vs file offset) and to ephasize that shared libraries are not all that different from executables in this regard. This patch also changes selftests/bpf Makefile to force urand_read and liburand_read.so to be built with Clang and LLVM's lld (and explicitly request this ELF file size optimization through -znoseparate-code linker parameter) to validate libbpf logic and ensure regressions don't happen in the future. I've bundled these selftests changes together with libbpf changes to keep the above description tied with both libbpf and selftests changes. Fixes: 74cc6311cec9 ("libbpf: Add USDT notes parsing and resolution logic") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220616055543.3285835-1-andrii@kernel.org --- tools/lib/bpf/usdt.c | 123 ++++++++++++++++++----------------- tools/testing/selftests/bpf/Makefile | 14 ++-- 2 files changed, 72 insertions(+), 65 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c index f1c9339cfbbc..5159207cbfd9 100644 --- a/tools/lib/bpf/usdt.c +++ b/tools/lib/bpf/usdt.c @@ -441,7 +441,7 @@ static int parse_elf_segs(Elf *elf, const char *path, struct elf_seg **segs, siz return 0; } -static int parse_lib_segs(int pid, const char *lib_path, struct elf_seg **segs, size_t *seg_cnt) +static int parse_vma_segs(int pid, const char *lib_path, struct elf_seg **segs, size_t *seg_cnt) { char path[PATH_MAX], line[PATH_MAX], mode[16]; size_t seg_start, seg_end, seg_off; @@ -531,35 +531,40 @@ err_out: return err; } -static struct elf_seg *find_elf_seg(struct elf_seg *segs, size_t seg_cnt, long addr, bool relative) +static struct elf_seg *find_elf_seg(struct elf_seg *segs, size_t seg_cnt, long virtaddr) { struct elf_seg *seg; int i; - if (relative) { - /* for shared libraries, address is relative offset and thus - * should be fall within logical offset-based range of - * [offset_start, offset_end) - */ - for (i = 0, seg = segs; i < seg_cnt; i++, seg++) { - if (seg->offset <= addr && addr < seg->offset + (seg->end - seg->start)) - return seg; - } - } else { - /* for binaries, address is absolute and thus should be within - * absolute address range of [seg_start, seg_end) - */ - for (i = 0, seg = segs; i < seg_cnt; i++, seg++) { - if (seg->start <= addr && addr < seg->end) - return seg; - } + /* for ELF binaries (both executables and shared libraries), we are + * given virtual address (absolute for executables, relative for + * libraries) which should match address range of [seg_start, seg_end) + */ + for (i = 0, seg = segs; i < seg_cnt; i++, seg++) { + if (seg->start <= virtaddr && virtaddr < seg->end) + return seg; } + return NULL; +} +static struct elf_seg *find_vma_seg(struct elf_seg *segs, size_t seg_cnt, long offset) +{ + struct elf_seg *seg; + int i; + + /* for VMA segments from /proc//maps file, provided "address" is + * actually a file offset, so should be fall within logical + * offset-based range of [offset_start, offset_end) + */ + for (i = 0, seg = segs; i < seg_cnt; i++, seg++) { + if (seg->offset <= offset && offset < seg->offset + (seg->end - seg->start)) + return seg; + } return NULL; } -static int parse_usdt_note(Elf *elf, const char *path, long base_addr, - GElf_Nhdr *nhdr, const char *data, size_t name_off, size_t desc_off, +static int parse_usdt_note(Elf *elf, const char *path, GElf_Nhdr *nhdr, + const char *data, size_t name_off, size_t desc_off, struct usdt_note *usdt_note); static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, __u64 usdt_cookie); @@ -568,8 +573,8 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * const char *usdt_provider, const char *usdt_name, __u64 usdt_cookie, struct usdt_target **out_targets, size_t *out_target_cnt) { - size_t off, name_off, desc_off, seg_cnt = 0, lib_seg_cnt = 0, target_cnt = 0; - struct elf_seg *segs = NULL, *lib_segs = NULL; + size_t off, name_off, desc_off, seg_cnt = 0, vma_seg_cnt = 0, target_cnt = 0; + struct elf_seg *segs = NULL, *vma_segs = NULL; struct usdt_target *targets = NULL, *target; long base_addr = 0; Elf_Scn *notes_scn, *base_scn; @@ -613,8 +618,7 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * struct elf_seg *seg = NULL; void *tmp; - err = parse_usdt_note(elf, path, base_addr, &nhdr, - data->d_buf, name_off, desc_off, ¬e); + err = parse_usdt_note(elf, path, &nhdr, data->d_buf, name_off, desc_off, ¬e); if (err) goto err_out; @@ -654,30 +658,29 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * usdt_rel_ip += base_addr - note.base_addr; } - if (ehdr.e_type == ET_EXEC) { - /* When attaching uprobes (which what USDTs basically - * are) kernel expects a relative IP to be specified, - * so if we are attaching to an executable ELF binary - * (i.e., not a shared library), we need to calculate - * proper relative IP based on ELF's load address - */ - seg = find_elf_seg(segs, seg_cnt, usdt_abs_ip, false /* relative */); - if (!seg) { - err = -ESRCH; - pr_warn("usdt: failed to find ELF program segment for '%s:%s' in '%s' at IP 0x%lx\n", - usdt_provider, usdt_name, path, usdt_abs_ip); - goto err_out; - } - if (!seg->is_exec) { - err = -ESRCH; - pr_warn("usdt: matched ELF binary '%s' segment [0x%lx, 0x%lx) for '%s:%s' at IP 0x%lx is not executable\n", - path, seg->start, seg->end, usdt_provider, usdt_name, - usdt_abs_ip); - goto err_out; - } + /* When attaching uprobes (which is what USDTs basically are) + * kernel expects file offset to be specified, not a relative + * virtual address, so we need to translate virtual address to + * file offset, for both ET_EXEC and ET_DYN binaries. + */ + seg = find_elf_seg(segs, seg_cnt, usdt_abs_ip); + if (!seg) { + err = -ESRCH; + pr_warn("usdt: failed to find ELF program segment for '%s:%s' in '%s' at IP 0x%lx\n", + usdt_provider, usdt_name, path, usdt_abs_ip); + goto err_out; + } + if (!seg->is_exec) { + err = -ESRCH; + pr_warn("usdt: matched ELF binary '%s' segment [0x%lx, 0x%lx) for '%s:%s' at IP 0x%lx is not executable\n", + path, seg->start, seg->end, usdt_provider, usdt_name, + usdt_abs_ip); + goto err_out; + } + /* translate from virtual address to file offset */ + usdt_rel_ip = usdt_abs_ip - seg->start + seg->offset; - usdt_rel_ip = usdt_abs_ip - (seg->start - seg->offset); - } else if (!man->has_bpf_cookie) { /* ehdr.e_type == ET_DYN */ + if (ehdr.e_type == ET_DYN && !man->has_bpf_cookie) { /* If we don't have BPF cookie support but need to * attach to a shared library, we'll need to know and * record absolute addresses of attach points due to @@ -697,9 +700,9 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * goto err_out; } - /* lib_segs are lazily initialized only if necessary */ - if (lib_seg_cnt == 0) { - err = parse_lib_segs(pid, path, &lib_segs, &lib_seg_cnt); + /* vma_segs are lazily initialized only if necessary */ + if (vma_seg_cnt == 0) { + err = parse_vma_segs(pid, path, &vma_segs, &vma_seg_cnt); if (err) { pr_warn("usdt: failed to get memory segments in PID %d for shared library '%s': %d\n", pid, path, err); @@ -707,7 +710,7 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * } } - seg = find_elf_seg(lib_segs, lib_seg_cnt, usdt_rel_ip, true /* relative */); + seg = find_vma_seg(vma_segs, vma_seg_cnt, usdt_rel_ip); if (!seg) { err = -ESRCH; pr_warn("usdt: failed to find shared lib memory segment for '%s:%s' in '%s' at relative IP 0x%lx\n", @@ -715,7 +718,7 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * goto err_out; } - usdt_abs_ip = seg->start + (usdt_rel_ip - seg->offset); + usdt_abs_ip = seg->start - seg->offset + usdt_rel_ip; } pr_debug("usdt: probe for '%s:%s' in %s '%s': addr 0x%lx base 0x%lx (resolved abs_ip 0x%lx rel_ip 0x%lx) args '%s' in segment [0x%lx, 0x%lx) at offset 0x%lx\n", @@ -723,7 +726,7 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * note.loc_addr, note.base_addr, usdt_abs_ip, usdt_rel_ip, note.args, seg ? seg->start : 0, seg ? seg->end : 0, seg ? seg->offset : 0); - /* Adjust semaphore address to be a relative offset */ + /* Adjust semaphore address to be a file offset */ if (note.sema_addr) { if (!man->has_sema_refcnt) { pr_warn("usdt: kernel doesn't support USDT semaphore refcounting for '%s:%s' in '%s'\n", @@ -732,7 +735,7 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * goto err_out; } - seg = find_elf_seg(segs, seg_cnt, note.sema_addr, false /* relative */); + seg = find_elf_seg(segs, seg_cnt, note.sema_addr); if (!seg) { err = -ESRCH; pr_warn("usdt: failed to find ELF loadable segment with semaphore of '%s:%s' in '%s' at 0x%lx\n", @@ -747,7 +750,7 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * goto err_out; } - usdt_sema_off = note.sema_addr - (seg->start - seg->offset); + usdt_sema_off = note.sema_addr - seg->start + seg->offset; pr_debug("usdt: sema for '%s:%s' in %s '%s': addr 0x%lx base 0x%lx (resolved 0x%lx) in segment [0x%lx, 0x%lx] at offset 0x%lx\n", usdt_provider, usdt_name, ehdr.e_type == ET_EXEC ? "exec" : "lib ", @@ -770,7 +773,7 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * target->rel_ip = usdt_rel_ip; target->sema_off = usdt_sema_off; - /* notes->args references strings from Elf itself, so they can + /* notes.args references strings from Elf itself, so they can * be referenced safely until elf_end() call */ target->spec_str = note.args; @@ -788,7 +791,7 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * err_out: free(segs); - free(lib_segs); + free(vma_segs); if (err < 0) free(targets); return err; @@ -1089,8 +1092,8 @@ err_out: /* Parse out USDT ELF note from '.note.stapsdt' section. * Logic inspired by perf's code. */ -static int parse_usdt_note(Elf *elf, const char *path, long base_addr, - GElf_Nhdr *nhdr, const char *data, size_t name_off, size_t desc_off, +static int parse_usdt_note(Elf *elf, const char *path, GElf_Nhdr *nhdr, + const char *data, size_t name_off, size_t desc_off, struct usdt_note *note) { const char *provider, *name, *args; diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 8ad7a733a505..e08e8e34e793 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -172,13 +172,15 @@ $(OUTPUT)/%:%.c # do not fail. Static builds leave urandom_read relying on system-wide shared libraries. $(OUTPUT)/liburandom_read.so: urandom_read_lib1.c urandom_read_lib2.c $(call msg,LIB,,$@) - $(Q)$(CC) $(filter-out -static,$(CFLAGS) $(LDFLAGS)) $^ $(LDLIBS) -fPIC -shared -o $@ + $(Q)$(CLANG) $(filter-out -static,$(CFLAGS) $(LDFLAGS)) $^ $(LDLIBS) \ + -fuse-ld=lld -Wl,-znoseparate-code -fPIC -shared -o $@ $(OUTPUT)/urandom_read: urandom_read.c urandom_read_aux.c $(OUTPUT)/liburandom_read.so $(call msg,BINARY,,$@) - $(Q)$(CC) $(filter-out -static,$(CFLAGS) $(LDFLAGS)) $(filter %.c,$^) \ - liburandom_read.so $(LDLIBS) \ - -Wl,-rpath=. -Wl,--build-id=sha1 -o $@ + $(Q)$(CLANG) $(filter-out -static,$(CFLAGS) $(LDFLAGS)) $(filter %.c,$^) \ + liburandom_read.so $(LDLIBS) \ + -fuse-ld=lld -Wl,-znoseparate-code \ + -Wl,-rpath=. -Wl,--build-id=sha1 -o $@ $(OUTPUT)/bpf_testmod.ko: $(VMLINUX_BTF) $(wildcard bpf_testmod/Makefile bpf_testmod/*.[ch]) $(call msg,MOD,,$@) @@ -580,6 +582,8 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \ EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ feature bpftool \ - $(addprefix $(OUTPUT)/,*.o *.skel.h *.lskel.h *.subskel.h no_alu32 bpf_gcc bpf_testmod.ko) + $(addprefix $(OUTPUT)/,*.o *.skel.h *.lskel.h *.subskel.h \ + no_alu32 bpf_gcc bpf_testmod.ko \ + liburandom_read.so) .PHONY: docs docs-clean -- cgit v1.2.3 From c4cac71fc8a55cebd9abf30a3b287063be34b512 Mon Sep 17 00:00:00 2001 From: Delyan Kratunov Date: Tue, 14 Jun 2022 23:10:47 +0000 Subject: libbpf: add support for sleepable uprobe programs Add section mappings for u(ret)probe.s programs. Acked-by: Andrii Nakryiko Signed-off-by: Delyan Kratunov Link: https://lore.kernel.org/r/aedbc3b74f3523f00010a7b0df8f3388cca59f16.1655248076.git.delyank@fb.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index d989b0a17a89..49e359cd34df 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -9177,8 +9177,10 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("sk_reuseport", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT, SEC_ATTACHABLE | SEC_SLOPPY_PFX), SEC_DEF("kprobe+", KPROBE, 0, SEC_NONE, attach_kprobe), SEC_DEF("uprobe+", KPROBE, 0, SEC_NONE, attach_uprobe), + SEC_DEF("uprobe.s+", KPROBE, 0, SEC_SLEEPABLE, attach_uprobe), SEC_DEF("kretprobe+", KPROBE, 0, SEC_NONE, attach_kprobe), SEC_DEF("uretprobe+", KPROBE, 0, SEC_NONE, attach_uprobe), + SEC_DEF("uretprobe.s+", KPROBE, 0, SEC_SLEEPABLE, attach_uprobe), SEC_DEF("kprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), SEC_DEF("kretprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), SEC_DEF("usdt+", KPROBE, 0, SEC_NONE, attach_usdt), @@ -11571,7 +11573,8 @@ static int attach_uprobe(const struct bpf_program *prog, long cookie, struct bpf break; case 3: case 4: - opts.retprobe = strcmp(probe_type, "uretprobe") == 0; + opts.retprobe = strcmp(probe_type, "uretprobe") == 0 || + strcmp(probe_type, "uretprobe.s") == 0; if (opts.retprobe && offset != 0) { pr_warn("prog '%s': uretprobes do not support offset specification\n", prog->name); -- cgit v1.2.3 From cb3f4a4a462b46eb5487ad806d58e34824c49044 Mon Sep 17 00:00:00 2001 From: Delyan Kratunov Date: Tue, 14 Jun 2022 23:10:44 +0000 Subject: selftests/bpf: add tests for sleepable (uk)probes Add tests that ensure sleepable uprobe programs work correctly. Add tests that ensure sleepable kprobe programs cannot attach. Also add tests that attach both sleepable and non-sleepable uprobe programs to the same location (i.e. same bpf_prog_array). Acked-by: Andrii Nakryiko Signed-off-by: Delyan Kratunov Link: https://lore.kernel.org/r/c744e5bb7a5c0703f05444dc41f2522ba3579a48.1655248076.git.delyank@fb.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/attach_probe.c | 49 +++++++++++++++++- .../selftests/bpf/progs/test_attach_probe.c | 60 ++++++++++++++++++++++ 2 files changed, 108 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c index 08c0601b3e84..0b899d2d8ea7 100644 --- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c +++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c @@ -17,6 +17,14 @@ static void trigger_func2(void) asm volatile (""); } +/* attach point for byname sleepable uprobe */ +static void trigger_func3(void) +{ + asm volatile (""); +} + +static char test_data[] = "test_data"; + void test_attach_probe(void) { DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts); @@ -49,9 +57,17 @@ void test_attach_probe(void) if (!ASSERT_GE(ref_ctr_offset, 0, "ref_ctr_offset")) return; - skel = test_attach_probe__open_and_load(); + skel = test_attach_probe__open(); if (!ASSERT_OK_PTR(skel, "skel_open")) return; + + /* sleepable kprobe test case needs flags set before loading */ + if (!ASSERT_OK(bpf_program__set_flags(skel->progs.handle_kprobe_sleepable, + BPF_F_SLEEPABLE), "kprobe_sleepable_flags")) + goto cleanup; + + if (!ASSERT_OK(test_attach_probe__load(skel), "skel_load")) + goto cleanup; if (!ASSERT_OK_PTR(skel->bss, "check_bss")) goto cleanup; @@ -151,6 +167,30 @@ void test_attach_probe(void) if (!ASSERT_OK_PTR(skel->links.handle_uretprobe_byname2, "attach_uretprobe_byname2")) goto cleanup; + /* sleepable kprobes should not attach successfully */ + skel->links.handle_kprobe_sleepable = bpf_program__attach(skel->progs.handle_kprobe_sleepable); + if (!ASSERT_ERR_PTR(skel->links.handle_kprobe_sleepable, "attach_kprobe_sleepable")) + goto cleanup; + + /* test sleepable uprobe and uretprobe variants */ + skel->links.handle_uprobe_byname3_sleepable = bpf_program__attach(skel->progs.handle_uprobe_byname3_sleepable); + if (!ASSERT_OK_PTR(skel->links.handle_uprobe_byname3_sleepable, "attach_uprobe_byname3_sleepable")) + goto cleanup; + + skel->links.handle_uprobe_byname3 = bpf_program__attach(skel->progs.handle_uprobe_byname3); + if (!ASSERT_OK_PTR(skel->links.handle_uprobe_byname3, "attach_uprobe_byname3")) + goto cleanup; + + skel->links.handle_uretprobe_byname3_sleepable = bpf_program__attach(skel->progs.handle_uretprobe_byname3_sleepable); + if (!ASSERT_OK_PTR(skel->links.handle_uretprobe_byname3_sleepable, "attach_uretprobe_byname3_sleepable")) + goto cleanup; + + skel->links.handle_uretprobe_byname3 = bpf_program__attach(skel->progs.handle_uretprobe_byname3); + if (!ASSERT_OK_PTR(skel->links.handle_uretprobe_byname3, "attach_uretprobe_byname3")) + goto cleanup; + + skel->bss->user_ptr = test_data; + /* trigger & validate kprobe && kretprobe */ usleep(1); @@ -164,6 +204,9 @@ void test_attach_probe(void) /* trigger & validate uprobe attached by name */ trigger_func2(); + /* trigger & validate sleepable uprobe attached by name */ + trigger_func3(); + ASSERT_EQ(skel->bss->kprobe_res, 1, "check_kprobe_res"); ASSERT_EQ(skel->bss->kprobe2_res, 11, "check_kprobe_auto_res"); ASSERT_EQ(skel->bss->kretprobe_res, 2, "check_kretprobe_res"); @@ -174,6 +217,10 @@ void test_attach_probe(void) ASSERT_EQ(skel->bss->uretprobe_byname_res, 6, "check_uretprobe_byname_res"); ASSERT_EQ(skel->bss->uprobe_byname2_res, 7, "check_uprobe_byname2_res"); ASSERT_EQ(skel->bss->uretprobe_byname2_res, 8, "check_uretprobe_byname2_res"); + ASSERT_EQ(skel->bss->uprobe_byname3_sleepable_res, 9, "check_uprobe_byname3_sleepable_res"); + ASSERT_EQ(skel->bss->uprobe_byname3_res, 10, "check_uprobe_byname3_res"); + ASSERT_EQ(skel->bss->uretprobe_byname3_sleepable_res, 11, "check_uretprobe_byname3_sleepable_res"); + ASSERT_EQ(skel->bss->uretprobe_byname3_res, 12, "check_uretprobe_byname3_res"); cleanup: test_attach_probe__destroy(skel); diff --git a/tools/testing/selftests/bpf/progs/test_attach_probe.c b/tools/testing/selftests/bpf/progs/test_attach_probe.c index ce9acf4db8d2..f1c88ad368ef 100644 --- a/tools/testing/selftests/bpf/progs/test_attach_probe.c +++ b/tools/testing/selftests/bpf/progs/test_attach_probe.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "bpf_misc.h" int kprobe_res = 0; @@ -17,6 +18,11 @@ int uprobe_byname_res = 0; int uretprobe_byname_res = 0; int uprobe_byname2_res = 0; int uretprobe_byname2_res = 0; +int uprobe_byname3_sleepable_res = 0; +int uprobe_byname3_res = 0; +int uretprobe_byname3_sleepable_res = 0; +int uretprobe_byname3_res = 0; +void *user_ptr = 0; SEC("kprobe") int handle_kprobe(struct pt_regs *ctx) @@ -32,6 +38,17 @@ int BPF_KPROBE(handle_kprobe_auto) return 0; } +/** + * This program will be manually made sleepable on the userspace side + * and should thus be unattachable. + */ +SEC("kprobe/" SYS_PREFIX "sys_nanosleep") +int handle_kprobe_sleepable(struct pt_regs *ctx) +{ + kprobe_res = 2; + return 0; +} + SEC("kretprobe") int handle_kretprobe(struct pt_regs *ctx) { @@ -93,4 +110,47 @@ int handle_uretprobe_byname2(struct pt_regs *ctx) return 0; } +static __always_inline bool verify_sleepable_user_copy(void) +{ + char data[9]; + + bpf_copy_from_user(data, sizeof(data), user_ptr); + return bpf_strncmp(data, sizeof(data), "test_data") == 0; +} + +SEC("uprobe.s//proc/self/exe:trigger_func3") +int handle_uprobe_byname3_sleepable(struct pt_regs *ctx) +{ + if (verify_sleepable_user_copy()) + uprobe_byname3_sleepable_res = 9; + return 0; +} + +/** + * same target as the uprobe.s above to force sleepable and non-sleepable + * programs in the same bpf_prog_array + */ +SEC("uprobe//proc/self/exe:trigger_func3") +int handle_uprobe_byname3(struct pt_regs *ctx) +{ + uprobe_byname3_res = 10; + return 0; +} + +SEC("uretprobe.s//proc/self/exe:trigger_func3") +int handle_uretprobe_byname3_sleepable(struct pt_regs *ctx) +{ + if (verify_sleepable_user_copy()) + uretprobe_byname3_sleepable_res = 11; + return 0; +} + +SEC("uretprobe//proc/self/exe:trigger_func3") +int handle_uretprobe_byname3(struct pt_regs *ctx) +{ + uretprobe_byname3_res = 12; + return 0; +} + + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 4f5ceb8851f0081af54313abbf56de1615911faf Mon Sep 17 00:00:00 2001 From: Yuanzheng Song Date: Sat, 28 May 2022 06:31:17 +0000 Subject: tools/vm/slabinfo: use alphabetic order when two values are equal When the number of partial slabs in each cache is the same (e.g., the value are 0), the results of the `slabinfo -X -N5` and `slabinfo -P -N5` are different. / # slabinfo -X -N5 ... Slabs sorted by number of partial slabs --------------------------------------- Name Objects Objsize Space Slabs/Part/Cpu O/S O %Fr %Ef Flg inode_cache 15180 392 6217728 758/0/1 20 1 0 95 a kernfs_node_cache 22494 88 2002944 488/0/1 46 0 0 98 shmem_inode_cache 663 464 319488 38/0/1 17 1 0 96 biovec-max 50 3072 163840 4/0/1 10 3 0 93 A dentry 19050 136 2600960 633/0/2 30 0 0 99 a / # slabinfo -P -N5 Name Objects Objsize Space Slabs/Part/Cpu O/S O %Fr %Ef Flg bdev_cache 32 984 32.7K 1/0/1 16 2 0 96 Aa ext4_inode_cache 42 752 32.7K 1/0/1 21 2 0 96 a dentry 19050 136 2.6M 633/0/2 30 0 0 99 a TCPv6 17 1840 32.7K 0/0/1 17 3 0 95 A RAWv6 18 856 16.3K 0/0/1 18 2 0 94 A This problem is caused by the sort_slabs(). So let's use alphabetic order when two values are equal in the sort_slabs(). By the way, the content of the `slabinfo -h` is not aligned because the `-P|--partial Sort by number of partial slabs` uses tabs instead of spaces. So let's use spaces instead of tabs to fix it. Link: https://lkml.kernel.org/r/20220528063117.935158-1-songyuanzheng@huawei.com Fixes: 1106b205a3fe ("tools/vm/slabinfo: add partial slab listing to -X") Signed-off-by: Yuanzheng Song Cc: "Tobin C. Harding" Signed-off-by: Andrew Morton --- tools/vm/slabinfo.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index 9b68658b6bb8..3ae985dc24b6 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c @@ -125,7 +125,7 @@ static void usage(void) "-n|--numa Show NUMA information\n" "-N|--lines=K Show the first K slabs\n" "-o|--ops Show kmem_cache_ops\n" - "-P|--partial Sort by number of partial slabs\n" + "-P|--partial Sort by number of partial slabs\n" "-r|--report Detailed report on single slabs\n" "-s|--shrink Shrink slabs\n" "-S|--Size Sort by size\n" @@ -1045,15 +1045,27 @@ static void sort_slabs(void) for (s2 = s1 + 1; s2 < slabinfo + slabs; s2++) { int result; - if (sort_size) - result = slab_size(s1) < slab_size(s2); - else if (sort_active) - result = slab_activity(s1) < slab_activity(s2); - else if (sort_loss) - result = slab_waste(s1) < slab_waste(s2); - else if (sort_partial) - result = s1->partial < s2->partial; - else + if (sort_size) { + if (slab_size(s1) == slab_size(s2)) + result = strcasecmp(s1->name, s2->name); + else + result = slab_size(s1) < slab_size(s2); + } else if (sort_active) { + if (slab_activity(s1) == slab_activity(s2)) + result = strcasecmp(s1->name, s2->name); + else + result = slab_activity(s1) < slab_activity(s2); + } else if (sort_loss) { + if (slab_waste(s1) == slab_waste(s2)) + result = strcasecmp(s1->name, s2->name); + else + result = slab_waste(s1) < slab_waste(s2); + } else if (sort_partial) { + if (s1->partial == s2->partial) + result = strcasecmp(s1->name, s2->name); + else + result = s1->partial < s2->partial; + } else result = strcasecmp(s1->name, s2->name); if (show_inverted) -- cgit v1.2.3 From c200d90049dbe08fa8b016f74b713fddefca0479 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sat, 11 Jun 2022 11:55:48 +0800 Subject: mm: kmemleak: remove kmemleak_not_leak_phys() and the min_count argument to kmemleak_alloc_phys() Patch series "mm: kmemleak: store objects allocated with physical address separately and check when scan", v4. The kmemleak_*_phys() interface uses "min_low_pfn" and "max_low_pfn" to check address. But on some architectures, kmemleak_*_phys() is called before those two variables initialized. The following steps will be taken: 1) Add OBJECT_PHYS flag and rbtree for the objects allocated with physical address 2) Store physical address in objects if allocated with OBJECT_PHYS 3) Check the boundary when scan instead of in kmemleak_*_phys() This patch set will solve: https://lore.kernel.org/r/20220527032504.30341-1-yee.lee@mediatek.com https://lore.kernel.org/r/9dd08bb5-f39e-53d8-f88d-bec598a08c93@gmail.com v3: https://lore.kernel.org/r/20220609124950.1694394-1-patrick.wang.shcn@gmail.com v2: https://lore.kernel.org/r/20220603035415.1243913-1-patrick.wang.shcn@gmail.com v1: https://lore.kernel.org/r/20220531150823.1004101-1-patrick.wang.shcn@gmail.com This patch (of 4): Remove the unused kmemleak_not_leak_phys() function. And remove the min_count argument to kmemleak_alloc_phys() function, assume it's 0. Link: https://lkml.kernel.org/r/20220611035551.1823303-1-patrick.wang.shcn@gmail.com Link: https://lkml.kernel.org/r/20220611035551.1823303-2-patrick.wang.shcn@gmail.com Signed-off-by: Patrick Wang Suggested-by: Catalin Marinas Reviewed-by: Catalin Marinas Cc: Yee Lee Signed-off-by: Andrew Morton --- Documentation/dev-tools/kmemleak.rst | 1 - drivers/of/fdt.c | 2 +- include/linux/kmemleak.h | 8 ++------ mm/kmemleak.c | 20 +++----------------- mm/memblock.c | 14 +++++++------- tools/testing/memblock/linux/kmemleak.h | 2 +- 6 files changed, 14 insertions(+), 33 deletions(-) (limited to 'tools') diff --git a/Documentation/dev-tools/kmemleak.rst b/Documentation/dev-tools/kmemleak.rst index 1c935f41cd3a..5483fd39ef29 100644 --- a/Documentation/dev-tools/kmemleak.rst +++ b/Documentation/dev-tools/kmemleak.rst @@ -174,7 +174,6 @@ mapping: - ``kmemleak_alloc_phys`` - ``kmemleak_free_part_phys`` -- ``kmemleak_not_leak_phys`` - ``kmemleak_ignore_phys`` Dealing with false positives/negatives diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index a8f5b6532165..2c677e84c3f5 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -529,7 +529,7 @@ static int __init __reserved_mem_reserve_reg(unsigned long node, pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n", uname, &base, (unsigned long)(size / SZ_1M)); if (!nomap) - kmemleak_alloc_phys(base, size, 0, 0); + kmemleak_alloc_phys(base, size, 0); } else pr_info("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n", diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 34684b2026ab..6a3cd1bf4680 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -29,10 +29,9 @@ extern void kmemleak_not_leak(const void *ptr) __ref; extern void kmemleak_ignore(const void *ptr) __ref; extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref; extern void kmemleak_no_scan(const void *ptr) __ref; -extern void kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count, +extern void kmemleak_alloc_phys(phys_addr_t phys, size_t size, gfp_t gfp) __ref; extern void kmemleak_free_part_phys(phys_addr_t phys, size_t size) __ref; -extern void kmemleak_not_leak_phys(phys_addr_t phys) __ref; extern void kmemleak_ignore_phys(phys_addr_t phys) __ref; static inline void kmemleak_alloc_recursive(const void *ptr, size_t size, @@ -107,15 +106,12 @@ static inline void kmemleak_no_scan(const void *ptr) { } static inline void kmemleak_alloc_phys(phys_addr_t phys, size_t size, - int min_count, gfp_t gfp) + gfp_t gfp) { } static inline void kmemleak_free_part_phys(phys_addr_t phys, size_t size) { } -static inline void kmemleak_not_leak_phys(phys_addr_t phys) -{ -} static inline void kmemleak_ignore_phys(phys_addr_t phys) { } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index a182f5ddaf68..156eafafa182 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1125,15 +1125,13 @@ EXPORT_SYMBOL(kmemleak_no_scan); * address argument * @phys: physical address of the object * @size: size of the object - * @min_count: minimum number of references to this object. - * See kmemleak_alloc() * @gfp: kmalloc() flags used for kmemleak internal memory allocations */ -void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count, - gfp_t gfp) +void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, gfp_t gfp) { if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) - kmemleak_alloc(__va(phys), size, min_count, gfp); + /* assume min_count 0 */ + kmemleak_alloc(__va(phys), size, 0, gfp); } EXPORT_SYMBOL(kmemleak_alloc_phys); @@ -1151,18 +1149,6 @@ void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size) } EXPORT_SYMBOL(kmemleak_free_part_phys); -/** - * kmemleak_not_leak_phys - similar to kmemleak_not_leak but taking a physical - * address argument - * @phys: physical address of the object - */ -void __ref kmemleak_not_leak_phys(phys_addr_t phys) -{ - if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) - kmemleak_not_leak(__va(phys)); -} -EXPORT_SYMBOL(kmemleak_not_leak_phys); - /** * kmemleak_ignore_phys - similar to kmemleak_ignore but taking a physical * address argument diff --git a/mm/memblock.c b/mm/memblock.c index e4f03a6e8e56..749abd2685c4 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1345,8 +1345,8 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, * from the regions with mirroring enabled and then retried from any * memory region. * - * In addition, function sets the min_count to 0 using kmemleak_alloc_phys for - * allocated boot memory block, so that it is never reported as leaks. + * In addition, function using kmemleak_alloc_phys for allocated boot + * memory block, it is never reported as leaks. * * Return: * Physical address of allocated memory block on success, %0 on failure. @@ -1398,12 +1398,12 @@ done: */ if (end != MEMBLOCK_ALLOC_NOLEAKTRACE) /* - * The min_count is set to 0 so that memblock allocated - * blocks are never reported as leaks. This is because many - * of these blocks are only referred via the physical - * address which is not looked up by kmemleak. + * Memblock allocated blocks are never reported as + * leaks. This is because many of these blocks are + * only referred via the physical address which is + * not looked up by kmemleak. */ - kmemleak_alloc_phys(found, size, 0, 0); + kmemleak_alloc_phys(found, size, 0); return found; } diff --git a/tools/testing/memblock/linux/kmemleak.h b/tools/testing/memblock/linux/kmemleak.h index 462f8c5e8aa0..5fed13bb9ec4 100644 --- a/tools/testing/memblock/linux/kmemleak.h +++ b/tools/testing/memblock/linux/kmemleak.h @@ -7,7 +7,7 @@ static inline void kmemleak_free_part_phys(phys_addr_t phys, size_t size) } static inline void kmemleak_alloc_phys(phys_addr_t phys, size_t size, - int min_count, gfp_t gfp) + gfp_t gfp) { } -- cgit v1.2.3 From c5de43634c572b0cec0b32eecf24a17c649711c1 Mon Sep 17 00:00:00 2001 From: Xiang wangx Date: Fri, 10 Jun 2022 15:12:44 +0800 Subject: userfaultfd/selftests: fix typo in comment Delete the redundant word 'in'. Link: https://lkml.kernel.org/r/20220610071244.59679-1-wangxiang@cdjrlc.com Signed-off-by: Xiang wangx Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/userfaultfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 0bdfc1955229..4bc24581760d 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -860,7 +860,7 @@ static int stress(struct uffd_stats *uffd_stats) /* * Be strict and immediately zap area_src, the whole area has * been transferred already by the background treads. The - * area_src could then be faulted in in a racy way by still + * area_src could then be faulted in a racy way by still * running uffdio_threads reading zeropages after we zapped * area_src (but they're guaranteed to get -EEXIST from * UFFDIO_COPY without writing zero pages into area_dst -- cgit v1.2.3 From b623d434f00868c3ec76ec6e6bbd85e9e6c06457 Mon Sep 17 00:00:00 2001 From: Joel Savitz Date: Thu, 9 Jun 2022 16:32:17 -0400 Subject: selftests: make use of GUP_TEST_FILE macro Commit 17de1e559cf1 ("selftests: clarify common error when running gup_test") had most of its hunks dropped due to a conflict with another patch accepted into Linux around the same time that implemented the same behavior as a subset of other changes. However, the remaining hunk defines the GUP_TEST_FILE macro without making use of it. This patch makes use of the macro in the two relevant places. Furthermore, the above mentioned commit's log message erroneously describes the changes that were dropped from the patch. This patch corrects the record. Link: https://lkml.kernel.org/r/20220609203217.3206247-1-jsavitz@redhat.com Fixes: 17de1e559cf1 ("selftests: clarify common error when running gup_test") Signed-off-by: Joel Savitz Reviewed-by: Shuah Khan Acked-by: Nico Pache Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/gup_test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/vm/gup_test.c b/tools/testing/selftests/vm/gup_test.c index 6bb36ca71cb5..a309876d832f 100644 --- a/tools/testing/selftests/vm/gup_test.c +++ b/tools/testing/selftests/vm/gup_test.c @@ -209,7 +209,7 @@ int main(int argc, char **argv) if (write) gup.gup_flags |= FOLL_WRITE; - gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); + gup_fd = open(GUP_TEST_FILE, O_RDWR); if (gup_fd == -1) { switch (errno) { case EACCES: @@ -224,7 +224,7 @@ int main(int argc, char **argv) printf("check if CONFIG_GUP_TEST is enabled in kernel config\n"); break; default: - perror("failed to open /sys/kernel/debug/gup_test"); + perror("failed to open " GUP_TEST_FILE); break; } exit(KSFT_SKIP); -- cgit v1.2.3 From 0aed4724a8392f2567f83c9c4b9decf447d752a2 Mon Sep 17 00:00:00 2001 From: cxbing Date: Thu, 9 Jun 2022 07:44:59 -0700 Subject: delayacct: remove some unused variables Drop the unused variables *done* and *count*. Link: https://lkml.kernel.org/r/20220609144459.86379-1-zhangkkoo@126.com Signed-off-by: cxbing Signed-off-by: Andrew Morton --- tools/accounting/getdelays.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'tools') diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c index e83e6e47a21e..938dec0dfaad 100644 --- a/tools/accounting/getdelays.c +++ b/tools/accounting/getdelays.c @@ -45,7 +45,6 @@ exit(code); \ } while (0) -int done; int rcvbufsz; char name[100]; int dbg; @@ -285,7 +284,6 @@ int main(int argc, char *argv[]) pid_t rtid = 0; int fd = 0; - int count = 0; int write_file = 0; int maskset = 0; char *logfile = NULL; @@ -495,7 +493,6 @@ int main(int argc, char *argv[]) len2 = 0; /* For nested attributes, na follows */ na = (struct nlattr *) NLA_DATA(na); - done = 0; while (len2 < aggr_len) { switch (na->nla_type) { case TASKSTATS_TYPE_PID: @@ -509,7 +506,6 @@ int main(int argc, char *argv[]) printf("TGID\t%d\n", rtid); break; case TASKSTATS_TYPE_STATS: - count++; if (print_delays) print_delayacct((struct taskstats *) NLA_DATA(na)); if (print_io_accounting) -- cgit v1.2.3 From dd7c9be330d87732766a95cfd7a6de38bf7a39c3 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Fri, 10 Jun 2022 09:57:21 +0200 Subject: selftests/filesystems: add a vfat RENAME_EXCHANGE test Add a test for the renameat2 RENAME_EXCHANGE support in vfat, but split it in a tool that just does the rename exchange and a script that is run by the kselftests framework on `make TARGETS="filesystems/fat" kselftest`. That way the script can be easily extended to test other file operations. The script creates a 1 MiB disk image, that is then formated with a vfat filesystem and mounted using a loop device. That way all file operations are done on an ephemeral filesystem. Link: https://lkml.kernel.org/r/20220610075721.1182745-5-javierm@redhat.com Signed-off-by: Javier Martinez Canillas Acked-by: Muhammad Usama Anjum Acked-by: OGAWA Hirofumi Cc: Alexander Larsson Cc: Christian Kellner Cc: Chung-Chiang Cheng Cc: Colin Walters Cc: Lennart Poettering Cc: Peter Jones Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + tools/testing/selftests/Makefile | 1 + tools/testing/selftests/filesystems/fat/.gitignore | 2 + tools/testing/selftests/filesystems/fat/Makefile | 7 ++ tools/testing/selftests/filesystems/fat/config | 2 + .../selftests/filesystems/fat/rename_exchange.c | 37 ++++++++++ .../selftests/filesystems/fat/run_fat_tests.sh | 82 ++++++++++++++++++++++ 7 files changed, 132 insertions(+) create mode 100644 tools/testing/selftests/filesystems/fat/.gitignore create mode 100644 tools/testing/selftests/filesystems/fat/Makefile create mode 100644 tools/testing/selftests/filesystems/fat/config create mode 100644 tools/testing/selftests/filesystems/fat/rename_exchange.c create mode 100644 tools/testing/selftests/filesystems/fat/run_fat_tests.sh (limited to 'tools') diff --git a/MAINTAINERS b/MAINTAINERS index 1fc9ead83d2a..addcc0cca321 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20891,6 +20891,7 @@ M: OGAWA Hirofumi S: Maintained F: Documentation/filesystems/vfat.rst F: fs/fat/ +F: tools/testing/selftests/filesystems/fat/ VFIO DRIVER M: Alex Williamson diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index de11992dc577..67668a9fa115 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -17,6 +17,7 @@ TARGETS += exec TARGETS += filesystems TARGETS += filesystems/binderfs TARGETS += filesystems/epoll +TARGETS += filesystems/fat TARGETS += firmware TARGETS += fpu TARGETS += ftrace diff --git a/tools/testing/selftests/filesystems/fat/.gitignore b/tools/testing/selftests/filesystems/fat/.gitignore new file mode 100644 index 000000000000..b89920ed841c --- /dev/null +++ b/tools/testing/selftests/filesystems/fat/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +rename_exchange diff --git a/tools/testing/selftests/filesystems/fat/Makefile b/tools/testing/selftests/filesystems/fat/Makefile new file mode 100644 index 000000000000..902033f6ef09 --- /dev/null +++ b/tools/testing/selftests/filesystems/fat/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 + +TEST_PROGS := run_fat_tests.sh +TEST_GEN_PROGS_EXTENDED := rename_exchange +CFLAGS += -O2 -g -Wall $(KHDR_INCLUDES) + +include ../../lib.mk diff --git a/tools/testing/selftests/filesystems/fat/config b/tools/testing/selftests/filesystems/fat/config new file mode 100644 index 000000000000..6cf95e787a17 --- /dev/null +++ b/tools/testing/selftests/filesystems/fat/config @@ -0,0 +1,2 @@ +CONFIG_BLK_DEV_LOOP=y +CONFIG_VFAT_FS=y diff --git a/tools/testing/selftests/filesystems/fat/rename_exchange.c b/tools/testing/selftests/filesystems/fat/rename_exchange.c new file mode 100644 index 000000000000..e488ad354fce --- /dev/null +++ b/tools/testing/selftests/filesystems/fat/rename_exchange.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Program that atomically exchanges two paths using + * the renameat2() system call RENAME_EXCHANGE flag. + * + * Copyright 2022 Red Hat Inc. + * Author: Javier Martinez Canillas + */ + +#define _GNU_SOURCE +#include +#include +#include + +void print_usage(const char *program) +{ + printf("Usage: %s [oldpath] [newpath]\n", program); + printf("Atomically exchange oldpath and newpath\n"); +} + +int main(int argc, char *argv[]) +{ + int ret; + + if (argc != 3) { + print_usage(argv[0]); + exit(EXIT_FAILURE); + } + + ret = renameat2(AT_FDCWD, argv[1], AT_FDCWD, argv[2], RENAME_EXCHANGE); + if (ret) { + perror("rename exchange failed"); + exit(EXIT_FAILURE); + } + + exit(EXIT_SUCCESS); +} diff --git a/tools/testing/selftests/filesystems/fat/run_fat_tests.sh b/tools/testing/selftests/filesystems/fat/run_fat_tests.sh new file mode 100644 index 000000000000..7f35dc3d15df --- /dev/null +++ b/tools/testing/selftests/filesystems/fat/run_fat_tests.sh @@ -0,0 +1,82 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Run filesystem operations tests on an 1 MiB disk image that is formatted with +# a vfat filesystem and mounted in a temporary directory using a loop device. +# +# Copyright 2022 Red Hat Inc. +# Author: Javier Martinez Canillas + +set -e +set -u +set -o pipefail + +BASE_DIR="$(dirname $0)" +TMP_DIR="$(mktemp -d /tmp/fat_tests_tmp.XXXX)" +IMG_PATH="${TMP_DIR}/fat.img" +MNT_PATH="${TMP_DIR}/mnt" + +cleanup() +{ + mountpoint -q "${MNT_PATH}" && unmount_image + rm -rf "${TMP_DIR}" +} +trap cleanup SIGINT SIGTERM EXIT + +create_loopback() +{ + touch "${IMG_PATH}" + chattr +C "${IMG_PATH}" >/dev/null 2>&1 || true + + truncate -s 1M "${IMG_PATH}" + mkfs.vfat "${IMG_PATH}" >/dev/null 2>&1 +} + +mount_image() +{ + mkdir -p "${MNT_PATH}" + sudo mount -o loop "${IMG_PATH}" "${MNT_PATH}" +} + +rename_exchange_test() +{ + local rename_exchange="${BASE_DIR}/rename_exchange" + local old_path="${MNT_PATH}/old_file" + local new_path="${MNT_PATH}/new_file" + + echo old | sudo tee "${old_path}" >/dev/null 2>&1 + echo new | sudo tee "${new_path}" >/dev/null 2>&1 + sudo "${rename_exchange}" "${old_path}" "${new_path}" >/dev/null 2>&1 + sudo sync -f "${MNT_PATH}" + grep new "${old_path}" >/dev/null 2>&1 + grep old "${new_path}" >/dev/null 2>&1 +} + +rename_exchange_subdir_test() +{ + local rename_exchange="${BASE_DIR}/rename_exchange" + local dir_path="${MNT_PATH}/subdir" + local old_path="${MNT_PATH}/old_file" + local new_path="${dir_path}/new_file" + + sudo mkdir -p "${dir_path}" + echo old | sudo tee "${old_path}" >/dev/null 2>&1 + echo new | sudo tee "${new_path}" >/dev/null 2>&1 + sudo "${rename_exchange}" "${old_path}" "${new_path}" >/dev/null 2>&1 + sudo sync -f "${MNT_PATH}" + grep new "${old_path}" >/dev/null 2>&1 + grep old "${new_path}" >/dev/null 2>&1 +} + +unmount_image() +{ + sudo umount "${MNT_PATH}" &> /dev/null +} + +create_loopback +mount_image +rename_exchange_test +rename_exchange_subdir_test +unmount_image + +exit 0 -- cgit v1.2.3 From ac80287a6af9fc3f3d189d6d1f523889a0a9e1bc Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 15 Jun 2022 16:48:42 +0300 Subject: bpf: Fix documentation of th_len in bpf_tcp_{gen,check}_syncookie bpf_tcp_gen_syncookie expects the full length of the TCP header (with all options), and bpf_tcp_check_syncookie accepts lengths bigger than sizeof(struct tcphdr). Fix the documentation that says these lengths should be exactly sizeof(struct tcphdr). While at it, fix a typo in the name of struct ipv6hdr. Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Link: https://lore.kernel.org/r/20220615134847.3753567-2-maximmi@nvidia.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 10 ++++++---- tools/include/uapi/linux/bpf.h | 10 ++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f4009dbdf62d..f545e39df72a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3597,10 +3597,11 @@ union bpf_attr { * * *iph* points to the start of the IPv4 or IPv6 header, while * *iph_len* contains **sizeof**\ (**struct iphdr**) or - * **sizeof**\ (**struct ip6hdr**). + * **sizeof**\ (**struct ipv6hdr**). * * *th* points to the start of the TCP header, while *th_len* - * contains **sizeof**\ (**struct tcphdr**). + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). * Return * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative * error otherwise. @@ -3783,10 +3784,11 @@ union bpf_attr { * * *iph* points to the start of the IPv4 or IPv6 header, while * *iph_len* contains **sizeof**\ (**struct iphdr**) or - * **sizeof**\ (**struct ip6hdr**). + * **sizeof**\ (**struct ipv6hdr**). * * *th* points to the start of the TCP header, while *th_len* - * contains the length of the TCP header. + * contains the length of the TCP header with options (at least + * **sizeof**\ (**struct tcphdr**)). * Return * On success, lower 32 bits hold the generated SYN cookie in * followed by 16 bits which hold the MSS value for that cookie, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f4009dbdf62d..f545e39df72a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3597,10 +3597,11 @@ union bpf_attr { * * *iph* points to the start of the IPv4 or IPv6 header, while * *iph_len* contains **sizeof**\ (**struct iphdr**) or - * **sizeof**\ (**struct ip6hdr**). + * **sizeof**\ (**struct ipv6hdr**). * * *th* points to the start of the TCP header, while *th_len* - * contains **sizeof**\ (**struct tcphdr**). + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). * Return * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative * error otherwise. @@ -3783,10 +3784,11 @@ union bpf_attr { * * *iph* points to the start of the IPv4 or IPv6 header, while * *iph_len* contains **sizeof**\ (**struct iphdr**) or - * **sizeof**\ (**struct ip6hdr**). + * **sizeof**\ (**struct ipv6hdr**). * * *th* points to the start of the TCP header, while *th_len* - * contains the length of the TCP header. + * contains the length of the TCP header with options (at least + * **sizeof**\ (**struct tcphdr**)). * Return * On success, lower 32 bits hold the generated SYN cookie in * followed by 16 bits which hold the MSS value for that cookie, -- cgit v1.2.3 From 33bf9885040c399cf6a95bd33216644126728e14 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 15 Jun 2022 16:48:44 +0300 Subject: bpf: Add helpers to issue and check SYN cookies in XDP The new helpers bpf_tcp_raw_{gen,check}_syncookie_ipv{4,6} allow an XDP program to generate SYN cookies in response to TCP SYN packets and to check those cookies upon receiving the first ACK packet (the final packet of the TCP handshake). Unlike bpf_tcp_{gen,check}_syncookie these new helpers don't need a listening socket on the local machine, which allows to use them together with synproxy to accelerate SYN cookie generation. Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Link: https://lore.kernel.org/r/20220615134847.3753567-4-maximmi@nvidia.com Signed-off-by: Alexei Starovoitov --- include/net/tcp.h | 1 + include/uapi/linux/bpf.h | 78 +++++++++++++++++++++++++++ net/core/filter.c | 118 +++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_input.c | 3 +- scripts/bpf_doc.py | 4 ++ tools/include/uapi/linux/bpf.h | 78 +++++++++++++++++++++++++++ 6 files changed, 281 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/include/net/tcp.h b/include/net/tcp.h index 1e99f5c61f84..9a1efe23fab7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -432,6 +432,7 @@ u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, struct tcphdr *th, u32 *cookie); u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph, struct tcphdr *th, u32 *cookie); +u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss); u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct tcphdr *th); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f545e39df72a..e81362891596 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5251,6 +5251,80 @@ union bpf_attr { * Pointer to the underlying dynptr data, NULL if the dynptr is * read-only, if the dynptr is invalid, or if the offset and length * is out of bounds. + * + * s64 bpf_tcp_raw_gen_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th, u32 th_len) + * Description + * Try to issue a SYN cookie for the packet with corresponding + * IPv4/TCP headers, *iph* and *th*, without depending on a + * listening socket. + * + * *iph* points to the IPv4 header. + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * Return + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if *th_len* is invalid. + * + * s64 bpf_tcp_raw_gen_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th, u32 th_len) + * Description + * Try to issue a SYN cookie for the packet with corresponding + * IPv6/TCP headers, *iph* and *th*, without depending on a + * listening socket. + * + * *iph* points to the IPv6 header. + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * Return + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if *th_len* is invalid. + * + * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + * + * long bpf_tcp_raw_check_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th) + * Description + * Check whether *iph* and *th* contain a valid SYN cookie ACK + * without depending on a listening socket. + * + * *iph* points to the IPv4 header. + * + * *th* points to the TCP header. + * Return + * 0 if *iph* and *th* are a valid SYN cookie ACK. + * + * On failure, the returned value is one of the following: + * + * **-EACCES** if the SYN cookie is not valid. + * + * long bpf_tcp_raw_check_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th) + * Description + * Check whether *iph* and *th* contain a valid SYN cookie ACK + * without depending on a listening socket. + * + * *iph* points to the IPv6 header. + * + * *th* points to the TCP header. + * Return + * 0 if *iph* and *th* are a valid SYN cookie ACK. + * + * On failure, the returned value is one of the following: + * + * **-EACCES** if the SYN cookie is not valid. + * + * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5457,6 +5531,10 @@ union bpf_attr { FN(dynptr_read), \ FN(dynptr_write), \ FN(dynptr_data), \ + FN(tcp_raw_gen_syncookie_ipv4), \ + FN(tcp_raw_gen_syncookie_ipv6), \ + FN(tcp_raw_check_syncookie_ipv4), \ + FN(tcp_raw_check_syncookie_ipv6), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/net/core/filter.c b/net/core/filter.c index 5af58eb48587..b62d4126a561 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7444,6 +7444,114 @@ static const struct bpf_func_proto bpf_skb_set_tstamp_proto = { .arg3_type = ARG_ANYTHING, }; +#ifdef CONFIG_SYN_COOKIES +BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv4, struct iphdr *, iph, + struct tcphdr *, th, u32, th_len) +{ + u32 cookie; + u16 mss; + + if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) + return -EINVAL; + + mss = tcp_parse_mss_option(th, 0) ?: TCP_MSS_DEFAULT; + cookie = __cookie_v4_init_sequence(iph, th, &mss); + + return cookie | ((u64)mss << 32); +} + +static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = { + .func = bpf_tcp_raw_gen_syncookie_ipv4, + .gpl_only = true, /* __cookie_v4_init_sequence() is GPL */ + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_size = sizeof(struct iphdr), + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6, struct ipv6hdr *, iph, + struct tcphdr *, th, u32, th_len) +{ +#if IS_BUILTIN(CONFIG_IPV6) + const u16 mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - + sizeof(struct ipv6hdr); + u32 cookie; + u16 mss; + + if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) + return -EINVAL; + + mss = tcp_parse_mss_option(th, 0) ?: mss_clamp; + cookie = __cookie_v6_init_sequence(iph, th, &mss); + + return cookie | ((u64)mss << 32); +#else + return -EPROTONOSUPPORT; +#endif +} + +static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = { + .func = bpf_tcp_raw_gen_syncookie_ipv6, + .gpl_only = true, /* __cookie_v6_init_sequence() is GPL */ + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_size = sizeof(struct ipv6hdr), + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph, + struct tcphdr *, th) +{ + u32 cookie = ntohl(th->ack_seq) - 1; + + if (__cookie_v4_check(iph, th, cookie) > 0) + return 0; + + return -EACCES; +} + +static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv4_proto = { + .func = bpf_tcp_raw_check_syncookie_ipv4, + .gpl_only = true, /* __cookie_v4_check is GPL */ + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_size = sizeof(struct iphdr), + .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg2_size = sizeof(struct tcphdr), +}; + +BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph, + struct tcphdr *, th) +{ +#if IS_BUILTIN(CONFIG_IPV6) + u32 cookie = ntohl(th->ack_seq) - 1; + + if (__cookie_v6_check(iph, th, cookie) > 0) + return 0; + + return -EACCES; +#else + return -EPROTONOSUPPORT; +#endif +} + +static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = { + .func = bpf_tcp_raw_check_syncookie_ipv6, + .gpl_only = true, /* __cookie_v6_check is GPL */ + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_size = sizeof(struct ipv6hdr), + .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg2_size = sizeof(struct tcphdr), +}; +#endif /* CONFIG_SYN_COOKIES */ + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -7856,6 +7964,16 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_tcp_check_syncookie_proto; case BPF_FUNC_tcp_gen_syncookie: return &bpf_tcp_gen_syncookie_proto; +#ifdef CONFIG_SYN_COOKIES + case BPF_FUNC_tcp_raw_gen_syncookie_ipv4: + return &bpf_tcp_raw_gen_syncookie_ipv4_proto; + case BPF_FUNC_tcp_raw_gen_syncookie_ipv6: + return &bpf_tcp_raw_gen_syncookie_ipv6_proto; + case BPF_FUNC_tcp_raw_check_syncookie_ipv4: + return &bpf_tcp_raw_check_syncookie_ipv4_proto; + case BPF_FUNC_tcp_raw_check_syncookie_ipv6: + return &bpf_tcp_raw_check_syncookie_ipv6_proto; +#endif #endif default: return bpf_sk_base_func_proto(func_id); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2e2a9ece9af2..6426f6a2e744 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3967,7 +3967,7 @@ static bool smc_parse_options(const struct tcphdr *th, /* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped * value on success. */ -static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) +u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) { const unsigned char *ptr = (const unsigned char *)(th + 1); int length = (th->doff * 4) - sizeof(struct tcphdr); @@ -4006,6 +4006,7 @@ static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) } return mss; } +EXPORT_SYMBOL_GPL(tcp_parse_mss_option); /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index 855b937e7585..a0ec321469bd 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -635,6 +635,8 @@ class PrinterHelpers(Printer): 'struct bpf_timer', 'struct mptcp_sock', 'struct bpf_dynptr', + 'struct iphdr', + 'struct ipv6hdr', ] known_types = { '...', @@ -686,6 +688,8 @@ class PrinterHelpers(Printer): 'struct bpf_timer', 'struct mptcp_sock', 'struct bpf_dynptr', + 'struct iphdr', + 'struct ipv6hdr', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f545e39df72a..e81362891596 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5251,6 +5251,80 @@ union bpf_attr { * Pointer to the underlying dynptr data, NULL if the dynptr is * read-only, if the dynptr is invalid, or if the offset and length * is out of bounds. + * + * s64 bpf_tcp_raw_gen_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th, u32 th_len) + * Description + * Try to issue a SYN cookie for the packet with corresponding + * IPv4/TCP headers, *iph* and *th*, without depending on a + * listening socket. + * + * *iph* points to the IPv4 header. + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * Return + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if *th_len* is invalid. + * + * s64 bpf_tcp_raw_gen_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th, u32 th_len) + * Description + * Try to issue a SYN cookie for the packet with corresponding + * IPv6/TCP headers, *iph* and *th*, without depending on a + * listening socket. + * + * *iph* points to the IPv6 header. + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * Return + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if *th_len* is invalid. + * + * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + * + * long bpf_tcp_raw_check_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th) + * Description + * Check whether *iph* and *th* contain a valid SYN cookie ACK + * without depending on a listening socket. + * + * *iph* points to the IPv4 header. + * + * *th* points to the TCP header. + * Return + * 0 if *iph* and *th* are a valid SYN cookie ACK. + * + * On failure, the returned value is one of the following: + * + * **-EACCES** if the SYN cookie is not valid. + * + * long bpf_tcp_raw_check_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th) + * Description + * Check whether *iph* and *th* contain a valid SYN cookie ACK + * without depending on a listening socket. + * + * *iph* points to the IPv6 header. + * + * *th* points to the TCP header. + * Return + * 0 if *iph* and *th* are a valid SYN cookie ACK. + * + * On failure, the returned value is one of the following: + * + * **-EACCES** if the SYN cookie is not valid. + * + * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5457,6 +5531,10 @@ union bpf_attr { FN(dynptr_read), \ FN(dynptr_write), \ FN(dynptr_data), \ + FN(tcp_raw_gen_syncookie_ipv4), \ + FN(tcp_raw_gen_syncookie_ipv6), \ + FN(tcp_raw_check_syncookie_ipv4), \ + FN(tcp_raw_check_syncookie_ipv6), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From fb5cd0ce70d43b9bf589aa05aaa067350c3d3b26 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 15 Jun 2022 16:48:45 +0300 Subject: selftests/bpf: Add selftests for raw syncookie helpers This commit adds selftests for the new BPF helpers: bpf_tcp_raw_{gen,check}_syncookie_ipv{4,6}. xdp_synproxy_kern.c is a BPF program that generates SYN cookies on allowed TCP ports and sends SYNACKs to clients, accelerating synproxy iptables module. xdp_synproxy.c is a userspace control application that allows to configure the following options in runtime: list of allowed ports, MSS, window scale, TTL. A selftest is added to prog_tests that leverages the above programs to test the functionality of the new helpers. Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Link: https://lore.kernel.org/r/20220615134847.3753567-5-maximmi@nvidia.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/.gitignore | 1 + tools/testing/selftests/bpf/Makefile | 3 +- .../selftests/bpf/prog_tests/xdp_synproxy.c | 146 ++++ .../selftests/bpf/progs/xdp_synproxy_kern.c | 763 +++++++++++++++++++++ tools/testing/selftests/bpf/xdp_synproxy.c | 418 +++++++++++ 5 files changed, 1330 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c create mode 100644 tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c create mode 100644 tools/testing/selftests/bpf/xdp_synproxy.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 595565eb68c0..ca2f47f45670 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -43,3 +43,4 @@ test_cpp *.tmp xdpxceiver xdp_redirect_multi +xdp_synproxy diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index e08e8e34e793..8b30bb743e24 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -82,7 +82,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \ - xdpxceiver xdp_redirect_multi + xdpxceiver xdp_redirect_multi xdp_synproxy TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read @@ -504,6 +504,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \ cap_helpers.c TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko \ $(OUTPUT)/liburandom_read.so \ + $(OUTPUT)/xdp_synproxy \ ima_setup.sh \ $(wildcard progs/btf_dump_test_case_*.c) TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c new file mode 100644 index 000000000000..d9ee884c2a2b --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include +#include +#include + +#define CMD_OUT_BUF_SIZE 1023 + +#define SYS(cmd) ({ \ + if (!ASSERT_OK(system(cmd), (cmd))) \ + goto out; \ +}) + +#define SYS_OUT(cmd) ({ \ + FILE *f = popen((cmd), "r"); \ + if (!ASSERT_OK_PTR(f, (cmd))) \ + goto out; \ + f; \ +}) + +/* out must be at least `size * 4 + 1` bytes long */ +static void escape_str(char *out, const char *in, size_t size) +{ + static const char *hex = "0123456789ABCDEF"; + size_t i; + + for (i = 0; i < size; i++) { + if (isprint(in[i]) && in[i] != '\\' && in[i] != '\'') { + *out++ = in[i]; + } else { + *out++ = '\\'; + *out++ = 'x'; + *out++ = hex[(in[i] >> 4) & 0xf]; + *out++ = hex[in[i] & 0xf]; + } + } + *out++ = '\0'; +} + +static bool expect_str(char *buf, size_t size, const char *str, const char *name) +{ + static char escbuf_expected[CMD_OUT_BUF_SIZE * 4]; + static char escbuf_actual[CMD_OUT_BUF_SIZE * 4]; + static int duration = 0; + bool ok; + + ok = size == strlen(str) && !memcmp(buf, str, size); + + if (!ok) { + escape_str(escbuf_expected, str, strlen(str)); + escape_str(escbuf_actual, buf, size); + } + CHECK(!ok, name, "unexpected %s: actual '%s' != expected '%s'\n", + name, escbuf_actual, escbuf_expected); + + return ok; +} + +void test_xdp_synproxy(void) +{ + int server_fd = -1, client_fd = -1, accept_fd = -1; + struct nstoken *ns = NULL; + FILE *ctrl_file = NULL; + char buf[CMD_OUT_BUF_SIZE]; + size_t size; + + SYS("ip netns add synproxy"); + + SYS("ip link add tmp0 type veth peer name tmp1"); + SYS("ip link set tmp1 netns synproxy"); + SYS("ip link set tmp0 up"); + SYS("ip addr replace 198.18.0.1/24 dev tmp0"); + + /* When checksum offload is enabled, the XDP program sees wrong + * checksums and drops packets. + */ + SYS("ethtool -K tmp0 tx off"); + /* Workaround required for veth. */ + SYS("ip link set tmp0 xdp object xdp_dummy.o section xdp 2> /dev/null"); + + ns = open_netns("synproxy"); + if (!ASSERT_OK_PTR(ns, "setns")) + goto out; + + SYS("ip link set lo up"); + SYS("ip link set tmp1 up"); + SYS("ip addr replace 198.18.0.2/24 dev tmp1"); + SYS("sysctl -w net.ipv4.tcp_syncookies=2"); + SYS("sysctl -w net.ipv4.tcp_timestamps=1"); + SYS("sysctl -w net.netfilter.nf_conntrack_tcp_loose=0"); + SYS("iptables -t raw -I PREROUTING \ + -i tmp1 -p tcp -m tcp --syn --dport 8080 -j CT --notrack"); + SYS("iptables -t filter -A INPUT \ + -i tmp1 -p tcp -m tcp --dport 8080 -m state --state INVALID,UNTRACKED \ + -j SYNPROXY --sack-perm --timestamp --wscale 7 --mss 1460"); + SYS("iptables -t filter -A INPUT \ + -i tmp1 -m state --state INVALID -j DROP"); + + ctrl_file = SYS_OUT("./xdp_synproxy --iface tmp1 --ports 8080 --single \ + --mss4 1460 --mss6 1440 --wscale 7 --ttl 64"); + size = fread(buf, 1, sizeof(buf), ctrl_file); + pclose(ctrl_file); + if (!expect_str(buf, size, "Total SYNACKs generated: 0\n", + "initial SYNACKs")) + goto out; + + server_fd = start_server(AF_INET, SOCK_STREAM, "198.18.0.2", 8080, 0); + if (!ASSERT_GE(server_fd, 0, "start_server")) + goto out; + + close_netns(ns); + ns = NULL; + + client_fd = connect_to_fd(server_fd, 10000); + if (!ASSERT_GE(client_fd, 0, "connect_to_fd")) + goto out; + + accept_fd = accept(server_fd, NULL, NULL); + if (!ASSERT_GE(accept_fd, 0, "accept")) + goto out; + + ns = open_netns("synproxy"); + if (!ASSERT_OK_PTR(ns, "setns")) + goto out; + + ctrl_file = SYS_OUT("./xdp_synproxy --iface tmp1 --single"); + size = fread(buf, 1, sizeof(buf), ctrl_file); + pclose(ctrl_file); + if (!expect_str(buf, size, "Total SYNACKs generated: 1\n", + "SYNACKs after connection")) + goto out; + +out: + if (accept_fd >= 0) + close(accept_fd); + if (client_fd >= 0) + close(client_fd); + if (server_fd >= 0) + close(server_fd); + if (ns) + close_netns(ns); + + system("ip link del tmp0"); + system("ip netns del synproxy"); +} diff --git a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c new file mode 100644 index 000000000000..53b9865276a4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c @@ -0,0 +1,763 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include "vmlinux.h" + +#include +#include +#include + +#define NSEC_PER_SEC 1000000000L + +#define ETH_ALEN 6 +#define ETH_P_IP 0x0800 +#define ETH_P_IPV6 0x86DD + +#define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3]) + +#define IP_DF 0x4000 +#define IP_MF 0x2000 +#define IP_OFFSET 0x1fff + +#define NEXTHDR_TCP 6 + +#define TCPOPT_NOP 1 +#define TCPOPT_EOL 0 +#define TCPOPT_MSS 2 +#define TCPOPT_WINDOW 3 +#define TCPOPT_SACK_PERM 4 +#define TCPOPT_TIMESTAMP 8 + +#define TCPOLEN_MSS 4 +#define TCPOLEN_WINDOW 3 +#define TCPOLEN_SACK_PERM 2 +#define TCPOLEN_TIMESTAMP 10 + +#define TCP_TS_HZ 1000 +#define TS_OPT_WSCALE_MASK 0xf +#define TS_OPT_SACK (1 << 4) +#define TS_OPT_ECN (1 << 5) +#define TSBITS 6 +#define TSMASK (((__u32)1 << TSBITS) - 1) +#define TCP_MAX_WSCALE 14U + +#define IPV4_MAXLEN 60 +#define TCP_MAXLEN 60 + +#define DEFAULT_MSS4 1460 +#define DEFAULT_MSS6 1440 +#define DEFAULT_WSCALE 7 +#define DEFAULT_TTL 64 +#define MAX_ALLOWED_PORTS 8 + +#define swap(a, b) \ + do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) + +#define __get_unaligned_t(type, ptr) ({ \ + const struct { type x; } __attribute__((__packed__)) *__pptr = (typeof(__pptr))(ptr); \ + __pptr->x; \ +}) + +#define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr)) + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, __u64); + __uint(max_entries, 2); +} values SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, __u16); + __uint(max_entries, MAX_ALLOWED_PORTS); +} allowed_ports SEC(".maps"); + +extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, + struct bpf_sock_tuple *bpf_tuple, + __u32 len_tuple, + struct bpf_ct_opts *opts, + __u32 len_opts) __ksym; + +extern void bpf_ct_release(struct nf_conn *ct) __ksym; + +static __always_inline void swap_eth_addr(__u8 *a, __u8 *b) +{ + __u8 tmp[ETH_ALEN]; + + __builtin_memcpy(tmp, a, ETH_ALEN); + __builtin_memcpy(a, b, ETH_ALEN); + __builtin_memcpy(b, tmp, ETH_ALEN); +} + +static __always_inline __u16 csum_fold(__u32 csum) +{ + csum = (csum & 0xffff) + (csum >> 16); + csum = (csum & 0xffff) + (csum >> 16); + return (__u16)~csum; +} + +static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, + __u32 len, __u8 proto, + __u32 csum) +{ + __u64 s = csum; + + s += (__u32)saddr; + s += (__u32)daddr; +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + s += proto + len; +#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + s += (proto + len) << 8; +#else +#error Unknown endian +#endif + s = (s & 0xffffffff) + (s >> 32); + s = (s & 0xffffffff) + (s >> 32); + + return csum_fold((__u32)s); +} + +static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u32 len, __u8 proto, __u32 csum) +{ + __u64 sum = csum; + int i; + +#pragma unroll + for (i = 0; i < 4; i++) + sum += (__u32)saddr->in6_u.u6_addr32[i]; + +#pragma unroll + for (i = 0; i < 4; i++) + sum += (__u32)daddr->in6_u.u6_addr32[i]; + + /* Don't combine additions to avoid 32-bit overflow. */ + sum += bpf_htonl(len); + sum += bpf_htonl(proto); + + sum = (sum & 0xffffffff) + (sum >> 32); + sum = (sum & 0xffffffff) + (sum >> 32); + + return csum_fold((__u32)sum); +} + +static __always_inline __u64 tcp_clock_ns(void) +{ + return bpf_ktime_get_ns(); +} + +static __always_inline __u32 tcp_ns_to_ts(__u64 ns) +{ + return ns / (NSEC_PER_SEC / TCP_TS_HZ); +} + +static __always_inline __u32 tcp_time_stamp_raw(void) +{ + return tcp_ns_to_ts(tcp_clock_ns()); +} + +struct tcpopt_context { + __u8 *ptr; + __u8 *end; + void *data_end; + __be32 *tsecr; + __u8 wscale; + bool option_timestamp; + bool option_sack; +}; + +static int tscookie_tcpopt_parse(struct tcpopt_context *ctx) +{ + __u8 opcode, opsize; + + if (ctx->ptr >= ctx->end) + return 1; + if (ctx->ptr >= ctx->data_end) + return 1; + + opcode = ctx->ptr[0]; + + if (opcode == TCPOPT_EOL) + return 1; + if (opcode == TCPOPT_NOP) { + ++ctx->ptr; + return 0; + } + + if (ctx->ptr + 1 >= ctx->end) + return 1; + if (ctx->ptr + 1 >= ctx->data_end) + return 1; + opsize = ctx->ptr[1]; + if (opsize < 2) + return 1; + + if (ctx->ptr + opsize > ctx->end) + return 1; + + switch (opcode) { + case TCPOPT_WINDOW: + if (opsize == TCPOLEN_WINDOW && ctx->ptr + TCPOLEN_WINDOW <= ctx->data_end) + ctx->wscale = ctx->ptr[2] < TCP_MAX_WSCALE ? ctx->ptr[2] : TCP_MAX_WSCALE; + break; + case TCPOPT_TIMESTAMP: + if (opsize == TCPOLEN_TIMESTAMP && ctx->ptr + TCPOLEN_TIMESTAMP <= ctx->data_end) { + ctx->option_timestamp = true; + /* Client's tsval becomes our tsecr. */ + *ctx->tsecr = get_unaligned((__be32 *)(ctx->ptr + 2)); + } + break; + case TCPOPT_SACK_PERM: + if (opsize == TCPOLEN_SACK_PERM) + ctx->option_sack = true; + break; + } + + ctx->ptr += opsize; + + return 0; +} + +static int tscookie_tcpopt_parse_batch(__u32 index, void *context) +{ + int i; + + for (i = 0; i < 7; i++) + if (tscookie_tcpopt_parse(context)) + return 1; + return 0; +} + +static __always_inline bool tscookie_init(struct tcphdr *tcp_header, + __u16 tcp_len, __be32 *tsval, + __be32 *tsecr, void *data_end) +{ + struct tcpopt_context loop_ctx = { + .ptr = (__u8 *)(tcp_header + 1), + .end = (__u8 *)tcp_header + tcp_len, + .data_end = data_end, + .tsecr = tsecr, + .wscale = TS_OPT_WSCALE_MASK, + .option_timestamp = false, + .option_sack = false, + }; + u32 cookie; + + bpf_loop(6, tscookie_tcpopt_parse_batch, &loop_ctx, 0); + + if (!loop_ctx.option_timestamp) + return false; + + cookie = tcp_time_stamp_raw() & ~TSMASK; + cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK; + if (loop_ctx.option_sack) + cookie |= TS_OPT_SACK; + if (tcp_header->ece && tcp_header->cwr) + cookie |= TS_OPT_ECN; + *tsval = bpf_htonl(cookie); + + return true; +} + +static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale, + __u8 *ttl, bool ipv6) +{ + __u32 key = 0; + __u64 *value; + + value = bpf_map_lookup_elem(&values, &key); + if (value && *value != 0) { + if (ipv6) + *mss = (*value >> 32) & 0xffff; + else + *mss = *value & 0xffff; + *wscale = (*value >> 16) & 0xf; + *ttl = (*value >> 24) & 0xff; + return; + } + + *mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4; + *wscale = DEFAULT_WSCALE; + *ttl = DEFAULT_TTL; +} + +static __always_inline void values_inc_synacks(void) +{ + __u32 key = 1; + __u32 *value; + + value = bpf_map_lookup_elem(&values, &key); + if (value) + __sync_fetch_and_add(value, 1); +} + +static __always_inline bool check_port_allowed(__u16 port) +{ + __u32 i; + + for (i = 0; i < MAX_ALLOWED_PORTS; i++) { + __u32 key = i; + __u16 *value; + + value = bpf_map_lookup_elem(&allowed_ports, &key); + + if (!value) + break; + /* 0 is a terminator value. Check it first to avoid matching on + * a forbidden port == 0 and returning true. + */ + if (*value == 0) + break; + + if (*value == port) + return true; + } + + return false; +} + +struct header_pointers { + struct ethhdr *eth; + struct iphdr *ipv4; + struct ipv6hdr *ipv6; + struct tcphdr *tcp; + __u16 tcp_len; +}; + +static __always_inline int tcp_dissect(void *data, void *data_end, + struct header_pointers *hdr) +{ + hdr->eth = data; + if (hdr->eth + 1 > data_end) + return XDP_DROP; + + switch (bpf_ntohs(hdr->eth->h_proto)) { + case ETH_P_IP: + hdr->ipv6 = NULL; + + hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth); + if (hdr->ipv4 + 1 > data_end) + return XDP_DROP; + if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4)) + return XDP_DROP; + if (hdr->ipv4->version != 4) + return XDP_DROP; + + if (hdr->ipv4->protocol != IPPROTO_TCP) + return XDP_PASS; + + hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4; + break; + case ETH_P_IPV6: + hdr->ipv4 = NULL; + + hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth); + if (hdr->ipv6 + 1 > data_end) + return XDP_DROP; + if (hdr->ipv6->version != 6) + return XDP_DROP; + + /* XXX: Extension headers are not supported and could circumvent + * XDP SYN flood protection. + */ + if (hdr->ipv6->nexthdr != NEXTHDR_TCP) + return XDP_PASS; + + hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6); + break; + default: + /* XXX: VLANs will circumvent XDP SYN flood protection. */ + return XDP_PASS; + } + + if (hdr->tcp + 1 > data_end) + return XDP_DROP; + hdr->tcp_len = hdr->tcp->doff * 4; + if (hdr->tcp_len < sizeof(*hdr->tcp)) + return XDP_DROP; + + return XDP_TX; +} + +static __always_inline int tcp_lookup(struct xdp_md *ctx, struct header_pointers *hdr) +{ + struct bpf_ct_opts ct_lookup_opts = { + .netns_id = BPF_F_CURRENT_NETNS, + .l4proto = IPPROTO_TCP, + }; + struct bpf_sock_tuple tup = {}; + struct nf_conn *ct; + __u32 tup_size; + + if (hdr->ipv4) { + /* TCP doesn't normally use fragments, and XDP can't reassemble + * them. + */ + if ((hdr->ipv4->frag_off & bpf_htons(IP_DF | IP_MF | IP_OFFSET)) != bpf_htons(IP_DF)) + return XDP_DROP; + + tup.ipv4.saddr = hdr->ipv4->saddr; + tup.ipv4.daddr = hdr->ipv4->daddr; + tup.ipv4.sport = hdr->tcp->source; + tup.ipv4.dport = hdr->tcp->dest; + tup_size = sizeof(tup.ipv4); + } else if (hdr->ipv6) { + __builtin_memcpy(tup.ipv6.saddr, &hdr->ipv6->saddr, sizeof(tup.ipv6.saddr)); + __builtin_memcpy(tup.ipv6.daddr, &hdr->ipv6->daddr, sizeof(tup.ipv6.daddr)); + tup.ipv6.sport = hdr->tcp->source; + tup.ipv6.dport = hdr->tcp->dest; + tup_size = sizeof(tup.ipv6); + } else { + /* The verifier can't track that either ipv4 or ipv6 is not + * NULL. + */ + return XDP_ABORTED; + } + ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts)); + if (ct) { + unsigned long status = ct->status; + + bpf_ct_release(ct); + if (status & IPS_CONFIRMED_BIT) + return XDP_PASS; + } else if (ct_lookup_opts.error != -ENOENT) { + return XDP_ABORTED; + } + + /* error == -ENOENT || !(status & IPS_CONFIRMED_BIT) */ + return XDP_TX; +} + +static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, __u16 mss, + __u8 wscale) +{ + __be32 *start = buf; + + *buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); + + if (!tsopt) + return buf - start; + + if (tsopt[0] & bpf_htonl(1 << 4)) + *buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) | + (TCPOLEN_SACK_PERM << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + else + *buf++ = bpf_htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + *buf++ = tsopt[0]; + *buf++ = tsopt[1]; + + if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf)) + *buf++ = bpf_htonl((TCPOPT_NOP << 24) | + (TCPOPT_WINDOW << 16) | + (TCPOLEN_WINDOW << 8) | + wscale); + + return buf - start; +} + +static __always_inline void tcp_gen_synack(struct tcphdr *tcp_header, + __u32 cookie, __be32 *tsopt, + __u16 mss, __u8 wscale) +{ + void *tcp_options; + + tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK; + if (tsopt && (tsopt[0] & bpf_htonl(1 << 5))) + tcp_flag_word(tcp_header) |= TCP_FLAG_ECE; + tcp_header->doff = 5; /* doff is part of tcp_flag_word. */ + swap(tcp_header->source, tcp_header->dest); + tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1); + tcp_header->seq = bpf_htonl(cookie); + tcp_header->window = 0; + tcp_header->urg_ptr = 0; + tcp_header->check = 0; /* Calculate checksum later. */ + + tcp_options = (void *)(tcp_header + 1); + tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale); +} + +static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr, + __u32 cookie, __be32 *tsopt) +{ + __u8 wscale; + __u16 mss; + __u8 ttl; + + values_get_tcpipopts(&mss, &wscale, &ttl, false); + + swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest); + + swap(hdr->ipv4->saddr, hdr->ipv4->daddr); + hdr->ipv4->check = 0; /* Calculate checksum later. */ + hdr->ipv4->tos = 0; + hdr->ipv4->id = 0; + hdr->ipv4->ttl = ttl; + + tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale); + + hdr->tcp_len = hdr->tcp->doff * 4; + hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len); +} + +static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr, + __u32 cookie, __be32 *tsopt) +{ + __u8 wscale; + __u16 mss; + __u8 ttl; + + values_get_tcpipopts(&mss, &wscale, &ttl, true); + + swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest); + + swap(hdr->ipv6->saddr, hdr->ipv6->daddr); + *(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000); + hdr->ipv6->hop_limit = ttl; + + tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale); + + hdr->tcp_len = hdr->tcp->doff * 4; + hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len); +} + +static __always_inline int syncookie_handle_syn(struct header_pointers *hdr, + struct xdp_md *ctx, + void *data, void *data_end) +{ + __u32 old_pkt_size, new_pkt_size; + /* Unlike clang 10, clang 11 and 12 generate code that doesn't pass the + * BPF verifier if tsopt is not volatile. Volatile forces it to store + * the pointer value and use it directly, otherwise tcp_mkoptions is + * (mis)compiled like this: + * if (!tsopt) + * return buf - start; + * reg = stored_return_value_of_tscookie_init; + * if (reg) + * tsopt = tsopt_buf; + * else + * tsopt = NULL; + * ... + * *buf++ = tsopt[1]; + * It creates a dead branch where tsopt is assigned NULL, but the + * verifier can't prove it's dead and blocks the program. + */ + __be32 * volatile tsopt = NULL; + __be32 tsopt_buf[2] = {}; + __u16 ip_len; + __u32 cookie; + __s64 value; + + /* Checksum is not yet verified, but both checksum failure and TCP + * header checks return XDP_DROP, so the order doesn't matter. + */ + if (hdr->tcp->fin || hdr->tcp->rst) + return XDP_DROP; + + /* Issue SYN cookies on allowed ports, drop SYN packets on blocked + * ports. + */ + if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest))) + return XDP_DROP; + + if (hdr->ipv4) { + /* Check the IPv4 and TCP checksums before creating a SYNACK. */ + value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0); + if (value < 0) + return XDP_ABORTED; + if (csum_fold(value) != 0) + return XDP_DROP; /* Bad IPv4 checksum. */ + + value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); + if (value < 0) + return XDP_ABORTED; + if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr, + hdr->tcp_len, IPPROTO_TCP, value) != 0) + return XDP_DROP; /* Bad TCP checksum. */ + + ip_len = sizeof(*hdr->ipv4); + + value = bpf_tcp_raw_gen_syncookie_ipv4(hdr->ipv4, hdr->tcp, + hdr->tcp_len); + } else if (hdr->ipv6) { + /* Check the TCP checksum before creating a SYNACK. */ + value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); + if (value < 0) + return XDP_ABORTED; + if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr, + hdr->tcp_len, IPPROTO_TCP, value) != 0) + return XDP_DROP; /* Bad TCP checksum. */ + + ip_len = sizeof(*hdr->ipv6); + + value = bpf_tcp_raw_gen_syncookie_ipv6(hdr->ipv6, hdr->tcp, + hdr->tcp_len); + } else { + return XDP_ABORTED; + } + + if (value < 0) + return XDP_ABORTED; + cookie = (__u32)value; + + if (tscookie_init((void *)hdr->tcp, hdr->tcp_len, + &tsopt_buf[0], &tsopt_buf[1], data_end)) + tsopt = tsopt_buf; + + /* Check that there is enough space for a SYNACK. It also covers + * the check that the destination of the __builtin_memmove below + * doesn't overflow. + */ + if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end) + return XDP_ABORTED; + + if (hdr->ipv4) { + if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) { + struct tcphdr *new_tcp_header; + + new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4); + __builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp)); + hdr->tcp = new_tcp_header; + + hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4; + } + + tcpv4_gen_synack(hdr, cookie, tsopt); + } else if (hdr->ipv6) { + tcpv6_gen_synack(hdr, cookie, tsopt); + } else { + return XDP_ABORTED; + } + + /* Recalculate checksums. */ + hdr->tcp->check = 0; + value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); + if (value < 0) + return XDP_ABORTED; + if (hdr->ipv4) { + hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr, + hdr->ipv4->daddr, + hdr->tcp_len, + IPPROTO_TCP, + value); + + hdr->ipv4->check = 0; + value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0); + if (value < 0) + return XDP_ABORTED; + hdr->ipv4->check = csum_fold(value); + } else if (hdr->ipv6) { + hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr, + &hdr->ipv6->daddr, + hdr->tcp_len, + IPPROTO_TCP, + value); + } else { + return XDP_ABORTED; + } + + /* Set the new packet size. */ + old_pkt_size = data_end - data; + new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4; + if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size)) + return XDP_ABORTED; + + values_inc_synacks(); + + return XDP_TX; +} + +static __always_inline int syncookie_handle_ack(struct header_pointers *hdr) +{ + int err; + + if (hdr->tcp->rst) + return XDP_DROP; + + if (hdr->ipv4) + err = bpf_tcp_raw_check_syncookie_ipv4(hdr->ipv4, hdr->tcp); + else if (hdr->ipv6) + err = bpf_tcp_raw_check_syncookie_ipv6(hdr->ipv6, hdr->tcp); + else + return XDP_ABORTED; + if (err) + return XDP_DROP; + + return XDP_PASS; +} + +SEC("xdp") +int syncookie_xdp(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct header_pointers hdr; + __s64 value; + int ret; + + struct bpf_ct_opts ct_lookup_opts = { + .netns_id = BPF_F_CURRENT_NETNS, + .l4proto = IPPROTO_TCP, + }; + + ret = tcp_dissect(data, data_end, &hdr); + if (ret != XDP_TX) + return ret; + + ret = tcp_lookup(ctx, &hdr); + if (ret != XDP_TX) + return ret; + + /* Packet is TCP and doesn't belong to an established connection. */ + + if ((hdr.tcp->syn ^ hdr.tcp->ack) != 1) + return XDP_DROP; + + /* Grow the TCP header to TCP_MAXLEN to be able to pass any hdr.tcp_len + * to bpf_tcp_raw_gen_syncookie_ipv{4,6} and pass the verifier. + */ + if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr.tcp_len)) + return XDP_ABORTED; + + data_end = (void *)(long)ctx->data_end; + data = (void *)(long)ctx->data; + + if (hdr.ipv4) { + hdr.eth = data; + hdr.ipv4 = (void *)hdr.eth + sizeof(*hdr.eth); + /* IPV4_MAXLEN is needed when calculating checksum. + * At least sizeof(struct iphdr) is needed here to access ihl. + */ + if ((void *)hdr.ipv4 + IPV4_MAXLEN > data_end) + return XDP_ABORTED; + hdr.tcp = (void *)hdr.ipv4 + hdr.ipv4->ihl * 4; + } else if (hdr.ipv6) { + hdr.eth = data; + hdr.ipv6 = (void *)hdr.eth + sizeof(*hdr.eth); + hdr.tcp = (void *)hdr.ipv6 + sizeof(*hdr.ipv6); + } else { + return XDP_ABORTED; + } + + if ((void *)hdr.tcp + TCP_MAXLEN > data_end) + return XDP_ABORTED; + + /* We run out of registers, tcp_len gets spilled to the stack, and the + * verifier forgets its min and max values checked above in tcp_dissect. + */ + hdr.tcp_len = hdr.tcp->doff * 4; + if (hdr.tcp_len < sizeof(*hdr.tcp)) + return XDP_ABORTED; + + return hdr.tcp->syn ? syncookie_handle_syn(&hdr, ctx, data, data_end) : + syncookie_handle_ack(&hdr); +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/xdp_synproxy.c b/tools/testing/selftests/bpf/xdp_synproxy.c new file mode 100644 index 000000000000..4653d4655b5f --- /dev/null +++ b/tools/testing/selftests/bpf/xdp_synproxy.c @@ -0,0 +1,418 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned int ifindex; +static __u32 attached_prog_id; + +static void noreturn cleanup(int sig) +{ + DECLARE_LIBBPF_OPTS(bpf_xdp_attach_opts, opts); + int prog_fd; + int err; + + if (attached_prog_id == 0) + exit(0); + + prog_fd = bpf_prog_get_fd_by_id(attached_prog_id); + if (prog_fd < 0) { + fprintf(stderr, "Error: bpf_prog_get_fd_by_id: %s\n", strerror(-prog_fd)); + err = bpf_xdp_attach(ifindex, -1, 0, NULL); + if (err < 0) { + fprintf(stderr, "Error: bpf_set_link_xdp_fd: %s\n", strerror(-err)); + fprintf(stderr, "Failed to detach XDP program\n"); + exit(1); + } + } else { + opts.old_prog_fd = prog_fd; + err = bpf_xdp_attach(ifindex, -1, XDP_FLAGS_REPLACE, &opts); + close(prog_fd); + if (err < 0) { + fprintf(stderr, "Error: bpf_set_link_xdp_fd_opts: %s\n", strerror(-err)); + /* Not an error if already replaced by someone else. */ + if (err != -EEXIST) { + fprintf(stderr, "Failed to detach XDP program\n"); + exit(1); + } + } + } + exit(0); +} + +static noreturn void usage(const char *progname) +{ + fprintf(stderr, "Usage: %s [--iface |--prog ] [--mss4 --mss6 --wscale --ttl ] [--ports ,,...] [--single]\n", + progname); + exit(1); +} + +static unsigned long parse_arg_ul(const char *progname, const char *arg, unsigned long limit) +{ + unsigned long res; + char *endptr; + + errno = 0; + res = strtoul(arg, &endptr, 10); + if (errno != 0 || *endptr != '\0' || arg[0] == '\0' || res > limit) + usage(progname); + + return res; +} + +static void parse_options(int argc, char *argv[], unsigned int *ifindex, __u32 *prog_id, + __u64 *tcpipopts, char **ports, bool *single) +{ + static struct option long_options[] = { + { "help", no_argument, NULL, 'h' }, + { "iface", required_argument, NULL, 'i' }, + { "prog", required_argument, NULL, 'x' }, + { "mss4", required_argument, NULL, 4 }, + { "mss6", required_argument, NULL, 6 }, + { "wscale", required_argument, NULL, 'w' }, + { "ttl", required_argument, NULL, 't' }, + { "ports", required_argument, NULL, 'p' }, + { "single", no_argument, NULL, 's' }, + { NULL, 0, NULL, 0 }, + }; + unsigned long mss4, mss6, wscale, ttl; + unsigned int tcpipopts_mask = 0; + + if (argc < 2) + usage(argv[0]); + + *ifindex = 0; + *prog_id = 0; + *tcpipopts = 0; + *ports = NULL; + *single = false; + + while (true) { + int opt; + + opt = getopt_long(argc, argv, "", long_options, NULL); + if (opt == -1) + break; + + switch (opt) { + case 'h': + usage(argv[0]); + break; + case 'i': + *ifindex = if_nametoindex(optarg); + if (*ifindex == 0) + usage(argv[0]); + break; + case 'x': + *prog_id = parse_arg_ul(argv[0], optarg, UINT32_MAX); + if (*prog_id == 0) + usage(argv[0]); + break; + case 4: + mss4 = parse_arg_ul(argv[0], optarg, UINT16_MAX); + tcpipopts_mask |= 1 << 0; + break; + case 6: + mss6 = parse_arg_ul(argv[0], optarg, UINT16_MAX); + tcpipopts_mask |= 1 << 1; + break; + case 'w': + wscale = parse_arg_ul(argv[0], optarg, 14); + tcpipopts_mask |= 1 << 2; + break; + case 't': + ttl = parse_arg_ul(argv[0], optarg, UINT8_MAX); + tcpipopts_mask |= 1 << 3; + break; + case 'p': + *ports = optarg; + break; + case 's': + *single = true; + break; + default: + usage(argv[0]); + } + } + if (optind < argc) + usage(argv[0]); + + if (tcpipopts_mask == 0xf) { + if (mss4 == 0 || mss6 == 0 || wscale == 0 || ttl == 0) + usage(argv[0]); + *tcpipopts = (mss6 << 32) | (ttl << 24) | (wscale << 16) | mss4; + } else if (tcpipopts_mask != 0) { + usage(argv[0]); + } + + if (*ifindex != 0 && *prog_id != 0) + usage(argv[0]); + if (*ifindex == 0 && *prog_id == 0) + usage(argv[0]); +} + +static int syncookie_attach(const char *argv0, unsigned int ifindex) +{ + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + char xdp_filename[PATH_MAX]; + struct bpf_program *prog; + struct bpf_object *obj; + int prog_fd; + int err; + + snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv0); + obj = bpf_object__open_file(xdp_filename, NULL); + err = libbpf_get_error(obj); + if (err < 0) { + fprintf(stderr, "Error: bpf_object__open_file: %s\n", strerror(-err)); + return err; + } + + err = bpf_object__load(obj); + if (err < 0) { + fprintf(stderr, "Error: bpf_object__open_file: %s\n", strerror(-err)); + return err; + } + + prog = bpf_object__find_program_by_name(obj, "syncookie_xdp"); + if (!prog) { + fprintf(stderr, "Error: bpf_object__find_program_by_name: program syncookie_xdp was not found\n"); + return -ENOENT; + } + + prog_fd = bpf_program__fd(prog); + + err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); + if (err < 0) { + fprintf(stderr, "Error: bpf_obj_get_info_by_fd: %s\n", strerror(-err)); + goto out; + } + attached_prog_id = info.id; + signal(SIGINT, cleanup); + signal(SIGTERM, cleanup); + err = bpf_xdp_attach(ifindex, prog_fd, XDP_FLAGS_UPDATE_IF_NOEXIST, NULL); + if (err < 0) { + fprintf(stderr, "Error: bpf_set_link_xdp_fd: %s\n", strerror(-err)); + signal(SIGINT, SIG_DFL); + signal(SIGTERM, SIG_DFL); + attached_prog_id = 0; + goto out; + } + err = 0; +out: + bpf_object__close(obj); + return err; +} + +static int syncookie_open_bpf_maps(__u32 prog_id, int *values_map_fd, int *ports_map_fd) +{ + struct bpf_prog_info prog_info; + __u32 map_ids[8]; + __u32 info_len; + int prog_fd; + int err; + int i; + + *values_map_fd = -1; + *ports_map_fd = -1; + + prog_fd = bpf_prog_get_fd_by_id(prog_id); + if (prog_fd < 0) { + fprintf(stderr, "Error: bpf_prog_get_fd_by_id: %s\n", strerror(-prog_fd)); + return prog_fd; + } + + prog_info = (struct bpf_prog_info) { + .nr_map_ids = 8, + .map_ids = (__u64)map_ids, + }; + info_len = sizeof(prog_info); + + err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &info_len); + if (err != 0) { + fprintf(stderr, "Error: bpf_obj_get_info_by_fd: %s\n", strerror(-err)); + goto out; + } + + if (prog_info.type != BPF_PROG_TYPE_XDP) { + fprintf(stderr, "Error: BPF prog type is not BPF_PROG_TYPE_XDP\n"); + err = -ENOENT; + goto out; + } + if (prog_info.nr_map_ids < 2) { + fprintf(stderr, "Error: Found %u BPF maps, expected at least 2\n", + prog_info.nr_map_ids); + err = -ENOENT; + goto out; + } + + for (i = 0; i < prog_info.nr_map_ids; i++) { + struct bpf_map_info map_info = {}; + int map_fd; + + err = bpf_map_get_fd_by_id(map_ids[i]); + if (err < 0) { + fprintf(stderr, "Error: bpf_map_get_fd_by_id: %s\n", strerror(-err)); + goto err_close_map_fds; + } + map_fd = err; + + info_len = sizeof(map_info); + err = bpf_obj_get_info_by_fd(map_fd, &map_info, &info_len); + if (err != 0) { + fprintf(stderr, "Error: bpf_obj_get_info_by_fd: %s\n", strerror(-err)); + close(map_fd); + goto err_close_map_fds; + } + if (strcmp(map_info.name, "values") == 0) { + *values_map_fd = map_fd; + continue; + } + if (strcmp(map_info.name, "allowed_ports") == 0) { + *ports_map_fd = map_fd; + continue; + } + close(map_fd); + } + + if (*values_map_fd != -1 && *ports_map_fd != -1) { + err = 0; + goto out; + } + + err = -ENOENT; + +err_close_map_fds: + if (*values_map_fd != -1) + close(*values_map_fd); + if (*ports_map_fd != -1) + close(*ports_map_fd); + *values_map_fd = -1; + *ports_map_fd = -1; + +out: + close(prog_fd); + return err; +} + +int main(int argc, char *argv[]) +{ + int values_map_fd, ports_map_fd; + __u64 tcpipopts; + bool firstiter; + __u64 prevcnt; + __u32 prog_id; + char *ports; + bool single; + int err = 0; + + parse_options(argc, argv, &ifindex, &prog_id, &tcpipopts, &ports, &single); + + if (prog_id == 0) { + err = bpf_xdp_query_id(ifindex, 0, &prog_id); + if (err < 0) { + fprintf(stderr, "Error: bpf_get_link_xdp_id: %s\n", strerror(-err)); + goto out; + } + if (prog_id == 0) { + err = syncookie_attach(argv[0], ifindex); + if (err < 0) + goto out; + prog_id = attached_prog_id; + } + } + + err = syncookie_open_bpf_maps(prog_id, &values_map_fd, &ports_map_fd); + if (err < 0) + goto out; + + if (ports) { + __u16 port_last = 0; + __u32 port_idx = 0; + char *p = ports; + + fprintf(stderr, "Replacing allowed ports\n"); + + while (p && *p != '\0') { + char *token = strsep(&p, ","); + __u16 port; + + port = parse_arg_ul(argv[0], token, UINT16_MAX); + err = bpf_map_update_elem(ports_map_fd, &port_idx, &port, BPF_ANY); + if (err != 0) { + fprintf(stderr, "Error: bpf_map_update_elem: %s\n", strerror(-err)); + fprintf(stderr, "Failed to add port %u (index %u)\n", + port, port_idx); + goto out_close_maps; + } + fprintf(stderr, "Added port %u\n", port); + port_idx++; + } + err = bpf_map_update_elem(ports_map_fd, &port_idx, &port_last, BPF_ANY); + if (err != 0) { + fprintf(stderr, "Error: bpf_map_update_elem: %s\n", strerror(-err)); + fprintf(stderr, "Failed to add the terminator value 0 (index %u)\n", + port_idx); + goto out_close_maps; + } + } + + if (tcpipopts) { + __u32 key = 0; + + fprintf(stderr, "Replacing TCP/IP options\n"); + + err = bpf_map_update_elem(values_map_fd, &key, &tcpipopts, BPF_ANY); + if (err != 0) { + fprintf(stderr, "Error: bpf_map_update_elem: %s\n", strerror(-err)); + goto out_close_maps; + } + } + + if ((ports || tcpipopts) && attached_prog_id == 0 && !single) + goto out_close_maps; + + prevcnt = 0; + firstiter = true; + while (true) { + __u32 key = 1; + __u64 value; + + err = bpf_map_lookup_elem(values_map_fd, &key, &value); + if (err != 0) { + fprintf(stderr, "Error: bpf_map_lookup_elem: %s\n", strerror(-err)); + goto out_close_maps; + } + if (firstiter) { + prevcnt = value; + firstiter = false; + } + if (single) { + printf("Total SYNACKs generated: %llu\n", value); + break; + } + printf("SYNACKs generated: %llu (total %llu)\n", value - prevcnt, value); + prevcnt = value; + sleep(1); + } + +out_close_maps: + close(values_map_fd); + close(ports_map_fd); +out: + return err == 0 ? 0 : 1; +} -- cgit v1.2.3 From 784d5dc0efc28bd0a52ccfedb707eba71d8bc8af Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 15 Jun 2022 16:48:47 +0300 Subject: selftests/bpf: Add selftests for raw syncookie helpers in TC mode This commit extends selftests for the new BPF helpers bpf_tcp_raw_{gen,check}_syncookie_ipv{4,6} to also test the TC BPF functionality added in the previous commit. Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Link: https://lore.kernel.org/r/20220615134847.3753567-7-maximmi@nvidia.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/xdp_synproxy.c | 55 ++++++-- .../selftests/bpf/progs/xdp_synproxy_kern.c | 142 +++++++++++++++------ tools/testing/selftests/bpf/xdp_synproxy.c | 96 ++++++++++---- 3 files changed, 224 insertions(+), 69 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c index d9ee884c2a2b..fb77a123fe89 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause /* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ +#define _GNU_SOURCE #include #include #include @@ -12,9 +13,11 @@ goto out; \ }) -#define SYS_OUT(cmd) ({ \ - FILE *f = popen((cmd), "r"); \ - if (!ASSERT_OK_PTR(f, (cmd))) \ +#define SYS_OUT(cmd, ...) ({ \ + char buf[1024]; \ + snprintf(buf, sizeof(buf), (cmd), ##__VA_ARGS__); \ + FILE *f = popen(buf, "r"); \ + if (!ASSERT_OK_PTR(f, buf)) \ goto out; \ f; \ }) @@ -57,9 +60,10 @@ static bool expect_str(char *buf, size_t size, const char *str, const char *name return ok; } -void test_xdp_synproxy(void) +static void test_synproxy(bool xdp) { int server_fd = -1, client_fd = -1, accept_fd = -1; + char *prog_id, *prog_id_end; struct nstoken *ns = NULL; FILE *ctrl_file = NULL; char buf[CMD_OUT_BUF_SIZE]; @@ -76,8 +80,9 @@ void test_xdp_synproxy(void) * checksums and drops packets. */ SYS("ethtool -K tmp0 tx off"); - /* Workaround required for veth. */ - SYS("ip link set tmp0 xdp object xdp_dummy.o section xdp 2> /dev/null"); + if (xdp) + /* Workaround required for veth. */ + SYS("ip link set tmp0 xdp object xdp_dummy.o section xdp 2> /dev/null"); ns = open_netns("synproxy"); if (!ASSERT_OK_PTR(ns, "setns")) @@ -97,14 +102,34 @@ void test_xdp_synproxy(void) SYS("iptables -t filter -A INPUT \ -i tmp1 -m state --state INVALID -j DROP"); - ctrl_file = SYS_OUT("./xdp_synproxy --iface tmp1 --ports 8080 --single \ - --mss4 1460 --mss6 1440 --wscale 7 --ttl 64"); + ctrl_file = SYS_OUT("./xdp_synproxy --iface tmp1 --ports 8080 \ + --single --mss4 1460 --mss6 1440 \ + --wscale 7 --ttl 64%s", xdp ? "" : " --tc"); size = fread(buf, 1, sizeof(buf), ctrl_file); pclose(ctrl_file); if (!expect_str(buf, size, "Total SYNACKs generated: 0\n", "initial SYNACKs")) goto out; + if (!xdp) { + ctrl_file = SYS_OUT("tc filter show dev tmp1 ingress"); + size = fread(buf, 1, sizeof(buf), ctrl_file); + pclose(ctrl_file); + prog_id = memmem(buf, size, " id ", 4); + if (!ASSERT_OK_PTR(prog_id, "find prog id")) + goto out; + prog_id += 4; + if (!ASSERT_LT(prog_id, buf + size, "find prog id begin")) + goto out; + prog_id_end = prog_id; + while (prog_id_end < buf + size && *prog_id_end >= '0' && + *prog_id_end <= '9') + prog_id_end++; + if (!ASSERT_LT(prog_id_end, buf + size, "find prog id end")) + goto out; + *prog_id_end = '\0'; + } + server_fd = start_server(AF_INET, SOCK_STREAM, "198.18.0.2", 8080, 0); if (!ASSERT_GE(server_fd, 0, "start_server")) goto out; @@ -124,7 +149,11 @@ void test_xdp_synproxy(void) if (!ASSERT_OK_PTR(ns, "setns")) goto out; - ctrl_file = SYS_OUT("./xdp_synproxy --iface tmp1 --single"); + if (xdp) + ctrl_file = SYS_OUT("./xdp_synproxy --iface tmp1 --single"); + else + ctrl_file = SYS_OUT("./xdp_synproxy --prog %s --single", + prog_id); size = fread(buf, 1, sizeof(buf), ctrl_file); pclose(ctrl_file); if (!expect_str(buf, size, "Total SYNACKs generated: 1\n", @@ -144,3 +173,11 @@ out: system("ip link del tmp0"); system("ip netns del synproxy"); } + +void test_xdp_synproxy(void) +{ + if (test__start_subtest("xdp")) + test_synproxy(true); + if (test__start_subtest("tc")) + test_synproxy(false); +} diff --git a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c index 53b9865276a4..9fd62e94b5e6 100644 --- a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c +++ b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c @@ -7,6 +7,9 @@ #include #include +#define TC_ACT_OK 0 +#define TC_ACT_SHOT 2 + #define NSEC_PER_SEC 1000000000L #define ETH_ALEN 6 @@ -80,6 +83,12 @@ extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_ct_opts *opts, __u32 len_opts) __ksym; +extern struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, + struct bpf_sock_tuple *bpf_tuple, + u32 len_tuple, + struct bpf_ct_opts *opts, + u32 len_opts) __ksym; + extern void bpf_ct_release(struct nf_conn *ct) __ksym; static __always_inline void swap_eth_addr(__u8 *a, __u8 *b) @@ -382,7 +391,7 @@ static __always_inline int tcp_dissect(void *data, void *data_end, return XDP_TX; } -static __always_inline int tcp_lookup(struct xdp_md *ctx, struct header_pointers *hdr) +static __always_inline int tcp_lookup(void *ctx, struct header_pointers *hdr, bool xdp) { struct bpf_ct_opts ct_lookup_opts = { .netns_id = BPF_F_CURRENT_NETNS, @@ -416,7 +425,10 @@ static __always_inline int tcp_lookup(struct xdp_md *ctx, struct header_pointers */ return XDP_ABORTED; } - ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts)); + if (xdp) + ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts)); + else + ct = bpf_skb_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts)); if (ct) { unsigned long status = ct->status; @@ -529,8 +541,9 @@ static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr, } static __always_inline int syncookie_handle_syn(struct header_pointers *hdr, - struct xdp_md *ctx, - void *data, void *data_end) + void *ctx, + void *data, void *data_end, + bool xdp) { __u32 old_pkt_size, new_pkt_size; /* Unlike clang 10, clang 11 and 12 generate code that doesn't pass the @@ -666,8 +679,13 @@ static __always_inline int syncookie_handle_syn(struct header_pointers *hdr, /* Set the new packet size. */ old_pkt_size = data_end - data; new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4; - if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size)) - return XDP_ABORTED; + if (xdp) { + if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size)) + return XDP_ABORTED; + } else { + if (bpf_skb_change_tail(ctx, new_pkt_size, 0)) + return XDP_ABORTED; + } values_inc_synacks(); @@ -693,71 +711,123 @@ static __always_inline int syncookie_handle_ack(struct header_pointers *hdr) return XDP_PASS; } -SEC("xdp") -int syncookie_xdp(struct xdp_md *ctx) +static __always_inline int syncookie_part1(void *ctx, void *data, void *data_end, + struct header_pointers *hdr, bool xdp) { - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; - struct header_pointers hdr; - __s64 value; - int ret; - struct bpf_ct_opts ct_lookup_opts = { .netns_id = BPF_F_CURRENT_NETNS, .l4proto = IPPROTO_TCP, }; + int ret; - ret = tcp_dissect(data, data_end, &hdr); + ret = tcp_dissect(data, data_end, hdr); if (ret != XDP_TX) return ret; - ret = tcp_lookup(ctx, &hdr); + ret = tcp_lookup(ctx, hdr, xdp); if (ret != XDP_TX) return ret; /* Packet is TCP and doesn't belong to an established connection. */ - if ((hdr.tcp->syn ^ hdr.tcp->ack) != 1) + if ((hdr->tcp->syn ^ hdr->tcp->ack) != 1) return XDP_DROP; - /* Grow the TCP header to TCP_MAXLEN to be able to pass any hdr.tcp_len + /* Grow the TCP header to TCP_MAXLEN to be able to pass any hdr->tcp_len * to bpf_tcp_raw_gen_syncookie_ipv{4,6} and pass the verifier. */ - if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr.tcp_len)) - return XDP_ABORTED; + if (xdp) { + if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr->tcp_len)) + return XDP_ABORTED; + } else { + /* Without volatile the verifier throws this error: + * R9 32-bit pointer arithmetic prohibited + */ + volatile u64 old_len = data_end - data; - data_end = (void *)(long)ctx->data_end; - data = (void *)(long)ctx->data; + if (bpf_skb_change_tail(ctx, old_len + TCP_MAXLEN - hdr->tcp_len, 0)) + return XDP_ABORTED; + } + + return XDP_TX; +} - if (hdr.ipv4) { - hdr.eth = data; - hdr.ipv4 = (void *)hdr.eth + sizeof(*hdr.eth); +static __always_inline int syncookie_part2(void *ctx, void *data, void *data_end, + struct header_pointers *hdr, bool xdp) +{ + if (hdr->ipv4) { + hdr->eth = data; + hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth); /* IPV4_MAXLEN is needed when calculating checksum. * At least sizeof(struct iphdr) is needed here to access ihl. */ - if ((void *)hdr.ipv4 + IPV4_MAXLEN > data_end) + if ((void *)hdr->ipv4 + IPV4_MAXLEN > data_end) return XDP_ABORTED; - hdr.tcp = (void *)hdr.ipv4 + hdr.ipv4->ihl * 4; - } else if (hdr.ipv6) { - hdr.eth = data; - hdr.ipv6 = (void *)hdr.eth + sizeof(*hdr.eth); - hdr.tcp = (void *)hdr.ipv6 + sizeof(*hdr.ipv6); + hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4; + } else if (hdr->ipv6) { + hdr->eth = data; + hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth); + hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6); } else { return XDP_ABORTED; } - if ((void *)hdr.tcp + TCP_MAXLEN > data_end) + if ((void *)hdr->tcp + TCP_MAXLEN > data_end) return XDP_ABORTED; /* We run out of registers, tcp_len gets spilled to the stack, and the * verifier forgets its min and max values checked above in tcp_dissect. */ - hdr.tcp_len = hdr.tcp->doff * 4; - if (hdr.tcp_len < sizeof(*hdr.tcp)) + hdr->tcp_len = hdr->tcp->doff * 4; + if (hdr->tcp_len < sizeof(*hdr->tcp)) return XDP_ABORTED; - return hdr.tcp->syn ? syncookie_handle_syn(&hdr, ctx, data, data_end) : - syncookie_handle_ack(&hdr); + return hdr->tcp->syn ? syncookie_handle_syn(hdr, ctx, data, data_end, xdp) : + syncookie_handle_ack(hdr); +} + +SEC("xdp") +int syncookie_xdp(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct header_pointers hdr; + int ret; + + ret = syncookie_part1(ctx, data, data_end, &hdr, true); + if (ret != XDP_TX) + return ret; + + data_end = (void *)(long)ctx->data_end; + data = (void *)(long)ctx->data; + + return syncookie_part2(ctx, data, data_end, &hdr, true); +} + +SEC("tc") +int syncookie_tc(struct __sk_buff *skb) +{ + void *data_end = (void *)(long)skb->data_end; + void *data = (void *)(long)skb->data; + struct header_pointers hdr; + int ret; + + ret = syncookie_part1(skb, data, data_end, &hdr, false); + if (ret != XDP_TX) + return ret == XDP_PASS ? TC_ACT_OK : TC_ACT_SHOT; + + data_end = (void *)(long)skb->data_end; + data = (void *)(long)skb->data; + + ret = syncookie_part2(skb, data, data_end, &hdr, false); + switch (ret) { + case XDP_PASS: + return TC_ACT_OK; + case XDP_TX: + return bpf_redirect(skb->ifindex, 0); + default: + return TC_ACT_SHOT; + } } char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/xdp_synproxy.c b/tools/testing/selftests/bpf/xdp_synproxy.c index 4653d4655b5f..d874ddfb39c4 100644 --- a/tools/testing/selftests/bpf/xdp_synproxy.c +++ b/tools/testing/selftests/bpf/xdp_synproxy.c @@ -18,16 +18,31 @@ static unsigned int ifindex; static __u32 attached_prog_id; +static bool attached_tc; static void noreturn cleanup(int sig) { - DECLARE_LIBBPF_OPTS(bpf_xdp_attach_opts, opts); + LIBBPF_OPTS(bpf_xdp_attach_opts, opts); int prog_fd; int err; if (attached_prog_id == 0) exit(0); + if (attached_tc) { + LIBBPF_OPTS(bpf_tc_hook, hook, + .ifindex = ifindex, + .attach_point = BPF_TC_INGRESS); + + err = bpf_tc_hook_destroy(&hook); + if (err < 0) { + fprintf(stderr, "Error: bpf_tc_hook_destroy: %s\n", strerror(-err)); + fprintf(stderr, "Failed to destroy the TC hook\n"); + exit(1); + } + exit(0); + } + prog_fd = bpf_prog_get_fd_by_id(attached_prog_id); if (prog_fd < 0) { fprintf(stderr, "Error: bpf_prog_get_fd_by_id: %s\n", strerror(-prog_fd)); @@ -55,7 +70,7 @@ static void noreturn cleanup(int sig) static noreturn void usage(const char *progname) { - fprintf(stderr, "Usage: %s [--iface |--prog ] [--mss4 --mss6 --wscale --ttl ] [--ports ,,...] [--single]\n", + fprintf(stderr, "Usage: %s [--iface |--prog ] [--mss4 --mss6 --wscale --ttl ] [--ports ,,...] [--single] [--tc]\n", progname); exit(1); } @@ -74,7 +89,7 @@ static unsigned long parse_arg_ul(const char *progname, const char *arg, unsigne } static void parse_options(int argc, char *argv[], unsigned int *ifindex, __u32 *prog_id, - __u64 *tcpipopts, char **ports, bool *single) + __u64 *tcpipopts, char **ports, bool *single, bool *tc) { static struct option long_options[] = { { "help", no_argument, NULL, 'h' }, @@ -86,6 +101,7 @@ static void parse_options(int argc, char *argv[], unsigned int *ifindex, __u32 * { "ttl", required_argument, NULL, 't' }, { "ports", required_argument, NULL, 'p' }, { "single", no_argument, NULL, 's' }, + { "tc", no_argument, NULL, 'c' }, { NULL, 0, NULL, 0 }, }; unsigned long mss4, mss6, wscale, ttl; @@ -143,6 +159,9 @@ static void parse_options(int argc, char *argv[], unsigned int *ifindex, __u32 * case 's': *single = true; break; + case 'c': + *tc = true; + break; default: usage(argv[0]); } @@ -164,7 +183,7 @@ static void parse_options(int argc, char *argv[], unsigned int *ifindex, __u32 * usage(argv[0]); } -static int syncookie_attach(const char *argv0, unsigned int ifindex) +static int syncookie_attach(const char *argv0, unsigned int ifindex, bool tc) { struct bpf_prog_info info = {}; __u32 info_len = sizeof(info); @@ -188,9 +207,9 @@ static int syncookie_attach(const char *argv0, unsigned int ifindex) return err; } - prog = bpf_object__find_program_by_name(obj, "syncookie_xdp"); + prog = bpf_object__find_program_by_name(obj, tc ? "syncookie_tc" : "syncookie_xdp"); if (!prog) { - fprintf(stderr, "Error: bpf_object__find_program_by_name: program syncookie_xdp was not found\n"); + fprintf(stderr, "Error: bpf_object__find_program_by_name: program was not found\n"); return -ENOENT; } @@ -201,21 +220,50 @@ static int syncookie_attach(const char *argv0, unsigned int ifindex) fprintf(stderr, "Error: bpf_obj_get_info_by_fd: %s\n", strerror(-err)); goto out; } + attached_tc = tc; attached_prog_id = info.id; signal(SIGINT, cleanup); signal(SIGTERM, cleanup); - err = bpf_xdp_attach(ifindex, prog_fd, XDP_FLAGS_UPDATE_IF_NOEXIST, NULL); - if (err < 0) { - fprintf(stderr, "Error: bpf_set_link_xdp_fd: %s\n", strerror(-err)); - signal(SIGINT, SIG_DFL); - signal(SIGTERM, SIG_DFL); - attached_prog_id = 0; - goto out; + if (tc) { + LIBBPF_OPTS(bpf_tc_hook, hook, + .ifindex = ifindex, + .attach_point = BPF_TC_INGRESS); + LIBBPF_OPTS(bpf_tc_opts, opts, + .handle = 1, + .priority = 1, + .prog_fd = prog_fd); + + err = bpf_tc_hook_create(&hook); + if (err < 0) { + fprintf(stderr, "Error: bpf_tc_hook_create: %s\n", + strerror(-err)); + goto fail; + } + err = bpf_tc_attach(&hook, &opts); + if (err < 0) { + fprintf(stderr, "Error: bpf_tc_attach: %s\n", + strerror(-err)); + goto fail; + } + + } else { + err = bpf_xdp_attach(ifindex, prog_fd, + XDP_FLAGS_UPDATE_IF_NOEXIST, NULL); + if (err < 0) { + fprintf(stderr, "Error: bpf_set_link_xdp_fd: %s\n", + strerror(-err)); + goto fail; + } } err = 0; out: bpf_object__close(obj); return err; +fail: + signal(SIGINT, SIG_DFL); + signal(SIGTERM, SIG_DFL); + attached_prog_id = 0; + goto out; } static int syncookie_open_bpf_maps(__u32 prog_id, int *values_map_fd, int *ports_map_fd) @@ -248,11 +296,6 @@ static int syncookie_open_bpf_maps(__u32 prog_id, int *values_map_fd, int *ports goto out; } - if (prog_info.type != BPF_PROG_TYPE_XDP) { - fprintf(stderr, "Error: BPF prog type is not BPF_PROG_TYPE_XDP\n"); - err = -ENOENT; - goto out; - } if (prog_info.nr_map_ids < 2) { fprintf(stderr, "Error: Found %u BPF maps, expected at least 2\n", prog_info.nr_map_ids); @@ -319,17 +362,22 @@ int main(int argc, char *argv[]) char *ports; bool single; int err = 0; + bool tc; - parse_options(argc, argv, &ifindex, &prog_id, &tcpipopts, &ports, &single); + parse_options(argc, argv, &ifindex, &prog_id, &tcpipopts, &ports, + &single, &tc); if (prog_id == 0) { - err = bpf_xdp_query_id(ifindex, 0, &prog_id); - if (err < 0) { - fprintf(stderr, "Error: bpf_get_link_xdp_id: %s\n", strerror(-err)); - goto out; + if (!tc) { + err = bpf_xdp_query_id(ifindex, 0, &prog_id); + if (err < 0) { + fprintf(stderr, "Error: bpf_get_link_xdp_id: %s\n", + strerror(-err)); + goto out; + } } if (prog_id == 0) { - err = syncookie_attach(argv[0], ifindex); + err = syncookie_attach(argv[0], ifindex, tc); if (err < 0) goto out; prog_id = attached_prog_id; -- cgit v1.2.3 From 08c79c9cd67fffd0d5538ddbd3a97b0a865b5eb5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 16 Jun 2022 21:55:12 -0700 Subject: selftests/bpf: Don't force lld on non-x86 architectures LLVM's lld linker doesn't have a universal architecture support (e.g., it definitely doesn't work on s390x), so be safe and force lld for urandom_read and liburandom_read.so only on x86 architectures. This should fix s390x CI runs. Fixes: 3e6fe5ce4d48 ("libbpf: Fix internal USDT address translation logic for shared libraries") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220617045512.1339795-1-andrii@kernel.org --- tools/testing/selftests/bpf/Makefile | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 8b30bb743e24..cb8e552e1418 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -168,18 +168,25 @@ $(OUTPUT)/%:%.c $(call msg,BINARY,,$@) $(Q)$(LINK.c) $^ $(LDLIBS) -o $@ +# LLVM's ld.lld doesn't support all the architectures, so use it only on x86 +ifeq ($(SRCARCH),x86) +LLD := lld +else +LLD := ld +endif + # Filter out -static for liburandom_read.so and its dependent targets so that static builds # do not fail. Static builds leave urandom_read relying on system-wide shared libraries. $(OUTPUT)/liburandom_read.so: urandom_read_lib1.c urandom_read_lib2.c $(call msg,LIB,,$@) $(Q)$(CLANG) $(filter-out -static,$(CFLAGS) $(LDFLAGS)) $^ $(LDLIBS) \ - -fuse-ld=lld -Wl,-znoseparate-code -fPIC -shared -o $@ + -fuse-ld=$(LLD) -Wl,-znoseparate-code -fPIC -shared -o $@ $(OUTPUT)/urandom_read: urandom_read.c urandom_read_aux.c $(OUTPUT)/liburandom_read.so $(call msg,BINARY,,$@) $(Q)$(CLANG) $(filter-out -static,$(CFLAGS) $(LDFLAGS)) $(filter %.c,$^) \ liburandom_read.so $(LDLIBS) \ - -fuse-ld=lld -Wl,-znoseparate-code \ + -fuse-ld=$(LLD) -Wl,-znoseparate-code \ -Wl,-rpath=. -Wl,--build-id=sha1 -o $@ $(OUTPUT)/bpf_testmod.ko: $(VMLINUX_BTF) $(wildcard bpf_testmod/Makefile bpf_testmod/*.[ch]) -- cgit v1.2.3 From e386a527fc0893dc905464890eddfeb2cbeb44f4 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Thu, 16 Jun 2022 13:42:38 +0300 Subject: selftests: mirror_gre_bridge_1q_lag: Enslave port to bridge before other configurations Using mlxsw driver, the configurations are offloaded just in case that there is a physical port which is enslaved to the virtual device (e.g., to a bridge). In 'mirror_gre_bridge_1q_lag' test, the bridge gets an address and route before there are ports in the bridge. It means that these configurations are not offloaded. Till now the test passes with mlxsw driver even that the RIF of the bridge is not in the hardware, because the ARP packets are trapped in layer 2 and also mirrored, so there is no real need of the RIF in hardware. The previous patch changed the traps 'ARP_REQUEST' and 'ARP_RESPONSE' to be done at layer 3 instead of layer 2. With this change the ARP packets are not trapped during the test, as the RIF is not in the hardware because of the order of configurations. Reorder the configurations to make them to be offloaded, then the test will pass with the change of the traps. Signed-off-by: Amit Cohen Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q_lag.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q_lag.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q_lag.sh index 28d568c48a73..91e431cd919e 100755 --- a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q_lag.sh +++ b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q_lag.sh @@ -141,12 +141,13 @@ switch_create() ip link set dev $swp4 up ip link add name br1 type bridge vlan_filtering 1 - ip link set dev br1 up - __addr_add_del br1 add 192.0.2.129/32 - ip -4 route add 192.0.2.130/32 dev br1 team_create lag loadbalance $swp3 $swp4 ip link set dev lag master br1 + + ip link set dev br1 up + __addr_add_del br1 add 192.0.2.129/32 + ip -4 route add 192.0.2.130/32 dev br1 } switch_destroy() -- cgit v1.2.3 From d3ffeb2dba633b99ef2602019f61a97e0163a756 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 16 Jun 2022 13:42:39 +0300 Subject: selftests: mlxsw: resource_scale: Update scale target after test setup The scale of each resource is tested in the following manner: 1. The scale target is queried. 2. The test setup is prepared. 3. The test is invoked. In some cases, the occupancy of a resource changes as part of the second step, requiring the test to return a scale target that takes this change into account. Make this more robust by re-querying the scale target after the second step. Another possible solution is to swap the first and second steps, but when a test needs to be skipped (i.e., scale target is zero), the setup would have been in vain. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: David S. Miller --- tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh | 3 +++ tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh | 3 +++ 2 files changed, 6 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh index e9f65bd2e299..22f761442bad 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh @@ -38,6 +38,9 @@ for current_test in ${TESTS:-$ALL_TESTS}; do target=$(${current_test}_get_target "$should_fail") ${current_test}_setup_prepare setup_wait $num_netifs + # Update target in case occupancy of a certain resource changed + # following the test setup. + target=$(${current_test}_get_target "$should_fail") ${current_test}_test "$target" "$should_fail" ${current_test}_cleanup devlink_reload diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh index dea33dc93790..12201acc00b9 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh @@ -43,6 +43,9 @@ for current_test in ${TESTS:-$ALL_TESTS}; do target=$(${current_test}_get_target "$should_fail") ${current_test}_setup_prepare setup_wait $num_netifs + # Update target in case occupancy of a certain resource + # changed following the test setup. + target=$(${current_test}_get_target "$should_fail") ${current_test}_test "$target" "$should_fail" ${current_test}_cleanup if [[ "$should_fail" -eq 0 ]]; then -- cgit v1.2.3 From 3128b9f51ee7ec7d091496379247489aab3007bb Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 16 Jun 2022 13:42:40 +0300 Subject: selftests: mlxsw: resource_scale: Introduce traffic tests The scale tests are currently testing two things: that some number of instances of a given resource can actually be created; and that when an attempt is made to create more than the supported amount, the failures are noted and handled gracefully. However the ability to allocate the resource does not mean that the resource actually works when passing traffic. For that, make it possible for a given scale to also test traffic. Traffic test is only run on the positive leg of the scale test (no point trying to pass traffic when the expected outcome is that the resource will not be allocated). Traffic tests are opt-in, if a given test does not expose it, it is not run. To this end, delay the test cleanup until after the traffic test is run. Signed-off-by: Petr Machata Reviewed-by: Amit Cohen Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- .../selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh | 12 ++++++++++-- .../selftests/drivers/net/mlxsw/spectrum/resource_scale.sh | 11 ++++++++++- 2 files changed, 20 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh index 22f761442bad..6d7814ba3c03 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh @@ -42,13 +42,21 @@ for current_test in ${TESTS:-$ALL_TESTS}; do # following the test setup. target=$(${current_test}_get_target "$should_fail") ${current_test}_test "$target" "$should_fail" - ${current_test}_cleanup - devlink_reload if [[ "$should_fail" -eq 0 ]]; then log_test "'$current_test' $target" + + if ((!RET)); then + tt=${current_test}_traffic_test + if [[ $(type -t $tt) == "function" ]]; then + $tt "$target" + log_test "'$current_test' $target traffic test" + fi + fi else log_test "'$current_test' overflow $target" fi + ${current_test}_cleanup + devlink_reload RET_FIN=$(( RET_FIN || RET )) done done diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh index 12201acc00b9..a1bc93b966ae 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh @@ -47,12 +47,21 @@ for current_test in ${TESTS:-$ALL_TESTS}; do # changed following the test setup. target=$(${current_test}_get_target "$should_fail") ${current_test}_test "$target" "$should_fail" - ${current_test}_cleanup if [[ "$should_fail" -eq 0 ]]; then log_test "'$current_test' [$profile] $target" + + if ((!RET)); then + tt=${current_test}_traffic_test + if [[ $(type -t $tt) == "function" ]] + then + $tt "$target" + log_test "'$current_test' [$profile] $target traffic test" + fi + fi else log_test "'$current_test' [$profile] overflow $target" fi + ${current_test}_cleanup RET_FIN=$(( RET_FIN || RET )) done done -- cgit v1.2.3 From 8cad339db339a39cb82b1188e4be4070a433abac Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 16 Jun 2022 13:42:41 +0300 Subject: selftests: mlxsw: resource_scale: Allow skipping a test The scale tests are currently testing two things: that some number of instances of a given resource can actually be created; and that when an attempt is made to create more than the supported amount, the failures are noted and handled gracefully. Sometimes the scale test depends on more than one resource. In particular, a following patch will add a RIF counter scale test, which depends on the number of RIF counters that can be bound, and also on the number of RIFs that can be created. When the test is limited by the auxiliary resource and not by the primary one, there's no point trying to run the overflow test, because it would be testing exhaustion of the wrong resource. To support this use case, when the $test_get_target yields 0, skip the test instead. Signed-off-by: Petr Machata Reviewed-by: Amit Cohen Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- .../testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh | 5 +++++ tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh | 4 ++++ 2 files changed, 9 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh index 6d7814ba3c03..afe17b108b46 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh @@ -36,6 +36,11 @@ for current_test in ${TESTS:-$ALL_TESTS}; do for should_fail in 0 1; do RET=0 target=$(${current_test}_get_target "$should_fail") + if ((target == 0)); then + log_test_skip "'$current_test' should_fail=$should_fail test" + continue + fi + ${current_test}_setup_prepare setup_wait $num_netifs # Update target in case occupancy of a certain resource changed diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh index a1bc93b966ae..c0da22cd7d20 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh @@ -41,6 +41,10 @@ for current_test in ${TESTS:-$ALL_TESTS}; do for should_fail in 0 1; do RET=0 target=$(${current_test}_get_target "$should_fail") + if ((target == 0)); then + log_test_skip "'$current_test' [$profile] should_fail=$should_fail test" + continue + fi ${current_test}_setup_prepare setup_wait $num_netifs # Update target in case occupancy of a certain resource -- cgit v1.2.3 From 35d5829e86c29892136ca96bd4d809d4429f1510 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 16 Jun 2022 13:42:42 +0300 Subject: selftests: mlxsw: resource_scale: Pass target count to cleanup The scale tests are verifying behavior of mlxsw when number of instances of some resource reaches the ASIC capacity. The number of instances is referred to as "target" number. No scale tests so far needed to know this target number to clean up. E.g. the tc_flower simply removes the clsact qdisc that all the tested filters are hooked onto, and that takes care of collecting all the filters. However, for the RIF counter test, which is being added in a future patch, VLAN netdevices are created. These are created as part of the test, but of course the cleanup needs to undo them again. For that it needs to know how many there were. To support this usage, pass the target number to the cleanup callback. Signed-off-by: Petr Machata Reviewed-by: Amit Cohen Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh | 2 +- tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh index afe17b108b46..1a7a472edfd0 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh @@ -60,7 +60,7 @@ for current_test in ${TESTS:-$ALL_TESTS}; do else log_test "'$current_test' overflow $target" fi - ${current_test}_cleanup + ${current_test}_cleanup $target devlink_reload RET_FIN=$(( RET_FIN || RET )) done diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh index c0da22cd7d20..70c9da8fe303 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh @@ -65,7 +65,7 @@ for current_test in ${TESTS:-$ALL_TESTS}; do else log_test "'$current_test' [$profile] overflow $target" fi - ${current_test}_cleanup + ${current_test}_cleanup $target RET_FIN=$(( RET_FIN || RET )) done done -- cgit v1.2.3 From dd5d20e17c960dc5c8b8c585dfae79cf39660867 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 16 Jun 2022 13:42:43 +0300 Subject: selftests: mlxsw: tc_flower_scale: Add a traffic test Add a test that checks that the created filters do actually trigger on matching traffic. Exercising all the rules would be a very lengthy process. Instead, take a log2 subset of rules. The logic behind picking log2 rules is that then every bit of the instantiated item's number is exercised. This should catch issues whether they happen at the high end, low end, or somewhere in between. Signed-off-by: Petr Machata Reviewed-by: Amit Cohen Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- .../selftests/drivers/net/mlxsw/tc_flower_scale.sh | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh index aa74be9f47c8..d3d9e60d6ddf 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh @@ -77,6 +77,7 @@ tc_flower_rules_create() filter add dev $h2 ingress \ prot ipv6 \ pref 1000 \ + handle 42$i \ flower $tcflags dst_ip $(tc_flower_addr $i) \ action drop EOF @@ -121,3 +122,19 @@ tc_flower_test() tcflags="skip_sw" __tc_flower_test $count $should_fail } + +tc_flower_traffic_test() +{ + local count=$1; shift + local i; + + for ((i = count - 1; i > 0; i /= 2)); do + $MZ -6 $h1 -c 1 -d 20msec -p 100 -a own -b $(mac_get $h2) \ + -A $(tc_flower_addr 0) -B $(tc_flower_addr $i) \ + -q -t udp sp=54321,dp=12345 + done + for ((i = count - 1; i > 0; i /= 2)); do + tc_check_packets "dev $h2 ingress" 42$i 1 + check_err $? "Traffic not seen at rule #$i" + done +} -- cgit v1.2.3 From be00853bfd2e704893916bc349e7ab1d50615cb4 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 16 Jun 2022 13:42:44 +0300 Subject: selftests: mlxsw: Add a RIF counter scale test This tests creates as many RIFs as possible, ideally more than there can be RIF counters (though that is currently only possible on Spectrum-1). It then tries to enable L3 HW stats on each of the RIFs. It also contains the traffic test, which tries to run traffic through a log2 of those counters and checks that the traffic is shown in the counter values. Like with tc_flower traffic test, take a log2 subset of rules. The logic behind picking log2 rules is that then every bit of the instantiated item's number is exercised. This should catch issues whether they happen at the high end, low end, or somewhere in between. Signed-off-by: Petr Machata Reviewed-by: Amit Cohen Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- .../drivers/net/mlxsw/rif_counter_scale.sh | 107 +++++++++++++++++++++ .../drivers/net/mlxsw/spectrum-2/resource_scale.sh | 11 ++- .../net/mlxsw/spectrum-2/rif_counter_scale.sh | 1 + .../drivers/net/mlxsw/spectrum/resource_scale.sh | 11 ++- .../net/mlxsw/spectrum/rif_counter_scale.sh | 34 +++++++ 5 files changed, 162 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/drivers/net/mlxsw/rif_counter_scale.sh create mode 120000 tools/testing/selftests/drivers/net/mlxsw/spectrum-2/rif_counter_scale.sh create mode 100644 tools/testing/selftests/drivers/net/mlxsw/spectrum/rif_counter_scale.sh (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/rif_counter_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/rif_counter_scale.sh new file mode 100644 index 000000000000..a43a9926e690 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/rif_counter_scale.sh @@ -0,0 +1,107 @@ +# SPDX-License-Identifier: GPL-2.0 + +RIF_COUNTER_NUM_NETIFS=2 + +rif_counter_addr4() +{ + local i=$1; shift + local p=$1; shift + + printf 192.0.%d.%d $((i / 64)) $(((4 * i % 256) + p)) +} + +rif_counter_addr4pfx() +{ + rif_counter_addr4 $@ + printf /30 +} + +rif_counter_h1_create() +{ + simple_if_init $h1 +} + +rif_counter_h1_destroy() +{ + simple_if_fini $h1 +} + +rif_counter_h2_create() +{ + simple_if_init $h2 +} + +rif_counter_h2_destroy() +{ + simple_if_fini $h2 +} + +rif_counter_setup_prepare() +{ + h1=${NETIFS[p1]} + h2=${NETIFS[p2]} + + vrf_prepare + + rif_counter_h1_create + rif_counter_h2_create +} + +rif_counter_cleanup() +{ + local count=$1; shift + + pre_cleanup + + for ((i = 1; i <= count; i++)); do + vlan_destroy $h2 $i + done + + rif_counter_h2_destroy + rif_counter_h1_destroy + + vrf_cleanup + + if [[ -v RIF_COUNTER_BATCH_FILE ]]; then + rm -f $RIF_COUNTER_BATCH_FILE + fi +} + + +rif_counter_test() +{ + local count=$1; shift + local should_fail=$1; shift + + RIF_COUNTER_BATCH_FILE="$(mktemp)" + + for ((i = 1; i <= count; i++)); do + vlan_create $h2 $i v$h2 $(rif_counter_addr4pfx $i 2) + done + for ((i = 1; i <= count; i++)); do + cat >> $RIF_COUNTER_BATCH_FILE <<-EOF + stats set dev $h2.$i l3_stats on + EOF + done + + ip -b $RIF_COUNTER_BATCH_FILE + check_err_fail $should_fail $? "RIF counter enablement" +} + +rif_counter_traffic_test() +{ + local count=$1; shift + local i; + + for ((i = count; i > 0; i /= 2)); do + $MZ $h1 -Q $i -c 1 -d 20msec -p 100 -a own -b $(mac_get $h2) \ + -A $(rif_counter_addr4 $i 1) \ + -B $(rif_counter_addr4 $i 2) \ + -q -t udp sp=54321,dp=12345 + done + for ((i = count; i > 0; i /= 2)); do + busywait "$TC_HIT_TIMEOUT" until_counter_is "== 1" \ + hw_stats_get l3_stats $h2.$i rx packets > /dev/null + check_err $? "Traffic not seen at RIF $h2.$i" + done +} diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh index 1a7a472edfd0..688338bbeb97 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh @@ -25,7 +25,16 @@ cleanup() trap cleanup EXIT -ALL_TESTS="router tc_flower mirror_gre tc_police port rif_mac_profile" +ALL_TESTS=" + router + tc_flower + mirror_gre + tc_police + port + rif_mac_profile + rif_counter +" + for current_test in ${TESTS:-$ALL_TESTS}; do RET_FIN=0 source ${current_test}_scale.sh diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/rif_counter_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/rif_counter_scale.sh new file mode 120000 index 000000000000..1f5752e8ffc0 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/rif_counter_scale.sh @@ -0,0 +1 @@ +../spectrum/rif_counter_scale.sh \ No newline at end of file diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh index 70c9da8fe303..95d9f710a630 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh @@ -22,7 +22,16 @@ cleanup() devlink_sp_read_kvd_defaults trap cleanup EXIT -ALL_TESTS="router tc_flower mirror_gre tc_police port rif_mac_profile" +ALL_TESTS=" + router + tc_flower + mirror_gre + tc_police + port + rif_mac_profile + rif_counter +" + for current_test in ${TESTS:-$ALL_TESTS}; do RET_FIN=0 source ${current_test}_scale.sh diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/rif_counter_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/rif_counter_scale.sh new file mode 100644 index 000000000000..d44536276e8a --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/rif_counter_scale.sh @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: GPL-2.0 +source ../rif_counter_scale.sh + +rif_counter_get_target() +{ + local should_fail=$1; shift + local max_cnts + local max_rifs + local target + + max_rifs=$(devlink_resource_size_get rifs) + max_cnts=$(devlink_resource_size_get counters rif) + + # Remove already allocated RIFs. + ((max_rifs -= $(devlink_resource_occ_get rifs))) + + # 10 KVD slots per counter, ingress+egress counters per RIF + ((max_cnts /= 20)) + + # Pointless to run the overflow test if we don't have enough RIFs to + # host all the counters. + if ((max_cnts > max_rifs && should_fail)); then + echo 0 + return + fi + + target=$((max_rifs < max_cnts ? max_rifs : max_cnts)) + + if ((! should_fail)); then + echo $target + else + echo $((target + 1)) + fi +} -- cgit v1.2.3 From ed62af45467a6786cbdeef42a7b4e7ced374f593 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 16 Jun 2022 13:42:45 +0300 Subject: selftests: spectrum-2: tc_flower_scale: Dynamically set scale target Instead of hard coding the scale target in the test, dynamically set it based on the maximum number of flow counters and their current occupancy. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: David S. Miller --- .../drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh index efd798a85931..4444bbace1a9 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh @@ -4,17 +4,22 @@ source ../tc_flower_scale.sh tc_flower_get_target() { local should_fail=$1; shift + local max_cnts # The driver associates a counter with each tc filter, which means the # number of supported filters is bounded by the number of available # counters. - # Currently, the driver supports 30K (30,720) flow counters and six of - # these are used for multicast routing. - local target=30714 + max_cnts=$(devlink_resource_size_get counters flow) + + # Remove already allocated counters. + ((max_cnts -= $(devlink_resource_occ_get counters flow))) + + # Each rule uses two counters, for packets and bytes. + ((max_cnts /= 2)) if ((! should_fail)); then - echo $target + echo $max_cnts else - echo $((target + 1)) + echo $((max_cnts + 1)) fi } -- cgit v1.2.3 From e068c0776b0bbd3fc5a283b0e0eaa1cbb9ef0e3d Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Mon, 20 Jun 2022 13:49:39 +0300 Subject: selftests/bpf: Enable config options needed for xdp_synproxy test This commit adds the kernel config options needed to run the recently added xdp_synproxy test. Users without these options will hit errors like this: test_synproxy:FAIL:iptables -t raw -I PREROUTING -i tmp1 -p tcp -m tcp --syn --dport 8080 -j CT --notrack unexpected error: 256 (errno 22) Suggested-by: Alexei Starovoitov Signed-off-by: Maxim Mikityanskiy Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220620104939.4094104-1-maximmi@nvidia.com --- tools/testing/selftests/bpf/config | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 3b3edc0fc8a6..c05904d631ec 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -57,3 +57,9 @@ CONFIG_FPROBE=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_MPTCP=y +CONFIG_NETFILTER_SYNPROXY=y +CONFIG_NETFILTER_XT_TARGET_CT=y +CONFIG_NETFILTER_XT_MATCH_STATE=y +CONFIG_IP_NF_FILTER=y +CONFIG_IP_NF_TARGET_SYNPROXY=y +CONFIG_IP_NF_RAW=y -- cgit v1.2.3 From 3b23054cd3f5106aed31ccf71a7bc14518a768eb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jun 2022 22:45:13 +0000 Subject: KVM: selftests: Add x86-64 support for exception fixup Add x86-64 support for exception fixup on single instructions, without forcing tests to install their own fault handlers. Use registers r9-r11 to flag the instruction as "safe" and pass fixup/vector information, i.e. introduce yet another flavor of fixup (versus the kernel's in-memory tables and KUT's per-CPU area) to take advantage of KVM sefltests being 64-bit only. Using only registers avoids the need to allocate fixup tables, ensure FS or GS base is valid for the guest, ensure memory is mapped into the guest, etc..., and also reduces the potential for recursive faults due to accessing memory. Providing exception fixup trivializes tests that just want to verify that an instruction faults, e.g. no need to track start/end using global labels, no need to install a dedicated handler, etc... Deliberately do not support #DE in exception fixup so that the fixup glue doesn't need to account for a fault with vector == 0, i.e. the vector can also indicate that a fault occurred. KVM injects #DE only for esoteric emulation scenarios, i.e. there's very, very little value in testing #DE. Force any test that wants to generate #DEs to install its own handler(s). Use kvm_pv_test as a guinea pig for the new fixup, as it has a very straightforward use case of wanting to verify that RDMSR and WRMSR fault. Signed-off-by: Sean Christopherson Message-Id: <20220608224516.3788274-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/include/x86_64/processor.h | 74 ++++++++++++++++++++++ tools/testing/selftests/kvm/lib/x86_64/processor.c | 17 +++++ tools/testing/selftests/kvm/x86_64/kvm_pv_test.c | 74 ++++------------------ 3 files changed, 102 insertions(+), 63 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 32964d7b2218..79dcf6be1b47 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -15,6 +15,8 @@ #include #include +#include + #include "../kvm_util.h" #define NMI_VECTOR 0x02 @@ -541,6 +543,78 @@ void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu); void vm_install_exception_handler(struct kvm_vm *vm, int vector, void (*handler)(struct ex_regs *)); +/* If a toddler were to say "abracadabra". */ +#define KVM_EXCEPTION_MAGIC 0xabacadabaull + +/* + * KVM selftest exception fixup uses registers to coordinate with the exception + * handler, versus the kernel's in-memory tables and KVM-Unit-Tests's in-memory + * per-CPU data. Using only registers avoids having to map memory into the + * guest, doesn't require a valid, stable GS.base, and reduces the risk of + * for recursive faults when accessing memory in the handler. The downside to + * using registers is that it restricts what registers can be used by the actual + * instruction. But, selftests are 64-bit only, making register* pressure a + * minor concern. Use r9-r11 as they are volatile, i.e. don't need* to be saved + * by the callee, and except for r11 are not implicit parameters to any + * instructions. Ideally, fixup would use r8-r10 and thus avoid implicit + * parameters entirely, but Hyper-V's hypercall ABI uses r8 and testing Hyper-V + * is higher priority than testing non-faulting SYSCALL/SYSRET. + * + * Note, the fixup handler deliberately does not handle #DE, i.e. the vector + * is guaranteed to be non-zero on fault. + * + * REGISTER INPUTS: + * r9 = MAGIC + * r10 = RIP + * r11 = new RIP on fault + * + * REGISTER OUTPUTS: + * r9 = exception vector (non-zero) + */ +#define KVM_ASM_SAFE(insn) \ + "mov $" __stringify(KVM_EXCEPTION_MAGIC) ", %%r9\n\t" \ + "lea 1f(%%rip), %%r10\n\t" \ + "lea 2f(%%rip), %%r11\n\t" \ + "1: " insn "\n\t" \ + "mov $0, %[vector]\n\t" \ + "jmp 3f\n\t" \ + "2:\n\t" \ + "mov %%r9b, %[vector]\n\t" \ + "3:\n\t" + +#define KVM_ASM_SAFE_OUTPUTS(v) [vector] "=qm"(v) +#define KVM_ASM_SAFE_CLOBBERS "r9", "r10", "r11" + +#define kvm_asm_safe(insn, inputs...) \ +({ \ + uint8_t vector; \ + \ + asm volatile(KVM_ASM_SAFE(insn) \ + : KVM_ASM_SAFE_OUTPUTS(vector) \ + : inputs \ + : KVM_ASM_SAFE_CLOBBERS); \ + vector; \ +}) + +static inline uint8_t rdmsr_safe(uint32_t msr, uint64_t *val) +{ + uint8_t vector; + uint32_t a, d; + + asm volatile(KVM_ASM_SAFE("rdmsr") + : "=a"(a), "=d"(d), KVM_ASM_SAFE_OUTPUTS(vector) + : "c"(msr) + : KVM_ASM_SAFE_CLOBBERS); + + *val = (uint64_t)a | ((uint64_t)d << 32); + return vector; +} + +static inline uint8_t wrmsr_safe(uint32_t msr, uint64_t val) +{ + return kvm_asm_safe("wrmsr", "A"(val), "c"(msr)); +} + uint64_t vm_get_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, uint64_t vaddr); void vm_set_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 906132e70fa4..1a32b1c75e9a 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1127,6 +1127,20 @@ static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr, e->offset2 = addr >> 32; } + +static bool kvm_fixup_exception(struct ex_regs *regs) +{ + if (regs->r9 != KVM_EXCEPTION_MAGIC || regs->rip != regs->r10) + return false; + + if (regs->vector == DE_VECTOR) + return false; + + regs->rip = regs->r11; + regs->r9 = regs->vector; + return true; +} + void kvm_exit_unexpected_vector(uint32_t value) { ucall(UCALL_UNHANDLED, 1, value); @@ -1142,6 +1156,9 @@ void route_exception(struct ex_regs *regs) return; } + if (kvm_fixup_exception(regs)) + return; + kvm_exit_unexpected_vector(regs->vector); } diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c index 5901ccec7079..feff85e43be3 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c @@ -12,55 +12,6 @@ #include "kvm_util.h" #include "processor.h" -extern unsigned char rdmsr_start; -extern unsigned char rdmsr_end; - -static u64 do_rdmsr(u32 idx) -{ - u32 lo, hi; - - asm volatile("rdmsr_start: rdmsr;" - "rdmsr_end:" - : "=a"(lo), "=c"(hi) - : "c"(idx)); - - return (((u64) hi) << 32) | lo; -} - -extern unsigned char wrmsr_start; -extern unsigned char wrmsr_end; - -static void do_wrmsr(u32 idx, u64 val) -{ - u32 lo, hi; - - lo = val; - hi = val >> 32; - - asm volatile("wrmsr_start: wrmsr;" - "wrmsr_end:" - : : "a"(lo), "c"(idx), "d"(hi)); -} - -static int nr_gp; - -static void guest_gp_handler(struct ex_regs *regs) -{ - unsigned char *rip = (unsigned char *)regs->rip; - bool r, w; - - r = rip == &rdmsr_start; - w = rip == &wrmsr_start; - GUEST_ASSERT(r || w); - - nr_gp++; - - if (r) - regs->rip = (uint64_t)&rdmsr_end; - else - regs->rip = (uint64_t)&wrmsr_end; -} - struct msr_data { uint32_t idx; const char *name; @@ -89,14 +40,16 @@ static struct msr_data msrs_to_test[] = { static void test_msr(struct msr_data *msr) { + uint64_t ignored; + uint8_t vector; + PR_MSR(msr); - do_rdmsr(msr->idx); - GUEST_ASSERT(READ_ONCE(nr_gp) == 1); - nr_gp = 0; - do_wrmsr(msr->idx, 0); - GUEST_ASSERT(READ_ONCE(nr_gp) == 1); - nr_gp = 0; + vector = rdmsr_safe(msr->idx, &ignored); + GUEST_ASSERT_1(vector == GP_VECTOR, vector); + + vector = wrmsr_safe(msr->idx, 0); + GUEST_ASSERT_1(vector == GP_VECTOR, vector); } struct hcall_data { @@ -165,12 +118,6 @@ static void pr_hcall(struct ucall *uc) pr_info("testing hcall: %s (%lu)\n", hc->name, hc->nr); } -static void handle_abort(struct ucall *uc) -{ - TEST_FAIL("%s at %s:%ld", (const char *)uc->args[0], - __FILE__, uc->args[1]); -} - static void enter_guest(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; @@ -190,7 +137,9 @@ static void enter_guest(struct kvm_vcpu *vcpu) pr_hcall(&uc); break; case UCALL_ABORT: - handle_abort(&uc); + TEST_FAIL("%s at %s:%ld, vector = %lu", + (const char *)uc.args[0], __FILE__, + uc.args[1], uc.args[2]); return; case UCALL_DONE: return; @@ -216,7 +165,6 @@ int main(void) vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vcpu); - vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); enter_guest(vcpu); kvm_vm_free(vm); -- cgit v1.2.3 From 9f88d062c3db13164a0d2007b88e1e430f523a06 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jun 2022 22:45:14 +0000 Subject: KVM: selftests: Mostly fix broken Hyper-V Features test Explicitly do all setup at every stage of the Hyper-V Features test, e.g. set the MSR/hypercall, enable capabilities, etc... Now that the VM is recreated for every stage, values that are written into the VM's address space, i.e. shared with the guest, are reset between sub-tests, as are any capabilities, etc... Fix the hypercall params as well, which were broken in the same rework. The "hcall" struct/pointer needs to point at the hcall_params object, not the set of hypercall pages. The goofs were hidden by the test's dubious behavior of using '0' to signal "done", i.e. the MSR test ran exactly one sub-test, and the hypercall test was a gigantic nop. Fixes: 6c1186430a80 ("KVM: selftests: Avoid KVM_SET_CPUID2 after KVM_RUN in hyperv_features test") Signed-off-by: Sean Christopherson Message-Id: <20220608224516.3788274-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/x86_64/hyperv_features.c | 142 ++++++++++++--------- 1 file changed, 81 insertions(+), 61 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c index d5f37495ade8..1a2a06c3027a 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c @@ -101,52 +101,44 @@ struct hcall_data { static void guest_msr(struct msr_data *msr) { - int i = 0; + GUEST_ASSERT(msr->idx); - while (msr->idx) { - WRITE_ONCE(nr_gp, 0); - if (!msr->write) - do_rdmsr(msr->idx); - else - do_wrmsr(msr->idx, msr->write_val); - - if (msr->available) - GUEST_ASSERT(READ_ONCE(nr_gp) == 0); - else - GUEST_ASSERT(READ_ONCE(nr_gp) == 1); - - GUEST_SYNC(i++); - } + WRITE_ONCE(nr_gp, 0); + if (!msr->write) + do_rdmsr(msr->idx); + else + do_wrmsr(msr->idx, msr->write_val); + if (msr->available) + GUEST_ASSERT(READ_ONCE(nr_gp) == 0); + else + GUEST_ASSERT(READ_ONCE(nr_gp) == 1); GUEST_DONE(); } static void guest_hcall(vm_vaddr_t pgs_gpa, struct hcall_data *hcall) { - int i = 0; u64 res, input, output; + GUEST_ASSERT(hcall->control); + wrmsr(HV_X64_MSR_GUEST_OS_ID, LINUX_OS_ID); wrmsr(HV_X64_MSR_HYPERCALL, pgs_gpa); - while (hcall->control) { - nr_ud = 0; - if (!(hcall->control & HV_HYPERCALL_FAST_BIT)) { - input = pgs_gpa; - output = pgs_gpa + 4096; - } else { - input = output = 0; - } - - res = hypercall(hcall->control, input, output); - if (hcall->ud_expected) - GUEST_ASSERT(nr_ud == 1); - else - GUEST_ASSERT(res == hcall->expect); - - GUEST_SYNC(i++); + nr_ud = 0; + if (!(hcall->control & HV_HYPERCALL_FAST_BIT)) { + input = pgs_gpa; + output = pgs_gpa + 4096; + } else { + input = output = 0; } + res = hypercall(hcall->control, input, output); + if (hcall->ud_expected) + GUEST_ASSERT(nr_ud == 1); + else + GUEST_ASSERT(res == hcall->expect); + GUEST_DONE(); } @@ -204,6 +196,10 @@ static void guest_test_msrs_access(void) run = vcpu->run; + /* TODO: Make this entire test easier to maintain. */ + if (stage >= 21) + vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_SYNIC2, 0); + switch (stage) { case 0: /* @@ -247,11 +243,13 @@ static void guest_test_msrs_access(void) break; case 6: feat.eax |= HV_MSR_VP_RUNTIME_AVAILABLE; + msr->idx = HV_X64_MSR_VP_RUNTIME; msr->write = 0; msr->available = 1; break; case 7: /* Read only */ + msr->idx = HV_X64_MSR_VP_RUNTIME; msr->write = 1; msr->write_val = 1; msr->available = 0; @@ -264,11 +262,13 @@ static void guest_test_msrs_access(void) break; case 9: feat.eax |= HV_MSR_TIME_REF_COUNT_AVAILABLE; + msr->idx = HV_X64_MSR_TIME_REF_COUNT; msr->write = 0; msr->available = 1; break; case 10: /* Read only */ + msr->idx = HV_X64_MSR_TIME_REF_COUNT; msr->write = 1; msr->write_val = 1; msr->available = 0; @@ -281,11 +281,13 @@ static void guest_test_msrs_access(void) break; case 12: feat.eax |= HV_MSR_VP_INDEX_AVAILABLE; + msr->idx = HV_X64_MSR_VP_INDEX; msr->write = 0; msr->available = 1; break; case 13: /* Read only */ + msr->idx = HV_X64_MSR_VP_INDEX; msr->write = 1; msr->write_val = 1; msr->available = 0; @@ -298,10 +300,12 @@ static void guest_test_msrs_access(void) break; case 15: feat.eax |= HV_MSR_RESET_AVAILABLE; + msr->idx = HV_X64_MSR_RESET; msr->write = 0; msr->available = 1; break; case 16: + msr->idx = HV_X64_MSR_RESET; msr->write = 1; msr->write_val = 0; msr->available = 1; @@ -314,10 +318,12 @@ static void guest_test_msrs_access(void) break; case 18: feat.eax |= HV_MSR_REFERENCE_TSC_AVAILABLE; + msr->idx = HV_X64_MSR_REFERENCE_TSC; msr->write = 0; msr->available = 1; break; case 19: + msr->idx = HV_X64_MSR_REFERENCE_TSC; msr->write = 1; msr->write_val = 0; msr->available = 1; @@ -333,14 +339,18 @@ static void guest_test_msrs_access(void) * Remains unavailable even with KVM_CAP_HYPERV_SYNIC2 * capability enabled and guest visible CPUID bit unset. */ - vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_SYNIC2, 0); + msr->idx = HV_X64_MSR_EOM; + msr->write = 0; + msr->available = 0; break; case 22: feat.eax |= HV_MSR_SYNIC_AVAILABLE; + msr->idx = HV_X64_MSR_EOM; msr->write = 0; msr->available = 1; break; case 23: + msr->idx = HV_X64_MSR_EOM; msr->write = 1; msr->write_val = 0; msr->available = 1; @@ -353,22 +363,28 @@ static void guest_test_msrs_access(void) break; case 25: feat.eax |= HV_MSR_SYNTIMER_AVAILABLE; + msr->idx = HV_X64_MSR_STIMER0_CONFIG; msr->write = 0; msr->available = 1; break; case 26: + msr->idx = HV_X64_MSR_STIMER0_CONFIG; msr->write = 1; msr->write_val = 0; msr->available = 1; break; case 27: /* Direct mode test */ + msr->idx = HV_X64_MSR_STIMER0_CONFIG; msr->write = 1; msr->write_val = 1 << 12; msr->available = 0; break; case 28: feat.edx |= HV_STIMER_DIRECT_MODE_AVAILABLE; + msr->idx = HV_X64_MSR_STIMER0_CONFIG; + msr->write = 1; + msr->write_val = 1 << 12; msr->available = 1; break; @@ -379,6 +395,7 @@ static void guest_test_msrs_access(void) break; case 30: feat.eax |= HV_MSR_APIC_ACCESS_AVAILABLE; + msr->idx = HV_X64_MSR_EOI; msr->write = 1; msr->write_val = 1; msr->available = 1; @@ -391,11 +408,13 @@ static void guest_test_msrs_access(void) break; case 32: feat.eax |= HV_ACCESS_FREQUENCY_MSRS; + msr->idx = HV_X64_MSR_TSC_FREQUENCY; msr->write = 0; msr->available = 1; break; case 33: /* Read only */ + msr->idx = HV_X64_MSR_TSC_FREQUENCY; msr->write = 1; msr->write_val = 1; msr->available = 0; @@ -408,10 +427,12 @@ static void guest_test_msrs_access(void) break; case 35: feat.eax |= HV_ACCESS_REENLIGHTENMENT; + msr->idx = HV_X64_MSR_REENLIGHTENMENT_CONTROL; msr->write = 0; msr->available = 1; break; case 36: + msr->idx = HV_X64_MSR_REENLIGHTENMENT_CONTROL; msr->write = 1; msr->write_val = 1; msr->available = 1; @@ -431,10 +452,12 @@ static void guest_test_msrs_access(void) break; case 39: feat.edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; + msr->idx = HV_X64_MSR_CRASH_P0; msr->write = 0; msr->available = 1; break; case 40: + msr->idx = HV_X64_MSR_CRASH_P0; msr->write = 1; msr->write_val = 1; msr->available = 1; @@ -448,28 +471,26 @@ static void guest_test_msrs_access(void) case 42: feat.edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE; dbg.eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; + msr->idx = HV_X64_MSR_SYNDBG_STATUS; msr->write = 0; msr->available = 1; break; case 43: + msr->idx = HV_X64_MSR_SYNDBG_STATUS; msr->write = 1; msr->write_val = 0; msr->available = 1; break; case 44: - /* END */ - msr->idx = 0; - break; + kvm_vm_free(vm); + return; } hv_set_cpuid(vcpu, best, &feat, &recomm, &dbg); - if (msr->idx) - pr_debug("Stage %d: testing msr: 0x%x for %s\n", stage, - msr->idx, msr->write ? "write" : "read"); - else - pr_debug("Stage %d: finish\n", stage); + pr_debug("Stage %d: testing msr: 0x%x for %s\n", stage, + msr->idx, msr->write ? "write" : "read"); vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, @@ -477,16 +498,14 @@ static void guest_test_msrs_access(void) run->exit_reason, exit_reason_str(run->exit_reason)); switch (get_ucall(vcpu, &uc)) { - case UCALL_SYNC: - TEST_ASSERT(uc.args[1] == 0, - "Unexpected stage: %ld (0 expected)\n", - uc.args[1]); - break; case UCALL_ABORT: TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]); return; case UCALL_DONE: + break; + default: + TEST_FAIL("Unhandled ucall: %ld", uc.cmd); return; } @@ -525,11 +544,11 @@ static void guest_test_hcalls_access(void) /* Hypercall input/output */ hcall_page = vm_vaddr_alloc_pages(vm, 2); - hcall = addr_gva2hva(vm, hcall_page); memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize()); hcall_params = vm_vaddr_alloc_page(vm); memset(addr_gva2hva(vm, hcall_params), 0x0, getpagesize()); + hcall = addr_gva2hva(vm, hcall_params); vcpu_args_set(vcpu, 2, addr_gva2gpa(vm, hcall_page), hcall_params); vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_ENFORCE_CPUID, 1); @@ -552,6 +571,7 @@ static void guest_test_hcalls_access(void) break; case 2: feat.ebx |= HV_POST_MESSAGES; + hcall->control = HVCALL_POST_MESSAGE; hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT; break; @@ -561,6 +581,7 @@ static void guest_test_hcalls_access(void) break; case 4: feat.ebx |= HV_SIGNAL_EVENTS; + hcall->control = HVCALL_SIGNAL_EVENT; hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT; break; @@ -570,10 +591,12 @@ static void guest_test_hcalls_access(void) break; case 6: dbg.eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; + hcall->control = HVCALL_RESET_DEBUG_SESSION; hcall->expect = HV_STATUS_ACCESS_DENIED; break; case 7: feat.ebx |= HV_DEBUGGING; + hcall->control = HVCALL_RESET_DEBUG_SESSION; hcall->expect = HV_STATUS_OPERATION_DENIED; break; @@ -583,6 +606,7 @@ static void guest_test_hcalls_access(void) break; case 9: recomm.eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; + hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE; hcall->expect = HV_STATUS_SUCCESS; break; case 10: @@ -591,6 +615,7 @@ static void guest_test_hcalls_access(void) break; case 11: recomm.eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; + hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX; hcall->expect = HV_STATUS_SUCCESS; break; @@ -600,6 +625,7 @@ static void guest_test_hcalls_access(void) break; case 13: recomm.eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; + hcall->control = HVCALL_SEND_IPI; hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT; break; case 14: @@ -614,6 +640,7 @@ static void guest_test_hcalls_access(void) break; case 16: recomm.ebx = 0xfff; + hcall->control = HVCALL_NOTIFY_LONG_SPIN_WAIT; hcall->expect = HV_STATUS_SUCCESS; break; case 17: @@ -623,23 +650,18 @@ static void guest_test_hcalls_access(void) break; case 18: feat.edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE; + hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | HV_HYPERCALL_FAST_BIT; hcall->ud_expected = false; hcall->expect = HV_STATUS_SUCCESS; break; - case 19: - /* END */ - hcall->control = 0; - break; + kvm_vm_free(vm); + return; } hv_set_cpuid(vcpu, best, &feat, &recomm, &dbg); - if (hcall->control) - pr_debug("Stage %d: testing hcall: 0x%lx\n", stage, - hcall->control); - else - pr_debug("Stage %d: finish\n", stage); + pr_debug("Stage %d: testing hcall: 0x%lx\n", stage, hcall->control); vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, @@ -647,16 +669,14 @@ static void guest_test_hcalls_access(void) run->exit_reason, exit_reason_str(run->exit_reason)); switch (get_ucall(vcpu, &uc)) { - case UCALL_SYNC: - TEST_ASSERT(uc.args[1] == 0, - "Unexpected stage: %ld (0 expected)\n", - uc.args[1]); - break; case UCALL_ABORT: TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]); return; case UCALL_DONE: + break; + default: + TEST_FAIL("Unhandled ucall: %ld", uc.cmd); return; } -- cgit v1.2.3 From ab69d3c8b99435ebd2ac4a241a5d931024c83c02 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Apr 2022 15:17:37 -0700 Subject: torture: Make kvm-remote.sh announce which system is being waited on If a remote system fails in certain ways, for example, if it is rebooted without removing the contents of the /tmp directory, its remote.run file never will be removed and the kvm-remote.sh script will loop waiting forever. The manual workaround for this (hopefully!) rare event is to manually remove the file, which will cause the results up to the reboot to be collected and evaluated. Unfortunately, to work out which system is holding things up, the user must refer to the name of the last system whose results were collected, then look up the name of the next system in sequence, then manually remove the remote.run file. Even more unfortunately, this procedure can be fooled in runs where each system handles more than one batch should a given system take longer than expected, causing the systems to be handled out of order. This commit therefore causes kvm-remote.sh to print out the name of the system it will wait on next, allowing the user to refer directly to that name. Making the kvm-remote.sh script automatically handle unscheduled termination of the qemu processes is left as future work. Quite possibly deep future work. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-remote.sh | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh index 0ff59bd8b640..9f0a5d5ff2dd 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh @@ -262,6 +262,7 @@ echo All batches started. `date` | tee -a "$oldrun/remote-log" # Wait for all remaining scenarios to complete and collect results. for i in $systems do + echo " ---" Waiting for $i `date` | tee -a "$oldrun/remote-log" while checkremotefile "$i" "$resdir/$ds/remote.run" do sleep 30 -- cgit v1.2.3 From cc5851c6be864c5772944e32df3da322fe3ad415 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jun 2022 22:45:15 +0000 Subject: KVM: selftests: Use exception fixup for #UD/#GP Hyper-V MSR/hcall tests Use exception fixup to verify VMCALL/RDMSR/WRMSR fault as expected in the Hyper-V Features test. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20220608224516.3788274-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/x86_64/hyperv_features.c | 115 ++++++--------------- 1 file changed, 32 insertions(+), 83 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c index 1a2a06c3027a..c05acd78548f 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c @@ -15,75 +15,20 @@ #define LINUX_OS_ID ((u64)0x8100 << 48) -extern unsigned char rdmsr_start; -extern unsigned char rdmsr_end; - -static u64 do_rdmsr(u32 idx) -{ - u32 lo, hi; - - asm volatile("rdmsr_start: rdmsr;" - "rdmsr_end:" - : "=a"(lo), "=c"(hi) - : "c"(idx)); - - return (((u64) hi) << 32) | lo; -} - -extern unsigned char wrmsr_start; -extern unsigned char wrmsr_end; - -static void do_wrmsr(u32 idx, u64 val) -{ - u32 lo, hi; - - lo = val; - hi = val >> 32; - - asm volatile("wrmsr_start: wrmsr;" - "wrmsr_end:" - : : "a"(lo), "c"(idx), "d"(hi)); -} - -static int nr_gp; -static int nr_ud; - -static inline u64 hypercall(u64 control, vm_vaddr_t input_address, - vm_vaddr_t output_address) +static inline uint8_t hypercall(u64 control, vm_vaddr_t input_address, + vm_vaddr_t output_address, uint64_t *hv_status) { - u64 hv_status; - - asm volatile("mov %3, %%r8\n" - "vmcall" - : "=a" (hv_status), - "+c" (control), "+d" (input_address) - : "r" (output_address) - : "cc", "memory", "r8", "r9", "r10", "r11"); - - return hv_status; -} - -static void guest_gp_handler(struct ex_regs *regs) -{ - unsigned char *rip = (unsigned char *)regs->rip; - bool r, w; - - r = rip == &rdmsr_start; - w = rip == &wrmsr_start; - GUEST_ASSERT(r || w); - - nr_gp++; - - if (r) - regs->rip = (uint64_t)&rdmsr_end; - else - regs->rip = (uint64_t)&wrmsr_end; -} - -static void guest_ud_handler(struct ex_regs *regs) -{ - nr_ud++; - regs->rip += 3; + uint8_t vector; + + /* Note both the hypercall and the "asm safe" clobber r9-r11. */ + asm volatile("mov %[output_address], %%r8\n\t" + KVM_ASM_SAFE("vmcall") + : "=a" (*hv_status), + "+c" (control), "+d" (input_address), + KVM_ASM_SAFE_OUTPUTS(vector) + : [output_address] "r"(output_address) + : "cc", "memory", "r8", KVM_ASM_SAFE_CLOBBERS); + return vector; } struct msr_data { @@ -101,31 +46,33 @@ struct hcall_data { static void guest_msr(struct msr_data *msr) { + uint64_t ignored; + uint8_t vector; + GUEST_ASSERT(msr->idx); - WRITE_ONCE(nr_gp, 0); if (!msr->write) - do_rdmsr(msr->idx); + vector = rdmsr_safe(msr->idx, &ignored); else - do_wrmsr(msr->idx, msr->write_val); + vector = wrmsr_safe(msr->idx, msr->write_val); if (msr->available) - GUEST_ASSERT(READ_ONCE(nr_gp) == 0); + GUEST_ASSERT_2(!vector, msr->idx, vector); else - GUEST_ASSERT(READ_ONCE(nr_gp) == 1); + GUEST_ASSERT_2(vector == GP_VECTOR, msr->idx, vector); GUEST_DONE(); } static void guest_hcall(vm_vaddr_t pgs_gpa, struct hcall_data *hcall) { u64 res, input, output; + uint8_t vector; GUEST_ASSERT(hcall->control); wrmsr(HV_X64_MSR_GUEST_OS_ID, LINUX_OS_ID); wrmsr(HV_X64_MSR_HYPERCALL, pgs_gpa); - nr_ud = 0; if (!(hcall->control & HV_HYPERCALL_FAST_BIT)) { input = pgs_gpa; output = pgs_gpa + 4096; @@ -133,12 +80,14 @@ static void guest_hcall(vm_vaddr_t pgs_gpa, struct hcall_data *hcall) input = output = 0; } - res = hypercall(hcall->control, input, output); + vector = hypercall(hcall->control, input, output, &res); if (hcall->ud_expected) - GUEST_ASSERT(nr_ud == 1); + GUEST_ASSERT_2(vector == UD_VECTOR, hcall->control, vector); else - GUEST_ASSERT(res == hcall->expect); + GUEST_ASSERT_2(!vector, hcall->control, vector); + GUEST_ASSERT_2(!hcall->ud_expected || res == hcall->expect, + hcall->expect, res); GUEST_DONE(); } @@ -192,7 +141,6 @@ static void guest_test_msrs_access(void) vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vcpu); - vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); run = vcpu->run; @@ -499,8 +447,9 @@ static void guest_test_msrs_access(void) switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], - __FILE__, uc.args[1]); + TEST_FAIL("%s at %s:%ld, MSR = %lx, vector = %lx", + (const char *)uc.args[0], __FILE__, + uc.args[1], uc.args[2], uc.args[3]); return; case UCALL_DONE: break; @@ -540,7 +489,6 @@ static void guest_test_hcalls_access(void) vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vcpu); - vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); /* Hypercall input/output */ hcall_page = vm_vaddr_alloc_pages(vm, 2); @@ -670,8 +618,9 @@ static void guest_test_hcalls_access(void) switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], - __FILE__, uc.args[1]); + TEST_FAIL("%s at %s:%ld, arg1 = %lx, arg2 = %lx", + (const char *)uc.args[0], __FILE__, + uc.args[1], uc.args[2], uc.args[3]); return; case UCALL_DONE: break; -- cgit v1.2.3 From 1ef150cf40be986747c3fa0c0e75acaec412e85e Mon Sep 17 00:00:00 2001 From: Ammar Faizi Date: Fri, 20 May 2022 00:21:15 +0700 Subject: tools/nolibc/stdlib: Support overflow checking for older compiler versions Previously, we used __builtin_mul_overflow() to check for overflow in the multiplication operation in the calloc() function. However, older compiler versions don't support this built-in. This patch changes the overflow checking mechanism to make it work on any compiler version by using a division method to check for overflow. No functional change intended. While in there, remove the unused variable `void *orig`. Link: https://lore.kernel.org/lkml/20220330024114.GA18892@1wt.eu Suggested-by: Willy Tarreau Cc: Alviro Iskandar Setiawan Signed-off-by: Ammar Faizi Acked-by: Willy Tarreau Reviewed-by: Alviro Iskandar Setiawan Signed-off-by: Paul E. McKenney --- tools/include/nolibc/stdlib.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h index 8fd32eaf8037..92378c4b9660 100644 --- a/tools/include/nolibc/stdlib.h +++ b/tools/include/nolibc/stdlib.h @@ -128,10 +128,9 @@ void *malloc(size_t len) static __attribute__((unused)) void *calloc(size_t size, size_t nmemb) { - void *orig; - size_t res = 0; + size_t x = size * nmemb; - if (__builtin_expect(__builtin_mul_overflow(nmemb, size, &res), 0)) { + if (__builtin_expect(size && ((x / size) != nmemb), 0)) { SET_ERRNO(ENOMEM); return NULL; } @@ -140,7 +139,7 @@ void *calloc(size_t size, size_t nmemb) * No need to zero the heap, the MAP_ANONYMOUS in malloc() * already does it. */ - return malloc(res); + return malloc(x); } static __attribute__((unused)) -- cgit v1.2.3 From 4f2c9703a128d21db7d0bf2ea14c5fe50d0dfe7e Mon Sep 17 00:00:00 2001 From: Alviro Iskandar Setiawan Date: Fri, 20 May 2022 00:21:16 +0700 Subject: tools/nolibc/stdio: Add format attribute to enable printf warnings When we use printf and fprintf functions from the nolibc, we don't get any warning from the compiler if we have the wrong arguments. For example, the following calls will compile silently: ``` printf("%s %s\n", "aaa"); fprintf(stdout, "%s %s\n", "xxx", 1); ``` (Note the wrong arguments). Those calls are undefined behavior. The compiler can help us warn about the above mistakes by adding a `printf` format attribute to those functions declaration. This patch adds it, and now it yields these warnings for those mistakes: ``` warning: format `%s` expects a matching `char *` argument [-Wformat=] warning: format `%s` expects argument of type `char *`, but argument 4 has type `int` [-Wformat=] ``` [ ammarfaizi2: Simplify the attribute placement. ] Signed-off-by: Alviro Iskandar Setiawan Signed-off-by: Ammar Faizi Acked-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/stdio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h index 15dedf8d0902..a3cebc4bc3ac 100644 --- a/tools/include/nolibc/stdio.h +++ b/tools/include/nolibc/stdio.h @@ -273,7 +273,7 @@ int vfprintf(FILE *stream, const char *fmt, va_list args) return written; } -static __attribute__((unused)) +static __attribute__((unused, format(printf, 2, 3))) int fprintf(FILE *stream, const char *fmt, ...) { va_list args; @@ -285,7 +285,7 @@ int fprintf(FILE *stream, const char *fmt, ...) return ret; } -static __attribute__((unused)) +static __attribute__((unused, format(printf, 1, 2))) int printf(const char *fmt, ...) { va_list args; -- cgit v1.2.3 From 6a3ad243b29be01d0c30f8c3b35a1149e1a0139a Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sat, 28 May 2022 17:45:44 +0200 Subject: tools/nolibc: fix the makefile to also work as "make -C tools ..." As reported by Linus, the nolibc's makefile is currently broken when invoked as per the documented method (make -C tools nolibc_), because it now relies on the ARCH and OUTPUT variables that are not set in this case. This patch addresses this by sourcing subarch.include, and by presetting OUTPUT to the current directory if not set. This is sufficient to make the commands work both as a standalone target and as a tools/ sub-target. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/Makefile | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/include/nolibc/Makefile b/tools/include/nolibc/Makefile index 7a16d917c185..e8bac6ef3653 100644 --- a/tools/include/nolibc/Makefile +++ b/tools/include/nolibc/Makefile @@ -7,6 +7,22 @@ ifeq ($(srctree),) srctree := $(patsubst %/tools/include/,%,$(dir $(CURDIR))) endif +# when run as make -C tools/ nolibc_ the arch is not set +ifeq ($(ARCH),) +include $(srctree)/scripts/subarch.include +ARCH = $(SUBARCH) +endif + +# OUTPUT is only set when run from the main makefile, otherwise +# it defaults to this nolibc directory. +OUTPUT ?= $(CURDIR)/ + +ifeq ($(V),1) +Q= +else +Q=@ +endif + nolibc_arch := $(patsubst arm64,aarch64,$(ARCH)) arch_file := arch-$(nolibc_arch).h all_files := ctype.h errno.h nolibc.h signal.h std.h stdio.h stdlib.h string.h \ @@ -36,7 +52,7 @@ headers: headers_standalone: headers $(Q)$(MAKE) -C $(srctree) headers - $(Q)$(MAKE) -C $(srctree) headers_install INSTALL_HDR_PATH=$(OUTPUT)/sysroot + $(Q)$(MAKE) -C $(srctree) headers_install INSTALL_HDR_PATH=$(OUTPUT)sysroot clean: $(call QUIET_CLEAN, nolibc) rm -rf "$(OUTPUT)sysroot" -- cgit v1.2.3 From fe20cad47e6c79febd323c8db0a1f85896acbcdd Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sat, 28 May 2022 17:45:45 +0200 Subject: tools/nolibc: make the default target build the headers The help in "make -C tools" enumerates nolibc as a valid target so we must at least make it do something. Let's make it do the equivalent of "make headers" in that it will prepare a sysroot with the arch's headers, but will not install the kernel's headers. This is the minimum some tools will need when built with a full-blown toolchain anyway. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/Makefile | 3 +++ tools/include/nolibc/Makefile | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/Makefile b/tools/Makefile index c074e42fd92f..e497875fc7e3 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -78,6 +78,9 @@ bpf/%: FORCE libapi: FORCE $(call descend,lib/api) +nolibc: FORCE + $(call descend,include/nolibc) + nolibc_%: FORCE $(call descend,include/nolibc,$(patsubst nolibc_%,%,$@)) diff --git a/tools/include/nolibc/Makefile b/tools/include/nolibc/Makefile index e8bac6ef3653..9768819abd55 100644 --- a/tools/include/nolibc/Makefile +++ b/tools/include/nolibc/Makefile @@ -29,7 +29,7 @@ all_files := ctype.h errno.h nolibc.h signal.h std.h stdio.h stdlib.h string.h \ sys.h time.h types.h unistd.h # install all headers needed to support a bare-metal compiler -all: +all: headers # Note: when ARCH is "x86" we concatenate both x86_64 and i386 headers: -- cgit v1.2.3 From 4f8126f3a665456b68ae923cd7a7e9b9eb98547d Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sat, 28 May 2022 17:45:46 +0200 Subject: tools/nolibc: add a help target to list supported targets The "help" target simply presents the list of supported targets and the current set of variables being used to build the sysroot. Since the help in tools/ suggests to use "install", which is supported by most tools while such a target is not really relevant here, an "install" target was also added, redirecting to "help". Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/Makefile | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'tools') diff --git a/tools/include/nolibc/Makefile b/tools/include/nolibc/Makefile index 9768819abd55..cfd06764b5ae 100644 --- a/tools/include/nolibc/Makefile +++ b/tools/include/nolibc/Makefile @@ -31,6 +31,23 @@ all_files := ctype.h errno.h nolibc.h signal.h std.h stdio.h stdlib.h string.h \ # install all headers needed to support a bare-metal compiler all: headers +install: help + +help: + @echo "Supported targets under nolibc:" + @echo " all call \"headers\"" + @echo " clean clean the sysroot" + @echo " headers prepare a sysroot in tools/include/nolibc/sysroot" + @echo " headers_standalone like \"headers\", and also install kernel headers" + @echo " help this help" + @echo "" + @echo "These targets may also be called from tools as \"make nolibc_\"." + @echo "" + @echo "Currently using the following variables:" + @echo " ARCH = $(ARCH)" + @echo " OUTPUT = $(OUTPUT)" + @echo "" + # Note: when ARCH is "x86" we concatenate both x86_64 and i386 headers: $(Q)mkdir -p $(OUTPUT)sysroot -- cgit v1.2.3 From 933ff53191eb7a8492370bad6339a1ca6da2d939 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Jun 2022 02:53:40 +0300 Subject: selftests/bpf: specify expected instructions in test_verifier tests Allows to specify expected and unexpected instruction sequences in test_verifier test cases. The instructions are requested from kernel after BPF program loading, thus allowing to check some of the transformations applied by BPF verifier. - `expected_insn` field specifies a sequence of instructions expected to be found in the program; - `unexpected_insn` field specifies a sequence of instructions that are not expected to be found in the program; - `INSN_OFF_MASK` and `INSN_IMM_MASK` values could be used to mask `off` and `imm` fields. - `SKIP_INSNS` could be used to specify that some instructions in the (un)expected pattern are not important (behavior similar to usage of `\t` in `errstr` field). The intended usage is as follows: { "inline simple bpf_loop call", .insns = { /* main */ BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 1), BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_2, BPF_PSEUDO_FUNC, 0, 6), ... BPF_EXIT_INSN(), /* callback */ BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 1), BPF_EXIT_INSN(), }, .expected_insns = { BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 1), SKIP_INSNS(), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_CALL, 8, 1) }, .unexpected_insns = { BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, INSN_OFF_MASK, INSN_IMM_MASK), }, .prog_type = BPF_PROG_TYPE_TRACEPOINT, .result = ACCEPT, .runs = 0, }, Here it is expected that move of 1 to register 1 would remain in place and helper function call instruction would be replaced by a relative call instruction. Signed-off-by: Eduard Zingerman Acked-by: Song Liu Link: https://lore.kernel.org/r/20220620235344.569325-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_verifier.c | 234 ++++++++++++++++++++++++++++ 1 file changed, 234 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 372579c9f45e..1f24eae9e16e 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -51,6 +51,8 @@ #endif #define MAX_INSNS BPF_MAXINSNS +#define MAX_EXPECTED_INSNS 32 +#define MAX_UNEXPECTED_INSNS 32 #define MAX_TEST_INSNS 1000000 #define MAX_FIXUPS 8 #define MAX_NR_MAPS 23 @@ -58,6 +60,10 @@ #define POINTER_VALUE 0xcafe4all #define TEST_DATA_LEN 64 +#define INSN_OFF_MASK ((__s16)0xFFFF) +#define INSN_IMM_MASK ((__s32)0xFFFFFFFF) +#define SKIP_INSNS() BPF_RAW_INSN(0xde, 0xa, 0xd, 0xbeef, 0xdeadbeef) + #define F_NEEDS_EFFICIENT_UNALIGNED_ACCESS (1 << 0) #define F_LOAD_WITH_STRICT_ALIGNMENT (1 << 1) @@ -79,6 +85,23 @@ struct bpf_test { const char *descr; struct bpf_insn insns[MAX_INSNS]; struct bpf_insn *fill_insns; + /* If specified, test engine looks for this sequence of + * instructions in the BPF program after loading. Allows to + * test rewrites applied by verifier. Use values + * INSN_OFF_MASK and INSN_IMM_MASK to mask `off` and `imm` + * fields if content does not matter. The test case fails if + * specified instructions are not found. + * + * The sequence could be split into sub-sequences by adding + * SKIP_INSNS instruction at the end of each sub-sequence. In + * such case sub-sequences are searched for one after another. + */ + struct bpf_insn expected_insns[MAX_EXPECTED_INSNS]; + /* If specified, test engine applies same pattern matching + * logic as for `expected_insns`. If the specified pattern is + * matched test case is marked as failed. + */ + struct bpf_insn unexpected_insns[MAX_UNEXPECTED_INSNS]; int fixup_map_hash_8b[MAX_FIXUPS]; int fixup_map_hash_48b[MAX_FIXUPS]; int fixup_map_hash_16b[MAX_FIXUPS]; @@ -1126,6 +1149,214 @@ static bool cmp_str_seq(const char *log, const char *exp) return true; } +static int get_xlated_program(int fd_prog, struct bpf_insn **buf, int *cnt) +{ + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + __u32 xlated_prog_len; + __u32 buf_element_size = sizeof(struct bpf_insn); + + if (bpf_obj_get_info_by_fd(fd_prog, &info, &info_len)) { + perror("bpf_obj_get_info_by_fd failed"); + return -1; + } + + xlated_prog_len = info.xlated_prog_len; + if (xlated_prog_len % buf_element_size) { + printf("Program length %d is not multiple of %d\n", + xlated_prog_len, buf_element_size); + return -1; + } + + *cnt = xlated_prog_len / buf_element_size; + *buf = calloc(*cnt, buf_element_size); + if (!buf) { + perror("can't allocate xlated program buffer"); + return -ENOMEM; + } + + bzero(&info, sizeof(info)); + info.xlated_prog_len = xlated_prog_len; + info.xlated_prog_insns = (__u64)*buf; + if (bpf_obj_get_info_by_fd(fd_prog, &info, &info_len)) { + perror("second bpf_obj_get_info_by_fd failed"); + goto out_free_buf; + } + + return 0; + +out_free_buf: + free(*buf); + return -1; +} + +static bool is_null_insn(struct bpf_insn *insn) +{ + struct bpf_insn null_insn = {}; + + return memcmp(insn, &null_insn, sizeof(null_insn)) == 0; +} + +static bool is_skip_insn(struct bpf_insn *insn) +{ + struct bpf_insn skip_insn = SKIP_INSNS(); + + return memcmp(insn, &skip_insn, sizeof(skip_insn)) == 0; +} + +static int null_terminated_insn_len(struct bpf_insn *seq, int max_len) +{ + int i; + + for (i = 0; i < max_len; ++i) { + if (is_null_insn(&seq[i])) + return i; + } + return max_len; +} + +static bool compare_masked_insn(struct bpf_insn *orig, struct bpf_insn *masked) +{ + struct bpf_insn orig_masked; + + memcpy(&orig_masked, orig, sizeof(orig_masked)); + if (masked->imm == INSN_IMM_MASK) + orig_masked.imm = INSN_IMM_MASK; + if (masked->off == INSN_OFF_MASK) + orig_masked.off = INSN_OFF_MASK; + + return memcmp(&orig_masked, masked, sizeof(orig_masked)) == 0; +} + +static int find_insn_subseq(struct bpf_insn *seq, struct bpf_insn *subseq, + int seq_len, int subseq_len) +{ + int i, j; + + if (subseq_len > seq_len) + return -1; + + for (i = 0; i < seq_len - subseq_len + 1; ++i) { + bool found = true; + + for (j = 0; j < subseq_len; ++j) { + if (!compare_masked_insn(&seq[i + j], &subseq[j])) { + found = false; + break; + } + } + if (found) + return i; + } + + return -1; +} + +static int find_skip_insn_marker(struct bpf_insn *seq, int len) +{ + int i; + + for (i = 0; i < len; ++i) + if (is_skip_insn(&seq[i])) + return i; + + return -1; +} + +/* Return true if all sub-sequences in `subseqs` could be found in + * `seq` one after another. Sub-sequences are separated by a single + * nil instruction. + */ +static bool find_all_insn_subseqs(struct bpf_insn *seq, struct bpf_insn *subseqs, + int seq_len, int max_subseqs_len) +{ + int subseqs_len = null_terminated_insn_len(subseqs, max_subseqs_len); + + while (subseqs_len > 0) { + int skip_idx = find_skip_insn_marker(subseqs, subseqs_len); + int cur_subseq_len = skip_idx < 0 ? subseqs_len : skip_idx; + int subseq_idx = find_insn_subseq(seq, subseqs, + seq_len, cur_subseq_len); + + if (subseq_idx < 0) + return false; + seq += subseq_idx + cur_subseq_len; + seq_len -= subseq_idx + cur_subseq_len; + subseqs += cur_subseq_len + 1; + subseqs_len -= cur_subseq_len + 1; + } + + return true; +} + +static void print_insn(struct bpf_insn *buf, int cnt) +{ + int i; + + printf(" addr op d s off imm\n"); + for (i = 0; i < cnt; ++i) { + struct bpf_insn *insn = &buf[i]; + + if (is_null_insn(insn)) + break; + + if (is_skip_insn(insn)) + printf(" ...\n"); + else + printf(" %04x: %02x %1x %x %04hx %08x\n", + i, insn->code, insn->dst_reg, + insn->src_reg, insn->off, insn->imm); + } +} + +static bool check_xlated_program(struct bpf_test *test, int fd_prog) +{ + struct bpf_insn *buf; + int cnt; + bool result = true; + bool check_expected = !is_null_insn(test->expected_insns); + bool check_unexpected = !is_null_insn(test->unexpected_insns); + + if (!check_expected && !check_unexpected) + goto out; + + if (get_xlated_program(fd_prog, &buf, &cnt)) { + printf("FAIL: can't get xlated program\n"); + result = false; + goto out; + } + + if (check_expected && + !find_all_insn_subseqs(buf, test->expected_insns, + cnt, MAX_EXPECTED_INSNS)) { + printf("FAIL: can't find expected subsequence of instructions\n"); + result = false; + if (verbose) { + printf("Program:\n"); + print_insn(buf, cnt); + printf("Expected subsequence:\n"); + print_insn(test->expected_insns, MAX_EXPECTED_INSNS); + } + } + + if (check_unexpected && + find_all_insn_subseqs(buf, test->unexpected_insns, + cnt, MAX_UNEXPECTED_INSNS)) { + printf("FAIL: found unexpected subsequence of instructions\n"); + result = false; + if (verbose) { + printf("Program:\n"); + print_insn(buf, cnt); + printf("Un-expected subsequence:\n"); + print_insn(test->unexpected_insns, MAX_UNEXPECTED_INSNS); + } + } + + free(buf); + out: + return result; +} + static void do_test_single(struct bpf_test *test, bool unpriv, int *passes, int *errors) { @@ -1262,6 +1493,9 @@ static void do_test_single(struct bpf_test *test, bool unpriv, if (verbose) printf(", verifier log:\n%s", bpf_vlog); + if (!check_xlated_program(test, fd_prog)) + goto fail_log; + run_errs = 0; run_successes = 0; if (!alignment_prevented_execution && fd_prog >= 0 && test->runs >= 0) { -- cgit v1.2.3 From 7a42008ca5c700819e4b3003025e5e1695fd1f86 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Jun 2022 02:53:41 +0300 Subject: selftests/bpf: allow BTF specs and func infos in test_verifier tests The BTF and func_info specification for test_verifier tests follows the same notation as in prog_tests/btf.c tests. E.g.: ... .func_info = { { 0, 6 }, { 8, 7 } }, .func_info_cnt = 2, .btf_strings = "\0int\0", .btf_types = { BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), BTF_PTR_ENC(1), }, ... The BTF specification is loaded only when specified. Signed-off-by: Eduard Zingerman Acked-by: Song Liu Link: https://lore.kernel.org/r/20220620235344.569325-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/btf.c | 1 - tools/testing/selftests/bpf/test_btf.h | 2 + tools/testing/selftests/bpf/test_verifier.c | 94 +++++++++++++++++++++++----- 3 files changed, 79 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index edb387163baa..1fd792a92a1c 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -34,7 +34,6 @@ static bool always_log; #undef CHECK #define CHECK(condition, format...) _CHECK(condition, "check", duration, format) -#define BTF_END_RAW 0xdeadbeef #define NAME_TBD 0xdeadb33f #define NAME_NTH(N) (0xfffe0000 | N) diff --git a/tools/testing/selftests/bpf/test_btf.h b/tools/testing/selftests/bpf/test_btf.h index 38782bd47fdc..fb4f4714eeb4 100644 --- a/tools/testing/selftests/bpf/test_btf.h +++ b/tools/testing/selftests/bpf/test_btf.h @@ -4,6 +4,8 @@ #ifndef _TEST_BTF_H #define _TEST_BTF_H +#define BTF_END_RAW 0xdeadbeef + #define BTF_INFO_ENC(kind, kind_flag, vlen) \ ((!!(kind_flag) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN)) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 1f24eae9e16e..7fe897c66d81 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -59,11 +59,17 @@ #define MAX_TEST_RUNS 8 #define POINTER_VALUE 0xcafe4all #define TEST_DATA_LEN 64 +#define MAX_FUNC_INFOS 8 +#define MAX_BTF_STRINGS 256 +#define MAX_BTF_TYPES 256 #define INSN_OFF_MASK ((__s16)0xFFFF) #define INSN_IMM_MASK ((__s32)0xFFFFFFFF) #define SKIP_INSNS() BPF_RAW_INSN(0xde, 0xa, 0xd, 0xbeef, 0xdeadbeef) +#define DEFAULT_LIBBPF_LOG_LEVEL 4 +#define VERBOSE_LIBBPF_LOG_LEVEL 1 + #define F_NEEDS_EFFICIENT_UNALIGNED_ACCESS (1 << 0) #define F_LOAD_WITH_STRICT_ALIGNMENT (1 << 1) @@ -158,6 +164,14 @@ struct bpf_test { }; enum bpf_attach_type expected_attach_type; const char *kfunc; + struct bpf_func_info func_info[MAX_FUNC_INFOS]; + int func_info_cnt; + char btf_strings[MAX_BTF_STRINGS]; + /* A set of BTF types to load when specified, + * use macro definitions from test_btf.h, + * must end with BTF_END_RAW + */ + __u32 btf_types[MAX_BTF_TYPES]; }; /* Note we want this to be 64 bit aligned so that the end of our array is @@ -687,34 +701,66 @@ static __u32 btf_raw_types[] = { BTF_MEMBER_ENC(71, 13, 128), /* struct prog_test_member __kptr_ref *ptr; */ }; -static int load_btf(void) +static char bpf_vlog[UINT_MAX >> 8]; + +static int load_btf_spec(__u32 *types, int types_len, + const char *strings, int strings_len) { struct btf_header hdr = { .magic = BTF_MAGIC, .version = BTF_VERSION, .hdr_len = sizeof(struct btf_header), - .type_len = sizeof(btf_raw_types), - .str_off = sizeof(btf_raw_types), - .str_len = sizeof(btf_str_sec), + .type_len = types_len, + .str_off = types_len, + .str_len = strings_len, }; void *ptr, *raw_btf; int btf_fd; + LIBBPF_OPTS(bpf_btf_load_opts, opts, + .log_buf = bpf_vlog, + .log_size = sizeof(bpf_vlog), + .log_level = (verbose + ? VERBOSE_LIBBPF_LOG_LEVEL + : DEFAULT_LIBBPF_LOG_LEVEL), + ); - ptr = raw_btf = malloc(sizeof(hdr) + sizeof(btf_raw_types) + - sizeof(btf_str_sec)); + raw_btf = malloc(sizeof(hdr) + types_len + strings_len); + ptr = raw_btf; memcpy(ptr, &hdr, sizeof(hdr)); ptr += sizeof(hdr); - memcpy(ptr, btf_raw_types, hdr.type_len); + memcpy(ptr, types, hdr.type_len); ptr += hdr.type_len; - memcpy(ptr, btf_str_sec, hdr.str_len); + memcpy(ptr, strings, hdr.str_len); ptr += hdr.str_len; - btf_fd = bpf_btf_load(raw_btf, ptr - raw_btf, NULL); - free(raw_btf); + btf_fd = bpf_btf_load(raw_btf, ptr - raw_btf, &opts); if (btf_fd < 0) - return -1; - return btf_fd; + printf("Failed to load BTF spec: '%s'\n", strerror(errno)); + + free(raw_btf); + + return btf_fd < 0 ? -1 : btf_fd; +} + +static int load_btf(void) +{ + return load_btf_spec(btf_raw_types, sizeof(btf_raw_types), + btf_str_sec, sizeof(btf_str_sec)); +} + +static int load_btf_for_test(struct bpf_test *test) +{ + int types_num = 0; + + while (types_num < MAX_BTF_TYPES && + test->btf_types[types_num] != BTF_END_RAW) + ++types_num; + + int types_len = types_num * sizeof(test->btf_types[0]); + + return load_btf_spec(test->btf_types, types_len, + test->btf_strings, sizeof(test->btf_strings)); } static int create_map_spin_lock(void) @@ -793,8 +839,6 @@ static int create_map_kptr(void) return fd; } -static char bpf_vlog[UINT_MAX >> 8]; - static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, struct bpf_insn *prog, int *map_fds) { @@ -1360,7 +1404,7 @@ static bool check_xlated_program(struct bpf_test *test, int fd_prog) static void do_test_single(struct bpf_test *test, bool unpriv, int *passes, int *errors) { - int fd_prog, expected_ret, alignment_prevented_execution; + int fd_prog, btf_fd, expected_ret, alignment_prevented_execution; int prog_len, prog_type = test->prog_type; struct bpf_insn *prog = test->insns; LIBBPF_OPTS(bpf_prog_load_opts, opts); @@ -1372,8 +1416,10 @@ static void do_test_single(struct bpf_test *test, bool unpriv, __u32 pflags; int i, err; + fd_prog = -1; for (i = 0; i < MAX_NR_MAPS; i++) map_fds[i] = -1; + btf_fd = -1; if (!prog_type) prog_type = BPF_PROG_TYPE_SOCKET_FILTER; @@ -1406,11 +1452,11 @@ static void do_test_single(struct bpf_test *test, bool unpriv, opts.expected_attach_type = test->expected_attach_type; if (verbose) - opts.log_level = 1; + opts.log_level = VERBOSE_LIBBPF_LOG_LEVEL; else if (expected_ret == VERBOSE_ACCEPT) opts.log_level = 2; else - opts.log_level = 4; + opts.log_level = DEFAULT_LIBBPF_LOG_LEVEL; opts.prog_flags = pflags; if (prog_type == BPF_PROG_TYPE_TRACING && test->kfunc) { @@ -1428,6 +1474,19 @@ static void do_test_single(struct bpf_test *test, bool unpriv, opts.attach_btf_id = attach_btf_id; } + if (test->btf_types[0] != 0) { + btf_fd = load_btf_for_test(test); + if (btf_fd < 0) + goto fail_log; + opts.prog_btf_fd = btf_fd; + } + + if (test->func_info_cnt != 0) { + opts.func_info = test->func_info; + opts.func_info_cnt = test->func_info_cnt; + opts.func_info_rec_size = sizeof(test->func_info[0]); + } + opts.log_buf = bpf_vlog; opts.log_size = sizeof(bpf_vlog); fd_prog = bpf_prog_load(prog_type, NULL, "GPL", prog, prog_len, &opts); @@ -1539,6 +1598,7 @@ close_fds: if (test->fill_insns) free(test->fill_insns); close(fd_prog); + close(btf_fd); for (i = 0; i < MAX_NR_MAPS; i++) close(map_fds[i]); sched_yield(); -- cgit v1.2.3 From f8acfdd04410d26b096a7082444cdc402df10f89 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Jun 2022 02:53:43 +0300 Subject: selftests/bpf: BPF test_verifier selftests for bpf_loop inlining A number of test cases for BPF selftests test_verifier to check how bpf_loop inline transformation rewrites the BPF program. The following cases are covered: - happy path - no-rewrite when flags is non-zero - no-rewrite when callback is non-constant - subprogno in insn_aux is updated correctly when dead sub-programs are removed - check that correct stack offsets are assigned for spilling of R6-R8 registers Signed-off-by: Eduard Zingerman Acked-by: Song Liu Link: https://lore.kernel.org/r/20220620235344.569325-5-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/verifier/bpf_loop_inline.c | 252 +++++++++++++++++++++ 1 file changed, 252 insertions(+) create mode 100644 tools/testing/selftests/bpf/verifier/bpf_loop_inline.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/verifier/bpf_loop_inline.c b/tools/testing/selftests/bpf/verifier/bpf_loop_inline.c new file mode 100644 index 000000000000..232da07c93b5 --- /dev/null +++ b/tools/testing/selftests/bpf/verifier/bpf_loop_inline.c @@ -0,0 +1,252 @@ +#define BTF_TYPES \ + .btf_strings = "\0int\0i\0ctx\0callback\0main\0", \ + .btf_types = { \ + /* 1: int */ BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), \ + /* 2: int* */ BTF_PTR_ENC(1), \ + /* 3: void* */ BTF_PTR_ENC(0), \ + /* 4: int __(void*) */ BTF_FUNC_PROTO_ENC(1, 1), \ + BTF_FUNC_PROTO_ARG_ENC(7, 3), \ + /* 5: int __(int, int*) */ BTF_FUNC_PROTO_ENC(1, 2), \ + BTF_FUNC_PROTO_ARG_ENC(5, 1), \ + BTF_FUNC_PROTO_ARG_ENC(7, 2), \ + /* 6: main */ BTF_FUNC_ENC(20, 4), \ + /* 7: callback */ BTF_FUNC_ENC(11, 5), \ + BTF_END_RAW \ + } + +#define MAIN_TYPE 6 +#define CALLBACK_TYPE 7 + +/* can't use BPF_CALL_REL, jit_subprogs adjusts IMM & OFF + * fields for pseudo calls + */ +#define PSEUDO_CALL_INSN() \ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_CALL, \ + INSN_OFF_MASK, INSN_IMM_MASK) + +/* can't use BPF_FUNC_loop constant, + * do_mix_fixups adjusts the IMM field + */ +#define HELPER_CALL_INSN() \ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, INSN_OFF_MASK, INSN_IMM_MASK) + +{ + "inline simple bpf_loop call", + .insns = { + /* main */ + /* force verifier state branching to verify logic on first and + * subsequent bpf_loop insn processing steps + */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 777, 2), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 1), + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 2), + + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_2, BPF_PSEUDO_FUNC, 0, 6), + BPF_RAW_INSN(0, 0, 0, 0, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_3, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_4, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_loop), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0), + BPF_EXIT_INSN(), + /* callback */ + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .expected_insns = { PSEUDO_CALL_INSN() }, + .unexpected_insns = { HELPER_CALL_INSN() }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .result = ACCEPT, + .runs = 0, + .func_info = { { 0, MAIN_TYPE }, { 12, CALLBACK_TYPE } }, + .func_info_cnt = 2, + BTF_TYPES +}, +{ + "don't inline bpf_loop call, flags non-zero", + .insns = { + /* main */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64), + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64), + BPF_ALU64_REG(BPF_MOV, BPF_REG_7, BPF_REG_0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 9), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_4, 0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 1), + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_2, BPF_PSEUDO_FUNC, 0, 7), + BPF_RAW_INSN(0, 0, 0, 0, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_3, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_loop), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_4, 1), + BPF_JMP_IMM(BPF_JA, 0, 0, -10), + /* callback */ + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .expected_insns = { HELPER_CALL_INSN() }, + .unexpected_insns = { PSEUDO_CALL_INSN() }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .result = ACCEPT, + .runs = 0, + .func_info = { { 0, MAIN_TYPE }, { 16, CALLBACK_TYPE } }, + .func_info_cnt = 2, + BTF_TYPES +}, +{ + "don't inline bpf_loop call, callback non-constant", + .insns = { + /* main */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 777, 4), /* pick a random callback */ + + BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 1), + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_2, BPF_PSEUDO_FUNC, 0, 10), + BPF_RAW_INSN(0, 0, 0, 0, 0), + BPF_JMP_IMM(BPF_JA, 0, 0, 3), + + BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 1), + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_2, BPF_PSEUDO_FUNC, 0, 8), + BPF_RAW_INSN(0, 0, 0, 0, 0), + + BPF_ALU64_IMM(BPF_MOV, BPF_REG_3, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_4, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_loop), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0), + BPF_EXIT_INSN(), + /* callback */ + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 1), + BPF_EXIT_INSN(), + /* callback #2 */ + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .expected_insns = { HELPER_CALL_INSN() }, + .unexpected_insns = { PSEUDO_CALL_INSN() }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .result = ACCEPT, + .runs = 0, + .func_info = { + { 0, MAIN_TYPE }, + { 14, CALLBACK_TYPE }, + { 16, CALLBACK_TYPE } + }, + .func_info_cnt = 3, + BTF_TYPES +}, +{ + "bpf_loop_inline and a dead func", + .insns = { + /* main */ + + /* A reference to callback #1 to make verifier count it as a func. + * This reference is overwritten below and callback #1 is dead. + */ + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_2, BPF_PSEUDO_FUNC, 0, 9), + BPF_RAW_INSN(0, 0, 0, 0, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 1), + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_2, BPF_PSEUDO_FUNC, 0, 8), + BPF_RAW_INSN(0, 0, 0, 0, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_3, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_4, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_loop), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0), + BPF_EXIT_INSN(), + /* callback */ + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 1), + BPF_EXIT_INSN(), + /* callback #2 */ + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .expected_insns = { PSEUDO_CALL_INSN() }, + .unexpected_insns = { HELPER_CALL_INSN() }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .result = ACCEPT, + .runs = 0, + .func_info = { + { 0, MAIN_TYPE }, + { 10, CALLBACK_TYPE }, + { 12, CALLBACK_TYPE } + }, + .func_info_cnt = 3, + BTF_TYPES +}, +{ + "bpf_loop_inline stack locations for loop vars", + .insns = { + /* main */ + BPF_ST_MEM(BPF_W, BPF_REG_10, -12, 0x77), + /* bpf_loop call #1 */ + BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 1), + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_2, BPF_PSEUDO_FUNC, 0, 22), + BPF_RAW_INSN(0, 0, 0, 0, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_3, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_4, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_loop), + /* bpf_loop call #2 */ + BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 2), + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_2, BPF_PSEUDO_FUNC, 0, 16), + BPF_RAW_INSN(0, 0, 0, 0, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_3, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_4, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_loop), + /* call func and exit */ + BPF_CALL_REL(2), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0), + BPF_EXIT_INSN(), + /* func */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -32, 0x55), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 2), + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_2, BPF_PSEUDO_FUNC, 0, 6), + BPF_RAW_INSN(0, 0, 0, 0, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_3, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_4, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_loop), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0), + BPF_EXIT_INSN(), + /* callback */ + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .expected_insns = { + BPF_ST_MEM(BPF_W, BPF_REG_10, -12, 0x77), + SKIP_INSNS(), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -40), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, -32), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, -24), + SKIP_INSNS(), + /* offsets are the same as in the first call */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -40), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, -32), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, -24), + SKIP_INSNS(), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -32, 0x55), + SKIP_INSNS(), + /* offsets differ from main because of different offset + * in BPF_ST_MEM instruction + */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -56), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, -48), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, -40), + }, + .unexpected_insns = { HELPER_CALL_INSN() }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .result = ACCEPT, + .func_info = { + { 0, MAIN_TYPE }, + { 16, MAIN_TYPE }, + { 25, CALLBACK_TYPE }, + }, + .func_info_cnt = 3, + BTF_TYPES +}, + +#undef HELPER_CALL_INSN +#undef PSEUDO_CALL_INSN +#undef CALLBACK_TYPE +#undef MAIN_TYPE +#undef BTF_TYPES -- cgit v1.2.3 From 0e1bf9ed2000c16fa8e0703e255a23d64a4adb27 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Jun 2022 02:53:44 +0300 Subject: selftests/bpf: BPF test_prog selftests for bpf_loop inlining Two new test BPF programs for test_prog selftests checking bpf_loop behavior. Both are corner cases for bpf_loop inlinig transformation: - check that bpf_loop behaves correctly when callback function is not a compile time constant - check that local function variables are not affected by allocating additional stack storage for registers spilled by loop inlining Signed-off-by: Eduard Zingerman Acked-by: Song Liu Link: https://lore.kernel.org/r/20220620235344.569325-6-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/bpf_loop.c | 62 ++++++++++++ tools/testing/selftests/bpf/progs/bpf_loop.c | 114 ++++++++++++++++++++++ 2 files changed, 176 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_loop.c b/tools/testing/selftests/bpf/prog_tests/bpf_loop.c index 380d7a2072e3..4cd8a25afe68 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_loop.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_loop.c @@ -120,6 +120,64 @@ static void check_nested_calls(struct bpf_loop *skel) bpf_link__destroy(link); } +static void check_non_constant_callback(struct bpf_loop *skel) +{ + struct bpf_link *link = + bpf_program__attach(skel->progs.prog_non_constant_callback); + + if (!ASSERT_OK_PTR(link, "link")) + return; + + skel->bss->callback_selector = 0x0F; + usleep(1); + ASSERT_EQ(skel->bss->g_output, 0x0F, "g_output #1"); + + skel->bss->callback_selector = 0xF0; + usleep(1); + ASSERT_EQ(skel->bss->g_output, 0xF0, "g_output #2"); + + bpf_link__destroy(link); +} + +static void check_stack(struct bpf_loop *skel) +{ + struct bpf_link *link = bpf_program__attach(skel->progs.stack_check); + const int max_key = 12; + int key; + int map_fd; + + if (!ASSERT_OK_PTR(link, "link")) + return; + + map_fd = bpf_map__fd(skel->maps.map1); + + if (!ASSERT_GE(map_fd, 0, "bpf_map__fd")) + goto out; + + for (key = 1; key <= max_key; ++key) { + int val = key; + int err = bpf_map_update_elem(map_fd, &key, &val, BPF_NOEXIST); + + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto out; + } + + usleep(1); + + for (key = 1; key <= max_key; ++key) { + int val; + int err = bpf_map_lookup_elem(map_fd, &key, &val); + + if (!ASSERT_OK(err, "bpf_map_lookup_elem")) + goto out; + if (!ASSERT_EQ(val, key + 1, "bad value in the map")) + goto out; + } + +out: + bpf_link__destroy(link); +} + void test_bpf_loop(void) { struct bpf_loop *skel; @@ -140,6 +198,10 @@ void test_bpf_loop(void) check_invalid_flags(skel); if (test__start_subtest("check_nested_calls")) check_nested_calls(skel); + if (test__start_subtest("check_non_constant_callback")) + check_non_constant_callback(skel); + if (test__start_subtest("check_stack")) + check_stack(skel); bpf_loop__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/bpf_loop.c b/tools/testing/selftests/bpf/progs/bpf_loop.c index e08565282759..de1fc82d2710 100644 --- a/tools/testing/selftests/bpf/progs/bpf_loop.c +++ b/tools/testing/selftests/bpf/progs/bpf_loop.c @@ -11,11 +11,19 @@ struct callback_ctx { int output; }; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 32); + __type(key, int); + __type(value, int); +} map1 SEC(".maps"); + /* These should be set by the user program */ u32 nested_callback_nr_loops; u32 stop_index = -1; u32 nr_loops; int pid; +int callback_selector; /* Making these global variables so that the userspace program * can verify the output through the skeleton @@ -111,3 +119,109 @@ int prog_nested_calls(void *ctx) return 0; } + +static int callback_set_f0(int i, void *ctx) +{ + g_output = 0xF0; + return 0; +} + +static int callback_set_0f(int i, void *ctx) +{ + g_output = 0x0F; + return 0; +} + +/* + * non-constant callback is a corner case for bpf_loop inline logic + */ +SEC("fentry/" SYS_PREFIX "sys_nanosleep") +int prog_non_constant_callback(void *ctx) +{ + struct callback_ctx data = {}; + + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + int (*callback)(int i, void *ctx); + + g_output = 0; + + if (callback_selector == 0x0F) + callback = callback_set_0f; + else + callback = callback_set_f0; + + bpf_loop(1, callback, NULL, 0); + + return 0; +} + +static int stack_check_inner_callback(void *ctx) +{ + return 0; +} + +static int map1_lookup_elem(int key) +{ + int *val = bpf_map_lookup_elem(&map1, &key); + + return val ? *val : -1; +} + +static void map1_update_elem(int key, int val) +{ + bpf_map_update_elem(&map1, &key, &val, BPF_ANY); +} + +static int stack_check_outer_callback(void *ctx) +{ + int a = map1_lookup_elem(1); + int b = map1_lookup_elem(2); + int c = map1_lookup_elem(3); + int d = map1_lookup_elem(4); + int e = map1_lookup_elem(5); + int f = map1_lookup_elem(6); + + bpf_loop(1, stack_check_inner_callback, NULL, 0); + + map1_update_elem(1, a + 1); + map1_update_elem(2, b + 1); + map1_update_elem(3, c + 1); + map1_update_elem(4, d + 1); + map1_update_elem(5, e + 1); + map1_update_elem(6, f + 1); + + return 0; +} + +/* Some of the local variables in stack_check and + * stack_check_outer_callback would be allocated on stack by + * compiler. This test should verify that stack content for these + * variables is preserved between calls to bpf_loop (might be an issue + * if loop inlining allocates stack slots incorrectly). + */ +SEC("fentry/" SYS_PREFIX "sys_nanosleep") +int stack_check(void *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + int a = map1_lookup_elem(7); + int b = map1_lookup_elem(8); + int c = map1_lookup_elem(9); + int d = map1_lookup_elem(10); + int e = map1_lookup_elem(11); + int f = map1_lookup_elem(12); + + bpf_loop(1, stack_check_outer_callback, NULL, 0); + + map1_update_elem(7, a + 1); + map1_update_elem(8, b + 1); + map1_update_elem(9, c + 1); + map1_update_elem(10, d + 1); + map1_update_elem(11, e + 1); + map1_update_elem(12, f + 1); + + return 0; +} -- cgit v1.2.3 From 5c92d7501699a5deb72a579f808500db5bb6f92a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 May 2022 13:22:28 -0700 Subject: torture: Adjust to again produce debugging information A recent change to the DEBUG_INFO Kconfig option means that simply adding CONFIG_DEBUG_INFO=y to the .config file and running "make oldconfig" no longer works. It is instead necessary to add CONFIG_DEBUG_INFO_NONE=n and (for example) CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y. This combination will then result in CONFIG_DEBUG_INFO being selected. This commit therefore updates the Kconfig options produced in response to the kvm.sh --gdb, --kasan, and --kcsan Kconfig options. Fixes: f9b3cd245784 ("Kconfig.debug: make DEBUG_INFO selectable from a choice") Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 263e16aeca0e..6c734818a875 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -164,7 +164,7 @@ do shift ;; --gdb) - TORTURE_KCONFIG_GDB_ARG="CONFIG_DEBUG_INFO=y"; export TORTURE_KCONFIG_GDB_ARG + TORTURE_KCONFIG_GDB_ARG="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y"; export TORTURE_KCONFIG_GDB_ARG TORTURE_BOOT_GDB_ARG="nokaslr"; export TORTURE_BOOT_GDB_ARG TORTURE_QEMU_GDB_ARG="-s -S"; export TORTURE_QEMU_GDB_ARG ;; @@ -180,7 +180,7 @@ do shift ;; --kasan) - TORTURE_KCONFIG_KASAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG + TORTURE_KCONFIG_KASAN_ARG="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG if test -n "$torture_qemu_mem_default" then TORTURE_QEMU_MEM=2G @@ -192,7 +192,7 @@ do shift ;; --kcsan) - TORTURE_KCONFIG_KCSAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KCSAN=y CONFIG_KCSAN_STRICT=y CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y"; export TORTURE_KCONFIG_KCSAN_ARG + TORTURE_KCONFIG_KCSAN_ARG="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_KCSAN=y CONFIG_KCSAN_STRICT=y CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y"; export TORTURE_KCONFIG_KCSAN_ARG ;; --kmake-arg|--kmake-args) checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$' -- cgit v1.2.3 From 148df92fb14e5aab1c84b8ca4d2181e7b117290d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Jun 2022 19:53:52 -0700 Subject: torture: Create kvm-check-branches.sh output in proper location Currently, kvm-check-branches.sh causes each kvm.sh invocation create a separate date-stamped directory, then after that invocation completes, moves it into the *-group/NNNN directory. This works, but makes it more difficult to monitor an ongoing run. This commit therefore uses the kvm.sh --datestamp argument to make kvm.sh put the output in the right place to start with, and also dispenses with the additional level of datestamping. (Those wanting datestamps can find them in the log files.) Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh b/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh index f17000a2ccf1..ed0ec7f0927e 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh @@ -35,7 +35,7 @@ then exit 1 fi -# Remember where we started so that we can get back and the end. +# Remember where we started so that we can get back at the end. curcommit="`git status | head -1 | awk '{ print $NF }'`" nfail=0 @@ -73,15 +73,10 @@ do # Test the specified commit. git checkout $i > $resdir/$ds/$idir/git-checkout.out 2>&1 echo git checkout return code: $? "(Commit $ntry: $i)" - kvm.sh --allcpus --duration 3 --trust-make > $resdir/$ds/$idir/kvm.sh.out 2>&1 + kvm.sh --allcpus --duration 3 --trust-make --datestamp "$ds/$idir" > $resdir/$ds/$idir/kvm.sh.out 2>&1 ret=$? echo kvm.sh return code $ret for commit $i from branch $gitbr - - # Move the build products to their resting place. - runresdir="`grep -m 1 '^Results directory:' < $resdir/$ds/$idir/kvm.sh.out | sed -e 's/^Results directory://'`" - mv $runresdir $resdir/$ds/$idir - rrd="`echo $runresdir | sed -e 's,^.*/,,'`" - echo Run results: $resdir/$ds/$idir/$rrd + echo Run results: $resdir/$ds/$idir if test "$ret" -ne 0 then # Failure, so leave all evidence intact. -- cgit v1.2.3 From 2325d4dd7321cd569f996c5d091f4f83efb25693 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jun 2022 22:45:16 +0000 Subject: KVM: selftests: Add MONITOR/MWAIT quirk test Add a test to verify the "MONITOR/MWAIT never fault" quirk, and as a bonus, also verify the related "MISC_ENABLES ignores ENABLE_MWAIT" quirk. If the "never fault" quirk is enabled, MONITOR/MWAIT should always be emulated as NOPs, even if they're reported as disabled in guest CPUID. Use the MISC_ENABLES quirk to coerce KVM into toggling the MWAIT CPUID enable, as KVM now disallows manually toggling CPUID bits after running the vCPU. Signed-off-by: Sean Christopherson Message-Id: <20220608224516.3788274-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/x86_64/monitor_mwait_test | Bin 1485656 -> 0 bytes .../selftests/kvm/x86_64/monitor_mwait_test.c | 137 +++++++++++++++++++++ 4 files changed, 139 insertions(+) delete mode 100755 tools/testing/selftests/kvm/x86_64/monitor_mwait_test create mode 100644 tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c (limited to 'tools') diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index dd5c88c11059..f3c2f074b948 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -28,6 +28,7 @@ /x86_64/max_vcpuid_cap_test /x86_64/mmio_warning_test /x86_64/mmu_role_test +/x86_64/monitor_mwait_test /x86_64/platform_info_test /x86_64/pmu_event_filter_test /x86_64/set_boot_cpu_id diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index b52c130f7b2f..ad1634e5659f 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -84,6 +84,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/kvm_clock_test TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test TEST_GEN_PROGS_x86_64 += x86_64/mmu_role_test +TEST_GEN_PROGS_x86_64 += x86_64/monitor_mwait_test TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id diff --git a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test deleted file mode 100755 index 598b597e1eec..000000000000 Binary files a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test and /dev/null differ diff --git a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c new file mode 100644 index 000000000000..49f2ed1c53fe --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c @@ -0,0 +1,137 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +#include "kvm_util.h" +#include "processor.h" + +#define X86_FEATURE_MWAIT (1u << 3) + +enum monitor_mwait_testcases { + MWAIT_QUIRK_DISABLED = BIT(0), + MISC_ENABLES_QUIRK_DISABLED = BIT(1), + MWAIT_DISABLED = BIT(2), +}; + +static void guest_monitor_wait(int testcase) +{ + /* + * If both MWAIT and its quirk are disabled, MONITOR/MWAIT should #UD, + * in all other scenarios KVM should emulate them as nops. + */ + bool fault_wanted = (testcase & MWAIT_QUIRK_DISABLED) && + (testcase & MWAIT_DISABLED); + u8 vector; + + GUEST_SYNC(testcase); + + vector = kvm_asm_safe("monitor"); + if (fault_wanted) + GUEST_ASSERT_2(vector == UD_VECTOR, testcase, vector); + else + GUEST_ASSERT_2(!vector, testcase, vector); + + vector = kvm_asm_safe("monitor"); + if (fault_wanted) + GUEST_ASSERT_2(vector == UD_VECTOR, testcase, vector); + else + GUEST_ASSERT_2(!vector, testcase, vector); +} + +static void guest_code(void) +{ + guest_monitor_wait(MWAIT_DISABLED); + + guest_monitor_wait(MWAIT_QUIRK_DISABLED | MWAIT_DISABLED); + + guest_monitor_wait(MISC_ENABLES_QUIRK_DISABLED | MWAIT_DISABLED); + guest_monitor_wait(MISC_ENABLES_QUIRK_DISABLED); + + guest_monitor_wait(MISC_ENABLES_QUIRK_DISABLED | MWAIT_QUIRK_DISABLED | MWAIT_DISABLED); + guest_monitor_wait(MISC_ENABLES_QUIRK_DISABLED | MWAIT_QUIRK_DISABLED); + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + uint64_t disabled_quirks; + struct kvm_cpuid2 *cpuid; + struct kvm_cpuid_entry2 *entry; + struct kvm_vcpu *vcpu; + struct kvm_run *run; + struct kvm_vm *vm; + struct ucall uc; + int testcase; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_DISABLE_QUIRKS2)); + + cpuid = kvm_get_supported_cpuid(); + + entry = kvm_get_supported_cpuid_index(1, 0); + entry->ecx &= ~X86_FEATURE_MWAIT; + set_cpuid(cpuid, entry); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + vcpu_set_cpuid(vcpu, cpuid); + + run = vcpu->run; + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vcpu); + + while (1) { + vcpu_run(vcpu); + + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Unexpected exit reason: %u (%s),\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + testcase = uc.args[1]; + break; + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld, testcase = %lx, vector = %ld", + (const char *)uc.args[0], __FILE__, + uc.args[1], uc.args[2], uc.args[3]); + goto done; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + goto done; + } + + disabled_quirks = 0; + if (testcase & MWAIT_QUIRK_DISABLED) + disabled_quirks |= KVM_X86_QUIRK_MWAIT_NEVER_FAULTS; + if (testcase & MISC_ENABLES_QUIRK_DISABLED) + disabled_quirks |= KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT; + vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, disabled_quirks); + + /* + * If the MISC_ENABLES quirk (KVM neglects to update CPUID to + * enable/disable MWAIT) is disabled, toggle the ENABLE_MWAIT + * bit in MISC_ENABLES accordingly. If the quirk is enabled, + * the only valid configuration is MWAIT disabled, as CPUID + * can't be manually changed after running the vCPU. + */ + if (!(testcase & MISC_ENABLES_QUIRK_DISABLED)) { + TEST_ASSERT(testcase & MWAIT_DISABLED, + "Can't toggle CPUID features after running vCPU"); + continue; + } + + vcpu_set_msr(vcpu, MSR_IA32_MISC_ENABLE, + (testcase & MWAIT_DISABLED) ? 0 : MSR_IA32_MISC_ENABLE_MWAIT); + } + +done: + kvm_vm_free(vm); + return 0; +} -- cgit v1.2.3 From 73087489250def7cdda2dee5ba685bdeae73b8af Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 20 Jun 2022 15:25:54 -0700 Subject: selftests/bpf: Add benchmark for local_storage get MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a benchmarks to demonstrate the performance cliff for local_storage get as the number of local_storage maps increases beyond current local_storage implementation's cache size. "sequential get" and "interleaved get" benchmarks are added, both of which do many bpf_task_storage_get calls on sets of task local_storage maps of various counts, while considering a single specific map to be 'important' and counting task_storage_gets to the important map separately in addition to normal 'hits' count of all gets. Goal here is to mimic scenario where a particular program using one map - the important one - is running on a system where many other local_storage maps exist and are accessed often. While "sequential get" benchmark does bpf_task_storage_get for map 0, 1, ..., {9, 99, 999} in order, "interleaved" benchmark interleaves 4 bpf_task_storage_gets for the important map for every 10 map gets. This is meant to highlight performance differences when important map is accessed far more frequently than non-important maps. A "hashmap control" benchmark is also included for easy comparison of standard bpf hashmap lookup vs local_storage get. The benchmark is similar to "sequential get", but creates and uses BPF_MAP_TYPE_HASH instead of local storage. Only one inner map is created - a hashmap meant to hold tid -> data mapping for all tasks. Size of the hashmap is hardcoded to my system's PID_MAX_LIMIT (4,194,304). The number of these keys which are actually fetched as part of the benchmark is configurable. Addition of this benchmark is inspired by conversation with Alexei in a previous patchset's thread [0], which highlighted the need for such a benchmark to motivate and validate improvements to local_storage implementation. My approach in that series focused on improving performance for explicitly-marked 'important' maps and was rejected with feedback to make more generally-applicable improvements while avoiding explicitly marking maps as important. Thus the benchmark reports both general and important-map-focused metrics, so effect of future work on both is clear. Regarding the benchmark results. On a powerful system (Skylake, 20 cores, 256gb ram): Hashmap Control =============== num keys: 10 hashmap (control) sequential get: hits throughput: 20.900 ± 0.334 M ops/s, hits latency: 47.847 ns/op, important_hits throughput: 20.900 ± 0.334 M ops/s num keys: 1000 hashmap (control) sequential get: hits throughput: 13.758 ± 0.219 M ops/s, hits latency: 72.683 ns/op, important_hits throughput: 13.758 ± 0.219 M ops/s num keys: 10000 hashmap (control) sequential get: hits throughput: 6.995 ± 0.034 M ops/s, hits latency: 142.959 ns/op, important_hits throughput: 6.995 ± 0.034 M ops/s num keys: 100000 hashmap (control) sequential get: hits throughput: 4.452 ± 0.371 M ops/s, hits latency: 224.635 ns/op, important_hits throughput: 4.452 ± 0.371 M ops/s num keys: 4194304 hashmap (control) sequential get: hits throughput: 3.043 ± 0.033 M ops/s, hits latency: 328.587 ns/op, important_hits throughput: 3.043 ± 0.033 M ops/s Local Storage ============= num_maps: 1 local_storage cache sequential get: hits throughput: 47.298 ± 0.180 M ops/s, hits latency: 21.142 ns/op, important_hits throughput: 47.298 ± 0.180 M ops/s local_storage cache interleaved get: hits throughput: 55.277 ± 0.888 M ops/s, hits latency: 18.091 ns/op, important_hits throughput: 55.277 ± 0.888 M ops/s num_maps: 10 local_storage cache sequential get: hits throughput: 40.240 ± 0.802 M ops/s, hits latency: 24.851 ns/op, important_hits throughput: 4.024 ± 0.080 M ops/s local_storage cache interleaved get: hits throughput: 48.701 ± 0.722 M ops/s, hits latency: 20.533 ns/op, important_hits throughput: 17.393 ± 0.258 M ops/s num_maps: 16 local_storage cache sequential get: hits throughput: 44.515 ± 0.708 M ops/s, hits latency: 22.464 ns/op, important_hits throughput: 2.782 ± 0.044 M ops/s local_storage cache interleaved get: hits throughput: 49.553 ± 2.260 M ops/s, hits latency: 20.181 ns/op, important_hits throughput: 15.767 ± 0.719 M ops/s num_maps: 17 local_storage cache sequential get: hits throughput: 38.778 ± 0.302 M ops/s, hits latency: 25.788 ns/op, important_hits throughput: 2.284 ± 0.018 M ops/s local_storage cache interleaved get: hits throughput: 43.848 ± 1.023 M ops/s, hits latency: 22.806 ns/op, important_hits throughput: 13.349 ± 0.311 M ops/s num_maps: 24 local_storage cache sequential get: hits throughput: 19.317 ± 0.568 M ops/s, hits latency: 51.769 ns/op, important_hits throughput: 0.806 ± 0.024 M ops/s local_storage cache interleaved get: hits throughput: 24.397 ± 0.272 M ops/s, hits latency: 40.989 ns/op, important_hits throughput: 6.863 ± 0.077 M ops/s num_maps: 32 local_storage cache sequential get: hits throughput: 13.333 ± 0.135 M ops/s, hits latency: 75.000 ns/op, important_hits throughput: 0.417 ± 0.004 M ops/s local_storage cache interleaved get: hits throughput: 16.898 ± 0.383 M ops/s, hits latency: 59.178 ns/op, important_hits throughput: 4.717 ± 0.107 M ops/s num_maps: 100 local_storage cache sequential get: hits throughput: 6.360 ± 0.107 M ops/s, hits latency: 157.233 ns/op, important_hits throughput: 0.064 ± 0.001 M ops/s local_storage cache interleaved get: hits throughput: 7.303 ± 0.362 M ops/s, hits latency: 136.930 ns/op, important_hits throughput: 1.907 ± 0.094 M ops/s num_maps: 1000 local_storage cache sequential get: hits throughput: 0.452 ± 0.010 M ops/s, hits latency: 2214.022 ns/op, important_hits throughput: 0.000 ± 0.000 M ops/s local_storage cache interleaved get: hits throughput: 0.542 ± 0.007 M ops/s, hits latency: 1843.341 ns/op, important_hits throughput: 0.136 ± 0.002 M ops/s Looking at the "sequential get" results, it's clear that as the number of task local_storage maps grows beyond the current cache size (16), there's a significant reduction in hits throughput. Note that current local_storage implementation assigns a cache_idx to maps as they are created. Since "sequential get" is creating maps 0..n in order and then doing bpf_task_storage_get calls in the same order, the benchmark is effectively ensuring that a map will not be in cache when the program tries to access it. For "interleaved get" results, important-map hits throughput is greatly increased as the important map is more likely to be in cache by virtue of being accessed far more frequently. Throughput still reduces as # maps increases, though. To get a sense of the overhead of the benchmark program, I commented out bpf_task_storage_get/bpf_map_lookup_elem in local_storage_bench.c and ran the benchmark on the same host as the 'real' run. Results: Hashmap Control =============== num keys: 10 hashmap (control) sequential get: hits throughput: 54.288 ± 0.655 M ops/s, hits latency: 18.420 ns/op, important_hits throughput: 54.288 ± 0.655 M ops/s num keys: 1000 hashmap (control) sequential get: hits throughput: 52.913 ± 0.519 M ops/s, hits latency: 18.899 ns/op, important_hits throughput: 52.913 ± 0.519 M ops/s num keys: 10000 hashmap (control) sequential get: hits throughput: 53.480 ± 1.235 M ops/s, hits latency: 18.699 ns/op, important_hits throughput: 53.480 ± 1.235 M ops/s num keys: 100000 hashmap (control) sequential get: hits throughput: 54.982 ± 1.902 M ops/s, hits latency: 18.188 ns/op, important_hits throughput: 54.982 ± 1.902 M ops/s num keys: 4194304 hashmap (control) sequential get: hits throughput: 50.858 ± 0.707 M ops/s, hits latency: 19.662 ns/op, important_hits throughput: 50.858 ± 0.707 M ops/s Local Storage ============= num_maps: 1 local_storage cache sequential get: hits throughput: 110.990 ± 4.828 M ops/s, hits latency: 9.010 ns/op, important_hits throughput: 110.990 ± 4.828 M ops/s local_storage cache interleaved get: hits throughput: 161.057 ± 4.090 M ops/s, hits latency: 6.209 ns/op, important_hits throughput: 161.057 ± 4.090 M ops/s num_maps: 10 local_storage cache sequential get: hits throughput: 112.930 ± 1.079 M ops/s, hits latency: 8.855 ns/op, important_hits throughput: 11.293 ± 0.108 M ops/s local_storage cache interleaved get: hits throughput: 115.841 ± 2.088 M ops/s, hits latency: 8.633 ns/op, important_hits throughput: 41.372 ± 0.746 M ops/s num_maps: 16 local_storage cache sequential get: hits throughput: 115.653 ± 0.416 M ops/s, hits latency: 8.647 ns/op, important_hits throughput: 7.228 ± 0.026 M ops/s local_storage cache interleaved get: hits throughput: 138.717 ± 1.649 M ops/s, hits latency: 7.209 ns/op, important_hits throughput: 44.137 ± 0.525 M ops/s num_maps: 17 local_storage cache sequential get: hits throughput: 112.020 ± 1.649 M ops/s, hits latency: 8.927 ns/op, important_hits throughput: 6.598 ± 0.097 M ops/s local_storage cache interleaved get: hits throughput: 128.089 ± 1.960 M ops/s, hits latency: 7.807 ns/op, important_hits throughput: 38.995 ± 0.597 M ops/s num_maps: 24 local_storage cache sequential get: hits throughput: 92.447 ± 5.170 M ops/s, hits latency: 10.817 ns/op, important_hits throughput: 3.855 ± 0.216 M ops/s local_storage cache interleaved get: hits throughput: 128.844 ± 2.808 M ops/s, hits latency: 7.761 ns/op, important_hits throughput: 36.245 ± 0.790 M ops/s num_maps: 32 local_storage cache sequential get: hits throughput: 102.042 ± 1.462 M ops/s, hits latency: 9.800 ns/op, important_hits throughput: 3.194 ± 0.046 M ops/s local_storage cache interleaved get: hits throughput: 126.577 ± 1.818 M ops/s, hits latency: 7.900 ns/op, important_hits throughput: 35.332 ± 0.507 M ops/s num_maps: 100 local_storage cache sequential get: hits throughput: 111.327 ± 1.401 M ops/s, hits latency: 8.983 ns/op, important_hits throughput: 1.113 ± 0.014 M ops/s local_storage cache interleaved get: hits throughput: 131.327 ± 1.339 M ops/s, hits latency: 7.615 ns/op, important_hits throughput: 34.302 ± 0.350 M ops/s num_maps: 1000 local_storage cache sequential get: hits throughput: 101.978 ± 0.563 M ops/s, hits latency: 9.806 ns/op, important_hits throughput: 0.102 ± 0.001 M ops/s local_storage cache interleaved get: hits throughput: 141.084 ± 1.098 M ops/s, hits latency: 7.088 ns/op, important_hits throughput: 35.430 ± 0.276 M ops/s Adjusting for overhead, latency numbers for "hashmap control" and "sequential get" are: hashmap_control_1k: ~53.8ns hashmap_control_10k: ~124.2ns hashmap_control_100k: ~206.5ns sequential_get_1: ~12.1ns sequential_get_10: ~16.0ns sequential_get_16: ~13.8ns sequential_get_17: ~16.8ns sequential_get_24: ~40.9ns sequential_get_32: ~65.2ns sequential_get_100: ~148.2ns sequential_get_1000: ~2204ns Clearly demonstrating a cliff. In the discussion for v1 of this patch, Alexei noted that local_storage was 2.5x faster than a large hashmap when initially implemented [1]. The benchmark results show that local_storage is 5-10x faster: a long-running BPF application putting some pid-specific info into a hashmap for each pid it sees will probably see on the order of 10-100k pids. Bench numbers for hashmaps of this size are ~10x slower than sequential_get_16, but as the number of local_storage maps grows far past local_storage cache size the performance advantage shrinks and eventually reverses. When running the benchmarks it may be necessary to bump 'open files' ulimit for a successful run. [0]: https://lore.kernel.org/all/20220420002143.1096548-1-davemarchevsky@fb.com [1]: https://lore.kernel.org/bpf/20220511173305.ftldpn23m4ski3d3@MBP-98dd607d3435.dhcp.thefacebook.com/ Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20220620222554.270578-1-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 4 +- tools/testing/selftests/bpf/bench.c | 55 ++++ tools/testing/selftests/bpf/bench.h | 4 + .../selftests/bpf/benchs/bench_local_storage.c | 287 +++++++++++++++++++++ .../bpf/benchs/run_bench_local_storage.sh | 24 ++ tools/testing/selftests/bpf/benchs/run_common.sh | 17 ++ .../selftests/bpf/progs/local_storage_bench.c | 104 ++++++++ 7 files changed, 494 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/benchs/bench_local_storage.c create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_local_storage.sh create mode 100644 tools/testing/selftests/bpf/progs/local_storage_bench.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index cb8e552e1418..4fbd88a8ed9e 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -571,6 +571,7 @@ $(OUTPUT)/bench_bloom_filter_map.o: $(OUTPUT)/bloom_filter_bench.skel.h $(OUTPUT)/bench_bpf_loop.o: $(OUTPUT)/bpf_loop_bench.skel.h $(OUTPUT)/bench_strncmp.o: $(OUTPUT)/strncmp_bench.skel.h $(OUTPUT)/bench_bpf_hashmap_full_update.o: $(OUTPUT)/bpf_hashmap_full_update_bench.skel.h +$(OUTPUT)/bench_local_storage.o: $(OUTPUT)/local_storage_bench.skel.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ) $(OUTPUT)/bench: LDLIBS += -lm $(OUTPUT)/bench: $(OUTPUT)/bench.o \ @@ -583,7 +584,8 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \ $(OUTPUT)/bench_bloom_filter_map.o \ $(OUTPUT)/bench_bpf_loop.o \ $(OUTPUT)/bench_strncmp.o \ - $(OUTPUT)/bench_bpf_hashmap_full_update.o + $(OUTPUT)/bench_bpf_hashmap_full_update.o \ + $(OUTPUT)/bench_local_storage.o $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index d8aa62be996b..1e7b5d4b1f11 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -150,6 +150,53 @@ void ops_report_final(struct bench_res res[], int res_cnt) printf("latency %8.3lf ns/op\n", 1000.0 / hits_mean * env.producer_cnt); } +void local_storage_report_progress(int iter, struct bench_res *res, + long delta_ns) +{ + double important_hits_per_sec, hits_per_sec; + double delta_sec = delta_ns / 1000000000.0; + + hits_per_sec = res->hits / 1000000.0 / delta_sec; + important_hits_per_sec = res->important_hits / 1000000.0 / delta_sec; + + printf("Iter %3d (%7.3lfus): ", iter, (delta_ns - 1000000000) / 1000.0); + + printf("hits %8.3lfM/s ", hits_per_sec); + printf("important_hits %8.3lfM/s\n", important_hits_per_sec); +} + +void local_storage_report_final(struct bench_res res[], int res_cnt) +{ + double important_hits_mean = 0.0, important_hits_stddev = 0.0; + double hits_mean = 0.0, hits_stddev = 0.0; + int i; + + for (i = 0; i < res_cnt; i++) { + hits_mean += res[i].hits / 1000000.0 / (0.0 + res_cnt); + important_hits_mean += res[i].important_hits / 1000000.0 / (0.0 + res_cnt); + } + + if (res_cnt > 1) { + for (i = 0; i < res_cnt; i++) { + hits_stddev += (hits_mean - res[i].hits / 1000000.0) * + (hits_mean - res[i].hits / 1000000.0) / + (res_cnt - 1.0); + important_hits_stddev += + (important_hits_mean - res[i].important_hits / 1000000.0) * + (important_hits_mean - res[i].important_hits / 1000000.0) / + (res_cnt - 1.0); + } + + hits_stddev = sqrt(hits_stddev); + important_hits_stddev = sqrt(important_hits_stddev); + } + printf("Summary: hits throughput %8.3lf \u00B1 %5.3lf M ops/s, ", + hits_mean, hits_stddev); + printf("hits latency %8.3lf ns/op, ", 1000.0 / hits_mean); + printf("important_hits throughput %8.3lf \u00B1 %5.3lf M ops/s\n", + important_hits_mean, important_hits_stddev); +} + const char *argp_program_version = "benchmark"; const char *argp_program_bug_address = ""; const char argp_program_doc[] = @@ -188,12 +235,14 @@ static const struct argp_option opts[] = { extern struct argp bench_ringbufs_argp; extern struct argp bench_bloom_map_argp; extern struct argp bench_bpf_loop_argp; +extern struct argp bench_local_storage_argp; extern struct argp bench_strncmp_argp; static const struct argp_child bench_parsers[] = { { &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 }, { &bench_bloom_map_argp, 0, "Bloom filter map benchmark", 0 }, { &bench_bpf_loop_argp, 0, "bpf_loop helper benchmark", 0 }, + { &bench_local_storage_argp, 0, "local_storage benchmark", 0 }, { &bench_strncmp_argp, 0, "bpf_strncmp helper benchmark", 0 }, {}, }; @@ -397,6 +446,9 @@ extern const struct bench bench_bpf_loop; extern const struct bench bench_strncmp_no_helper; extern const struct bench bench_strncmp_helper; extern const struct bench bench_bpf_hashmap_full_update; +extern const struct bench bench_local_storage_cache_seq_get; +extern const struct bench bench_local_storage_cache_interleaved_get; +extern const struct bench bench_local_storage_cache_hashmap_control; static const struct bench *benchs[] = { &bench_count_global, @@ -432,6 +484,9 @@ static const struct bench *benchs[] = { &bench_strncmp_no_helper, &bench_strncmp_helper, &bench_bpf_hashmap_full_update, + &bench_local_storage_cache_seq_get, + &bench_local_storage_cache_interleaved_get, + &bench_local_storage_cache_hashmap_control, }; static void setup_benchmark() diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h index fb3e213df3dc..4b15286753ba 100644 --- a/tools/testing/selftests/bpf/bench.h +++ b/tools/testing/selftests/bpf/bench.h @@ -34,6 +34,7 @@ struct bench_res { long hits; long drops; long false_hits; + long important_hits; }; struct bench { @@ -61,6 +62,9 @@ void false_hits_report_progress(int iter, struct bench_res *res, long delta_ns); void false_hits_report_final(struct bench_res res[], int res_cnt); void ops_report_progress(int iter, struct bench_res *res, long delta_ns); void ops_report_final(struct bench_res res[], int res_cnt); +void local_storage_report_progress(int iter, struct bench_res *res, + long delta_ns); +void local_storage_report_final(struct bench_res res[], int res_cnt); static inline __u64 get_time_ns(void) { diff --git a/tools/testing/selftests/bpf/benchs/bench_local_storage.c b/tools/testing/selftests/bpf/benchs/bench_local_storage.c new file mode 100644 index 000000000000..5a378c84e81f --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_local_storage.c @@ -0,0 +1,287 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#include +#include + +#include "local_storage_bench.skel.h" +#include "bench.h" + +#include + +static struct { + __u32 nr_maps; + __u32 hashmap_nr_keys_used; +} args = { + .nr_maps = 1000, + .hashmap_nr_keys_used = 1000, +}; + +enum { + ARG_NR_MAPS = 6000, + ARG_HASHMAP_NR_KEYS_USED = 6001, +}; + +static const struct argp_option opts[] = { + { "nr_maps", ARG_NR_MAPS, "NR_MAPS", 0, + "Set number of local_storage maps"}, + { "hashmap_nr_keys_used", ARG_HASHMAP_NR_KEYS_USED, "NR_KEYS", + 0, "When doing hashmap test, set number of hashmap keys test uses"}, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + long ret; + + switch (key) { + case ARG_NR_MAPS: + ret = strtol(arg, NULL, 10); + if (ret < 1 || ret > UINT_MAX) { + fprintf(stderr, "invalid nr_maps"); + argp_usage(state); + } + args.nr_maps = ret; + break; + case ARG_HASHMAP_NR_KEYS_USED: + ret = strtol(arg, NULL, 10); + if (ret < 1 || ret > UINT_MAX) { + fprintf(stderr, "invalid hashmap_nr_keys_used"); + argp_usage(state); + } + args.hashmap_nr_keys_used = ret; + break; + default: + return ARGP_ERR_UNKNOWN; + } + + return 0; +} + +const struct argp bench_local_storage_argp = { + .options = opts, + .parser = parse_arg, +}; + +/* Keep in sync w/ array of maps in bpf */ +#define MAX_NR_MAPS 1000 +/* keep in sync w/ same define in bpf */ +#define HASHMAP_SZ 4194304 + +static void validate(void) +{ + if (env.producer_cnt != 1) { + fprintf(stderr, "benchmark doesn't support multi-producer!\n"); + exit(1); + } + if (env.consumer_cnt != 1) { + fprintf(stderr, "benchmark doesn't support multi-consumer!\n"); + exit(1); + } + + if (args.nr_maps > MAX_NR_MAPS) { + fprintf(stderr, "nr_maps must be <= 1000\n"); + exit(1); + } + + if (args.hashmap_nr_keys_used > HASHMAP_SZ) { + fprintf(stderr, "hashmap_nr_keys_used must be <= %u\n", HASHMAP_SZ); + exit(1); + } +} + +static struct { + struct local_storage_bench *skel; + void *bpf_obj; + struct bpf_map *array_of_maps; +} ctx; + +static void prepopulate_hashmap(int fd) +{ + int i, key, val; + + /* local_storage gets will have BPF_LOCAL_STORAGE_GET_F_CREATE flag set, so + * populate the hashmap for a similar comparison + */ + for (i = 0; i < HASHMAP_SZ; i++) { + key = val = i; + if (bpf_map_update_elem(fd, &key, &val, 0)) { + fprintf(stderr, "Error prepopulating hashmap (key %d)\n", key); + exit(1); + } + } +} + +static void __setup(struct bpf_program *prog, bool hashmap) +{ + struct bpf_map *inner_map; + int i, fd, mim_fd, err; + + LIBBPF_OPTS(bpf_map_create_opts, create_opts); + + if (!hashmap) + create_opts.map_flags = BPF_F_NO_PREALLOC; + + ctx.skel->rodata->num_maps = args.nr_maps; + ctx.skel->rodata->hashmap_num_keys = args.hashmap_nr_keys_used; + inner_map = bpf_map__inner_map(ctx.array_of_maps); + create_opts.btf_key_type_id = bpf_map__btf_key_type_id(inner_map); + create_opts.btf_value_type_id = bpf_map__btf_value_type_id(inner_map); + + err = local_storage_bench__load(ctx.skel); + if (err) { + fprintf(stderr, "Error loading skeleton\n"); + goto err_out; + } + + create_opts.btf_fd = bpf_object__btf_fd(ctx.skel->obj); + + mim_fd = bpf_map__fd(ctx.array_of_maps); + if (mim_fd < 0) { + fprintf(stderr, "Error getting map_in_map fd\n"); + goto err_out; + } + + for (i = 0; i < args.nr_maps; i++) { + if (hashmap) + fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(int), + sizeof(int), HASHMAP_SZ, &create_opts); + else + fd = bpf_map_create(BPF_MAP_TYPE_TASK_STORAGE, NULL, sizeof(int), + sizeof(int), 0, &create_opts); + if (fd < 0) { + fprintf(stderr, "Error creating map %d: %d\n", i, fd); + goto err_out; + } + + if (hashmap) + prepopulate_hashmap(fd); + + err = bpf_map_update_elem(mim_fd, &i, &fd, 0); + if (err) { + fprintf(stderr, "Error updating array-of-maps w/ map %d\n", i); + goto err_out; + } + } + + if (!bpf_program__attach(prog)) { + fprintf(stderr, "Error attaching bpf program\n"); + goto err_out; + } + + return; +err_out: + exit(1); +} + +static void hashmap_setup(void) +{ + struct local_storage_bench *skel; + + setup_libbpf(); + + skel = local_storage_bench__open(); + ctx.skel = skel; + ctx.array_of_maps = skel->maps.array_of_hash_maps; + skel->rodata->use_hashmap = 1; + skel->rodata->interleave = 0; + + __setup(skel->progs.get_local, true); +} + +static void local_storage_cache_get_setup(void) +{ + struct local_storage_bench *skel; + + setup_libbpf(); + + skel = local_storage_bench__open(); + ctx.skel = skel; + ctx.array_of_maps = skel->maps.array_of_local_storage_maps; + skel->rodata->use_hashmap = 0; + skel->rodata->interleave = 0; + + __setup(skel->progs.get_local, false); +} + +static void local_storage_cache_get_interleaved_setup(void) +{ + struct local_storage_bench *skel; + + setup_libbpf(); + + skel = local_storage_bench__open(); + ctx.skel = skel; + ctx.array_of_maps = skel->maps.array_of_local_storage_maps; + skel->rodata->use_hashmap = 0; + skel->rodata->interleave = 1; + + __setup(skel->progs.get_local, false); +} + +static void measure(struct bench_res *res) +{ + res->hits = atomic_swap(&ctx.skel->bss->hits, 0); + res->important_hits = atomic_swap(&ctx.skel->bss->important_hits, 0); +} + +static inline void trigger_bpf_program(void) +{ + syscall(__NR_getpgid); +} + +static void *consumer(void *input) +{ + return NULL; +} + +static void *producer(void *input) +{ + while (true) + trigger_bpf_program(); + + return NULL; +} + +/* cache sequential and interleaved get benchs test local_storage get + * performance, specifically they demonstrate performance cliff of + * current list-plus-cache local_storage model. + * + * cache sequential get: call bpf_task_storage_get on n maps in order + * cache interleaved get: like "sequential get", but interleave 4 calls to the + * 'important' map (idx 0 in array_of_maps) for every 10 calls. Goal + * is to mimic environment where many progs are accessing their local_storage + * maps, with 'our' prog needing to access its map more often than others + */ +const struct bench bench_local_storage_cache_seq_get = { + .name = "local-storage-cache-seq-get", + .validate = validate, + .setup = local_storage_cache_get_setup, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = local_storage_report_progress, + .report_final = local_storage_report_final, +}; + +const struct bench bench_local_storage_cache_interleaved_get = { + .name = "local-storage-cache-int-get", + .validate = validate, + .setup = local_storage_cache_get_interleaved_setup, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = local_storage_report_progress, + .report_final = local_storage_report_final, +}; + +const struct bench bench_local_storage_cache_hashmap_control = { + .name = "local-storage-cache-hashmap-control", + .validate = validate, + .setup = hashmap_setup, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = local_storage_report_progress, + .report_final = local_storage_report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/run_bench_local_storage.sh b/tools/testing/selftests/bpf/benchs/run_bench_local_storage.sh new file mode 100755 index 000000000000..2eb2b513a173 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/run_bench_local_storage.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source ./benchs/run_common.sh + +set -eufo pipefail + +header "Hashmap Control" +for i in 10 1000 10000 100000 4194304; do +subtitle "num keys: $i" + summarize_local_storage "hashmap (control) sequential get: "\ + "$(./bench --nr_maps 1 --hashmap_nr_keys_used=$i local-storage-cache-hashmap-control)" + printf "\n" +done + +header "Local Storage" +for i in 1 10 16 17 24 32 100 1000; do +subtitle "num_maps: $i" + summarize_local_storage "local_storage cache sequential get: "\ + "$(./bench --nr_maps $i local-storage-cache-seq-get)" + summarize_local_storage "local_storage cache interleaved get: "\ + "$(./bench --nr_maps $i local-storage-cache-int-get)" + printf "\n" +done diff --git a/tools/testing/selftests/bpf/benchs/run_common.sh b/tools/testing/selftests/bpf/benchs/run_common.sh index 6c5e6023a69f..d9f40af82006 100644 --- a/tools/testing/selftests/bpf/benchs/run_common.sh +++ b/tools/testing/selftests/bpf/benchs/run_common.sh @@ -41,6 +41,16 @@ function ops() echo "$*" | sed -E "s/.*latency\s+([0-9]+\.[0-9]+\sns\/op).*/\1/" } +function local_storage() +{ + echo -n "hits throughput: " + echo -n "$*" | sed -E "s/.* hits throughput\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+\sM\sops\/s).*/\1/" + echo -n -e ", hits latency: " + echo -n "$*" | sed -E "s/.* hits latency\s+([0-9]+\.[0-9]+\sns\/op).*/\1/" + echo -n ", important_hits throughput: " + echo "$*" | sed -E "s/.*important_hits throughput\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+\sM\sops\/s).*/\1/" +} + function total() { echo "$*" | sed -E "s/.*total operations\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+M\/s).*/\1/" @@ -67,6 +77,13 @@ function summarize_ops() printf "%-20s %s\n" "$bench" "$(ops $summary)" } +function summarize_local_storage() +{ + bench="$1" + summary=$(echo $2 | tail -n1) + printf "%-20s %s\n" "$bench" "$(local_storage $summary)" +} + function summarize_total() { bench="$1" diff --git a/tools/testing/selftests/bpf/progs/local_storage_bench.c b/tools/testing/selftests/bpf/progs/local_storage_bench.c new file mode 100644 index 000000000000..2c3234c5b73a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/local_storage_bench.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include +#include "bpf_misc.h" + +#define HASHMAP_SZ 4194304 + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 1000); + __type(key, int); + __type(value, int); + __array(values, struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, int); + }); +} array_of_local_storage_maps SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 1000); + __type(key, int); + __type(value, int); + __array(values, struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, HASHMAP_SZ); + __type(key, int); + __type(value, int); + }); +} array_of_hash_maps SEC(".maps"); + +long important_hits; +long hits; + +/* set from user-space */ +const volatile unsigned int use_hashmap; +const volatile unsigned int hashmap_num_keys; +const volatile unsigned int num_maps; +const volatile unsigned int interleave; + +struct loop_ctx { + struct task_struct *task; + long loop_hits; + long loop_important_hits; +}; + +static int do_lookup(unsigned int elem, struct loop_ctx *lctx) +{ + void *map, *inner_map; + int idx = 0; + + if (use_hashmap) + map = &array_of_hash_maps; + else + map = &array_of_local_storage_maps; + + inner_map = bpf_map_lookup_elem(map, &elem); + if (!inner_map) + return -1; + + if (use_hashmap) { + idx = bpf_get_prandom_u32() % hashmap_num_keys; + bpf_map_lookup_elem(inner_map, &idx); + } else { + bpf_task_storage_get(inner_map, lctx->task, &idx, + BPF_LOCAL_STORAGE_GET_F_CREATE); + } + + lctx->loop_hits++; + if (!elem) + lctx->loop_important_hits++; + return 0; +} + +static long loop(u32 index, void *ctx) +{ + struct loop_ctx *lctx = (struct loop_ctx *)ctx; + unsigned int map_idx = index % num_maps; + + do_lookup(map_idx, lctx); + if (interleave && map_idx % 3 == 0) + do_lookup(0, lctx); + return 0; +} + +SEC("fentry/" SYS_PREFIX "sys_getpgid") +int get_local(void *ctx) +{ + struct loop_ctx lctx; + + lctx.task = bpf_get_current_task_btf(); + lctx.loop_hits = 0; + lctx.loop_important_hits = 0; + bpf_loop(10000, &loop, &lctx, 0); + __sync_add_and_fetch(&hits, lctx.loop_hits); + __sync_add_and_fetch(&important_hits, lctx.loop_important_hits); + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 924a221581db557f6c61b93fe5f25893a74bc3ea Mon Sep 17 00:00:00 2001 From: "Shawn M. Chapla" Date: Thu, 26 May 2022 16:14:47 -0400 Subject: perf data convert: Prefer sampled CPU when exporting JSON When CPU has been explicitly sampled (via --sample-cpu), prefer this sampled value over the thread CPU value when exporting to JSON. Signed-off-by: Shawn M. Chapla Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20220526201506.2028281-1-schapla@codeweavers.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/data-convert-json.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/data-convert-json.c b/tools/perf/util/data-convert-json.c index f1ab6edba446..613d6ae82663 100644 --- a/tools/perf/util/data-convert-json.c +++ b/tools/perf/util/data-convert-json.c @@ -149,6 +149,7 @@ static int process_sample_event(struct perf_tool *tool, struct convert_json *c = container_of(tool, struct convert_json, tool); FILE *out = c->out; struct addr_location al, tal; + u64 sample_type = __evlist__combined_sample_type(evsel->evlist); u8 cpumode = PERF_RECORD_MISC_USER; if (machine__resolve(machine, &al, sample) < 0) { @@ -168,7 +169,9 @@ static int process_sample_event(struct perf_tool *tool, output_json_key_format(out, true, 3, "pid", "%i", al.thread->pid_); output_json_key_format(out, true, 3, "tid", "%i", al.thread->tid); - if (al.thread->cpu >= 0) + if ((sample_type & PERF_SAMPLE_CPU)) + output_json_key_format(out, true, 3, "cpu", "%i", sample->cpu); + else if (al.thread->cpu >= 0) output_json_key_format(out, true, 3, "cpu", "%i", al.thread->cpu); output_json_key_string(out, true, 3, "comm", thread__comm_str(al.thread)); -- cgit v1.2.3 From f42c0ce573df79d1b8bd169008c994dcdd43585a Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 10 Jun 2022 14:33:12 +0300 Subject: perf record: Always get text_poke events with --kcore option kcore provides a copy of the running kernel including any modified code. A trace that benefits from that also benefits from text_poke events, so enable them. Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/r/20220610113316.6682-2-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 9a71f0330137..3959a1b86afb 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -3805,6 +3805,9 @@ int cmd_record(int argc, const char **argv) goto out_opts; } + if (rec->opts.kcore) + rec->opts.text_poke = true; + if (rec->opts.kcore || record__threads_enabled(rec)) rec->data.is_dir = true; -- cgit v1.2.3 From 6b080312fc821658a479e74bdb0c3f7d9ac5838f Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 10 Jun 2022 14:33:13 +0300 Subject: perf record: Always record id index In preparation for recording sideband events in a virtual machine guest so that they can be injected into a host perf.data file. Adjust the logic so that if there are IDs then the id index is recorded. Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/r/20220610113316.6682-3-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 12 +++++------- tools/perf/util/synthetic-events.c | 7 +++++-- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 3959a1b86afb..00c2a6cdf1be 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1834,13 +1834,11 @@ static int record__synthesize(struct record *rec, bool tail) goto out; /* Synthesize id_index before auxtrace_info */ - if (rec->opts.auxtrace_sample_mode || rec->opts.full_auxtrace) { - err = perf_event__synthesize_id_index(tool, - process_synthesized_event, - session->evlist, machine); - if (err) - goto out; - } + err = perf_event__synthesize_id_index(tool, + process_synthesized_event, + session->evlist, machine); + if (err) + goto out; if (rec->opts.full_auxtrace) { err = perf_event__synthesize_auxtrace_info(rec->itr, tool, diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index 27acdc5e5723..d75074486a55 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1719,14 +1719,17 @@ int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_ size_t nr = 0, i = 0, sz, max_nr, n; int err; - pr_debug2("Synthesizing id index\n"); - max_nr = (UINT16_MAX - sizeof(struct perf_record_id_index)) / sizeof(struct id_index_entry); evlist__for_each_entry(evlist, evsel) nr += evsel->core.ids; + if (!nr) + return 0; + + pr_debug2("Synthesizing id index\n"); + n = nr > max_nr ? max_nr : nr; sz = sizeof(struct perf_record_id_index) + n * sizeof(struct id_index_entry); ev = zalloc(sz); -- cgit v1.2.3 From 61110883a02090cb5fd1f890978e238cc99f0164 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 15 Jun 2022 08:25:11 +0300 Subject: perf record: Add new option to sample identifier In preparation for recording sideband events in a virtual machine guest so that they can be injected into a host perf.data file. Add an option to always include sample type PERF_SAMPLE_IDENTIFIER. Committer testing: # perf record sleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.020 MB perf.data (7 samples) ] # perf evlist -v cycles: size: 128, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|PERIOD, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1 # # # perf record --sample-identifier sleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.022 MB perf.data (7 samples) ] # perf evlist -v cycles: size: 128, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|PERIOD|IDENTIFIER, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1 # Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/r/20220615052511.4441-1-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-record.txt | 5 +++++ tools/perf/builtin-record.c | 2 ++ tools/perf/util/record.c | 2 +- tools/perf/util/record.h | 1 + 4 files changed, 9 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index cf8ad50f3de1..6bd6d07021ba 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -313,6 +313,11 @@ OPTIONS --sample-cpu:: Record the sample cpu. +--sample-identifier:: + Record the sample identifier i.e. PERF_SAMPLE_IDENTIFIER bit set in + the sample_type member of the struct perf_event_attr argument to the + perf_event_open system call. + -n:: --no-samples:: Don't sample. diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 00c2a6cdf1be..40dca1fba4e3 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -3191,6 +3191,8 @@ static struct option __record_options[] = { OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size, "Record the sampled code address (ip) page size"), OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"), + OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier, + "Record the sample identifier"), OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time, &record.opts.sample_time_set, "Record the sample timestamps"), diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c index 5b09ecbb05dc..b529636ab3ea 100644 --- a/tools/perf/util/record.c +++ b/tools/perf/util/record.c @@ -121,7 +121,7 @@ void evlist__config(struct evlist *evlist, struct record_opts *opts, struct call evlist__for_each_entry(evlist, evsel) evsel__config_leader_sampling(evsel, evlist); - if (opts->full_auxtrace) { + if (opts->full_auxtrace || opts->sample_identifier) { /* * Need to be able to synthesize and parse selected events with * arbitrary sample types, which requires always being able to diff --git a/tools/perf/util/record.h b/tools/perf/util/record.h index be9a957501f4..4269e916f450 100644 --- a/tools/perf/util/record.h +++ b/tools/perf/util/record.h @@ -28,6 +28,7 @@ struct record_opts { bool sample_time; bool sample_time_set; bool sample_cpu; + bool sample_identifier; bool period; bool period_set; bool running_time; -- cgit v1.2.3 From 3812d2987733c5a00e103be4e23d63ec9342043a Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 10 Jun 2022 14:33:15 +0300 Subject: perf record: Add finished init event In preparation for recording sideband events in a virtual machine guest so that they can be injected into a host perf.data file. This is needed to enable injecting events after the initial synthesized user events (that have an all zero id sample) but before regular events. Committer notes: Add entry about PERF_RECORD_FINISHED_INIT to tools/perf/Documentation/perf.data-file-format.txt. Committer testing: Before: # perf report -D | grep FINISHED 0 0x5910 [0x8]: PERF_RECORD_FINISHED_ROUND FINISHED_ROUND events: 1 ( 0.5%) # After: # perf record -- sleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.020 MB perf.data (7 samples) ] # perf report -D | grep FINISHED 0 0x5068 [0x8]: PERF_RECORD_FINISHED_INIT: unhandled! 0 0x5390 [0x8]: PERF_RECORD_FINISHED_ROUND FINISHED_ROUND events: 1 ( 0.5%) FINISHED_INIT events: 1 ( 0.5%) # Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/r/20220610113316.6682-5-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/include/perf/event.h | 1 + tools/perf/Documentation/perf.data-file-format.txt | 10 ++++++++ tools/perf/builtin-inject.c | 1 + tools/perf/builtin-record.c | 27 ++++++++++++++++++++++ tools/perf/util/event.c | 1 + tools/perf/util/session.c | 4 ++++ tools/perf/util/tool.h | 3 ++- 7 files changed, 46 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h index e7758707cadd..9f7ca070da87 100644 --- a/tools/lib/perf/include/perf/event.h +++ b/tools/lib/perf/include/perf/event.h @@ -389,6 +389,7 @@ enum perf_user_event_type { /* above any possible kernel type */ PERF_RECORD_TIME_CONV = 79, PERF_RECORD_HEADER_FEATURE = 80, PERF_RECORD_COMPRESSED = 81, + PERF_RECORD_FINISHED_INIT = 82, PERF_RECORD_HEADER_MAX }; diff --git a/tools/perf/Documentation/perf.data-file-format.txt b/tools/perf/Documentation/perf.data-file-format.txt index f56d0e0fbff6..bc9f0aa113d8 100644 --- a/tools/perf/Documentation/perf.data-file-format.txt +++ b/tools/perf/Documentation/perf.data-file-format.txt @@ -607,6 +607,16 @@ struct compressed_event { char data[]; }; + PERF_RECORD_FINISHED_INIT = 82, + +Marks the end of records for the system, pre-existing threads in system wide +sessions, etc. Those are the ones prefixed PERF_RECORD_USER_*. + +This is used, for instance, to 'perf inject' events after init and before +regular events, those emitted by the kernel, to support combining guest and +host records. + + The header is followed by compressed data frame that can be decompressed into array of perf trace records. The size of the entire compressed event record including the header is limited by the max value of header.size. diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index a75bf11585b5..42e2918fd1cc 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -1059,6 +1059,7 @@ int cmd_inject(int argc, const char **argv) .stat = perf_event__repipe_op2_synth, .stat_round = perf_event__repipe_op2_synth, .feature = perf_event__repipe_op2_synth, + .finished_init = perf_event__repipe_op2_synth, .compressed = perf_event__repipe_op4_synth, .auxtrace = perf_event__repipe_auxtrace, }, diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 40dca1fba4e3..cf5c5379ceaa 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1388,6 +1388,11 @@ static struct perf_event_header finished_round_event = { .type = PERF_RECORD_FINISHED_ROUND, }; +static struct perf_event_header finished_init_event = { + .size = sizeof(struct perf_event_header), + .type = PERF_RECORD_FINISHED_INIT, +}; + static void record__adjust_affinity(struct record *rec, struct mmap *map) { if (rec->opts.affinity != PERF_AFFINITY_SYS && @@ -1696,6 +1701,14 @@ static int record__synthesize_workload(struct record *rec, bool tail) return err; } +static int write_finished_init(struct record *rec, bool tail) +{ + if (rec->opts.tail_synthesize != tail) + return 0; + + return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event)); +} + static int record__synthesize(struct record *rec, bool tail); static int @@ -1710,6 +1723,8 @@ record__switch_output(struct record *rec, bool at_exit) record__aio_mmap_read_sync(rec); + write_finished_init(rec, true); + record__synthesize(rec, true); if (target__none(&rec->opts.target)) record__synthesize_workload(rec, true); @@ -1764,6 +1779,7 @@ record__switch_output(struct record *rec, bool at_exit) */ if (target__none(&rec->opts.target)) record__synthesize_workload(rec, false); + write_finished_init(rec, false); } return fd; } @@ -2419,6 +2435,15 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) trigger_ready(&auxtrace_snapshot_trigger); trigger_ready(&switch_output_trigger); perf_hooks__invoke_record_start(); + + /* + * Must write FINISHED_INIT so it will be seen after all other + * synthesized user events, but before any regular events. + */ + err = write_finished_init(rec, false); + if (err < 0) + goto out_child; + for (;;) { unsigned long long hits = thread->samples; @@ -2563,6 +2588,8 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", record__waking(rec)); + write_finished_init(rec, true); + if (target__none(&rec->opts.target)) record__synthesize_workload(rec, true); diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 0476bb3a4188..1fa14598b916 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -76,6 +76,7 @@ static const char *perf_event__names[] = { [PERF_RECORD_TIME_CONV] = "TIME_CONV", [PERF_RECORD_HEADER_FEATURE] = "FEATURE", [PERF_RECORD_COMPRESSED] = "COMPRESSED", + [PERF_RECORD_FINISHED_INIT] = "FINISHED_INIT", }; const char *perf_event__name(unsigned int id) diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 0aa818977d2b..37f833c3c81b 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -562,6 +562,8 @@ void perf_tool__fill_defaults(struct perf_tool *tool) tool->feature = process_event_op2_stub; if (tool->compressed == NULL) tool->compressed = perf_session__process_compressed_event; + if (tool->finished_init == NULL) + tool->finished_init = process_event_op2_stub; } static void swap_sample_id_all(union perf_event *event, void *data) @@ -1706,6 +1708,8 @@ static s64 perf_session__process_user_event(struct perf_session *session, if (err) dump_event(session->evlist, event, file_offset, &sample, file_path); return err; + case PERF_RECORD_FINISHED_INIT: + return tool->finished_init(session, event); default: return -EINVAL; } diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h index f2352dba1875..c957fb849ac6 100644 --- a/tools/perf/util/tool.h +++ b/tools/perf/util/tool.h @@ -76,7 +76,8 @@ struct perf_tool { stat_config, stat, stat_round, - feature; + feature, + finished_init; event_op4 compressed; event_op3 auxtrace; bool ordered_events; -- cgit v1.2.3 From 52f28b7bac75da9b8508f17438c9a8d83ab48e5d Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 10 Jun 2022 14:33:16 +0300 Subject: perf script: Add some missing event dumps When the -D option is used, the details of thread-map, cpu-map and event-update events are not currently dumped. Add prints so that they are. Example: # perf record --kcore sleep 0.1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.021 MB perf.data (7 samples) ] # perf script -D | grep 'THREAD\|CPU' 0 0x4950 [0x28]: PERF_RECORD_THREAD_MAP nr: 1 thread: 35116 0 0x4978 [0x20]: PERF_RECORD_CPU_MAP: 0-7 # perf script -D | grep -A4 'UPDATE' 0 0x4920 [0x30]: PERF_RECORD_EVENT_UPDATE ... id: 147 ... 0-7 Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/r/20220610113316.6682-6-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 6 ++++++ tools/perf/util/header.c | 3 +++ 2 files changed, 9 insertions(+) (limited to 'tools') diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index c689054002cc..7cf21ab16f4f 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -3633,6 +3633,9 @@ int process_thread_map_event(struct perf_session *session, struct perf_tool *tool = session->tool; struct perf_script *script = container_of(tool, struct perf_script, tool); + if (dump_trace) + perf_event__fprintf_thread_map(event, stdout); + if (script->threads) { pr_warning("Extra thread map event, ignoring.\n"); return 0; @@ -3652,6 +3655,9 @@ int process_cpu_map_event(struct perf_session *session, struct perf_tool *tool = session->tool; struct perf_script *script = container_of(tool, struct perf_script, tool); + if (dump_trace) + perf_event__fprintf_cpu_map(event, stdout); + if (script->cpus) { pr_warning("Extra cpu map event, ignoring.\n"); return 0; diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 53332da100e8..de5b7a023e9e 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -4349,6 +4349,9 @@ int perf_event__process_event_update(struct perf_tool *tool __maybe_unused, struct evsel *evsel; struct perf_cpu_map *map; + if (dump_trace) + perf_event__fprintf_event_update(event, stdout); + if (!pevlist || *pevlist == NULL) return -EINVAL; -- cgit v1.2.3 From 6e945d57cc9f6e27893d57a419434a2859ba6f3f Mon Sep 17 00:00:00 2001 From: Jörn-Thorben Hinz Date: Wed, 22 Jun 2022 21:12:25 +0200 Subject: selftests/bpf: Test a BPF CC writing sk_pacing_* MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test whether a TCP CC implemented in BPF is allowed to write sk_pacing_rate and sk_pacing_status in struct sock. This is needed when cong_control() is implemented and used. Signed-off-by: Jörn-Thorben Hinz Link: https://lore.kernel.org/r/20220622191227.898118-4-jthinz@mailbox.tu-berlin.de Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 19 +++++++ .../selftests/bpf/progs/tcp_ca_write_sk_pacing.c | 60 ++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index e9a9a31b2ffe..e79f3f5a9d33 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -9,6 +9,7 @@ #include "bpf_cubic.skel.h" #include "bpf_tcp_nogpl.skel.h" #include "bpf_dctcp_release.skel.h" +#include "tcp_ca_write_sk_pacing.skel.h" #ifndef ENOTSUPP #define ENOTSUPP 524 @@ -322,6 +323,22 @@ static void test_rel_setsockopt(void) bpf_dctcp_release__destroy(rel_skel); } +static void test_write_sk_pacing(void) +{ + struct tcp_ca_write_sk_pacing *skel; + struct bpf_link *link; + + skel = tcp_ca_write_sk_pacing__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + link = bpf_map__attach_struct_ops(skel->maps.write_sk_pacing); + ASSERT_OK_PTR(link, "attach_struct_ops"); + + bpf_link__destroy(link); + tcp_ca_write_sk_pacing__destroy(skel); +} + void test_bpf_tcp_ca(void) { if (test__start_subtest("dctcp")) @@ -334,4 +351,6 @@ void test_bpf_tcp_ca(void) test_dctcp_fallback(); if (test__start_subtest("rel_setsockopt")) test_rel_setsockopt(); + if (test__start_subtest("write_sk_pacing")) + test_write_sk_pacing(); } diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c new file mode 100644 index 000000000000..43447704cf0e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" + +#include +#include + +char _license[] SEC("license") = "GPL"; + +#define USEC_PER_SEC 1000000UL + +#define min(a, b) ((a) < (b) ? (a) : (b)) + +static inline struct tcp_sock *tcp_sk(const struct sock *sk) +{ + return (struct tcp_sock *)sk; +} + +SEC("struct_ops/write_sk_pacing_init") +void BPF_PROG(write_sk_pacing_init, struct sock *sk) +{ +#ifdef ENABLE_ATOMICS_TESTS + __sync_bool_compare_and_swap(&sk->sk_pacing_status, SK_PACING_NONE, + SK_PACING_NEEDED); +#else + sk->sk_pacing_status = SK_PACING_NEEDED; +#endif +} + +SEC("struct_ops/write_sk_pacing_cong_control") +void BPF_PROG(write_sk_pacing_cong_control, struct sock *sk, + const struct rate_sample *rs) +{ + const struct tcp_sock *tp = tcp_sk(sk); + unsigned long rate = + ((tp->snd_cwnd * tp->mss_cache * USEC_PER_SEC) << 3) / + (tp->srtt_us ?: 1U << 3); + sk->sk_pacing_rate = min(rate, sk->sk_max_pacing_rate); +} + +SEC("struct_ops/write_sk_pacing_ssthresh") +__u32 BPF_PROG(write_sk_pacing_ssthresh, struct sock *sk) +{ + return tcp_sk(sk)->snd_ssthresh; +} + +SEC("struct_ops/write_sk_pacing_undo_cwnd") +__u32 BPF_PROG(write_sk_pacing_undo_cwnd, struct sock *sk) +{ + return tcp_sk(sk)->snd_cwnd; +} + +SEC(".struct_ops") +struct tcp_congestion_ops write_sk_pacing = { + .init = (void *)write_sk_pacing_init, + .cong_control = (void *)write_sk_pacing_cong_control, + .ssthresh = (void *)write_sk_pacing_ssthresh, + .undo_cwnd = (void *)write_sk_pacing_undo_cwnd, + .name = "bpf_w_sk_pacing", +}; -- cgit v1.2.3 From 0735627d78caa56f219dc14608ce0bdbd045e07e Mon Sep 17 00:00:00 2001 From: Jörn-Thorben Hinz Date: Wed, 22 Jun 2022 21:12:26 +0200 Subject: selftests/bpf: Test an incomplete BPF CC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test whether a TCP CC implemented in BPF providing neither cong_avoid() nor cong_control() is correctly rejected. This check solely depends on tcp_register_congestion_control() now, which is invoked during bpf_map__attach_struct_ops(). Signed-off-by: Jörn-Thorben Hinz Link: https://lore.kernel.org/r/20220622191227.898118-5-jthinz@mailbox.tu-berlin.de Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 22 ++++++++++++++ .../selftests/bpf/progs/tcp_ca_incompl_cong_ops.c | 35 ++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index e79f3f5a9d33..194d07310531 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -10,6 +10,7 @@ #include "bpf_tcp_nogpl.skel.h" #include "bpf_dctcp_release.skel.h" #include "tcp_ca_write_sk_pacing.skel.h" +#include "tcp_ca_incompl_cong_ops.skel.h" #ifndef ENOTSUPP #define ENOTSUPP 524 @@ -339,6 +340,25 @@ static void test_write_sk_pacing(void) tcp_ca_write_sk_pacing__destroy(skel); } +static void test_incompl_cong_ops(void) +{ + struct tcp_ca_incompl_cong_ops *skel; + struct bpf_link *link; + + skel = tcp_ca_incompl_cong_ops__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + /* That cong_avoid() and cong_control() are missing is only reported at + * this point: + */ + link = bpf_map__attach_struct_ops(skel->maps.incompl_cong_ops); + ASSERT_ERR_PTR(link, "attach_struct_ops"); + + bpf_link__destroy(link); + tcp_ca_incompl_cong_ops__destroy(skel); +} + void test_bpf_tcp_ca(void) { if (test__start_subtest("dctcp")) @@ -353,4 +373,6 @@ void test_bpf_tcp_ca(void) test_rel_setsockopt(); if (test__start_subtest("write_sk_pacing")) test_write_sk_pacing(); + if (test__start_subtest("incompl_cong_ops")) + test_incompl_cong_ops(); } diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c b/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c new file mode 100644 index 000000000000..7bb872fb22dd --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" + +#include +#include + +char _license[] SEC("license") = "GPL"; + +static inline struct tcp_sock *tcp_sk(const struct sock *sk) +{ + return (struct tcp_sock *)sk; +} + +SEC("struct_ops/incompl_cong_ops_ssthresh") +__u32 BPF_PROG(incompl_cong_ops_ssthresh, struct sock *sk) +{ + return tcp_sk(sk)->snd_ssthresh; +} + +SEC("struct_ops/incompl_cong_ops_undo_cwnd") +__u32 BPF_PROG(incompl_cong_ops_undo_cwnd, struct sock *sk) +{ + return tcp_sk(sk)->snd_cwnd; +} + +SEC(".struct_ops") +struct tcp_congestion_ops incompl_cong_ops = { + /* Intentionally leaving out any of the required cong_avoid() and + * cong_control() here. + */ + .ssthresh = (void *)incompl_cong_ops_ssthresh, + .undo_cwnd = (void *)incompl_cong_ops_undo_cwnd, + .name = "bpf_incompl_ops", +}; -- cgit v1.2.3 From f14a3f644a1c5a2e2dbe6073f51793119a12e6ce Mon Sep 17 00:00:00 2001 From: Jörn-Thorben Hinz Date: Wed, 22 Jun 2022 21:12:27 +0200 Subject: selftests/bpf: Test a BPF CC implementing the unsupported get_info() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test whether a TCP CC implemented in BPF providing get_info() is rejected correctly. get_info() is unsupported in a BPF CC. The check for required functions in a BPF CC has moved, this test ensures unsupported functions are still rejected correctly. Signed-off-by: Jörn-Thorben Hinz Reviewed-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220622191227.898118-6-jthinz@mailbox.tu-berlin.de Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 20 ++++++++++++++++++++ .../selftests/bpf/progs/tcp_ca_unsupp_cong_op.c | 21 +++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 194d07310531..2959a52ced06 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -11,6 +11,7 @@ #include "bpf_dctcp_release.skel.h" #include "tcp_ca_write_sk_pacing.skel.h" #include "tcp_ca_incompl_cong_ops.skel.h" +#include "tcp_ca_unsupp_cong_op.skel.h" #ifndef ENOTSUPP #define ENOTSUPP 524 @@ -359,6 +360,23 @@ static void test_incompl_cong_ops(void) tcp_ca_incompl_cong_ops__destroy(skel); } +static void test_unsupp_cong_op(void) +{ + libbpf_print_fn_t old_print_fn; + struct tcp_ca_unsupp_cong_op *skel; + + err_str = "attach to unsupported member get_info"; + found = false; + old_print_fn = libbpf_set_print(libbpf_debug_print); + + skel = tcp_ca_unsupp_cong_op__open_and_load(); + ASSERT_NULL(skel, "open_and_load"); + ASSERT_EQ(found, true, "expected_err_msg"); + + tcp_ca_unsupp_cong_op__destroy(skel); + libbpf_set_print(old_print_fn); +} + void test_bpf_tcp_ca(void) { if (test__start_subtest("dctcp")) @@ -375,4 +393,6 @@ void test_bpf_tcp_ca(void) test_write_sk_pacing(); if (test__start_subtest("incompl_cong_ops")) test_incompl_cong_ops(); + if (test__start_subtest("unsupp_cong_op")) + test_unsupp_cong_op(); } diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c b/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c new file mode 100644 index 000000000000..c06f4a41c21a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" + +#include +#include + +char _license[] SEC("license") = "GPL"; + +SEC("struct_ops/unsupp_cong_op_get_info") +size_t BPF_PROG(unsupp_cong_op_get_info, struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) +{ + return 0; +} + +SEC(".struct_ops") +struct tcp_congestion_ops unsupp_cong_op = { + .get_info = (void *)unsupp_cong_op_get_info, + .name = "bpf_unsupp_op", +}; -- cgit v1.2.3 From 6dc7a0baf1a70b7d22662d38481824c14ddd80c5 Mon Sep 17 00:00:00 2001 From: Jörn-Thorben Hinz Date: Tue, 21 Jun 2022 09:01:16 +0200 Subject: selftests/bpf: Fix rare segfault in sock_fields prog test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_sock_fields__detach() got called with a null pointer here when one of the CHECKs or ASSERTs up to the test_sock_fields__open_and_load() call resulted in a jump to the "done" label. A skeletons *__detach() is not safe to call with a null pointer, though. This led to a segfault. Go the easy route and only call test_sock_fields__destroy() which is null-pointer safe and includes detaching. Came across this while looking[1] to introduce the usage of bpf_tcp_helpers.h (included in progs/test_sock_fields.c) together with vmlinux.h. [1] https://lore.kernel.org/bpf/629bc069dd807d7ac646f836e9dca28bbc1108e2.camel@mailbox.tu-berlin.de/ Fixes: 8f50f16ff39d ("selftests/bpf: Extend verifier and bpf_sock tests for dst_port loads") Signed-off-by: Jörn-Thorben Hinz Signed-off-by: Andrii Nakryiko Reviewed-by: Jakub Sitnicki Reviewed-by: Martin KaFai Lau Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220621070116.307221-1-jthinz@mailbox.tu-berlin.de --- tools/testing/selftests/bpf/prog_tests/sock_fields.c | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sock_fields.c b/tools/testing/selftests/bpf/prog_tests/sock_fields.c index 9d211b5c22c4..7d23166c77af 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_fields.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_fields.c @@ -394,7 +394,6 @@ void serial_test_sock_fields(void) test(); done: - test_sock_fields__detach(skel); test_sock_fields__destroy(skel); if (child_cg_fd >= 0) close(child_cg_fd); -- cgit v1.2.3 From c07d2475f9cd7eaf6ddc60b5d284b375699d9553 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 8 Jun 2022 09:54:47 +0200 Subject: selftests/kexec: remove broken EFI_VARS secure boot fallback check Commit b433a52aa28733e0 ("selftests/kexec: update get_secureboot_mode") refactored the code that discovers the EFI secure boot mode so it only depends on either the efivars pseudo filesystem or the efivars sysfs interface, but never both. However, the latter version was not implemented correctly, given the fact that the local 'efi_vars' variable never assumes a value. This means the fallback has been dead code ever since it was introduced. So let's drop the fallback altogether. The sysfs interface has been deprecated for ~10 years now, and is only enabled on x86 to begin with, so it is time to get rid of it entirely. Reviewed-by: Mimi Zohar Signed-off-by: Ard Biesheuvel --- tools/testing/selftests/kexec/kexec_common_lib.sh | 36 ++--------------------- 1 file changed, 2 insertions(+), 34 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kexec/kexec_common_lib.sh b/tools/testing/selftests/kexec/kexec_common_lib.sh index 0e114b34d5d7..641ef05863b2 100755 --- a/tools/testing/selftests/kexec/kexec_common_lib.sh +++ b/tools/testing/selftests/kexec/kexec_common_lib.sh @@ -65,32 +65,6 @@ get_efivarfs_secureboot_mode() return 0; } -get_efi_var_secureboot_mode() -{ - local efi_vars - local secure_boot_file - local setup_mode_file - local secureboot_mode - local setup_mode - - if [ ! -d "$efi_vars" ]; then - log_skip "efi_vars is not enabled\n" - fi - secure_boot_file=$(find "$efi_vars" -name SecureBoot-* 2>/dev/null) - setup_mode_file=$(find "$efi_vars" -name SetupMode-* 2>/dev/null) - if [ -f "$secure_boot_file/data" ] && \ - [ -f "$setup_mode_file/data" ]; then - secureboot_mode=`od -An -t u1 "$secure_boot_file/data"` - setup_mode=`od -An -t u1 "$setup_mode_file/data"` - - if [ $secureboot_mode -eq 1 ] && [ $setup_mode -eq 0 ]; then - log_info "secure boot mode enabled (CONFIG_EFI_VARS)" - return 1; - fi - fi - return 0; -} - # On powerpc platform, check device-tree property # /proc/device-tree/ibm,secureboot/os-secureboot-enforcing # to detect secureboot state. @@ -113,9 +87,8 @@ get_arch() } # Check efivar SecureBoot-$(the UUID) and SetupMode-$(the UUID). -# The secure boot mode can be accessed either as the last integer -# of "od -An -t u1 /sys/firmware/efi/efivars/SecureBoot-*" or from -# "od -An -t u1 /sys/firmware/efi/vars/SecureBoot-*/data". The efi +# The secure boot mode can be accessed as the last integer of +# "od -An -t u1 /sys/firmware/efi/efivars/SecureBoot-*". The efi # SetupMode can be similarly accessed. # Return 1 for SecureBoot mode enabled and SetupMode mode disabled. get_secureboot_mode() @@ -129,11 +102,6 @@ get_secureboot_mode() else get_efivarfs_secureboot_mode secureboot_mode=$? - # fallback to using the efi_var files - if [ $secureboot_mode -eq 0 ]; then - get_efi_var_secureboot_mode - secureboot_mode=$? - fi fi if [ $secureboot_mode -eq 0 ]; then -- cgit v1.2.3 From fcd48a213f0ac45c2187b09e19d4849e14cb59f8 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 13 Jun 2022 21:25:14 +0000 Subject: KVM: selftests: Remove dynamic memory allocation for stats header There's no need to allocate dynamic memory for the stats header since its size is known at compile time. Reviewed-by: David Matlack Reviewed-by: Peter Xu Signed-off-by: Ben Gardon Message-Id: <20220613212523.3436117-2-bgardon@google.com> Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/kvm_binary_stats_test.c | 58 ++++++++++------------ 1 file changed, 27 insertions(+), 31 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index 3c2d06b61442..ac5c3d0181e3 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -26,56 +26,53 @@ static void stats_test(int stats_fd) int i; size_t size_desc; size_t size_data = 0; - struct kvm_stats_header *header; + struct kvm_stats_header header; char *id; struct kvm_stats_desc *stats_desc; u64 *stats_data; struct kvm_stats_desc *pdesc; /* Read kvm stats header */ - header = malloc(sizeof(*header)); - TEST_ASSERT(header, "Allocate memory for stats header"); - - ret = read(stats_fd, header, sizeof(*header)); - TEST_ASSERT(ret == sizeof(*header), "Read stats header"); - size_desc = sizeof(*stats_desc) + header->name_size; + ret = read(stats_fd, &header, sizeof(header)); + TEST_ASSERT(ret == sizeof(header), "Read stats header"); + size_desc = sizeof(*stats_desc) + header.name_size; /* Read kvm stats id string */ - id = malloc(header->name_size); + id = malloc(header.name_size); TEST_ASSERT(id, "Allocate memory for id string"); - ret = read(stats_fd, id, header->name_size); - TEST_ASSERT(ret == header->name_size, "Read id string"); + ret = read(stats_fd, id, header.name_size); + TEST_ASSERT(ret == header.name_size, "Read id string"); /* Check id string, that should start with "kvm" */ - TEST_ASSERT(!strncmp(id, "kvm", 3) && strlen(id) < header->name_size, + TEST_ASSERT(!strncmp(id, "kvm", 3) && strlen(id) < header.name_size, "Invalid KVM stats type, id: %s", id); /* Sanity check for other fields in header */ - if (header->num_desc == 0) { + if (header.num_desc == 0) { printf("No KVM stats defined!"); return; } /* Check overlap */ - TEST_ASSERT(header->desc_offset > 0 && header->data_offset > 0 - && header->desc_offset >= sizeof(*header) - && header->data_offset >= sizeof(*header), + TEST_ASSERT(header.desc_offset > 0 && header.data_offset > 0 + && header.desc_offset >= sizeof(header) + && header.data_offset >= sizeof(header), "Invalid offset fields in header"); - TEST_ASSERT(header->desc_offset > header->data_offset || - (header->desc_offset + size_desc * header->num_desc <= - header->data_offset), + TEST_ASSERT(header.desc_offset > header.data_offset || + (header.desc_offset + size_desc * header.num_desc <= + header.data_offset), "Descriptor block is overlapped with data block"); /* Allocate memory for stats descriptors */ - stats_desc = calloc(header->num_desc, size_desc); + stats_desc = calloc(header.num_desc, size_desc); TEST_ASSERT(stats_desc, "Allocate memory for stats descriptors"); /* Read kvm stats descriptors */ ret = pread(stats_fd, stats_desc, - size_desc * header->num_desc, header->desc_offset); - TEST_ASSERT(ret == size_desc * header->num_desc, + size_desc * header.num_desc, header.desc_offset); + TEST_ASSERT(ret == size_desc * header.num_desc, "Read KVM stats descriptors"); /* Sanity check for fields in descriptors */ - for (i = 0; i < header->num_desc; ++i) { + for (i = 0; i < header.num_desc; ++i) { pdesc = (void *)stats_desc + i * size_desc; /* Check type,unit,base boundaries */ TEST_ASSERT((pdesc->flags & KVM_STATS_TYPE_MASK) @@ -104,7 +101,7 @@ static void stats_test(int stats_fd) break; } /* Check name string */ - TEST_ASSERT(strlen(pdesc->name) < header->name_size, + TEST_ASSERT(strlen(pdesc->name) < header.name_size, "KVM stats name(%s) too long", pdesc->name); /* Check size field, which should not be zero */ TEST_ASSERT(pdesc->size, "KVM descriptor(%s) with size of 0", @@ -124,14 +121,14 @@ static void stats_test(int stats_fd) size_data += pdesc->size * sizeof(*stats_data); } /* Check overlap */ - TEST_ASSERT(header->data_offset >= header->desc_offset - || header->data_offset + size_data <= header->desc_offset, + TEST_ASSERT(header.data_offset >= header.desc_offset + || header.data_offset + size_data <= header.desc_offset, "Data block is overlapped with Descriptor block"); /* Check validity of all stats data size */ - TEST_ASSERT(size_data >= header->num_desc * sizeof(*stats_data), + TEST_ASSERT(size_data >= header.num_desc * sizeof(*stats_data), "Data size is not correct"); /* Check stats offset */ - for (i = 0; i < header->num_desc; ++i) { + for (i = 0; i < header.num_desc; ++i) { pdesc = (void *)stats_desc + i * size_desc; TEST_ASSERT(pdesc->offset < size_data, "Invalid offset (%u) for stats: %s", @@ -142,15 +139,15 @@ static void stats_test(int stats_fd) stats_data = malloc(size_data); TEST_ASSERT(stats_data, "Allocate memory for stats data"); /* Read kvm stats data as a bulk */ - ret = pread(stats_fd, stats_data, size_data, header->data_offset); + ret = pread(stats_fd, stats_data, size_data, header.data_offset); TEST_ASSERT(ret == size_data, "Read KVM stats data"); /* Read kvm stats data one by one */ size_data = 0; - for (i = 0; i < header->num_desc; ++i) { + for (i = 0; i < header.num_desc; ++i) { pdesc = (void *)stats_desc + i * size_desc; ret = pread(stats_fd, stats_data, pdesc->size * sizeof(*stats_data), - header->data_offset + size_data); + header.data_offset + size_data); TEST_ASSERT(ret == pdesc->size * sizeof(*stats_data), "Read data of KVM stats: %s", pdesc->name); size_data += pdesc->size * sizeof(*stats_data); @@ -159,7 +156,6 @@ static void stats_test(int stats_fd) free(stats_data); free(stats_desc); free(id); - free(header); } -- cgit v1.2.3 From 32faa0647cea46ffda9095c3a8c95b315e139f0f Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 13 Jun 2022 21:25:15 +0000 Subject: KVM: selftests: Read binary stats header in lib Move the code to read the binary stats header to the KVM selftests library. It will be re-used by other tests to check KVM behavior. No functional change intended. Reviewed-by: David Matlack Reviewed-by: Peter Xu Signed-off-by: Ben Gardon Message-Id: <20220613212523.3436117-3-bgardon@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/kvm_util_base.h | 8 ++++++++ tools/testing/selftests/kvm/kvm_binary_stats_test.c | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 7ebfc8c7de17..669fe4b400c2 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -308,6 +308,14 @@ static inline int vm_get_stats_fd(struct kvm_vm *vm) return fd; } +static inline void read_stats_header(int stats_fd, struct kvm_stats_header *header) +{ + ssize_t ret; + + ret = read(stats_fd, header, sizeof(*header)); + TEST_ASSERT(ret == sizeof(*header), "Read stats header"); +} + void vm_create_irqchip(struct kvm_vm *vm); void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index ac5c3d0181e3..2a90b6276fcd 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -33,8 +33,8 @@ static void stats_test(int stats_fd) struct kvm_stats_desc *pdesc; /* Read kvm stats header */ - ret = read(stats_fd, &header, sizeof(header)); - TEST_ASSERT(ret == sizeof(header), "Read stats header"); + read_stats_header(stats_fd, &header); + size_desc = sizeof(*stats_desc) + header.name_size; /* Read kvm stats id string */ -- cgit v1.2.3 From 4d0a059415708ec0f221616f187853176d2fbebc Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 13 Jun 2022 21:25:16 +0000 Subject: KVM: selftests: Read binary stats desc in lib Move the code to read the binary stats descriptors to the KVM selftests library. It will be re-used by other tests to check KVM behavior. No functional change intended. Reviewed-by: David Matlack Reviewed-by: Peter Xu Signed-off-by: Ben Gardon Message-Id: <20220613212523.3436117-4-bgardon@google.com> Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/kvm_util_base.h | 25 ++++++++++++++++ .../testing/selftests/kvm/kvm_binary_stats_test.c | 16 ++++------- tools/testing/selftests/kvm/lib/kvm_util.c | 33 ++++++++++++++++++++++ 3 files changed, 63 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 669fe4b400c2..8bce6df5fd40 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -316,6 +316,31 @@ static inline void read_stats_header(int stats_fd, struct kvm_stats_header *head TEST_ASSERT(ret == sizeof(*header), "Read stats header"); } +struct kvm_stats_desc *read_stats_descriptors(int stats_fd, + struct kvm_stats_header *header); + +static inline ssize_t get_stats_descriptor_size(struct kvm_stats_header *header) +{ + /* + * The base size of the descriptor is defined by KVM's ABI, but the + * size of the name field is variable, as far as KVM's ABI is + * concerned. For a given instance of KVM, the name field is the same + * size for all stats and is provided in the overall stats header. + */ + return sizeof(struct kvm_stats_desc) + header->name_size; +} + +static inline struct kvm_stats_desc *get_stats_descriptor(struct kvm_stats_desc *stats, + int index, + struct kvm_stats_header *header) +{ + /* + * Note, size_desc includes the size of the name field, which is + * variable. i.e. this is NOT equivalent to &stats_desc[i]. + */ + return (void *)stats + index * get_stats_descriptor_size(header); +} + void vm_create_irqchip(struct kvm_vm *vm); void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index 2a90b6276fcd..66cf6cdc638b 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -35,7 +35,7 @@ static void stats_test(int stats_fd) /* Read kvm stats header */ read_stats_header(stats_fd, &header); - size_desc = sizeof(*stats_desc) + header.name_size; + size_desc = get_stats_descriptor_size(&header); /* Read kvm stats id string */ id = malloc(header.name_size); @@ -62,18 +62,12 @@ static void stats_test(int stats_fd) header.data_offset), "Descriptor block is overlapped with data block"); - /* Allocate memory for stats descriptors */ - stats_desc = calloc(header.num_desc, size_desc); - TEST_ASSERT(stats_desc, "Allocate memory for stats descriptors"); /* Read kvm stats descriptors */ - ret = pread(stats_fd, stats_desc, - size_desc * header.num_desc, header.desc_offset); - TEST_ASSERT(ret == size_desc * header.num_desc, - "Read KVM stats descriptors"); + stats_desc = read_stats_descriptors(stats_fd, &header); /* Sanity check for fields in descriptors */ for (i = 0; i < header.num_desc; ++i) { - pdesc = (void *)stats_desc + i * size_desc; + pdesc = get_stats_descriptor(stats_desc, i, &header); /* Check type,unit,base boundaries */ TEST_ASSERT((pdesc->flags & KVM_STATS_TYPE_MASK) <= KVM_STATS_TYPE_MAX, "Unknown KVM stats type"); @@ -129,7 +123,7 @@ static void stats_test(int stats_fd) "Data size is not correct"); /* Check stats offset */ for (i = 0; i < header.num_desc; ++i) { - pdesc = (void *)stats_desc + i * size_desc; + pdesc = get_stats_descriptor(stats_desc, i, &header); TEST_ASSERT(pdesc->offset < size_data, "Invalid offset (%u) for stats: %s", pdesc->offset, pdesc->name); @@ -144,7 +138,7 @@ static void stats_test(int stats_fd) /* Read kvm stats data one by one */ size_data = 0; for (i = 0; i < header.num_desc; ++i) { - pdesc = (void *)stats_desc + i * size_desc; + pdesc = get_stats_descriptor(stats_desc, i, &header); ret = pread(stats_fd, stats_data, pdesc->size * sizeof(*stats_data), header.data_offset + size_data); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index f8c104dba258..64ef108cc270 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1850,3 +1850,36 @@ unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size) n = DIV_ROUND_UP(size, vm_guest_mode_params[mode].page_size); return vm_adjust_num_guest_pages(mode, n); } + +/* + * Read binary stats descriptors + * + * Input Args: + * stats_fd - the file descriptor for the binary stats file from which to read + * header - the binary stats metadata header corresponding to the given FD + * + * Output Args: None + * + * Return: + * A pointer to a newly allocated series of stat descriptors. + * Caller is responsible for freeing the returned kvm_stats_desc. + * + * Read the stats descriptors from the binary stats interface. + */ +struct kvm_stats_desc *read_stats_descriptors(int stats_fd, + struct kvm_stats_header *header) +{ + struct kvm_stats_desc *stats_desc; + ssize_t desc_size, total_size, ret; + + desc_size = get_stats_descriptor_size(header); + total_size = header->num_desc * desc_size; + + stats_desc = calloc(header->num_desc, desc_size); + TEST_ASSERT(stats_desc, "Allocate memory for stats descriptors"); + + ret = pread(stats_fd, stats_desc, total_size, header->desc_offset); + TEST_ASSERT(ret == total_size, "Read KVM stats descriptors"); + + return stats_desc; +} -- cgit v1.2.3 From 143e7eea3d66737c73cc3aa3538123ea3bb1f640 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 13 Jun 2022 21:25:17 +0000 Subject: KVM: selftests: Clean up coding style in binary stats test Fix a variety of code style violations and/or inconsistencies in the binary stats test. The 80 char limit is a soft limit and can and should be ignored/violated if doing so improves the overall code readability. Specifically, provide consistent indentation and don't split expressions at arbitrary points just to honor the 80 char limit. Opportunistically expand/add comments to call out the more subtle aspects of the code. Signed-off-by: Sean Christopherson Reviewed-by: David Matlack Signed-off-by: Ben Gardon Message-Id: <20220613212523.3436117-5-bgardon@google.com> Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/kvm_binary_stats_test.c | 79 ++++++++++++---------- 1 file changed, 45 insertions(+), 34 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index 66cf6cdc638b..a904873b2b86 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -40,27 +40,31 @@ static void stats_test(int stats_fd) /* Read kvm stats id string */ id = malloc(header.name_size); TEST_ASSERT(id, "Allocate memory for id string"); + ret = read(stats_fd, id, header.name_size); TEST_ASSERT(ret == header.name_size, "Read id string"); /* Check id string, that should start with "kvm" */ TEST_ASSERT(!strncmp(id, "kvm", 3) && strlen(id) < header.name_size, - "Invalid KVM stats type, id: %s", id); + "Invalid KVM stats type, id: %s", id); /* Sanity check for other fields in header */ if (header.num_desc == 0) { printf("No KVM stats defined!"); return; } - /* Check overlap */ - TEST_ASSERT(header.desc_offset > 0 && header.data_offset > 0 - && header.desc_offset >= sizeof(header) - && header.data_offset >= sizeof(header), - "Invalid offset fields in header"); + /* + * The descriptor and data offsets must be valid, they must not overlap + * the header, and the descriptor and data blocks must not overlap each + * other. Note, the data block is rechecked after its size is known. + */ + TEST_ASSERT(header.desc_offset && header.desc_offset >= sizeof(header) && + header.data_offset && header.data_offset >= sizeof(header), + "Invalid offset fields in header"); + TEST_ASSERT(header.desc_offset > header.data_offset || - (header.desc_offset + size_desc * header.num_desc <= - header.data_offset), - "Descriptor block is overlapped with data block"); + (header.desc_offset + size_desc * header.num_desc <= header.data_offset), + "Descriptor block is overlapped with data block"); /* Read kvm stats descriptors */ stats_desc = read_stats_descriptors(stats_fd, &header); @@ -68,14 +72,17 @@ static void stats_test(int stats_fd) /* Sanity check for fields in descriptors */ for (i = 0; i < header.num_desc; ++i) { pdesc = get_stats_descriptor(stats_desc, i, &header); + /* Check type,unit,base boundaries */ - TEST_ASSERT((pdesc->flags & KVM_STATS_TYPE_MASK) - <= KVM_STATS_TYPE_MAX, "Unknown KVM stats type"); - TEST_ASSERT((pdesc->flags & KVM_STATS_UNIT_MASK) - <= KVM_STATS_UNIT_MAX, "Unknown KVM stats unit"); - TEST_ASSERT((pdesc->flags & KVM_STATS_BASE_MASK) - <= KVM_STATS_BASE_MAX, "Unknown KVM stats base"); - /* Check exponent for stats unit + TEST_ASSERT((pdesc->flags & KVM_STATS_TYPE_MASK) <= KVM_STATS_TYPE_MAX, + "Unknown KVM stats type"); + TEST_ASSERT((pdesc->flags & KVM_STATS_UNIT_MASK) <= KVM_STATS_UNIT_MAX, + "Unknown KVM stats unit"); + TEST_ASSERT((pdesc->flags & KVM_STATS_BASE_MASK) <= KVM_STATS_BASE_MAX, + "Unknown KVM stats base"); + + /* + * Check exponent for stats unit * Exponent for counter should be greater than or equal to 0 * Exponent for unit bytes should be greater than or equal to 0 * Exponent for unit seconds should be less than or equal to 0 @@ -86,47 +93,51 @@ static void stats_test(int stats_fd) case KVM_STATS_UNIT_NONE: case KVM_STATS_UNIT_BYTES: case KVM_STATS_UNIT_CYCLES: - TEST_ASSERT(pdesc->exponent >= 0, - "Unsupported KVM stats unit"); + TEST_ASSERT(pdesc->exponent >= 0, "Unsupported KVM stats unit"); break; case KVM_STATS_UNIT_SECONDS: - TEST_ASSERT(pdesc->exponent <= 0, - "Unsupported KVM stats unit"); + TEST_ASSERT(pdesc->exponent <= 0, "Unsupported KVM stats unit"); break; } /* Check name string */ TEST_ASSERT(strlen(pdesc->name) < header.name_size, - "KVM stats name(%s) too long", pdesc->name); + "KVM stats name(%s) too long", pdesc->name); /* Check size field, which should not be zero */ - TEST_ASSERT(pdesc->size, "KVM descriptor(%s) with size of 0", - pdesc->name); + TEST_ASSERT(pdesc->size, + "KVM descriptor(%s) with size of 0", pdesc->name); /* Check bucket_size field */ switch (pdesc->flags & KVM_STATS_TYPE_MASK) { case KVM_STATS_TYPE_LINEAR_HIST: TEST_ASSERT(pdesc->bucket_size, - "Bucket size of Linear Histogram stats (%s) is zero", - pdesc->name); + "Bucket size of Linear Histogram stats (%s) is zero", + pdesc->name); break; default: TEST_ASSERT(!pdesc->bucket_size, - "Bucket size of stats (%s) is not zero", - pdesc->name); + "Bucket size of stats (%s) is not zero", + pdesc->name); } size_data += pdesc->size * sizeof(*stats_data); } - /* Check overlap */ - TEST_ASSERT(header.data_offset >= header.desc_offset - || header.data_offset + size_data <= header.desc_offset, - "Data block is overlapped with Descriptor block"); + + /* + * Now that the size of the data block is known, verify the data block + * doesn't overlap the descriptor block. + */ + TEST_ASSERT(header.data_offset >= header.desc_offset || + header.data_offset + size_data <= header.desc_offset, + "Data block is overlapped with Descriptor block"); + /* Check validity of all stats data size */ TEST_ASSERT(size_data >= header.num_desc * sizeof(*stats_data), - "Data size is not correct"); + "Data size is not correct"); + /* Check stats offset */ for (i = 0; i < header.num_desc; ++i) { pdesc = get_stats_descriptor(stats_desc, i, &header); TEST_ASSERT(pdesc->offset < size_data, - "Invalid offset (%u) for stats: %s", - pdesc->offset, pdesc->name); + "Invalid offset (%u) for stats: %s", + pdesc->offset, pdesc->name); } /* Allocate memory for stats data */ -- cgit v1.2.3 From ed6b53ec9090133cd744705a09811cf627f3e9cb Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 13 Jun 2022 21:25:18 +0000 Subject: KVM: selftests: Read binary stat data in lib Move the code to read the binary stats data to the KVM selftests library. It will be re-used by other tests to check KVM behavior. Also opportunistically remove an unnecessary calculation with "size_data" in stats_test. Reviewed-by: David Matlack Reviewed-by: Peter Xu Signed-off-by: Ben Gardon Message-Id: <20220613212523.3436117-6-bgardon@google.com> Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/kvm_util_base.h | 4 +++ .../testing/selftests/kvm/kvm_binary_stats_test.c | 9 ++---- tools/testing/selftests/kvm/lib/kvm_util.c | 35 ++++++++++++++++++++++ 3 files changed, 41 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 8bce6df5fd40..cec3af45e18d 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -341,6 +341,10 @@ static inline struct kvm_stats_desc *get_stats_descriptor(struct kvm_stats_desc return (void *)stats + index * get_stats_descriptor_size(header); } +void read_stat_data(int stats_fd, struct kvm_stats_header *header, + struct kvm_stats_desc *desc, uint64_t *data, + size_t max_elements); + void vm_create_irqchip(struct kvm_vm *vm); void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index a904873b2b86..b01e8b0851e7 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -147,15 +147,10 @@ static void stats_test(int stats_fd) ret = pread(stats_fd, stats_data, size_data, header.data_offset); TEST_ASSERT(ret == size_data, "Read KVM stats data"); /* Read kvm stats data one by one */ - size_data = 0; for (i = 0; i < header.num_desc; ++i) { pdesc = get_stats_descriptor(stats_desc, i, &header); - ret = pread(stats_fd, stats_data, - pdesc->size * sizeof(*stats_data), - header.data_offset + size_data); - TEST_ASSERT(ret == pdesc->size * sizeof(*stats_data), - "Read data of KVM stats: %s", pdesc->name); - size_data += pdesc->size * sizeof(*stats_data); + read_stat_data(stats_fd, &header, pdesc, stats_data, + pdesc->size); } free(stats_data); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 64ef108cc270..e44eb510fcc1 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1883,3 +1883,38 @@ struct kvm_stats_desc *read_stats_descriptors(int stats_fd, return stats_desc; } + +/* + * Read stat data for a particular stat + * + * Input Args: + * stats_fd - the file descriptor for the binary stats file from which to read + * header - the binary stats metadata header corresponding to the given FD + * desc - the binary stat metadata for the particular stat to be read + * max_elements - the maximum number of 8-byte values to read into data + * + * Output Args: + * data - the buffer into which stat data should be read + * + * Read the data values of a specified stat from the binary stats interface. + */ +void read_stat_data(int stats_fd, struct kvm_stats_header *header, + struct kvm_stats_desc *desc, uint64_t *data, + size_t max_elements) +{ + size_t nr_elements = min_t(ssize_t, desc->size, max_elements); + size_t size = nr_elements * sizeof(*data); + ssize_t ret; + + TEST_ASSERT(desc->size, "No elements in stat '%s'", desc->name); + TEST_ASSERT(max_elements, "Zero elements requested for stat '%s'", desc->name); + + ret = pread(stats_fd, data, size, + header->data_offset + desc->offset); + + TEST_ASSERT(ret >= 0, "pread() failed on stat '%s', errno: %i (%s)", + desc->name, errno, strerror(errno)); + TEST_ASSERT(ret == size, + "pread() on stat '%s' read %ld bytes, wanted %lu bytes", + desc->name, size, ret); +} -- cgit v1.2.3 From 8448ec5993beee031376e36f77969cc0a07d8c6b Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 13 Jun 2022 21:25:19 +0000 Subject: KVM: selftests: Add NX huge pages test There's currently no test coverage of NX hugepages in KVM selftests, so add a basic test to ensure that the feature works as intended. The test creates a VM with a data slot backed with huge pages. The memory in the data slot is filled with op-codes for the return instruction. The guest then executes a series of accesses on the memory, some reads, some instruction fetches. After each operation, the guest exits and the test performs some checks on the backing page counts to ensure that NX page splitting an reclaim work as expected. Reviewed-by: David Matlack Signed-off-by: Ben Gardon Message-Id: <20220613212523.3436117-7-bgardon@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 10 + .../testing/selftests/kvm/include/kvm_util_base.h | 11 + tools/testing/selftests/kvm/lib/kvm_util.c | 46 +++++ .../selftests/kvm/x86_64/nx_huge_pages_test.c | 229 +++++++++++++++++++++ .../selftests/kvm/x86_64/nx_huge_pages_test.sh | 40 ++++ 6 files changed, 337 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c create mode 100755 tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh (limited to 'tools') diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index f3c2f074b948..c478b4fefc57 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -29,6 +29,7 @@ /x86_64/mmio_warning_test /x86_64/mmu_role_test /x86_64/monitor_mwait_test +/x86_64/nx_huge_pages_test /x86_64/platform_info_test /x86_64/pmu_event_filter_test /x86_64/set_boot_cpu_id diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index ad1634e5659f..fa3e0687e9d5 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -70,6 +70,10 @@ LIBKVM_s390x += lib/s390x/ucall.c LIBKVM_riscv += lib/riscv/processor.c LIBKVM_riscv += lib/riscv/ucall.c +# Non-compiled test targets +TEST_PROGS_x86_64 += x86_64/nx_huge_pages_test.sh + +# Compiled test targets TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features @@ -135,6 +139,9 @@ TEST_GEN_PROGS_x86_64 += steal_time TEST_GEN_PROGS_x86_64 += kvm_binary_stats_test TEST_GEN_PROGS_x86_64 += system_counter_offset_test +# Compiled outputs used by test targets +TEST_GEN_PROGS_EXTENDED_x86_64 += x86_64/nx_huge_pages_test + TEST_GEN_PROGS_aarch64 += aarch64/arch_timer TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list @@ -174,7 +181,9 @@ TEST_GEN_PROGS_riscv += kvm_page_table_test TEST_GEN_PROGS_riscv += set_memory_region_test TEST_GEN_PROGS_riscv += kvm_binary_stats_test +TEST_PROGS += $(TEST_PROGS_$(UNAME_M)) TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M)) +TEST_GEN_PROGS_EXTENDED += $(TEST_GEN_PROGS_EXTENDED_$(UNAME_M)) LIBKVM += $(LIBKVM_$(UNAME_M)) INSTALL_HDR_PATH = $(top_srcdir)/usr @@ -221,6 +230,7 @@ $(LIBKVM_S_OBJ): $(OUTPUT)/%.o: %.S x := $(shell mkdir -p $(sort $(dir $(TEST_GEN_PROGS)))) $(TEST_GEN_PROGS): $(LIBKVM_OBJS) +$(TEST_GEN_PROGS_EXTENDED): $(LIBKVM_OBJS) cscope: include_paths = $(LINUX_TOOL_INCLUDE) $(LINUX_HDR_PATH) include lib .. cscope: diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index cec3af45e18d..c6c1d66136d6 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -345,6 +345,17 @@ void read_stat_data(int stats_fd, struct kvm_stats_header *header, struct kvm_stats_desc *desc, uint64_t *data, size_t max_elements); +void __vm_get_stat(struct kvm_vm *vm, const char *stat_name, uint64_t *data, + size_t max_elements); + +static inline uint64_t vm_get_stat(struct kvm_vm *vm, const char *stat_name) +{ + uint64_t data; + + __vm_get_stat(vm, stat_name, &data, 1); + return data; +} + void vm_create_irqchip(struct kvm_vm *vm); void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index e44eb510fcc1..5a0fd368503f 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1918,3 +1918,49 @@ void read_stat_data(int stats_fd, struct kvm_stats_header *header, "pread() on stat '%s' read %ld bytes, wanted %lu bytes", desc->name, size, ret); } + +/* + * Read the data of the named stat + * + * Input Args: + * vm - the VM for which the stat should be read + * stat_name - the name of the stat to read + * max_elements - the maximum number of 8-byte values to read into data + * + * Output Args: + * data - the buffer into which stat data should be read + * + * Read the data values of a specified stat from the binary stats interface. + */ +void __vm_get_stat(struct kvm_vm *vm, const char *stat_name, uint64_t *data, + size_t max_elements) +{ + struct kvm_stats_desc *stats_desc; + struct kvm_stats_header header; + struct kvm_stats_desc *desc; + size_t size_desc; + int stats_fd; + int i; + + stats_fd = vm_get_stats_fd(vm); + + read_stats_header(stats_fd, &header); + + stats_desc = read_stats_descriptors(stats_fd, &header); + + size_desc = get_stats_descriptor_size(&header); + + for (i = 0; i < header.num_desc; ++i) { + desc = (void *)stats_desc + (i * size_desc); + + if (strcmp(desc->name, stat_name)) + continue; + + read_stat_data(stats_fd, &header, desc, data, max_elements); + + break; + } + + free(stats_desc); + close(stats_fd); +} diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c new file mode 100644 index 000000000000..5fa61d225787 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c @@ -0,0 +1,229 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * tools/testing/selftests/kvm/nx_huge_page_test.c + * + * Usage: to be run via nx_huge_page_test.sh, which does the necessary + * environment setup and teardown + * + * Copyright (C) 2022, Google LLC. + */ + +#define _GNU_SOURCE + +#include +#include +#include + +#include +#include "kvm_util.h" +#include "processor.h" + +#define HPAGE_SLOT 10 +#define HPAGE_GPA (4UL << 30) /* 4G prevents collision w/ slot 0 */ +#define HPAGE_GVA HPAGE_GPA /* GVA is arbitrary, so use GPA. */ +#define PAGES_PER_2MB_HUGE_PAGE 512 +#define HPAGE_SLOT_NPAGES (3 * PAGES_PER_2MB_HUGE_PAGE) + +/* + * Passed by nx_huge_pages_test.sh to provide an easy warning if this test is + * being run without it. + */ +#define MAGIC_TOKEN 887563923 + +/* + * x86 opcode for the return instruction. Used to call into, and then + * immediately return from, memory backed with hugepages. + */ +#define RETURN_OPCODE 0xC3 + +/* Call the specified memory address. */ +static void guest_do_CALL(uint64_t target) +{ + ((void (*)(void)) target)(); +} + +/* + * Exit the VM after each memory access so that the userspace component of the + * test can make assertions about the pages backing the VM. + * + * See the below for an explanation of how each access should affect the + * backing mappings. + */ +void guest_code(void) +{ + uint64_t hpage_1 = HPAGE_GVA; + uint64_t hpage_2 = hpage_1 + (PAGE_SIZE * 512); + uint64_t hpage_3 = hpage_2 + (PAGE_SIZE * 512); + + READ_ONCE(*(uint64_t *)hpage_1); + GUEST_SYNC(1); + + READ_ONCE(*(uint64_t *)hpage_2); + GUEST_SYNC(2); + + guest_do_CALL(hpage_1); + GUEST_SYNC(3); + + guest_do_CALL(hpage_3); + GUEST_SYNC(4); + + READ_ONCE(*(uint64_t *)hpage_1); + GUEST_SYNC(5); + + READ_ONCE(*(uint64_t *)hpage_3); + GUEST_SYNC(6); +} + +static void check_2m_page_count(struct kvm_vm *vm, int expected_pages_2m) +{ + int actual_pages_2m; + + actual_pages_2m = vm_get_stat(vm, "pages_2m"); + + TEST_ASSERT(actual_pages_2m == expected_pages_2m, + "Unexpected 2m page count. Expected %d, got %d", + expected_pages_2m, actual_pages_2m); +} + +static void check_split_count(struct kvm_vm *vm, int expected_splits) +{ + int actual_splits; + + actual_splits = vm_get_stat(vm, "nx_lpage_splits"); + + TEST_ASSERT(actual_splits == expected_splits, + "Unexpected NX huge page split count. Expected %d, got %d", + expected_splits, actual_splits); +} + +static void wait_for_reclaim(int reclaim_period_ms) +{ + long reclaim_wait_ms; + struct timespec ts; + + reclaim_wait_ms = reclaim_period_ms * 5; + ts.tv_sec = reclaim_wait_ms / 1000; + ts.tv_nsec = (reclaim_wait_ms - (ts.tv_sec * 1000)) * 1000000; + nanosleep(&ts, NULL); +} + +static void help(char *name) +{ + puts(""); + printf("usage: %s [-h] [-p period_ms] [-t token]\n", name); + puts(""); + printf(" -p: The NX reclaim period in miliseconds.\n"); + printf(" -t: The magic token to indicate environment setup is done.\n"); + puts(""); + exit(0); +} + +int main(int argc, char **argv) +{ + int reclaim_period_ms = 0, token = 0, opt; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + void *hva; + + while ((opt = getopt(argc, argv, "hp:t:")) != -1) { + switch (opt) { + case 'p': + reclaim_period_ms = atoi(optarg); + break; + case 't': + token = atoi(optarg); + break; + case 'h': + default: + help(argv[0]); + break; + } + } + + if (token != MAGIC_TOKEN) { + print_skip("This test must be run with the magic token %d.\n" + "This is done by nx_huge_pages_test.sh, which\n" + "also handles environment setup for the test.", + MAGIC_TOKEN); + exit(KSFT_SKIP); + } + + if (!reclaim_period_ms) { + print_skip("The NX reclaim period must be specified and non-zero"); + exit(KSFT_SKIP); + } + + vm = vm_create(1); + vcpu = vm_vcpu_add(vm, 0, guest_code); + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_HUGETLB, + HPAGE_GPA, HPAGE_SLOT, + HPAGE_SLOT_NPAGES, 0); + + virt_map(vm, HPAGE_GVA, HPAGE_GPA, HPAGE_SLOT_NPAGES); + + hva = addr_gpa2hva(vm, HPAGE_GPA); + memset(hva, RETURN_OPCODE, HPAGE_SLOT_NPAGES * PAGE_SIZE); + + check_2m_page_count(vm, 0); + check_split_count(vm, 0); + + /* + * The guest code will first read from the first hugepage, resulting + * in a huge page mapping being created. + */ + vcpu_run(vcpu); + check_2m_page_count(vm, 1); + check_split_count(vm, 0); + + /* + * Then the guest code will read from the second hugepage, resulting + * in another huge page mapping being created. + */ + vcpu_run(vcpu); + check_2m_page_count(vm, 2); + check_split_count(vm, 0); + + /* + * Next, the guest will execute from the first huge page, causing it + * to be remapped at 4k. + */ + vcpu_run(vcpu); + check_2m_page_count(vm, 1); + check_split_count(vm, 1); + + /* + * Executing from the third huge page (previously unaccessed) will + * cause part to be mapped at 4k. + */ + vcpu_run(vcpu); + check_2m_page_count(vm, 1); + check_split_count(vm, 2); + + /* Reading from the first huge page again should have no effect. */ + vcpu_run(vcpu); + check_2m_page_count(vm, 1); + check_split_count(vm, 2); + + /* Give recovery thread time to run. */ + wait_for_reclaim(reclaim_period_ms); + + /* + * Now that the reclaimer has run, all the split pages should be gone. + */ + check_2m_page_count(vm, 1); + check_split_count(vm, 0); + + /* + * The 4k mapping on hpage 3 should have been removed, so check that + * reading from it causes a huge page mapping to be installed. + */ + vcpu_run(vcpu); + check_2m_page_count(vm, 2); + check_split_count(vm, 0); + + kvm_vm_free(vm); + + return 0; +} + diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh new file mode 100755 index 000000000000..4e090a84f5f3 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-only */ +# +# Wrapper script which performs setup and cleanup for nx_huge_pages_test. +# Makes use of root privileges to set up huge pages and KVM module parameters. +# +# tools/testing/selftests/kvm/nx_huge_page_test.sh +# Copyright (C) 2022, Google LLC. + +set -e + +NX_HUGE_PAGES=$(cat /sys/module/kvm/parameters/nx_huge_pages) +NX_HUGE_PAGES_RECOVERY_RATIO=$(cat /sys/module/kvm/parameters/nx_huge_pages_recovery_ratio) +NX_HUGE_PAGES_RECOVERY_PERIOD=$(cat /sys/module/kvm/parameters/nx_huge_pages_recovery_period_ms) +HUGE_PAGES=$(cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages) + +set +e + +function sudo_echo () { + echo "$1" | sudo tee -a "$2" > /dev/null +} + +( + set -e + + sudo_echo 1 /sys/module/kvm/parameters/nx_huge_pages + sudo_echo 1 /sys/module/kvm/parameters/nx_huge_pages_recovery_ratio + sudo_echo 100 /sys/module/kvm/parameters/nx_huge_pages_recovery_period_ms + sudo_echo "$(( $HUGE_PAGES + 3 ))" /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages + + "$(dirname $0)"/nx_huge_pages_test -t 887563923 -p 100 +) +RET=$? + +sudo_echo "$NX_HUGE_PAGES" /sys/module/kvm/parameters/nx_huge_pages +sudo_echo "$NX_HUGE_PAGES_RECOVERY_RATIO" /sys/module/kvm/parameters/nx_huge_pages_recovery_ratio +sudo_echo "$NX_HUGE_PAGES_RECOVERY_PERIOD" /sys/module/kvm/parameters/nx_huge_pages_recovery_period_ms +sudo_echo "$HUGE_PAGES" /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages + +exit $RET -- cgit v1.2.3 From b774da3f2e5761cb85881ce62eb6dc97d15396c4 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 13 Jun 2022 21:25:22 +0000 Subject: KVM: selftests: Test disabling NX hugepages on a VM Add an argument to the NX huge pages test to test disabling the feature on a VM using the new capability. Reviewed-by: David Matlack Signed-off-by: Ben Gardon Message-Id: <20220613212523.3436117-10-bgardon@google.com> [Handle failure of sudo or setcap more gracefully. - Paolo] Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/kvm_util_base.h | 6 + .../selftests/kvm/x86_64/nx_huge_pages_test.c | 134 +++++++++++++-------- .../selftests/kvm/x86_64/nx_huge_pages_test.sh | 21 +++- 3 files changed, 113 insertions(+), 48 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index c6c1d66136d6..fd2b52850b29 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -809,4 +809,10 @@ static inline void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) virt_arch_dump(stream, vm, indent); } + +static inline int __vm_disable_nx_huge_pages(struct kvm_vm *vm) +{ + return __vm_enable_cap(vm, KVM_CAP_VM_DISABLE_NX_HUGE_PAGES, 0); +} + #endif /* SELFTEST_KVM_UTIL_BASE_H */ diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c index 5fa61d225787..cc6421716400 100644 --- a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c +++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c @@ -107,53 +107,34 @@ static void wait_for_reclaim(int reclaim_period_ms) nanosleep(&ts, NULL); } -static void help(char *name) -{ - puts(""); - printf("usage: %s [-h] [-p period_ms] [-t token]\n", name); - puts(""); - printf(" -p: The NX reclaim period in miliseconds.\n"); - printf(" -t: The magic token to indicate environment setup is done.\n"); - puts(""); - exit(0); -} - -int main(int argc, char **argv) +void run_test(int reclaim_period_ms, bool disable_nx_huge_pages, + bool reboot_permissions) { - int reclaim_period_ms = 0, token = 0, opt; struct kvm_vcpu *vcpu; struct kvm_vm *vm; void *hva; + int r; - while ((opt = getopt(argc, argv, "hp:t:")) != -1) { - switch (opt) { - case 'p': - reclaim_period_ms = atoi(optarg); - break; - case 't': - token = atoi(optarg); - break; - case 'h': - default: - help(argv[0]); - break; - } - } - - if (token != MAGIC_TOKEN) { - print_skip("This test must be run with the magic token %d.\n" - "This is done by nx_huge_pages_test.sh, which\n" - "also handles environment setup for the test.", - MAGIC_TOKEN); - exit(KSFT_SKIP); - } + vm = vm_create(1); - if (!reclaim_period_ms) { - print_skip("The NX reclaim period must be specified and non-zero"); - exit(KSFT_SKIP); + if (disable_nx_huge_pages) { + /* + * Cannot run the test without NX huge pages if the kernel + * does not support it. + */ + if (!kvm_check_cap(KVM_CAP_VM_DISABLE_NX_HUGE_PAGES)) + return; + + r = __vm_disable_nx_huge_pages(vm); + if (reboot_permissions) { + TEST_ASSERT(!r, "Disabling NX huge pages should succeed if process has reboot permissions"); + } else { + TEST_ASSERT(r == -1 && errno == EPERM, + "This process should not have permission to disable NX huge pages"); + return; + } } - vm = vm_create(1); vcpu = vm_vcpu_add(vm, 0, guest_code); vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_HUGETLB, @@ -187,31 +168,38 @@ int main(int argc, char **argv) /* * Next, the guest will execute from the first huge page, causing it * to be remapped at 4k. + * + * If NX huge pages are disabled, this should have no effect. */ vcpu_run(vcpu); - check_2m_page_count(vm, 1); - check_split_count(vm, 1); + check_2m_page_count(vm, disable_nx_huge_pages ? 2 : 1); + check_split_count(vm, disable_nx_huge_pages ? 0 : 1); /* * Executing from the third huge page (previously unaccessed) will * cause part to be mapped at 4k. + * + * If NX huge pages are disabled, it should be mapped at 2M. */ vcpu_run(vcpu); - check_2m_page_count(vm, 1); - check_split_count(vm, 2); + check_2m_page_count(vm, disable_nx_huge_pages ? 3 : 1); + check_split_count(vm, disable_nx_huge_pages ? 0 : 2); /* Reading from the first huge page again should have no effect. */ vcpu_run(vcpu); - check_2m_page_count(vm, 1); - check_split_count(vm, 2); + check_2m_page_count(vm, disable_nx_huge_pages ? 3 : 1); + check_split_count(vm, disable_nx_huge_pages ? 0 : 2); /* Give recovery thread time to run. */ wait_for_reclaim(reclaim_period_ms); /* * Now that the reclaimer has run, all the split pages should be gone. + * + * If NX huge pages are disabled, the relaimer will not run, so + * nothing should change from here on. */ - check_2m_page_count(vm, 1); + check_2m_page_count(vm, disable_nx_huge_pages ? 3 : 1); check_split_count(vm, 0); /* @@ -219,10 +207,62 @@ int main(int argc, char **argv) * reading from it causes a huge page mapping to be installed. */ vcpu_run(vcpu); - check_2m_page_count(vm, 2); + check_2m_page_count(vm, disable_nx_huge_pages ? 3 : 2); check_split_count(vm, 0); kvm_vm_free(vm); +} + +static void help(char *name) +{ + puts(""); + printf("usage: %s [-h] [-p period_ms] [-t token]\n", name); + puts(""); + printf(" -p: The NX reclaim period in miliseconds.\n"); + printf(" -t: The magic token to indicate environment setup is done.\n"); + printf(" -r: The test has reboot permissions and can disable NX huge pages.\n"); + puts(""); + exit(0); +} + +int main(int argc, char **argv) +{ + int reclaim_period_ms = 0, token = 0, opt; + bool reboot_permissions = false; + + while ((opt = getopt(argc, argv, "hp:t:r")) != -1) { + switch (opt) { + case 'p': + reclaim_period_ms = atoi(optarg); + break; + case 't': + token = atoi(optarg); + break; + case 'r': + reboot_permissions = true; + break; + case 'h': + default: + help(argv[0]); + break; + } + } + + if (token != MAGIC_TOKEN) { + print_skip("This test must be run with the magic token %d.\n" + "This is done by nx_huge_pages_test.sh, which\n" + "also handles environment setup for the test.", + MAGIC_TOKEN); + exit(KSFT_SKIP); + } + + if (!reclaim_period_ms) { + print_skip("The NX reclaim period must be specified and non-zero"); + exit(KSFT_SKIP); + } + + run_test(reclaim_period_ms, false, reboot_permissions); + run_test(reclaim_period_ms, true, reboot_permissions); return 0; } diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh index 4e090a84f5f3..0560149e66ed 100755 --- a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh +++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh @@ -20,6 +20,10 @@ function sudo_echo () { echo "$1" | sudo tee -a "$2" > /dev/null } +NXECUTABLE="$(dirname $0)/nx_huge_pages_test" + +sudo_echo test /dev/null || exit 4 # KSFT_SKIP=4 + ( set -e @@ -28,7 +32,22 @@ function sudo_echo () { sudo_echo 100 /sys/module/kvm/parameters/nx_huge_pages_recovery_period_ms sudo_echo "$(( $HUGE_PAGES + 3 ))" /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages - "$(dirname $0)"/nx_huge_pages_test -t 887563923 -p 100 + # Test with reboot permissions + if [ $(whoami) == "root" ] || sudo setcap cap_sys_boot+ep $NXECUTABLE 2> /dev/null; then + echo Running test with CAP_SYS_BOOT enabled + $NXECUTABLE -t 887563923 -p 100 -r + test $(whoami) == "root" || sudo setcap cap_sys_boot-ep $NXECUTABLE + else + echo setcap failed, skipping nx_huge_pages_test with CAP_SYS_BOOT enabled + fi + + # Test without reboot permissions + if [ $(whoami) != "root" ] ; then + echo Running test with CAP_SYS_BOOT disabled + $NXECUTABLE -t 887563923 -p 100 + else + echo Running as root, skipping nx_huge_pages_test with CAP_SYS_BOOT disabled + fi ) RET=$? -- cgit v1.2.3 From 83f6e109f562063ab7a1f54d99bcab2858b09ead Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 13 Jun 2022 21:25:23 +0000 Subject: KVM: selftests: Cache binary stats metadata for duration of test In order to improve performance across multiple reads of VM stats, cache the stats metadata in the VM struct. Signed-off-by: Ben Gardon Message-Id: <20220613212523.3436117-11-bgardon@google.com> Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/kvm_util_base.h | 5 ++++ tools/testing/selftests/kvm/lib/kvm_util.c | 32 ++++++++++++---------- 2 files changed, 22 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index fd2b52850b29..b78e3c7a2566 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -84,6 +84,11 @@ struct kvm_vm { vm_vaddr_t idt; vm_vaddr_t handlers; uint32_t dirty_ring_size; + + /* Cache of information for binary stats interface */ + int stats_fd; + struct kvm_stats_header stats_header; + struct kvm_stats_desc *stats_desc; }; diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 5a0fd368503f..768f3bce0161 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -547,6 +547,12 @@ void kvm_vm_free(struct kvm_vm *vmp) if (vmp == NULL) return; + /* Free cached stats metadata and close FD */ + if (vmp->stats_fd) { + free(vmp->stats_desc); + close(vmp->stats_fd); + } + /* Free userspace_mem_regions. */ hash_for_each_safe(vmp->regions.slot_hash, ctr, node, region, slot_node) __vm_mem_region_delete(vmp, region, false); @@ -1935,32 +1941,28 @@ void read_stat_data(int stats_fd, struct kvm_stats_header *header, void __vm_get_stat(struct kvm_vm *vm, const char *stat_name, uint64_t *data, size_t max_elements) { - struct kvm_stats_desc *stats_desc; - struct kvm_stats_header header; struct kvm_stats_desc *desc; size_t size_desc; - int stats_fd; int i; - stats_fd = vm_get_stats_fd(vm); - - read_stats_header(stats_fd, &header); - - stats_desc = read_stats_descriptors(stats_fd, &header); + if (!vm->stats_fd) { + vm->stats_fd = vm_get_stats_fd(vm); + read_stats_header(vm->stats_fd, &vm->stats_header); + vm->stats_desc = read_stats_descriptors(vm->stats_fd, + &vm->stats_header); + } - size_desc = get_stats_descriptor_size(&header); + size_desc = get_stats_descriptor_size(&vm->stats_header); - for (i = 0; i < header.num_desc; ++i) { - desc = (void *)stats_desc + (i * size_desc); + for (i = 0; i < vm->stats_header.num_desc; ++i) { + desc = (void *)vm->stats_desc + (i * size_desc); if (strcmp(desc->name, stat_name)) continue; - read_stat_data(stats_fd, &header, desc, data, max_elements); + read_stat_data(vm->stats_fd, &vm->stats_header, desc, + data, max_elements); break; } - - free(stats_desc); - close(stats_fd); } -- cgit v1.2.3 From eede2065cacce2e04110bc6e45e9dc8e843c571b Mon Sep 17 00:00:00 2001 From: Jue Wang Date: Fri, 10 Jun 2022 10:11:34 -0700 Subject: KVM: selftests: Add a self test for CMCI and UCNA emulations. This patch add a self test that verifies user space can inject UnCorrectable No Action required (UCNA) memory errors to the guest. It also verifies that incorrectly configured MSRs for Corrected Machine Check Interrupt (CMCI) emulation will result in #GP. Signed-off-by: Jue Wang Signed-off-by: Paolo Bonzini Message-Id: <20220610171134.772566-9-juew@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + tools/testing/selftests/kvm/include/x86_64/apic.h | 1 + tools/testing/selftests/kvm/include/x86_64/mce.h | 25 ++ .../selftests/kvm/x86_64/ucna_injection_test.c | 316 +++++++++++++++++++++ 5 files changed, 344 insertions(+) create mode 100644 tools/testing/selftests/kvm/include/x86_64/mce.h create mode 100644 tools/testing/selftests/kvm/x86_64/ucna_injection_test.c (limited to 'tools') diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index c478b4fefc57..f44ebf401310 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -43,6 +43,7 @@ /x86_64/sync_regs_test /x86_64/tsc_msrs_test /x86_64/tsc_scaling_sync +/x86_64/ucna_injection_test /x86_64/userspace_io_test /x86_64/userspace_msr_exit_test /x86_64/vmx_apic_access_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index fa3e0687e9d5..4d6753aadfa0 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -101,6 +101,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/svm_int_ctl_test TEST_GEN_PROGS_x86_64 += x86_64/svm_nested_soft_inject_test TEST_GEN_PROGS_x86_64 += x86_64/tsc_scaling_sync TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test +TEST_GEN_PROGS_x86_64 += x86_64/ucna_injection_test TEST_GEN_PROGS_x86_64 += x86_64/userspace_io_test TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test diff --git a/tools/testing/selftests/kvm/include/x86_64/apic.h b/tools/testing/selftests/kvm/include/x86_64/apic.h index ac88557dcc9a..bed316fdecd5 100644 --- a/tools/testing/selftests/kvm/include/x86_64/apic.h +++ b/tools/testing/selftests/kvm/include/x86_64/apic.h @@ -35,6 +35,7 @@ #define APIC_SPIV_APIC_ENABLED (1 << 8) #define APIC_IRR 0x200 #define APIC_ICR 0x300 +#define APIC_LVTCMCI 0x2f0 #define APIC_DEST_SELF 0x40000 #define APIC_DEST_ALLINC 0x80000 #define APIC_DEST_ALLBUT 0xC0000 diff --git a/tools/testing/selftests/kvm/include/x86_64/mce.h b/tools/testing/selftests/kvm/include/x86_64/mce.h new file mode 100644 index 000000000000..6119321f3f5d --- /dev/null +++ b/tools/testing/selftests/kvm/include/x86_64/mce.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * tools/testing/selftests/kvm/include/x86_64/mce.h + * + * Copyright (C) 2022, Google LLC. + */ + +#ifndef SELFTEST_KVM_MCE_H +#define SELFTEST_KVM_MCE_H + +#define MCG_CTL_P BIT_ULL(8) /* MCG_CTL register available */ +#define MCG_SER_P BIT_ULL(24) /* MCA recovery/new status bits */ +#define MCG_LMCE_P BIT_ULL(27) /* Local machine check supported */ +#define MCG_CMCI_P BIT_ULL(10) /* CMCI supported */ +#define KVM_MAX_MCE_BANKS 32 +#define MCG_CAP_BANKS_MASK 0xff /* Bit 0-7 of the MCG_CAP register are #banks */ +#define MCI_STATUS_VAL (1ULL << 63) /* valid error */ +#define MCI_STATUS_UC (1ULL << 61) /* uncorrected error */ +#define MCI_STATUS_EN (1ULL << 60) /* error enabled */ +#define MCI_STATUS_MISCV (1ULL << 59) /* misc error reg. valid */ +#define MCI_STATUS_ADDRV (1ULL << 58) /* addr reg. valid */ +#define MCM_ADDR_PHYS 2 /* physical address */ +#define MCI_CTL2_CMCI_EN BIT_ULL(30) + +#endif /* SELFTEST_KVM_MCE_H */ diff --git a/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c new file mode 100644 index 000000000000..a897c7fd8abe --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ucna_injection_test + * + * Copyright (C) 2022, Google LLC. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Test that user space can inject UnCorrectable No Action required (UCNA) + * memory errors to the guest. + * + * The test starts one vCPU with the MCG_CMCI_P enabled. It verifies that + * proper UCNA errors can be injected to a vCPU with MCG_CMCI_P and + * corresponding per-bank control register (MCI_CTL2) bit enabled. + * The test also checks that the UCNA errors get recorded in the + * Machine Check bank registers no matter the error signal interrupts get + * delivered into the guest or not. + * + */ + +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include +#include +#include +#include + +#include "kvm_util_base.h" +#include "kvm_util.h" +#include "mce.h" +#include "processor.h" +#include "test_util.h" +#include "apic.h" + +#define SYNC_FIRST_UCNA 9 +#define SYNC_SECOND_UCNA 10 +#define SYNC_GP 11 +#define FIRST_UCNA_ADDR 0xdeadbeef +#define SECOND_UCNA_ADDR 0xcafeb0ba + +/* + * Vector for the CMCI interrupt. + * Value is arbitrary. Any value in 0x20-0xFF should work: + * https://wiki.osdev.org/Interrupt_Vector_Table + */ +#define CMCI_VECTOR 0xa9 + +#define UCNA_BANK 0x7 // IMC0 bank + +#define MCI_CTL2_RESERVED_BIT BIT_ULL(29) + +static uint64_t supported_mcg_caps; + +/* + * Record states about the injected UCNA. + * The variables started with the 'i_' prefixes are recorded in interrupt + * handler. Variables without the 'i_' prefixes are recorded in guest main + * execution thread. + */ +static volatile uint64_t i_ucna_rcvd; +static volatile uint64_t i_ucna_addr; +static volatile uint64_t ucna_addr; +static volatile uint64_t ucna_addr2; + +struct thread_params { + struct kvm_vcpu *vcpu; + uint64_t *p_i_ucna_rcvd; + uint64_t *p_i_ucna_addr; + uint64_t *p_ucna_addr; + uint64_t *p_ucna_addr2; +}; + +static void verify_apic_base_addr(void) +{ + uint64_t msr = rdmsr(MSR_IA32_APICBASE); + uint64_t base = GET_APIC_BASE(msr); + + GUEST_ASSERT(base == APIC_DEFAULT_GPA); +} + +static void ucna_injection_guest_code(void) +{ + uint64_t ctl2; + verify_apic_base_addr(); + xapic_enable(); + + /* Sets up the interrupt vector and enables per-bank CMCI sigaling. */ + xapic_write_reg(APIC_LVTCMCI, CMCI_VECTOR | APIC_DM_FIXED); + ctl2 = rdmsr(MSR_IA32_MCx_CTL2(UCNA_BANK)); + wrmsr(MSR_IA32_MCx_CTL2(UCNA_BANK), ctl2 | MCI_CTL2_CMCI_EN); + + /* Enables interrupt in guest. */ + asm volatile("sti"); + + /* Let user space inject the first UCNA */ + GUEST_SYNC(SYNC_FIRST_UCNA); + + ucna_addr = rdmsr(MSR_IA32_MCx_ADDR(UCNA_BANK)); + + /* Disables the per-bank CMCI signaling. */ + ctl2 = rdmsr(MSR_IA32_MCx_CTL2(UCNA_BANK)); + wrmsr(MSR_IA32_MCx_CTL2(UCNA_BANK), ctl2 & ~MCI_CTL2_CMCI_EN); + + /* Let the user space inject the second UCNA */ + GUEST_SYNC(SYNC_SECOND_UCNA); + + ucna_addr2 = rdmsr(MSR_IA32_MCx_ADDR(UCNA_BANK)); + GUEST_DONE(); +} + +static void cmci_disabled_guest_code(void) +{ + uint64_t ctl2 = rdmsr(MSR_IA32_MCx_CTL2(UCNA_BANK)); + wrmsr(MSR_IA32_MCx_CTL2(UCNA_BANK), ctl2 | MCI_CTL2_CMCI_EN); + + GUEST_DONE(); +} + +static void cmci_enabled_guest_code(void) +{ + uint64_t ctl2 = rdmsr(MSR_IA32_MCx_CTL2(UCNA_BANK)); + wrmsr(MSR_IA32_MCx_CTL2(UCNA_BANK), ctl2 | MCI_CTL2_RESERVED_BIT); + + GUEST_DONE(); +} + +static void guest_cmci_handler(struct ex_regs *regs) +{ + i_ucna_rcvd++; + i_ucna_addr = rdmsr(MSR_IA32_MCx_ADDR(UCNA_BANK)); + xapic_write_reg(APIC_EOI, 0); +} + +static void guest_gp_handler(struct ex_regs *regs) +{ + GUEST_SYNC(SYNC_GP); +} + +static void run_vcpu_expect_gp(struct kvm_vcpu *vcpu) +{ + unsigned int exit_reason; + struct ucall uc; + + vcpu_run(vcpu); + + exit_reason = vcpu->run->exit_reason; + TEST_ASSERT(exit_reason == KVM_EXIT_IO, + "exited with unexpected exit reason %u-%s, expected KVM_EXIT_IO", + exit_reason, exit_reason_str(exit_reason)); + TEST_ASSERT(get_ucall(vcpu, &uc) == UCALL_SYNC, + "Expect UCALL_SYNC\n"); + TEST_ASSERT(uc.args[1] == SYNC_GP, "#GP is expected."); + printf("vCPU received GP in guest.\n"); +} + +static void inject_ucna(struct kvm_vcpu *vcpu, uint64_t addr) { + /* + * A UCNA error is indicated with VAL=1, UC=1, PCC=0, S=0 and AR=0 in + * the IA32_MCi_STATUS register. + * MSCOD=1 (BIT[16] - MscodDataRdErr). + * MCACOD=0x0090 (Memory controller error format, channel 0) + */ + uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN | + MCI_STATUS_MISCV | MCI_STATUS_ADDRV | 0x10090; + struct kvm_x86_mce mce = {}; + mce.status = status; + mce.mcg_status = 0; + /* + * MCM_ADDR_PHYS indicates the reported address is a physical address. + * Lowest 6 bits is the recoverable address LSB, i.e., the injected MCE + * is at 4KB granularity. + */ + mce.misc = (MCM_ADDR_PHYS << 6) | 0xc; + mce.addr = addr; + mce.bank = UCNA_BANK; + + vcpu_ioctl(vcpu, KVM_X86_SET_MCE, &mce); +} + +static void *run_ucna_injection(void *arg) +{ + struct thread_params *params = (struct thread_params *)arg; + struct ucall uc; + int old; + int r; + unsigned int exit_reason; + + r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old); + TEST_ASSERT(r == 0, + "pthread_setcanceltype failed with errno=%d", + r); + + vcpu_run(params->vcpu); + + exit_reason = params->vcpu->run->exit_reason; + TEST_ASSERT(exit_reason == KVM_EXIT_IO, + "unexpected exit reason %u-%s, expected KVM_EXIT_IO", + exit_reason, exit_reason_str(exit_reason)); + TEST_ASSERT(get_ucall(params->vcpu, &uc) == UCALL_SYNC, + "Expect UCALL_SYNC\n"); + TEST_ASSERT(uc.args[1] == SYNC_FIRST_UCNA, "Injecting first UCNA."); + + printf("Injecting first UCNA at %#x.\n", FIRST_UCNA_ADDR); + + inject_ucna(params->vcpu, FIRST_UCNA_ADDR); + vcpu_run(params->vcpu); + + exit_reason = params->vcpu->run->exit_reason; + TEST_ASSERT(exit_reason == KVM_EXIT_IO, + "unexpected exit reason %u-%s, expected KVM_EXIT_IO", + exit_reason, exit_reason_str(exit_reason)); + TEST_ASSERT(get_ucall(params->vcpu, &uc) == UCALL_SYNC, + "Expect UCALL_SYNC\n"); + TEST_ASSERT(uc.args[1] == SYNC_SECOND_UCNA, "Injecting second UCNA."); + + printf("Injecting second UCNA at %#x.\n", SECOND_UCNA_ADDR); + + inject_ucna(params->vcpu, SECOND_UCNA_ADDR); + vcpu_run(params->vcpu); + + exit_reason = params->vcpu->run->exit_reason; + TEST_ASSERT(exit_reason == KVM_EXIT_IO, + "unexpected exit reason %u-%s, expected KVM_EXIT_IO", + exit_reason, exit_reason_str(exit_reason)); + if (get_ucall(params->vcpu, &uc) == UCALL_ABORT) { + TEST_ASSERT(false, "vCPU assertion failure: %s.\n", + (const char *)uc.args[0]); + } + + return NULL; +} + +static void test_ucna_injection(struct kvm_vcpu *vcpu, struct thread_params *params) +{ + struct kvm_vm *vm = vcpu->vm; + params->vcpu = vcpu; + params->p_i_ucna_rcvd = (uint64_t *)addr_gva2hva(vm, (uint64_t)&i_ucna_rcvd); + params->p_i_ucna_addr = (uint64_t *)addr_gva2hva(vm, (uint64_t)&i_ucna_addr); + params->p_ucna_addr = (uint64_t *)addr_gva2hva(vm, (uint64_t)&ucna_addr); + params->p_ucna_addr2 = (uint64_t *)addr_gva2hva(vm, (uint64_t)&ucna_addr2); + + run_ucna_injection(params); + + TEST_ASSERT(*params->p_i_ucna_rcvd == 1, "Only first UCNA get signaled."); + TEST_ASSERT(*params->p_i_ucna_addr == FIRST_UCNA_ADDR, + "Only first UCNA reported addr get recorded via interrupt."); + TEST_ASSERT(*params->p_ucna_addr == FIRST_UCNA_ADDR, + "First injected UCNAs should get exposed via registers."); + TEST_ASSERT(*params->p_ucna_addr2 == SECOND_UCNA_ADDR, + "Second injected UCNAs should get exposed via registers."); + + printf("Test successful.\n" + "UCNA CMCI interrupts received: %ld\n" + "Last UCNA address received via CMCI: %lx\n" + "First UCNA address in vCPU thread: %lx\n" + "Second UCNA address in vCPU thread: %lx\n", + *params->p_i_ucna_rcvd, *params->p_i_ucna_addr, + *params->p_ucna_addr, *params->p_ucna_addr2); +} + +static void setup_mce_cap(struct kvm_vcpu *vcpu, bool enable_cmci_p) +{ + uint64_t mcg_caps = MCG_CTL_P | MCG_SER_P | MCG_LMCE_P | KVM_MAX_MCE_BANKS; + if (enable_cmci_p) + mcg_caps |= MCG_CMCI_P; + + mcg_caps &= supported_mcg_caps | MCG_CAP_BANKS_MASK; + vcpu_ioctl(vcpu, KVM_X86_SETUP_MCE, &mcg_caps); +} + +static struct kvm_vcpu *create_vcpu_with_mce_cap(struct kvm_vm *vm, uint32_t vcpuid, + bool enable_cmci_p, void *guest_code) +{ + struct kvm_vcpu *vcpu = vm_vcpu_add(vm, vcpuid, guest_code); + setup_mce_cap(vcpu, enable_cmci_p); + return vcpu; +} + +int main(int argc, char *argv[]) +{ + struct thread_params params; + struct kvm_vm *vm; + struct kvm_vcpu *ucna_vcpu; + struct kvm_vcpu *cmcidis_vcpu; + struct kvm_vcpu *cmci_vcpu; + + kvm_check_cap(KVM_CAP_MCE); + + vm = __vm_create(VM_MODE_DEFAULT, 3, 0); + + kvm_ioctl(vm->kvm_fd, KVM_X86_GET_MCE_CAP_SUPPORTED, + &supported_mcg_caps); + + if (!(supported_mcg_caps & MCG_CMCI_P)) { + print_skip("MCG_CMCI_P is not supported"); + exit(KSFT_SKIP); + } + + ucna_vcpu = create_vcpu_with_mce_cap(vm, 0, true, ucna_injection_guest_code); + cmcidis_vcpu = create_vcpu_with_mce_cap(vm, 1, false, cmci_disabled_guest_code); + cmci_vcpu = create_vcpu_with_mce_cap(vm, 2, true, cmci_enabled_guest_code); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(ucna_vcpu); + vcpu_init_descriptor_tables(cmcidis_vcpu); + vcpu_init_descriptor_tables(cmci_vcpu); + vm_install_exception_handler(vm, CMCI_VECTOR, guest_cmci_handler); + vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); + + virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); + + test_ucna_injection(ucna_vcpu, ¶ms); + run_vcpu_expect_gp(cmcidis_vcpu); + run_vcpu_expect_gp(cmci_vcpu); + + kvm_vm_free(vm); +} -- cgit v1.2.3 From 4b88b1a518b337de1252b8180519ca4c00015c9e Mon Sep 17 00:00:00 2001 From: Zeng Guang Date: Thu, 23 Jun 2022 17:45:11 +0800 Subject: KVM: selftests: Enhance handling WRMSR ICR register in x2APIC mode Hardware would directly write x2APIC ICR register instead of software emulation in some circumstances, e.g when Intel IPI virtualization is enabled. This behavior requires normal reserved bits checking to ensure them input as zero, otherwise it will cause #GP. So we need mask out those reserved bits from the data written to vICR register. Remove Delivery Status bit emulation in test case as this flag is invalid and not needed in x2APIC mode. KVM may ignore clearing it during interrupt dispatch which will lead to fake test failure. Opportunistically correct vector number for test sending IPI to non-existent vCPUs. Signed-off-by: Zeng Guang Message-Id: <20220623094511.26066-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/x86_64/xapic_state_test.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c index 5c5dc7bbb4e2..87531623064f 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c @@ -71,13 +71,27 @@ static void ____test_icr(struct xapic_vcpu *x, uint64_t val) vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic); icr = (u64)(*((u32 *)&xapic.regs[APIC_ICR])) | (u64)(*((u32 *)&xapic.regs[APIC_ICR2])) << 32; - if (!x->is_x2apic) + if (!x->is_x2apic) { val &= (-1u | (0xffull << (32 + 24))); - ASSERT_EQ(icr, val & ~APIC_ICR_BUSY); + ASSERT_EQ(icr, val & ~APIC_ICR_BUSY); + } else { + ASSERT_EQ(icr & ~APIC_ICR_BUSY, val & ~APIC_ICR_BUSY); + } } +#define X2APIC_RSVED_BITS_MASK (GENMASK_ULL(31,20) | \ + GENMASK_ULL(17,16) | \ + GENMASK_ULL(13,13)) + static void __test_icr(struct xapic_vcpu *x, uint64_t val) { + if (x->is_x2apic) { + /* Hardware writing vICR register requires reserved bits 31:20, + * 17:16 and 13 kept as zero to avoid #GP exception. Data value + * written to vICR should mask out those bits above. + */ + val &= ~X2APIC_RSVED_BITS_MASK; + } ____test_icr(x, val | APIC_ICR_BUSY); ____test_icr(x, val & ~(u64)APIC_ICR_BUSY); } @@ -102,7 +116,7 @@ static void test_icr(struct xapic_vcpu *x) icr = APIC_INT_ASSERT | 0xff; for (i = vcpu->id + 1; i < 0xff; i++) { for (j = 0; j < 8; j++) - __test_icr(x, i << (32 + 24) | APIC_INT_ASSERT | (j << 8)); + __test_icr(x, i << (32 + 24) | icr | (j << 8)); } /* And again with a shorthand destination for all types of IPIs. */ -- cgit v1.2.3 From 0a2ff7cc8ad48a86939a91bd3457f38e59e741a1 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 21 Jun 2022 15:49:19 +0800 Subject: Bonding: add per-port priority for failover re-selection Add per port priority support for bonding active slave re-selection during failover. A higher number means higher priority in selection. The primary slave still has the highest priority. This option also follows the primary_reselect rules. This option could only be configured via netlink. Signed-off-by: Hangbin Liu Acked-by: Jonathan Toppins Signed-off-by: David S. Miller --- Documentation/networking/bonding.rst | 11 +++++++++++ drivers/net/bonding/bond_main.c | 27 +++++++++++++++++++++++++++ drivers/net/bonding/bond_netlink.c | 15 +++++++++++++++ drivers/net/bonding/bond_options.c | 33 +++++++++++++++++++++++++++++++++ include/net/bond_options.h | 1 + include/net/bonding.h | 1 + include/uapi/linux/if_link.h | 1 + tools/include/uapi/linux/if_link.h | 1 + 8 files changed, 90 insertions(+) (limited to 'tools') diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst index 43be3782e5df..53a18ff7cf23 100644 --- a/Documentation/networking/bonding.rst +++ b/Documentation/networking/bonding.rst @@ -780,6 +780,17 @@ peer_notif_delay value is 0 which means to match the value of the link monitor interval. +prio + Slave priority. A higher number means higher priority. + The primary slave has the highest priority. This option also + follows the primary_reselect rules. + + This option could only be configured via netlink, and is only valid + for active-backup(1), balance-tlb (5) and balance-alb (6) mode. + The valid value range is a signed 32 bit integer. + + The default value is 0. + primary A string (eth0, eth2, etc) specifying which slave is the diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index dc3e9a06e1aa..e75acb14d066 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1026,12 +1026,38 @@ out: } +/** + * bond_choose_primary_or_current - select the primary or high priority slave + * @bond: our bonding struct + * + * - Check if there is a primary link. If the primary link was set and is up, + * go on and do link reselection. + * + * - If primary link is not set or down, find the highest priority link. + * If the highest priority link is not current slave, set it as primary + * link and do link reselection. + */ static struct slave *bond_choose_primary_or_current(struct bonding *bond) { struct slave *prim = rtnl_dereference(bond->primary_slave); struct slave *curr = rtnl_dereference(bond->curr_active_slave); + struct slave *slave, *hprio = NULL; + struct list_head *iter; if (!prim || prim->link != BOND_LINK_UP) { + bond_for_each_slave(bond, slave, iter) { + if (slave->link == BOND_LINK_UP) { + hprio = hprio ?: slave; + if (slave->prio > hprio->prio) + hprio = slave; + } + } + + if (hprio && hprio != curr) { + prim = hprio; + goto link_reselect; + } + if (!curr || curr->link != BOND_LINK_UP) return NULL; return curr; @@ -1042,6 +1068,7 @@ static struct slave *bond_choose_primary_or_current(struct bonding *bond) return prim; } +link_reselect: if (!curr || curr->link != BOND_LINK_UP) return prim; diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c index 5a6f44455b95..c2d080fc4fc4 100644 --- a/drivers/net/bonding/bond_netlink.c +++ b/drivers/net/bonding/bond_netlink.c @@ -27,6 +27,7 @@ static size_t bond_get_slave_size(const struct net_device *bond_dev, nla_total_size(sizeof(u16)) + /* IFLA_BOND_SLAVE_AD_AGGREGATOR_ID */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE */ nla_total_size(sizeof(u16)) + /* IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE */ + nla_total_size(sizeof(s32)) + /* IFLA_BOND_SLAVE_PRIO */ 0; } @@ -53,6 +54,9 @@ static int bond_fill_slave_info(struct sk_buff *skb, if (nla_put_u16(skb, IFLA_BOND_SLAVE_QUEUE_ID, slave->queue_id)) goto nla_put_failure; + if (nla_put_s32(skb, IFLA_BOND_SLAVE_PRIO, slave->prio)) + goto nla_put_failure; + if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) { const struct aggregator *agg; const struct port *ad_port; @@ -117,6 +121,7 @@ static const struct nla_policy bond_policy[IFLA_BOND_MAX + 1] = { static const struct nla_policy bond_slave_policy[IFLA_BOND_SLAVE_MAX + 1] = { [IFLA_BOND_SLAVE_QUEUE_ID] = { .type = NLA_U16 }, + [IFLA_BOND_SLAVE_PRIO] = { .type = NLA_S32 }, }; static int bond_validate(struct nlattr *tb[], struct nlattr *data[], @@ -157,6 +162,16 @@ static int bond_slave_changelink(struct net_device *bond_dev, return err; } + if (data[IFLA_BOND_SLAVE_PRIO]) { + int prio = nla_get_s32(data[IFLA_BOND_SLAVE_PRIO]); + + bond_opt_slave_initval(&newval, &slave_dev, prio); + err = __bond_opt_set(bond, BOND_OPT_PRIO, &newval, + data[IFLA_BOND_SLAVE_PRIO], extack); + if (err) + return err; + } + return 0; } diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 96eef19cffc4..3498db1c1b3c 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -40,6 +40,8 @@ static int bond_option_arp_validate_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_arp_all_targets_set(struct bonding *bond, const struct bond_opt_value *newval); +static int bond_option_prio_set(struct bonding *bond, + const struct bond_opt_value *newval); static int bond_option_primary_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_primary_reselect_set(struct bonding *bond, @@ -365,6 +367,16 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = { .values = bond_intmax_tbl, .set = bond_option_miimon_set }, + [BOND_OPT_PRIO] = { + .id = BOND_OPT_PRIO, + .name = "prio", + .desc = "Link priority for failover re-selection", + .flags = BOND_OPTFLAG_RAWVAL, + .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_ACTIVEBACKUP) | + BIT(BOND_MODE_TLB) | + BIT(BOND_MODE_ALB)), + .set = bond_option_prio_set + }, [BOND_OPT_PRIMARY] = { .id = BOND_OPT_PRIMARY, .name = "primary", @@ -1306,6 +1318,27 @@ static int bond_option_missed_max_set(struct bonding *bond, return 0; } +static int bond_option_prio_set(struct bonding *bond, + const struct bond_opt_value *newval) +{ + struct slave *slave; + + slave = bond_slave_get_rtnl(newval->slave_dev); + if (!slave) { + netdev_dbg(newval->slave_dev, "%s called on NULL slave\n", __func__); + return -ENODEV; + } + slave->prio = newval->value; + + if (rtnl_dereference(bond->primary_slave)) + slave_warn(bond->dev, slave->dev, + "prio updated, but will not affect failover re-selection as primary slave have been set\n"); + else + bond_select_active_slave(bond); + + return 0; +} + static int bond_option_primary_set(struct bonding *bond, const struct bond_opt_value *newval) { diff --git a/include/net/bond_options.h b/include/net/bond_options.h index eade8236a4df..d2aea5cf1e41 100644 --- a/include/net/bond_options.h +++ b/include/net/bond_options.h @@ -67,6 +67,7 @@ enum { BOND_OPT_LACP_ACTIVE, BOND_OPT_MISSED_MAX, BOND_OPT_NS_TARGETS, + BOND_OPT_PRIO, BOND_OPT_LAST }; diff --git a/include/net/bonding.h b/include/net/bonding.h index cb904d356e31..6e78d657aa05 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -178,6 +178,7 @@ struct slave { u32 speed; u16 queue_id; u8 perm_hwaddr[MAX_ADDR_LEN]; + int prio; struct ad_slave_info *ad_info; struct tlb_slave_info tlb_info; #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 5f58dcfe2787..e36d9d2c65a7 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -963,6 +963,7 @@ enum { IFLA_BOND_SLAVE_AD_AGGREGATOR_ID, IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE, IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE, + IFLA_BOND_SLAVE_PRIO, __IFLA_BOND_SLAVE_MAX, }; diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index b339bf2196ca..0242f31e339c 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -890,6 +890,7 @@ enum { IFLA_BOND_SLAVE_AD_AGGREGATOR_ID, IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE, IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE, + IFLA_BOND_SLAVE_PRIO, __IFLA_BOND_SLAVE_MAX, }; -- cgit v1.2.3 From 41188e9e9defa1678abbf860ad7f6dd1ba48ad1c Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 24 Jun 2022 05:06:13 +0300 Subject: selftest/bpf: Test for use-after-free bug fix in inline_bpf_loop This test verifies that bpf_loop() inlining works as expected when address of `env->prog` is updated. This address is updated upon BPF program reallocation. Reallocation is handled by bpf_prog_realloc(), which reuses old memory if page boundary is not crossed. The value of `len` in the test is chosen to cross this boundary on bpf_loop() patching. Verify that the use-after-free bug in inline_bpf_loop() reported by Dan Carpenter is fixed. Signed-off-by: Eduard Zingerman Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220624020613.548108-3-eddyz87@gmail.com --- tools/testing/selftests/bpf/test_verifier.c | 39 ++++++++++++++++++++++ .../selftests/bpf/verifier/bpf_loop_inline.c | 11 ++++++ 2 files changed, 50 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 7fe897c66d81..f9d553fbf68a 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -425,6 +425,45 @@ static void bpf_fill_torturous_jumps(struct bpf_test *self) } } +static void bpf_fill_big_prog_with_loop_1(struct bpf_test *self) +{ + struct bpf_insn *insn = self->fill_insns; + /* This test was added to catch a specific use after free + * error, which happened upon BPF program reallocation. + * Reallocation is handled by core.c:bpf_prog_realloc, which + * reuses old memory if page boundary is not crossed. The + * value of `len` is chosen to cross this boundary on bpf_loop + * patching. + */ + const int len = getpagesize() - 25; + int callback_load_idx; + int callback_idx; + int i = 0; + + insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 1); + callback_load_idx = i; + insn[i++] = BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, + BPF_REG_2, BPF_PSEUDO_FUNC, 0, + 777 /* filled below */); + insn[i++] = BPF_RAW_INSN(0, 0, 0, 0, 0); + insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_3, 0); + insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_4, 0); + insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_loop); + + while (i < len - 3) + insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0); + insn[i++] = BPF_EXIT_INSN(); + + callback_idx = i; + insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0); + insn[i++] = BPF_EXIT_INSN(); + + insn[callback_load_idx].imm = callback_idx - callback_load_idx - 1; + self->func_info[1].insn_off = callback_idx; + self->prog_len = i; + assert(i == len); +} + /* BPF_SK_LOOKUP contains 13 instructions, if you need to fix up maps */ #define BPF_SK_LOOKUP(func) \ /* struct bpf_sock_tuple tuple = {} */ \ diff --git a/tools/testing/selftests/bpf/verifier/bpf_loop_inline.c b/tools/testing/selftests/bpf/verifier/bpf_loop_inline.c index 232da07c93b5..2d0023659d88 100644 --- a/tools/testing/selftests/bpf/verifier/bpf_loop_inline.c +++ b/tools/testing/selftests/bpf/verifier/bpf_loop_inline.c @@ -244,6 +244,17 @@ .func_info_cnt = 3, BTF_TYPES }, +{ + "inline bpf_loop call in a big program", + .insns = {}, + .fill_helper = bpf_fill_big_prog_with_loop_1, + .expected_insns = { PSEUDO_CALL_INSN() }, + .unexpected_insns = { HELPER_CALL_INSN() }, + .result = ACCEPT, + .func_info = { { 0, MAIN_TYPE }, { 16, CALLBACK_TYPE } }, + .func_info_cnt = 2, + BTF_TYPES +}, #undef HELPER_CALL_INSN #undef PSEUDO_CALL_INSN -- cgit v1.2.3 From 9ab95b0b15a092abb7c3309f3580f572a041f9ab Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Sat, 4 Jun 2022 10:15:12 +0530 Subject: perf record ibs: Warn about sampling period skew Samples without an L3 miss are discarded and counter is reset with random value (between 1-15 for fetch PMU and 1-127 for op PMU) when IBS L3 miss filtering is enabled. This causes a sampling period skew but there is no way to reconstruct aggregated sampling period. So print a warning at perf record if user sets l3missonly=1. Ex: # perf record -c 10000 -C 0 -e ibs_op/l3missonly=1/ WARNING: Hw internally resets sampling period when L3 Miss Filtering is enabled and tagged operation does not cause L3 Miss. This causes sampling period skew. Signed-off-by: Ravi Bangoria Acked-by: Ian Rogers Acked-by: Namhyung Kim Cc: Ananth Narayan Cc: Andi Kleen Cc: Borislav Petkov Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Kim Phillips Cc: Leo Yan Cc: Mark Rutland Cc: Peter Zijlstra Cc: Robert Richter Cc: Sandipan Das Cc: Santosh Shukla Cc: Stephane Eranian Cc: Thomas Gleixner Cc: like.xu.linux@gmail.com Cc: x86@kernel.org Link: http://lore.kernel.org/lkml/20220604044519.594-2-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/util/evsel.c | 52 ++++++++++++++++++++++++++++++++++++++++ tools/perf/util/evsel.c | 7 ++++++ tools/perf/util/evsel.h | 1 + 3 files changed, 60 insertions(+) (limited to 'tools') diff --git a/tools/perf/arch/x86/util/evsel.c b/tools/perf/arch/x86/util/evsel.c index 3501399cef35..882c1a8c1ded 100644 --- a/tools/perf/arch/x86/util/evsel.c +++ b/tools/perf/arch/x86/util/evsel.c @@ -6,6 +6,10 @@ #include "util/pmu.h" #include "linux/string.h" #include "evsel.h" +#include "util/debug.h" + +#define IBS_FETCH_L3MISSONLY (1ULL << 59) +#define IBS_OP_L3MISSONLY (1ULL << 16) void arch_evsel__set_sample_weight(struct evsel *evsel) { @@ -61,3 +65,51 @@ bool arch_evsel__must_be_in_group(const struct evsel *evsel) (strcasestr(evsel->name, "slots") || strcasestr(evsel->name, "topdown")); } + +static void ibs_l3miss_warn(void) +{ + pr_warning( +"WARNING: Hw internally resets sampling period when L3 Miss Filtering is enabled\n" +"and tagged operation does not cause L3 Miss. This causes sampling period skew.\n"); +} + +void arch__post_evsel_config(struct evsel *evsel, struct perf_event_attr *attr) +{ + struct perf_pmu *evsel_pmu, *ibs_fetch_pmu, *ibs_op_pmu; + static int warned_once; + /* 0: Uninitialized, 1: Yes, -1: No */ + static int is_amd; + + if (warned_once || is_amd == -1) + return; + + if (!is_amd) { + struct perf_env *env = evsel__env(evsel); + + if (!perf_env__cpuid(env) || !env->cpuid || + !strstarts(env->cpuid, "AuthenticAMD")) { + is_amd = -1; + return; + } + is_amd = 1; + } + + evsel_pmu = evsel__find_pmu(evsel); + if (!evsel_pmu) + return; + + ibs_fetch_pmu = perf_pmu__find("ibs_fetch"); + ibs_op_pmu = perf_pmu__find("ibs_op"); + + if (ibs_fetch_pmu && ibs_fetch_pmu->type == evsel_pmu->type) { + if (attr->config & IBS_FETCH_L3MISSONLY) { + ibs_l3miss_warn(); + warned_once = 1; + } + } else if (ibs_op_pmu && ibs_op_pmu->type == evsel_pmu->type) { + if (attr->config & IBS_OP_L3MISSONLY) { + ibs_l3miss_warn(); + warned_once = 1; + } + } +} diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index ce499c5da8d7..8fea51a9cd90 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1091,6 +1091,11 @@ void __weak arch_evsel__fixup_new_cycles(struct perf_event_attr *attr __maybe_un { } +void __weak arch__post_evsel_config(struct evsel *evsel __maybe_unused, + struct perf_event_attr *attr __maybe_unused) +{ +} + static void evsel__set_default_freq_period(struct record_opts *opts, struct perf_event_attr *attr) { @@ -1366,6 +1371,8 @@ void evsel__config(struct evsel *evsel, struct record_opts *opts, */ if (evsel__is_dummy_event(evsel)) evsel__reset_sample_bit(evsel, BRANCH_STACK); + + arch__post_evsel_config(evsel, attr); } int evsel__set_filter(struct evsel *evsel, const char *filter) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 73ea48e94079..92bed8e2f7d8 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -297,6 +297,7 @@ void evsel__set_sample_id(struct evsel *evsel, bool use_sample_identifier); void arch_evsel__set_sample_weight(struct evsel *evsel); void arch_evsel__fixup_new_cycles(struct perf_event_attr *attr); +void arch__post_evsel_config(struct evsel *evsel, struct perf_event_attr *attr); int evsel__set_filter(struct evsel *evsel, const char *filter); int evsel__append_tp_filter(struct evsel *evsel, const char *filter); -- cgit v1.2.3 From 3339ec44be7f9963c82d5d21e163f16f96e5ca58 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Sat, 4 Jun 2022 10:15:13 +0530 Subject: perf pmu: Parse pmu caps sysfs only once In addition to returning nr_caps, cache it locally in struct perf_pmu. Similarly, cache status of whether caps sysfs has already been parsed or not. These will help to avoid parsing sysfs every time the function gets called. Reviewed-by: Kan Liang Signed-off-by: Ravi Bangoria Acked-by: Namhyung Kim Cc: Ananth Narayan Cc: Andi Kleen Cc: Borislav Petkov Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kim Phillips Cc: Leo Yan Cc: Mark Rutland Cc: Peter Zijlstra Cc: Robert Richter Cc: Sandipan Das Cc: Santosh Shukla Cc: Stephane Eranian Cc: Thomas Gleixner Cc: like.xu.linux@gmail.com Cc: x86@kernel.org Link: https://lore.kernel.org/r/20220604044519.594-3-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmu.c | 15 +++++++++++---- tools/perf/util/pmu.h | 2 ++ 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 9a1c7e63e663..0112e1c36418 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1890,7 +1890,11 @@ int perf_pmu__caps_parse(struct perf_pmu *pmu) const char *sysfs = sysfs__mountpoint(); DIR *caps_dir; struct dirent *evt_ent; - int nr_caps = 0; + + if (pmu->caps_initialized) + return pmu->nr_caps; + + pmu->nr_caps = 0; if (!sysfs) return -1; @@ -1898,8 +1902,10 @@ int perf_pmu__caps_parse(struct perf_pmu *pmu) snprintf(caps_path, PATH_MAX, "%s" EVENT_SOURCE_DEVICE_PATH "%s/caps", sysfs, pmu->name); - if (stat(caps_path, &st) < 0) + if (stat(caps_path, &st) < 0) { + pmu->caps_initialized = true; return 0; /* no error if caps does not exist */ + } caps_dir = opendir(caps_path); if (!caps_dir) @@ -1926,13 +1932,14 @@ int perf_pmu__caps_parse(struct perf_pmu *pmu) continue; } - nr_caps++; + pmu->nr_caps++; fclose(file); } closedir(caps_dir); - return nr_caps; + pmu->caps_initialized = true; + return pmu->nr_caps; } void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 541889fa9f9c..4b45fd8da5a3 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -46,6 +46,8 @@ struct perf_pmu { struct perf_cpu_map *cpus; struct list_head format; /* HEAD struct perf_pmu_format -> list */ struct list_head aliases; /* HEAD struct perf_pmu_alias -> list */ + bool caps_initialized; + u32 nr_caps; struct list_head caps; /* HEAD struct perf_pmu_caps -> list */ struct list_head list; /* ELEM */ struct list_head hybrid_list; -- cgit v1.2.3 From 2a12bef413bb278db202ecb6adbd9c18dec4b260 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Sat, 4 Jun 2022 10:15:14 +0530 Subject: perf header: Pass "cpu" pmu name while printing caps Avoid unnecessary conditional code to check if pmu name is NULL or not by passing "cpu" pmu name to the printing function. Reviewed-by: Kan Liang Signed-off-by: Ravi Bangoria Acked-by: Namhyung Kim Cc: Ananth Narayan Cc: Andi Kleen Cc: Borislav Petkov Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kim Phillips Cc: Leo Yan Cc: Mark Rutland Cc: Peter Zijlstra Cc: Robert Richter Cc: Sandipan Das Cc: Santosh Shukla Cc: Stephane Eranian Cc: Thomas Gleixner Cc: like.xu.linux@gmail.com Cc: x86@kernel.org Link: https://lore.kernel.org/r/20220604044519.594-4-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/header.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index de5b7a023e9e..8c0b85e3851c 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -2058,17 +2058,11 @@ static void print_per_cpu_pmu_caps(FILE *fp, int nr_caps, char *cpu_pmu_caps, char *str, buf[128]; if (!nr_caps) { - if (!pmu_name) - fprintf(fp, "# cpu pmu capabilities: not available\n"); - else - fprintf(fp, "# %s pmu capabilities: not available\n", pmu_name); + fprintf(fp, "# %s pmu capabilities: not available\n", pmu_name); return; } - if (!pmu_name) - scnprintf(buf, sizeof(buf), "# cpu pmu capabilities: "); - else - scnprintf(buf, sizeof(buf), "# %s pmu capabilities: ", pmu_name); + scnprintf(buf, sizeof(buf), "# %s pmu capabilities: ", pmu_name); delimiter = buf; @@ -2085,7 +2079,7 @@ static void print_per_cpu_pmu_caps(FILE *fp, int nr_caps, char *cpu_pmu_caps, static void print_cpu_pmu_caps(struct feat_fd *ff, FILE *fp) { print_per_cpu_pmu_caps(fp, ff->ph->env.nr_cpu_pmu_caps, - ff->ph->env.cpu_pmu_caps, NULL); + ff->ph->env.cpu_pmu_caps, (char *)"cpu"); } static void print_hybrid_cpu_pmu_caps(struct feat_fd *ff, FILE *fp) -- cgit v1.2.3 From ff34eaa820231dfaecca9c048637bd86ba294bc7 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Sat, 4 Jun 2022 10:15:15 +0530 Subject: perf header: Store PMU caps in an array of strings Currently all capabilities are stored in a single string separated by NULL character. Instead, store them in an array which makes searching of capability easier. Reviewed-by: Kan Liang Signed-off-by: Ravi Bangoria Acked-by: Namhyung Kim Cc: Ananth Narayan Cc: Andi Kleen Cc: Borislav Petkov Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kim Phillips Cc: Leo Yan Cc: Mark Rutland Cc: Peter Zijlstra Cc: Robert Richter Cc: Sandipan Das Cc: Santosh Shukla Cc: Stephane Eranian Cc: Thomas Gleixner Cc: like.xu.linux@gmail.com Cc: x86@kernel.org Link: https://lore.kernel.org/r/20220604044519.594-5-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/env.c | 6 ++++- tools/perf/util/env.h | 4 +-- tools/perf/util/header.c | 63 ++++++++++++++++++++++++++---------------------- 3 files changed, 41 insertions(+), 32 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index 579e44c59914..7d3aeb2e4622 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -179,7 +179,7 @@ static void perf_env__purge_bpf(struct perf_env *env __maybe_unused) void perf_env__exit(struct perf_env *env) { - int i; + int i, j; perf_env__purge_bpf(env); perf_env__purge_cgroups(env); @@ -196,6 +196,8 @@ void perf_env__exit(struct perf_env *env) zfree(&env->sibling_threads); zfree(&env->pmu_mappings); zfree(&env->cpu); + for (i = 0; i < env->nr_cpu_pmu_caps; i++) + zfree(&env->cpu_pmu_caps[i]); zfree(&env->cpu_pmu_caps); zfree(&env->numa_map); @@ -218,6 +220,8 @@ void perf_env__exit(struct perf_env *env) zfree(&env->hybrid_nodes); for (i = 0; i < env->nr_hybrid_cpc_nodes; i++) { + for (j = 0; j < env->hybrid_cpc_nodes[i].nr_cpu_pmu_caps; j++) + zfree(&env->hybrid_cpc_nodes[i].cpu_pmu_caps[j]); zfree(&env->hybrid_cpc_nodes[i].cpu_pmu_caps); zfree(&env->hybrid_cpc_nodes[i].pmu_name); } diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index a3541f98e1fc..43aab59f7322 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -46,7 +46,7 @@ struct hybrid_node { struct hybrid_cpc_node { int nr_cpu_pmu_caps; unsigned int max_branches; - char *cpu_pmu_caps; + char **cpu_pmu_caps; char *pmu_name; }; @@ -81,7 +81,7 @@ struct perf_env { char *sibling_dies; char *sibling_threads; char *pmu_mappings; - char *cpu_pmu_caps; + char **cpu_pmu_caps; struct cpu_topology_map *cpu; struct cpu_cache_level *caches; int caches_cnt; diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 8c0b85e3851c..c9d1f764aa68 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -2051,26 +2051,21 @@ static void print_compressed(struct feat_fd *ff, FILE *fp) ff->ph->env.comp_level, ff->ph->env.comp_ratio); } -static void print_per_cpu_pmu_caps(FILE *fp, int nr_caps, char *cpu_pmu_caps, +static void print_per_cpu_pmu_caps(FILE *fp, int nr_caps, char **cpu_pmu_caps, char *pmu_name) { - const char *delimiter; - char *str, buf[128]; + const char *delimiter = ""; + int i; if (!nr_caps) { fprintf(fp, "# %s pmu capabilities: not available\n", pmu_name); return; } - scnprintf(buf, sizeof(buf), "# %s pmu capabilities: ", pmu_name); - - delimiter = buf; - - str = cpu_pmu_caps; - while (nr_caps--) { - fprintf(fp, "%s%s", delimiter, str); + fprintf(fp, "# %s pmu capabilities: ", pmu_name); + for (i = 0; i < nr_caps; i++) { + fprintf(fp, "%s%s", delimiter, cpu_pmu_caps[i]); delimiter = ", "; - str += strlen(str) + 1; } fprintf(fp, "\n"); @@ -3202,27 +3197,26 @@ static int process_compressed(struct feat_fd *ff, } static int process_per_cpu_pmu_caps(struct feat_fd *ff, int *nr_cpu_pmu_caps, - char **cpu_pmu_caps, + char ***cpu_pmu_caps, unsigned int *max_branches) { - char *name, *value; - struct strbuf sb; - u32 nr_caps; + char *name, *value, *ptr; + u32 nr_caps, i; + + *nr_cpu_pmu_caps = 0; + *cpu_pmu_caps = NULL; if (do_read_u32(ff, &nr_caps)) return -1; - if (!nr_caps) { - pr_debug("cpu pmu capabilities not available\n"); + if (!nr_caps) return 0; - } - - *nr_cpu_pmu_caps = nr_caps; - if (strbuf_init(&sb, 128) < 0) + *cpu_pmu_caps = zalloc(sizeof(char *) * nr_caps); + if (!*cpu_pmu_caps) return -1; - while (nr_caps--) { + for (i = 0; i < nr_caps; i++) { name = do_read_string(ff); if (!name) goto error; @@ -3231,12 +3225,10 @@ static int process_per_cpu_pmu_caps(struct feat_fd *ff, int *nr_cpu_pmu_caps, if (!value) goto free_name; - if (strbuf_addf(&sb, "%s=%s", name, value) < 0) + if (asprintf(&ptr, "%s=%s", name, value) < 0) goto free_value; - /* include a NULL character at the end */ - if (strbuf_add(&sb, "", 1) < 0) - goto free_value; + (*cpu_pmu_caps)[i] = ptr; if (!strcmp(name, "branches")) *max_branches = atoi(value); @@ -3244,7 +3236,7 @@ static int process_per_cpu_pmu_caps(struct feat_fd *ff, int *nr_cpu_pmu_caps, free(value); free(name); } - *cpu_pmu_caps = strbuf_detach(&sb, NULL); + *nr_cpu_pmu_caps = nr_caps; return 0; free_value: @@ -3252,16 +3244,24 @@ free_value: free_name: free(name); error: - strbuf_release(&sb); + for (; i > 0; i--) + free((*cpu_pmu_caps)[i - 1]); + free(*cpu_pmu_caps); + *cpu_pmu_caps = NULL; + *nr_cpu_pmu_caps = 0; return -1; } static int process_cpu_pmu_caps(struct feat_fd *ff, void *data __maybe_unused) { - return process_per_cpu_pmu_caps(ff, &ff->ph->env.nr_cpu_pmu_caps, + int ret = process_per_cpu_pmu_caps(ff, &ff->ph->env.nr_cpu_pmu_caps, &ff->ph->env.cpu_pmu_caps, &ff->ph->env.max_branches); + + if (!ret && !ff->ph->env.cpu_pmu_caps) + pr_debug("cpu pmu capabilities not available\n"); + return ret; } static int process_hybrid_cpu_pmu_caps(struct feat_fd *ff, @@ -3270,6 +3270,7 @@ static int process_hybrid_cpu_pmu_caps(struct feat_fd *ff, struct hybrid_cpc_node *nodes; u32 nr_pmu, i; int ret; + int j; if (do_read_u32(ff, &nr_pmu)) return -1; @@ -3297,6 +3298,8 @@ static int process_hybrid_cpu_pmu_caps(struct feat_fd *ff, ret = -1; goto err; } + if (!n->nr_cpu_pmu_caps) + pr_debug("%s pmu capabilities not available\n", n->pmu_name); } ff->ph->env.nr_hybrid_cpc_nodes = nr_pmu; @@ -3305,6 +3308,8 @@ static int process_hybrid_cpu_pmu_caps(struct feat_fd *ff, err: for (i = 0; i < nr_pmu; i++) { + for (j = 0; j < nodes[i].nr_cpu_pmu_caps; j++) + free(nodes[i].cpu_pmu_caps[j]); free(nodes[i].cpu_pmu_caps); free(nodes[i].pmu_name); } -- cgit v1.2.3 From 2139f7424819818afc25fbfd0effda4babe235f6 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Sat, 4 Jun 2022 10:15:16 +0530 Subject: perf header: Record non-CPU PMU capabilities PMUs advertise their capabilities via sysfs attribute files but the perf tool currently parses only core(CPU) or hybrid core PMU capabilities. Add support of recording non-core PMU capabilities int perf.data header. Note that a newly proposed HEADER_PMU_CAPS is replacing existing HEADER_HYBRID_CPU_PMU_CAPS. Special care is taken for hybrid core PMUs by writing their capabilities first in the perf.data header to make sure new perf.data file being read by old perf tool does not break. Reviewed-by: Kan Liang Signed-off-by: Ravi Bangoria Acked-by: Namhyung Kim Cc: Ananth Narayan Cc: Andi Kleen Cc: Borislav Petkov Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kim Phillips Cc: Leo Yan Cc: Mark Rutland Cc: Peter Zijlstra Cc: Robert Richter Cc: Sandipan Das Cc: Santosh Shukla Cc: Stephane Eranian Cc: Thomas Gleixner Cc: like.xu.linux@gmail.com Cc: x86@kernel.org Link: https://lore.kernel.org/r/20220604044519.594-6-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf.data-file-format.txt | 10 +- tools/perf/builtin-inject.c | 2 +- tools/perf/util/env.c | 60 +++++++- tools/perf/util/env.h | 12 +- tools/perf/util/header.c | 160 ++++++++++++--------- tools/perf/util/header.h | 2 +- 6 files changed, 158 insertions(+), 88 deletions(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf.data-file-format.txt b/tools/perf/Documentation/perf.data-file-format.txt index bc9f0aa113d8..635ba043fd7d 100644 --- a/tools/perf/Documentation/perf.data-file-format.txt +++ b/tools/perf/Documentation/perf.data-file-format.txt @@ -419,18 +419,20 @@ Example: cpu_core cpu list : 0-15 cpu_atom cpu list : 16-23 - HEADER_HYBRID_CPU_PMU_CAPS = 31, + HEADER_PMU_CAPS = 31, - A list of hybrid CPU PMU capabilities. + List of pmu capabilities (except cpu pmu which is already + covered by HEADER_CPU_PMU_CAPS). Note that hybrid cpu pmu + capabilities are also stored here. struct { u32 nr_pmu; struct { - u32 nr_cpu_pmu_caps; + u32 nr_caps; { char name[]; char value[]; - } [nr_cpu_pmu_caps]; + } [nr_caps]; char pmu_name[]; } [nr_pmu]; }; diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 42e2918fd1cc..b9022d408def 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -809,7 +809,7 @@ static bool keep_feat(int feat) case HEADER_CPU_PMU_CAPS: case HEADER_CLOCK_DATA: case HEADER_HYBRID_TOPOLOGY: - case HEADER_HYBRID_CPU_PMU_CAPS: + case HEADER_PMU_CAPS: return true; /* Information that can be updated */ case HEADER_BUILD_ID: diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index 7d3aeb2e4622..5b8cf6a421a4 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -219,13 +219,13 @@ void perf_env__exit(struct perf_env *env) } zfree(&env->hybrid_nodes); - for (i = 0; i < env->nr_hybrid_cpc_nodes; i++) { - for (j = 0; j < env->hybrid_cpc_nodes[i].nr_cpu_pmu_caps; j++) - zfree(&env->hybrid_cpc_nodes[i].cpu_pmu_caps[j]); - zfree(&env->hybrid_cpc_nodes[i].cpu_pmu_caps); - zfree(&env->hybrid_cpc_nodes[i].pmu_name); + for (i = 0; i < env->nr_pmus_with_caps; i++) { + for (j = 0; j < env->pmu_caps[i].nr_caps; j++) + zfree(&env->pmu_caps[i].caps[j]); + zfree(&env->pmu_caps[i].caps); + zfree(&env->pmu_caps[i].pmu_name); } - zfree(&env->hybrid_cpc_nodes); + zfree(&env->pmu_caps); } void perf_env__init(struct perf_env *env) @@ -531,3 +531,51 @@ int perf_env__numa_node(struct perf_env *env, struct perf_cpu cpu) return cpu.cpu >= 0 && cpu.cpu < env->nr_numa_map ? env->numa_map[cpu.cpu] : -1; } + +char *perf_env__find_pmu_cap(struct perf_env *env, const char *pmu_name, + const char *cap) +{ + char *cap_eq; + int cap_size; + char **ptr; + int i, j; + + if (!pmu_name || !cap) + return NULL; + + cap_size = strlen(cap); + cap_eq = zalloc(cap_size + 2); + if (!cap_eq) + return NULL; + + memcpy(cap_eq, cap, cap_size); + cap_eq[cap_size] = '='; + + if (!strcmp(pmu_name, "cpu")) { + for (i = 0; i < env->nr_cpu_pmu_caps; i++) { + if (!strncmp(env->cpu_pmu_caps[i], cap_eq, cap_size + 1)) { + free(cap_eq); + return &env->cpu_pmu_caps[i][cap_size + 1]; + } + } + goto out; + } + + for (i = 0; i < env->nr_pmus_with_caps; i++) { + if (strcmp(env->pmu_caps[i].pmu_name, pmu_name)) + continue; + + ptr = env->pmu_caps[i].caps; + + for (j = 0; j < env->pmu_caps[i].nr_caps; j++) { + if (!strncmp(ptr[j], cap_eq, cap_size + 1)) { + free(cap_eq); + return &ptr[j][cap_size + 1]; + } + } + } + +out: + free(cap_eq); + return NULL; +} diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index 43aab59f7322..4566c51f2fd9 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -43,10 +43,10 @@ struct hybrid_node { char *cpus; }; -struct hybrid_cpc_node { - int nr_cpu_pmu_caps; +struct pmu_caps { + int nr_caps; unsigned int max_branches; - char **cpu_pmu_caps; + char **caps; char *pmu_name; }; @@ -74,7 +74,7 @@ struct perf_env { int nr_groups; int nr_cpu_pmu_caps; int nr_hybrid_nodes; - int nr_hybrid_cpc_nodes; + int nr_pmus_with_caps; char *cmdline; const char **cmdline_argv; char *sibling_cores; @@ -94,7 +94,7 @@ struct perf_env { struct memory_node *memory_nodes; unsigned long long memory_bsize; struct hybrid_node *hybrid_nodes; - struct hybrid_cpc_node *hybrid_cpc_nodes; + struct pmu_caps *pmu_caps; #ifdef HAVE_LIBBPF_SUPPORT /* * bpf_info_lock protects bpf rbtrees. This is needed because the @@ -172,4 +172,6 @@ bool perf_env__insert_btf(struct perf_env *env, struct btf_node *btf_node); struct btf_node *perf_env__find_btf(struct perf_env *env, __u32 btf_id); int perf_env__numa_node(struct perf_env *env, struct perf_cpu cpu); +char *perf_env__find_pmu_cap(struct perf_env *env, const char *pmu_name, + const char *cap); #endif /* __PERF_ENV_H */ diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index c9d1f764aa68..511478429463 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1512,18 +1512,13 @@ static int write_compressed(struct feat_fd *ff __maybe_unused, return do_write(ff, &(ff->ph->env.comp_mmap_len), sizeof(ff->ph->env.comp_mmap_len)); } -static int write_per_cpu_pmu_caps(struct feat_fd *ff, struct perf_pmu *pmu, - bool write_pmu) +static int __write_pmu_caps(struct feat_fd *ff, struct perf_pmu *pmu, + bool write_pmu) { struct perf_pmu_caps *caps = NULL; - int nr_caps; int ret; - nr_caps = perf_pmu__caps_parse(pmu); - if (nr_caps < 0) - return nr_caps; - - ret = do_write(ff, &nr_caps, sizeof(nr_caps)); + ret = do_write(ff, &pmu->nr_caps, sizeof(pmu->nr_caps)); if (ret < 0) return ret; @@ -1550,33 +1545,60 @@ static int write_cpu_pmu_caps(struct feat_fd *ff, struct evlist *evlist __maybe_unused) { struct perf_pmu *cpu_pmu = perf_pmu__find("cpu"); + int ret; if (!cpu_pmu) return -ENOENT; - return write_per_cpu_pmu_caps(ff, cpu_pmu, false); + ret = perf_pmu__caps_parse(cpu_pmu); + if (ret < 0) + return ret; + + return __write_pmu_caps(ff, cpu_pmu, false); } -static int write_hybrid_cpu_pmu_caps(struct feat_fd *ff, - struct evlist *evlist __maybe_unused) +static int write_pmu_caps(struct feat_fd *ff, + struct evlist *evlist __maybe_unused) { - struct perf_pmu *pmu; - u32 nr_pmu = perf_pmu__hybrid_pmu_num(); + struct perf_pmu *pmu = NULL; + int nr_pmu = 0; int ret; - if (nr_pmu == 0) - return -ENOENT; + while ((pmu = perf_pmu__scan(pmu))) { + if (!pmu->name || !strcmp(pmu->name, "cpu") || + perf_pmu__caps_parse(pmu) <= 0) + continue; + nr_pmu++; + } ret = do_write(ff, &nr_pmu, sizeof(nr_pmu)); if (ret < 0) return ret; + if (!nr_pmu) + return 0; + + /* + * Write hybrid pmu caps first to maintain compatibility with + * older perf tool. + */ + pmu = NULL; perf_pmu__for_each_hybrid_pmu(pmu) { - ret = write_per_cpu_pmu_caps(ff, pmu, true); + ret = __write_pmu_caps(ff, pmu, true); if (ret < 0) return ret; } + pmu = NULL; + while ((pmu = perf_pmu__scan(pmu))) { + if (!pmu->name || !strcmp(pmu->name, "cpu") || + !pmu->nr_caps || perf_pmu__is_hybrid(pmu->name)) + continue; + + ret = __write_pmu_caps(ff, pmu, true); + if (ret < 0) + return ret; + } return 0; } @@ -2051,8 +2073,7 @@ static void print_compressed(struct feat_fd *ff, FILE *fp) ff->ph->env.comp_level, ff->ph->env.comp_ratio); } -static void print_per_cpu_pmu_caps(FILE *fp, int nr_caps, char **cpu_pmu_caps, - char *pmu_name) +static void __print_pmu_caps(FILE *fp, int nr_caps, char **caps, char *pmu_name) { const char *delimiter = ""; int i; @@ -2064,7 +2085,7 @@ static void print_per_cpu_pmu_caps(FILE *fp, int nr_caps, char **cpu_pmu_caps, fprintf(fp, "# %s pmu capabilities: ", pmu_name); for (i = 0; i < nr_caps; i++) { - fprintf(fp, "%s%s", delimiter, cpu_pmu_caps[i]); + fprintf(fp, "%s%s", delimiter, caps[i]); delimiter = ", "; } @@ -2073,19 +2094,18 @@ static void print_per_cpu_pmu_caps(FILE *fp, int nr_caps, char **cpu_pmu_caps, static void print_cpu_pmu_caps(struct feat_fd *ff, FILE *fp) { - print_per_cpu_pmu_caps(fp, ff->ph->env.nr_cpu_pmu_caps, - ff->ph->env.cpu_pmu_caps, (char *)"cpu"); + __print_pmu_caps(fp, ff->ph->env.nr_cpu_pmu_caps, + ff->ph->env.cpu_pmu_caps, (char *)"cpu"); } -static void print_hybrid_cpu_pmu_caps(struct feat_fd *ff, FILE *fp) +static void print_pmu_caps(struct feat_fd *ff, FILE *fp) { - struct hybrid_cpc_node *n; + struct pmu_caps *pmu_caps; - for (int i = 0; i < ff->ph->env.nr_hybrid_cpc_nodes; i++) { - n = &ff->ph->env.hybrid_cpc_nodes[i]; - print_per_cpu_pmu_caps(fp, n->nr_cpu_pmu_caps, - n->cpu_pmu_caps, - n->pmu_name); + for (int i = 0; i < ff->ph->env.nr_pmus_with_caps; i++) { + pmu_caps = &ff->ph->env.pmu_caps[i]; + __print_pmu_caps(fp, pmu_caps->nr_caps, pmu_caps->caps, + pmu_caps->pmu_name); } } @@ -3196,27 +3216,26 @@ static int process_compressed(struct feat_fd *ff, return 0; } -static int process_per_cpu_pmu_caps(struct feat_fd *ff, int *nr_cpu_pmu_caps, - char ***cpu_pmu_caps, - unsigned int *max_branches) +static int __process_pmu_caps(struct feat_fd *ff, int *nr_caps, + char ***caps, unsigned int *max_branches) { char *name, *value, *ptr; - u32 nr_caps, i; + u32 nr_pmu_caps, i; - *nr_cpu_pmu_caps = 0; - *cpu_pmu_caps = NULL; + *nr_caps = 0; + *caps = NULL; - if (do_read_u32(ff, &nr_caps)) + if (do_read_u32(ff, &nr_pmu_caps)) return -1; - if (!nr_caps) + if (!nr_pmu_caps) return 0; - *cpu_pmu_caps = zalloc(sizeof(char *) * nr_caps); - if (!*cpu_pmu_caps) + *caps = zalloc(sizeof(char *) * nr_pmu_caps); + if (!*caps) return -1; - for (i = 0; i < nr_caps; i++) { + for (i = 0; i < nr_pmu_caps; i++) { name = do_read_string(ff); if (!name) goto error; @@ -3228,7 +3247,7 @@ static int process_per_cpu_pmu_caps(struct feat_fd *ff, int *nr_cpu_pmu_caps, if (asprintf(&ptr, "%s=%s", name, value) < 0) goto free_value; - (*cpu_pmu_caps)[i] = ptr; + (*caps)[i] = ptr; if (!strcmp(name, "branches")) *max_branches = atoi(value); @@ -3236,7 +3255,7 @@ static int process_per_cpu_pmu_caps(struct feat_fd *ff, int *nr_cpu_pmu_caps, free(value); free(name); } - *nr_cpu_pmu_caps = nr_caps; + *nr_caps = nr_pmu_caps; return 0; free_value: @@ -3245,29 +3264,28 @@ free_name: free(name); error: for (; i > 0; i--) - free((*cpu_pmu_caps)[i - 1]); - free(*cpu_pmu_caps); - *cpu_pmu_caps = NULL; - *nr_cpu_pmu_caps = 0; + free((*caps)[i - 1]); + free(*caps); + *caps = NULL; + *nr_caps = 0; return -1; } static int process_cpu_pmu_caps(struct feat_fd *ff, void *data __maybe_unused) { - int ret = process_per_cpu_pmu_caps(ff, &ff->ph->env.nr_cpu_pmu_caps, - &ff->ph->env.cpu_pmu_caps, - &ff->ph->env.max_branches); + int ret = __process_pmu_caps(ff, &ff->ph->env.nr_cpu_pmu_caps, + &ff->ph->env.cpu_pmu_caps, + &ff->ph->env.max_branches); if (!ret && !ff->ph->env.cpu_pmu_caps) pr_debug("cpu pmu capabilities not available\n"); return ret; } -static int process_hybrid_cpu_pmu_caps(struct feat_fd *ff, - void *data __maybe_unused) +static int process_pmu_caps(struct feat_fd *ff, void *data __maybe_unused) { - struct hybrid_cpc_node *nodes; + struct pmu_caps *pmu_caps; u32 nr_pmu, i; int ret; int j; @@ -3276,45 +3294,45 @@ static int process_hybrid_cpu_pmu_caps(struct feat_fd *ff, return -1; if (!nr_pmu) { - pr_debug("hybrid cpu pmu capabilities not available\n"); + pr_debug("pmu capabilities not available\n"); return 0; } - nodes = zalloc(sizeof(*nodes) * nr_pmu); - if (!nodes) + pmu_caps = zalloc(sizeof(*pmu_caps) * nr_pmu); + if (!pmu_caps) return -ENOMEM; for (i = 0; i < nr_pmu; i++) { - struct hybrid_cpc_node *n = &nodes[i]; - - ret = process_per_cpu_pmu_caps(ff, &n->nr_cpu_pmu_caps, - &n->cpu_pmu_caps, - &n->max_branches); + ret = __process_pmu_caps(ff, &pmu_caps[i].nr_caps, + &pmu_caps[i].caps, + &pmu_caps[i].max_branches); if (ret) goto err; - n->pmu_name = do_read_string(ff); - if (!n->pmu_name) { + pmu_caps[i].pmu_name = do_read_string(ff); + if (!pmu_caps[i].pmu_name) { ret = -1; goto err; } - if (!n->nr_cpu_pmu_caps) - pr_debug("%s pmu capabilities not available\n", n->pmu_name); + if (!pmu_caps[i].nr_caps) { + pr_debug("%s pmu capabilities not available\n", + pmu_caps[i].pmu_name); + } } - ff->ph->env.nr_hybrid_cpc_nodes = nr_pmu; - ff->ph->env.hybrid_cpc_nodes = nodes; + ff->ph->env.nr_pmus_with_caps = nr_pmu; + ff->ph->env.pmu_caps = pmu_caps; return 0; err: for (i = 0; i < nr_pmu; i++) { - for (j = 0; j < nodes[i].nr_cpu_pmu_caps; j++) - free(nodes[i].cpu_pmu_caps[j]); - free(nodes[i].cpu_pmu_caps); - free(nodes[i].pmu_name); + for (j = 0; j < pmu_caps[i].nr_caps; j++) + free(pmu_caps[i].caps[j]); + free(pmu_caps[i].caps); + free(pmu_caps[i].pmu_name); } - free(nodes); + free(pmu_caps); return ret; } @@ -3380,7 +3398,7 @@ const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE] = { FEAT_OPR(CPU_PMU_CAPS, cpu_pmu_caps, false), FEAT_OPR(CLOCK_DATA, clock_data, false), FEAT_OPN(HYBRID_TOPOLOGY, hybrid_topology, true), - FEAT_OPR(HYBRID_CPU_PMU_CAPS, hybrid_cpu_pmu_caps, false), + FEAT_OPR(PMU_CAPS, pmu_caps, false), }; struct header_print_data { diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 08563c1f1bff..5790bf556b90 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -46,7 +46,7 @@ enum { HEADER_CPU_PMU_CAPS, HEADER_CLOCK_DATA, HEADER_HYBRID_TOPOLOGY, - HEADER_HYBRID_CPU_PMU_CAPS, + HEADER_PMU_CAPS, HEADER_LAST_FEATURE, HEADER_FEAT_BITS = 256, }; -- cgit v1.2.3 From c1f4f92b7d5d3deeaae758a1a7ed263b1381dd1c Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Sat, 4 Jun 2022 10:15:18 +0530 Subject: perf tool ibs: Sync AMD IBS header file IBS support has been enhanced with two new features in upcoming uarch: 1. DataSrc extension 2. L3 miss filtering. Additional set of bits has been introduced in IBS registers to exploit these features. New bits are already defining in arch/x86/ header. Sync it with tools header file. Also rename existing ibs_op_data field 'data_src' to 'data_src_lo'. Signed-off-by: Ravi Bangoria Acked-by: Namhyung Kim Cc: Ananth Narayan Cc: Andi Kleen Cc: Borislav Petkov Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Kim Phillips Cc: Leo Yan Cc: Mark Rutland Cc: Peter Zijlstra Cc: Robert Richter Cc: Sandipan Das Cc: Santosh Shukla Cc: Stephane Eranian Cc: Thomas Gleixner Cc: like.xu.linux@gmail.com Cc: x86@kernel.org Link: https://lore.kernel.org/r/20220604044519.594-8-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/amd-ibs.h | 16 ++++++++++------ tools/perf/util/amd-sample-raw.c | 4 ++-- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/arch/x86/include/asm/amd-ibs.h b/tools/arch/x86/include/asm/amd-ibs.h index 765e9e752d03..9a3312e12e2e 100644 --- a/tools/arch/x86/include/asm/amd-ibs.h +++ b/tools/arch/x86/include/asm/amd-ibs.h @@ -29,7 +29,10 @@ union ibs_fetch_ctl { rand_en:1, /* 57: random tagging enable */ fetch_l2_miss:1,/* 58: L2 miss for sampled fetch * (needs IbsFetchComp) */ - reserved:5; /* 59-63: reserved */ + l3_miss_only:1, /* 59: Collect L3 miss samples only */ + fetch_oc_miss:1,/* 60: Op cache miss for the sampled fetch */ + fetch_l3_miss:1,/* 61: L3 cache miss for the sampled fetch */ + reserved:2; /* 62-63: reserved */ }; }; @@ -38,14 +41,14 @@ union ibs_op_ctl { __u64 val; struct { __u64 opmaxcnt:16, /* 0-15: periodic op max. count */ - reserved0:1, /* 16: reserved */ + l3_miss_only:1, /* 16: Collect L3 miss samples only */ op_en:1, /* 17: op sampling enable */ op_val:1, /* 18: op sample valid */ cnt_ctl:1, /* 19: periodic op counter control */ opmaxcnt_ext:7, /* 20-26: upper 7 bits of periodic op maximum count */ - reserved1:5, /* 27-31: reserved */ + reserved0:5, /* 27-31: reserved */ opcurcnt:27, /* 32-58: periodic op counter current count */ - reserved2:5; /* 59-63: reserved */ + reserved1:5; /* 59-63: reserved */ }; }; @@ -71,11 +74,12 @@ union ibs_op_data { union ibs_op_data2 { __u64 val; struct { - __u64 data_src:3, /* 0-2: data source */ + __u64 data_src_lo:3, /* 0-2: data source low */ reserved0:1, /* 3: reserved */ rmt_node:1, /* 4: destination node */ cache_hit_st:1, /* 5: cache hit state */ - reserved1:57; /* 5-63: reserved */ + data_src_hi:2, /* 6-7: data source high */ + reserved1:56; /* 8-63: reserved */ }; }; diff --git a/tools/perf/util/amd-sample-raw.c b/tools/perf/util/amd-sample-raw.c index d19d765195c5..3b623ea6ee7e 100644 --- a/tools/perf/util/amd-sample-raw.c +++ b/tools/perf/util/amd-sample-raw.c @@ -98,9 +98,9 @@ static void pr_ibs_op_data2(union ibs_op_data2 reg) }; printf("ibs_op_data2:\t%016llx %sRmtNode %d%s\n", reg.val, - reg.data_src == 2 ? (reg.cache_hit_st ? "CacheHitSt 1=O-State " + reg.data_src_lo == 2 ? (reg.cache_hit_st ? "CacheHitSt 1=O-State " : "CacheHitSt 0=M-state ") : "", - reg.rmt_node, data_src_str[reg.data_src]); + reg.rmt_node, data_src_str[reg.data_src_lo]); } static void pr_ibs_op_data3(union ibs_op_data3 reg) -- cgit v1.2.3 From 0429796e45ec17eee26d7a59de92271c275d7666 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Sat, 4 Jun 2022 10:15:19 +0530 Subject: perf script ibs: Support new IBS bits in raw trace dump Interpret Additional set of IBS register bits while doing perf report/script raw dump. IBS op PMU ex: $ sudo ./perf record -c 130 -a -e ibs_op/l3missonly=1/ --raw-samples $ sudo ./perf report -D ... ibs_op_ctl: 0000004500070008 MaxCnt 128 L3MissOnly 1 En 1 Val 1 CntCtl 0=cycles CurCnt 69 ibs_op_data: 0000000000710002 CompToRetCtr 2 TagToRetCtr 113 BrnRet 0 RipInvalid 0 BrnFuse 0 Microcode 0 ibs_op_data2: 0000000000000002 CacheHitSt 0=M-state RmtNode 0 DataSrc 2=A peer cache in a near CCX ibs_op_data3: 000000681d1700a1 LdOp 1 StOp 0 DcL1TlbMiss 0 DcL2TlbMiss 0 DcL1TlbHit2M 0 DcL1TlbHit1G 1 DcL2TlbHit2M 0 DcMiss 1 DcMisAcc 0 DcWcMemAcc 0 DcUcMemAcc 0 DcLockedOp 0 DcMissNoMabAlloc 1 DcLinAddrValid 1 DcPhyAddrValid 1 DcL2TlbHit1G 0 L2Miss 1 SwPf 0 OpMemWidth 8 bytes OpDcMissOpenMemReqs 7 DcMissLat 104 TlbRefillLat 0 IBS Fetch PMU ex: $ sudo ./perf record -c 130 -a -e ibs_fetch/l3missonly=1/ --raw-samples $ sudo ./perf report -D ... ibs_fetch_ctl: 3c1f00c700080008 MaxCnt 128 Cnt 128 Lat 199 En 1 Val 1 Comp 1 IcMiss 1 PhyAddrValid 1 L1TlbPgSz 4KB L1TlbMiss 0 L2TlbMiss 0 RandEn 0 L2Miss 1 L3MissOnly 1 FetchOcMiss 1 FetchL3Miss 1 With the DataSrc extensions, the source of data can be decoded among: - Local L3 or other L1/L2 in CCX. - A peer cache in a near CCX. - Data returned from DRAM. - A peer cache in a far CCX. - DRAM address map with "long latency" bit set. - Data returned from MMIO/Config/PCI/APIC. - Extension Memory (S-Link, GenZ, etc - identified by the CS target and/or address map at DF's choice). - Peer Agent Memory. Signed-off-by: Ravi Bangoria Acked-by: Namhyung Kim Cc: Ananth Narayan Cc: Andi Kleen Cc: Borislav Petkov Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Kim Phillips Cc: Leo Yan Cc: Mark Rutland Cc: Peter Zijlstra Cc: Robert Richter Cc: Sandipan Das Cc: Santosh Shukla Cc: Stephane Eranian Cc: Thomas Gleixner Cc: like.xu.linux@gmail.com Cc: x86@kernel.org Link: https://lore.kernel.org/r/20220604044519.594-9-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/amd-sample-raw.c | 64 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/amd-sample-raw.c b/tools/perf/util/amd-sample-raw.c index 3b623ea6ee7e..238305868644 100644 --- a/tools/perf/util/amd-sample-raw.c +++ b/tools/perf/util/amd-sample-raw.c @@ -18,6 +18,7 @@ #include "pmu-events/pmu-events.h" static u32 cpu_family, cpu_model, ibs_fetch_type, ibs_op_type; +static bool zen4_ibs_extensions; static void pr_ibs_fetch_ctl(union ibs_fetch_ctl reg) { @@ -39,6 +40,7 @@ static void pr_ibs_fetch_ctl(union ibs_fetch_ctl reg) }; const char *ic_miss_str = NULL; const char *l1tlb_pgsz_str = NULL; + char l3_miss_str[sizeof(" L3MissOnly _ FetchOcMiss _ FetchL3Miss _")] = ""; if (cpu_family == 0x19 && cpu_model < 0x10) { /* @@ -53,12 +55,19 @@ static void pr_ibs_fetch_ctl(union ibs_fetch_ctl reg) ic_miss_str = ic_miss_strs[reg.ic_miss]; } + if (zen4_ibs_extensions) { + snprintf(l3_miss_str, sizeof(l3_miss_str), + " L3MissOnly %d FetchOcMiss %d FetchL3Miss %d", + reg.l3_miss_only, reg.fetch_oc_miss, reg.fetch_l3_miss); + } + printf("ibs_fetch_ctl:\t%016llx MaxCnt %7d Cnt %7d Lat %5d En %d Val %d Comp %d%s " - "PhyAddrValid %d%s L1TlbMiss %d L2TlbMiss %d RandEn %d%s\n", + "PhyAddrValid %d%s L1TlbMiss %d L2TlbMiss %d RandEn %d%s%s\n", reg.val, reg.fetch_maxcnt << 4, reg.fetch_cnt << 4, reg.fetch_lat, reg.fetch_en, reg.fetch_val, reg.fetch_comp, ic_miss_str ? : "", reg.phy_addr_valid, l1tlb_pgsz_str ? : "", reg.l1tlb_miss, reg.l2tlb_miss, - reg.rand_en, reg.fetch_comp ? (reg.fetch_l2_miss ? " L2Miss 1" : " L2Miss 0") : ""); + reg.rand_en, reg.fetch_comp ? (reg.fetch_l2_miss ? " L2Miss 1" : " L2Miss 0") : "", + l3_miss_str); } static void pr_ic_ibs_extd_ctl(union ic_ibs_extd_ctl reg) @@ -68,9 +77,15 @@ static void pr_ic_ibs_extd_ctl(union ic_ibs_extd_ctl reg) static void pr_ibs_op_ctl(union ibs_op_ctl reg) { - printf("ibs_op_ctl:\t%016llx MaxCnt %9d En %d Val %d CntCtl %d=%s CurCnt %9d\n", - reg.val, ((reg.opmaxcnt_ext << 16) | reg.opmaxcnt) << 4, reg.op_en, reg.op_val, - reg.cnt_ctl, reg.cnt_ctl ? "uOps" : "cycles", reg.opcurcnt); + char l3_miss_only[sizeof(" L3MissOnly _")] = ""; + + if (zen4_ibs_extensions) + snprintf(l3_miss_only, sizeof(l3_miss_only), " L3MissOnly %d", reg.l3_miss_only); + + printf("ibs_op_ctl:\t%016llx MaxCnt %9d%s En %d Val %d CntCtl %d=%s CurCnt %9d\n", + reg.val, ((reg.opmaxcnt_ext << 16) | reg.opmaxcnt) << 4, l3_miss_only, + reg.op_en, reg.op_val, reg.cnt_ctl, + reg.cnt_ctl ? "uOps" : "cycles", reg.opcurcnt); } static void pr_ibs_op_data(union ibs_op_data reg) @@ -84,7 +99,34 @@ static void pr_ibs_op_data(union ibs_op_data reg) reg.op_brn_ret, reg.op_rip_invalid, reg.op_brn_fuse, reg.op_microcode); } -static void pr_ibs_op_data2(union ibs_op_data2 reg) +static void pr_ibs_op_data2_extended(union ibs_op_data2 reg) +{ + static const char * const data_src_str[] = { + "", + " DataSrc 1=Local L3 or other L1/L2 in CCX", + " DataSrc 2=A peer cache in a near CCX", + " DataSrc 3=Data returned from DRAM", + " DataSrc 4=(reserved)", + " DataSrc 5=A peer cache in a far CCX", + " DataSrc 6=DRAM address map with \"long latency\" bit set", + " DataSrc 7=Data returned from MMIO/Config/PCI/APIC", + " DataSrc 8=Extension Memory (S-Link, GenZ, etc)", + " DataSrc 9=(reserved)", + " DataSrc 10=(reserved)", + " DataSrc 11=(reserved)", + " DataSrc 12=Peer Agent Memory", + /* 13 to 31 are reserved. Avoid printing them. */ + }; + int data_src = (reg.data_src_hi << 3) | reg.data_src_lo; + + printf("ibs_op_data2:\t%016llx %sRmtNode %d%s\n", reg.val, + (data_src == 1 || data_src == 2 || data_src == 5) ? + (reg.cache_hit_st ? "CacheHitSt 1=O-State " : "CacheHitSt 0=M-state ") : "", + reg.rmt_node, + data_src < (int)ARRAY_SIZE(data_src_str) ? data_src_str[data_src] : ""); +} + +static void pr_ibs_op_data2_default(union ibs_op_data2 reg) { static const char * const data_src_str[] = { "", @@ -103,6 +145,13 @@ static void pr_ibs_op_data2(union ibs_op_data2 reg) reg.rmt_node, data_src_str[reg.data_src_lo]); } +static void pr_ibs_op_data2(union ibs_op_data2 reg) +{ + if (zen4_ibs_extensions) + return pr_ibs_op_data2_extended(reg); + pr_ibs_op_data2_default(reg); +} + static void pr_ibs_op_data3(union ibs_op_data3 reg) { char l2_miss_str[sizeof(" L2Miss _")] = ""; @@ -279,6 +328,9 @@ bool evlist__has_amd_ibs(struct evlist *evlist) pmu_mapping += strlen(pmu_mapping) + 1 /* '\0' */; } + if (perf_env__find_pmu_cap(env, "ibs_op", "zen4_ibs_extensions")) + zen4_ibs_extensions = 1; + if (ibs_fetch_type || ibs_op_type) { if (!cpu_family) parse_cpuid(env); -- cgit v1.2.3 From b168852eb8eff57b63e72e3bc584bda3d2cb9577 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 16 Jun 2022 22:22:14 +0200 Subject: perf tools: Rework prologue generation code Some functions we use for bpf prologue generation are going to be deprecated. This change reworks current code not to use them. We need to replace following functions/struct: bpf_program__set_prep bpf_program__nth_fd struct bpf_prog_prep_result Currently we use bpf_program__set_prep to hook perf callback before program is loaded and provide new instructions with the prologue. We replace this function/ality by taking instructions for specific program, attaching prologue to them and load such new ebpf programs with prologue using separate bpf_prog_load calls (outside libbpf load machinery). Before we can take and use program instructions, we need libbpf to actually load it. This way we get the final shape of its instructions with all relocations and verifier adjustments). There's one glitch though.. perf kprobe program already assumes generated prologue code with proper values in argument registers, so loading such program directly will fail in the verifier. That's where the fallback pre-load handler fits in and prepends the initialization code to the program. Once such program is loaded we take its instructions, cut off the initialization code and prepend the prologue. I know.. sorry ;-) To have access to the program when loading this patch adds support to register 'fallback' section handler to take care of perf kprobe programs. The fallback means that it handles any section definition besides the ones that libbpf handles. The handler serves two purposes: - allows perf programs to have special arguments in section name - allows perf to use pre-load callback where we can attach init code (zeroing all argument registers) to each perf program Suggested-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Tested-by: Arnaldo Carvalho de Melo Acked-by: Arnaldo Carvalho de Melo Link: https://lore.kernel.org/bpf/20220616202214.70359-2-jolsa@kernel.org --- tools/perf/util/bpf-loader.c | 204 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 175 insertions(+), 29 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c index f8ad581ea247..6bd7c288e820 100644 --- a/tools/perf/util/bpf-loader.c +++ b/tools/perf/util/bpf-loader.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -49,6 +50,7 @@ struct bpf_prog_priv { struct bpf_insn *insns_buf; int nr_types; int *type_mapping; + int *prologue_fds; }; struct bpf_perf_object { @@ -56,6 +58,11 @@ struct bpf_perf_object { struct bpf_object *obj; }; +struct bpf_preproc_result { + struct bpf_insn *new_insn_ptr; + int new_insn_cnt; +}; + static LIST_HEAD(bpf_objects_list); static struct hashmap *bpf_program_hash; static struct hashmap *bpf_map_hash; @@ -86,6 +93,7 @@ bpf_perf_object__next(struct bpf_perf_object *prev) (perf_obj) = (tmp), (tmp) = bpf_perf_object__next(tmp)) static bool libbpf_initialized; +static int libbpf_sec_handler; static int bpf_perf_object__add(struct bpf_object *obj) { @@ -99,12 +107,76 @@ static int bpf_perf_object__add(struct bpf_object *obj) return perf_obj ? 0 : -ENOMEM; } +static void *program_priv(const struct bpf_program *prog) +{ + void *priv; + + if (IS_ERR_OR_NULL(bpf_program_hash)) + return NULL; + if (!hashmap__find(bpf_program_hash, prog, &priv)) + return NULL; + return priv; +} + +static struct bpf_insn prologue_init_insn[] = { + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_MOV64_IMM(BPF_REG_5, 0), +}; + +static int libbpf_prog_prepare_load_fn(struct bpf_program *prog, + struct bpf_prog_load_opts *opts __maybe_unused, + long cookie __maybe_unused) +{ + size_t init_size_cnt = ARRAY_SIZE(prologue_init_insn); + size_t orig_insn_cnt, insn_cnt, init_size, orig_size; + struct bpf_prog_priv *priv = program_priv(prog); + const struct bpf_insn *orig_insn; + struct bpf_insn *insn; + + if (IS_ERR_OR_NULL(priv)) { + pr_debug("bpf: failed to get private field\n"); + return -BPF_LOADER_ERRNO__INTERNAL; + } + + if (!priv->need_prologue) + return 0; + + /* prepend initialization code to program instructions */ + orig_insn = bpf_program__insns(prog); + orig_insn_cnt = bpf_program__insn_cnt(prog); + init_size = init_size_cnt * sizeof(*insn); + orig_size = orig_insn_cnt * sizeof(*insn); + + insn_cnt = orig_insn_cnt + init_size_cnt; + insn = malloc(insn_cnt * sizeof(*insn)); + if (!insn) + return -ENOMEM; + + memcpy(insn, prologue_init_insn, init_size); + memcpy((char *) insn + init_size, orig_insn, orig_size); + bpf_program__set_insns(prog, insn, insn_cnt); + return 0; +} + static int libbpf_init(void) { + LIBBPF_OPTS(libbpf_prog_handler_opts, handler_opts, + .prog_prepare_load_fn = libbpf_prog_prepare_load_fn, + ); + if (libbpf_initialized) return 0; libbpf_set_print(libbpf_perf_print); + libbpf_sec_handler = libbpf_register_prog_handler(NULL, BPF_PROG_TYPE_KPROBE, + 0, &handler_opts); + if (libbpf_sec_handler < 0) { + pr_debug("bpf: failed to register libbpf section handler: %d\n", + libbpf_sec_handler); + return -BPF_LOADER_ERRNO__INTERNAL; + } libbpf_initialized = true; return 0; } @@ -188,14 +260,31 @@ struct bpf_object *bpf__prepare_load(const char *filename, bool source) return obj; } +static void close_prologue_programs(struct bpf_prog_priv *priv) +{ + struct perf_probe_event *pev; + int i, fd; + + if (!priv->need_prologue) + return; + pev = &priv->pev; + for (i = 0; i < pev->ntevs; i++) { + fd = priv->prologue_fds[i]; + if (fd != -1) + close(fd); + } +} + static void clear_prog_priv(const struct bpf_program *prog __maybe_unused, void *_priv) { struct bpf_prog_priv *priv = _priv; + close_prologue_programs(priv); cleanup_perf_probe_events(&priv->pev, 1); zfree(&priv->insns_buf); + zfree(&priv->prologue_fds); zfree(&priv->type_mapping); zfree(&priv->sys_name); zfree(&priv->evt_name); @@ -243,17 +332,6 @@ static bool ptr_equal(const void *key1, const void *key2, return key1 == key2; } -static void *program_priv(const struct bpf_program *prog) -{ - void *priv; - - if (IS_ERR_OR_NULL(bpf_program_hash)) - return NULL; - if (!hashmap__find(bpf_program_hash, prog, &priv)) - return NULL; - return priv; -} - static int program_set_priv(struct bpf_program *prog, void *priv) { void *old_priv; @@ -558,8 +636,8 @@ static int bpf__prepare_probe(void) static int preproc_gen_prologue(struct bpf_program *prog, int n, - struct bpf_insn *orig_insns, int orig_insns_cnt, - struct bpf_prog_prep_result *res) + const struct bpf_insn *orig_insns, int orig_insns_cnt, + struct bpf_preproc_result *res) { struct bpf_prog_priv *priv = program_priv(prog); struct probe_trace_event *tev; @@ -607,7 +685,6 @@ preproc_gen_prologue(struct bpf_program *prog, int n, res->new_insn_ptr = buf; res->new_insn_cnt = prologue_cnt + orig_insns_cnt; - res->pfd = NULL; return 0; errout: @@ -715,7 +792,7 @@ static int hook_load_preprocessor(struct bpf_program *prog) struct bpf_prog_priv *priv = program_priv(prog); struct perf_probe_event *pev; bool need_prologue = false; - int err, i; + int i; if (IS_ERR_OR_NULL(priv)) { pr_debug("Internal error when hook preprocessor\n"); @@ -753,6 +830,13 @@ static int hook_load_preprocessor(struct bpf_program *prog) return -ENOMEM; } + priv->prologue_fds = malloc(sizeof(int) * pev->ntevs); + if (!priv->prologue_fds) { + pr_debug("Not enough memory: alloc prologue fds failed\n"); + return -ENOMEM; + } + memset(priv->prologue_fds, -1, sizeof(int) * pev->ntevs); + priv->type_mapping = malloc(sizeof(int) * pev->ntevs); if (!priv->type_mapping) { pr_debug("Not enough memory: alloc type_mapping failed\n"); @@ -761,13 +845,7 @@ static int hook_load_preprocessor(struct bpf_program *prog) memset(priv->type_mapping, -1, sizeof(int) * pev->ntevs); - err = map_prologue(pev, priv->type_mapping, &priv->nr_types); - if (err) - return err; - - err = bpf_program__set_prep(prog, priv->nr_types, - preproc_gen_prologue); - return err; + return map_prologue(pev, priv->type_mapping, &priv->nr_types); } int bpf__probe(struct bpf_object *obj) @@ -874,6 +952,77 @@ int bpf__unprobe(struct bpf_object *obj) return ret; } +static int bpf_object__load_prologue(struct bpf_object *obj) +{ + int init_cnt = ARRAY_SIZE(prologue_init_insn); + const struct bpf_insn *orig_insns; + struct bpf_preproc_result res; + struct perf_probe_event *pev; + struct bpf_program *prog; + int orig_insns_cnt; + + bpf_object__for_each_program(prog, obj) { + struct bpf_prog_priv *priv = program_priv(prog); + int err, i, fd; + + if (IS_ERR_OR_NULL(priv)) { + pr_debug("bpf: failed to get private field\n"); + return -BPF_LOADER_ERRNO__INTERNAL; + } + + if (!priv->need_prologue) + continue; + + /* + * For each program that needs prologue we do following: + * + * - take its current instructions and use them + * to generate the new code with prologue + * - load new instructions with bpf_prog_load + * and keep the fd in prologue_fds + * - new fd will be used in bpf__foreach_event + * to connect this program with perf evsel + */ + orig_insns = bpf_program__insns(prog); + orig_insns_cnt = bpf_program__insn_cnt(prog); + + pev = &priv->pev; + for (i = 0; i < pev->ntevs; i++) { + /* + * Skipping artificall prologue_init_insn instructions + * (init_cnt), so the prologue can be generated instead + * of them. + */ + err = preproc_gen_prologue(prog, i, + orig_insns + init_cnt, + orig_insns_cnt - init_cnt, + &res); + if (err) + return err; + + fd = bpf_prog_load(bpf_program__get_type(prog), + bpf_program__name(prog), "GPL", + res.new_insn_ptr, + res.new_insn_cnt, NULL); + if (fd < 0) { + char bf[128]; + + libbpf_strerror(-errno, bf, sizeof(bf)); + pr_debug("bpf: load objects with prologue failed: err=%d: (%s)\n", + -errno, bf); + return -errno; + } + priv->prologue_fds[i] = fd; + } + /* + * We no longer need the original program, + * we can unload it. + */ + bpf_program__unload(prog); + } + return 0; +} + int bpf__load(struct bpf_object *obj) { int err; @@ -885,7 +1034,7 @@ int bpf__load(struct bpf_object *obj) pr_debug("bpf: load objects failed: err=%d: (%s)\n", err, bf); return err; } - return 0; + return bpf_object__load_prologue(obj); } int bpf__foreach_event(struct bpf_object *obj, @@ -920,13 +1069,10 @@ int bpf__foreach_event(struct bpf_object *obj, for (i = 0; i < pev->ntevs; i++) { tev = &pev->tevs[i]; - if (priv->need_prologue) { - int type = priv->type_mapping[i]; - - fd = bpf_program__nth_fd(prog, type); - } else { + if (priv->need_prologue) + fd = priv->prologue_fds[i]; + else fd = bpf_program__fd(prog); - } if (fd < 0) { pr_debug("bpf: failed to get file descriptor\n"); -- cgit v1.2.3 From fd75733da2f376c0c8c6513c3cb2ac227082ec5c Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Thu, 23 Jun 2022 18:29:34 +0000 Subject: bpf: Merge "types_are_compat" logic into relo_core.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BPF type compatibility checks (bpf_core_types_are_compat()) are currently duplicated between kernel and user space. That's a historical artifact more than intentional doing and can lead to subtle bugs where one implementation is adjusted but another is forgotten. That happened with the enum64 work, for example, where the libbpf side was changed (commit 23b2a3a8f63a ("libbpf: Add enum64 relocation support")) to use the btf_kind_core_compat() helper function but the kernel side was not (commit 6089fb325cf7 ("bpf: Add btf enum64 support")). This patch addresses both the duplication issue, by merging both implementations and moving them into relo_core.c, and fixes the alluded to kind check (by giving preference to libbpf's already adjusted logic). For discussion of the topic, please refer to: https://lore.kernel.org/bpf/CAADnVQKbWR7oarBdewgOBZUPzryhRYvEbkhyPJQHHuxq=0K1gw@mail.gmail.com/T/#mcc99f4a33ad9a322afaf1b9276fb1f0b7add9665 Changelog: v1 -> v2: - limited libbpf recursion limit to 32 - changed name to __bpf_core_types_are_compat - included warning previously present in libbpf version - merged kernel and user space changes into a single patch Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220623182934.2582827-1-deso@posteo.net --- kernel/bpf/btf.c | 84 +---------------------------------------------- tools/lib/bpf/libbpf.c | 72 +--------------------------------------- tools/lib/bpf/relo_core.c | 80 ++++++++++++++++++++++++++++++++++++++++++++ tools/lib/bpf/relo_core.h | 2 ++ 4 files changed, 84 insertions(+), 154 deletions(-) (limited to 'tools') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f08037c31dd7..2e2066d6af94 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7416,87 +7416,6 @@ EXPORT_SYMBOL_GPL(register_btf_id_dtor_kfuncs); #define MAX_TYPES_ARE_COMPAT_DEPTH 2 -static -int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, - const struct btf *targ_btf, __u32 targ_id, - int level) -{ - const struct btf_type *local_type, *targ_type; - int depth = 32; /* max recursion depth */ - - /* caller made sure that names match (ignoring flavor suffix) */ - local_type = btf_type_by_id(local_btf, local_id); - targ_type = btf_type_by_id(targ_btf, targ_id); - if (btf_kind(local_type) != btf_kind(targ_type)) - return 0; - -recur: - depth--; - if (depth < 0) - return -EINVAL; - - local_type = btf_type_skip_modifiers(local_btf, local_id, &local_id); - targ_type = btf_type_skip_modifiers(targ_btf, targ_id, &targ_id); - if (!local_type || !targ_type) - return -EINVAL; - - if (btf_kind(local_type) != btf_kind(targ_type)) - return 0; - - switch (btf_kind(local_type)) { - case BTF_KIND_UNKN: - case BTF_KIND_STRUCT: - case BTF_KIND_UNION: - case BTF_KIND_ENUM: - case BTF_KIND_FWD: - case BTF_KIND_ENUM64: - return 1; - case BTF_KIND_INT: - /* just reject deprecated bitfield-like integers; all other - * integers are by default compatible between each other - */ - return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0; - case BTF_KIND_PTR: - local_id = local_type->type; - targ_id = targ_type->type; - goto recur; - case BTF_KIND_ARRAY: - local_id = btf_array(local_type)->type; - targ_id = btf_array(targ_type)->type; - goto recur; - case BTF_KIND_FUNC_PROTO: { - struct btf_param *local_p = btf_params(local_type); - struct btf_param *targ_p = btf_params(targ_type); - __u16 local_vlen = btf_vlen(local_type); - __u16 targ_vlen = btf_vlen(targ_type); - int i, err; - - if (local_vlen != targ_vlen) - return 0; - - for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { - if (level <= 0) - return -EINVAL; - - btf_type_skip_modifiers(local_btf, local_p->type, &local_id); - btf_type_skip_modifiers(targ_btf, targ_p->type, &targ_id); - err = __bpf_core_types_are_compat(local_btf, local_id, - targ_btf, targ_id, - level - 1); - if (err <= 0) - return err; - } - - /* tail recurse for return type check */ - btf_type_skip_modifiers(local_btf, local_type->type, &local_id); - btf_type_skip_modifiers(targ_btf, targ_type->type, &targ_id); - goto recur; - } - default: - return 0; - } -} - /* Check local and target types for compatibility. This check is used for * type-based CO-RE relocations and follow slightly different rules than * field-based relocations. This function assumes that root types were already @@ -7519,8 +7438,7 @@ recur: int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id) { - return __bpf_core_types_are_compat(local_btf, local_id, - targ_btf, targ_id, + return __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id, MAX_TYPES_ARE_COMPAT_DEPTH); } diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 49e359cd34df..335467ece75f 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5732,77 +5732,7 @@ err_out: int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id) { - const struct btf_type *local_type, *targ_type; - int depth = 32; /* max recursion depth */ - - /* caller made sure that names match (ignoring flavor suffix) */ - local_type = btf__type_by_id(local_btf, local_id); - targ_type = btf__type_by_id(targ_btf, targ_id); - if (!btf_kind_core_compat(local_type, targ_type)) - return 0; - -recur: - depth--; - if (depth < 0) - return -EINVAL; - - local_type = skip_mods_and_typedefs(local_btf, local_id, &local_id); - targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); - if (!local_type || !targ_type) - return -EINVAL; - - if (!btf_kind_core_compat(local_type, targ_type)) - return 0; - - switch (btf_kind(local_type)) { - case BTF_KIND_UNKN: - case BTF_KIND_STRUCT: - case BTF_KIND_UNION: - case BTF_KIND_ENUM: - case BTF_KIND_ENUM64: - case BTF_KIND_FWD: - return 1; - case BTF_KIND_INT: - /* just reject deprecated bitfield-like integers; all other - * integers are by default compatible between each other - */ - return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0; - case BTF_KIND_PTR: - local_id = local_type->type; - targ_id = targ_type->type; - goto recur; - case BTF_KIND_ARRAY: - local_id = btf_array(local_type)->type; - targ_id = btf_array(targ_type)->type; - goto recur; - case BTF_KIND_FUNC_PROTO: { - struct btf_param *local_p = btf_params(local_type); - struct btf_param *targ_p = btf_params(targ_type); - __u16 local_vlen = btf_vlen(local_type); - __u16 targ_vlen = btf_vlen(targ_type); - int i, err; - - if (local_vlen != targ_vlen) - return 0; - - for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { - skip_mods_and_typedefs(local_btf, local_p->type, &local_id); - skip_mods_and_typedefs(targ_btf, targ_p->type, &targ_id); - err = bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id); - if (err <= 0) - return err; - } - - /* tail recurse for return type check */ - skip_mods_and_typedefs(local_btf, local_type->type, &local_id); - skip_mods_and_typedefs(targ_btf, targ_type->type, &targ_id); - goto recur; - } - default: - pr_warn("unexpected kind %s relocated, local [%d], target [%d]\n", - btf_kind_str(local_type), local_id, targ_id); - return 0; - } + return __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id, 32); } static size_t bpf_core_hash_fn(const void *key, void *ctx) diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c index 6ad3c3891a9a..e070123332cd 100644 --- a/tools/lib/bpf/relo_core.c +++ b/tools/lib/bpf/relo_core.c @@ -141,6 +141,86 @@ static bool core_relo_is_enumval_based(enum bpf_core_relo_kind kind) } } +int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id, int level) +{ + const struct btf_type *local_type, *targ_type; + int depth = 32; /* max recursion depth */ + + /* caller made sure that names match (ignoring flavor suffix) */ + local_type = btf_type_by_id(local_btf, local_id); + targ_type = btf_type_by_id(targ_btf, targ_id); + if (!btf_kind_core_compat(local_type, targ_type)) + return 0; + +recur: + depth--; + if (depth < 0) + return -EINVAL; + + local_type = skip_mods_and_typedefs(local_btf, local_id, &local_id); + targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); + if (!local_type || !targ_type) + return -EINVAL; + + if (!btf_kind_core_compat(local_type, targ_type)) + return 0; + + switch (btf_kind(local_type)) { + case BTF_KIND_UNKN: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + case BTF_KIND_FWD: + case BTF_KIND_ENUM64: + return 1; + case BTF_KIND_INT: + /* just reject deprecated bitfield-like integers; all other + * integers are by default compatible between each other + */ + return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0; + case BTF_KIND_PTR: + local_id = local_type->type; + targ_id = targ_type->type; + goto recur; + case BTF_KIND_ARRAY: + local_id = btf_array(local_type)->type; + targ_id = btf_array(targ_type)->type; + goto recur; + case BTF_KIND_FUNC_PROTO: { + struct btf_param *local_p = btf_params(local_type); + struct btf_param *targ_p = btf_params(targ_type); + __u16 local_vlen = btf_vlen(local_type); + __u16 targ_vlen = btf_vlen(targ_type); + int i, err; + + if (local_vlen != targ_vlen) + return 0; + + for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { + if (level <= 0) + return -EINVAL; + + skip_mods_and_typedefs(local_btf, local_p->type, &local_id); + skip_mods_and_typedefs(targ_btf, targ_p->type, &targ_id); + err = __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id, + level - 1); + if (err <= 0) + return err; + } + + /* tail recurse for return type check */ + skip_mods_and_typedefs(local_btf, local_type->type, &local_id); + skip_mods_and_typedefs(targ_btf, targ_type->type, &targ_id); + goto recur; + } + default: + pr_warn("unexpected kind %s relocated, local [%d], target [%d]\n", + btf_kind_str(local_type), local_id, targ_id); + return 0; + } +} + /* * Turn bpf_core_relo into a low- and high-level spec representation, * validating correctness along the way, as well as calculating resulting diff --git a/tools/lib/bpf/relo_core.h b/tools/lib/bpf/relo_core.h index 7df0da082f2c..3fd3842d4230 100644 --- a/tools/lib/bpf/relo_core.h +++ b/tools/lib/bpf/relo_core.h @@ -68,6 +68,8 @@ struct bpf_core_relo_res { __u32 new_type_id; }; +int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id, int level); int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id); -- cgit v1.2.3 From 1da9e27415bfc54db25c8374331aaf5321185a1d Mon Sep 17 00:00:00 2001 From: liujing Date: Wed, 22 Jun 2022 08:12:37 -0400 Subject: tc-testing: gitignore, delete plugins directory when we modfying kernel, commit it to our environment building. we find a error that is "tools/testing/selftests/tc-testing/plugins" failed: No such file or directory" we find plugins directory is ignored in "tools/testing/selftests/tc-testing/.gitignore", but the plugins directory is need in "tools/testing/selftests/tc-testing/Makefile" Signed-off-by: liujing Link: https://lore.kernel.org/r/20220622121237.5832-1-liujing@cmss.chinamobile.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/tc-testing/.gitignore | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/tc-testing/.gitignore b/tools/testing/selftests/tc-testing/.gitignore index d52f65de23b4..9fe1cef72728 100644 --- a/tools/testing/selftests/tc-testing/.gitignore +++ b/tools/testing/selftests/tc-testing/.gitignore @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only __pycache__/ *.pyc -plugins/ *.xml *.tap tdc_config_local.py -- cgit v1.2.3 From 4228a996b072d36f3baafb4afdc2d2d66d2cbadf Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 20 Jun 2022 09:31:03 +1000 Subject: selftests/powerpc: Skip energy_scale_info test on older firmware Older machines don't have the firmware feature that enables the code this test is testing. Skip the test if the sysfs directory doesn't exist. Also use the FAIL_IF() macro to provide more verbose error reporting if an error is encountered. Fixes: 57201d657eb7 ("selftest/powerpc: Add PAPR sysfs attributes sniff test") Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220619233103.2666171-1-mpe@ellerman.id.au --- .../selftests/powerpc/papr_attributes/attr_test.c | 30 +++++++++++++--------- 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/papr_attributes/attr_test.c b/tools/testing/selftests/powerpc/papr_attributes/attr_test.c index bab0dc06e90b..9b655be641c9 100644 --- a/tools/testing/selftests/powerpc/papr_attributes/attr_test.c +++ b/tools/testing/selftests/powerpc/papr_attributes/attr_test.c @@ -7,6 +7,7 @@ * Copyright 2022, Pratik Rajesh Sampat, IBM Corp. */ +#include #include #include #include @@ -32,7 +33,7 @@ enum type { NUM_VAL }; -int value_type(int id) +static int value_type(int id) { int val_type; @@ -54,15 +55,21 @@ int value_type(int id) return val_type; } -int verify_energy_info(void) +static int verify_energy_info(void) { const char *path = "/sys/firmware/papr/energy_scale_info"; struct dirent *entry; struct stat s; DIR *dirp; - if (stat(path, &s) || !S_ISDIR(s.st_mode)) - return -1; + errno = 0; + if (stat(path, &s)) { + SKIP_IF(errno == ENOENT); + FAIL_IF(errno); + } + + FAIL_IF(!S_ISDIR(s.st_mode)); + dirp = opendir(path); while ((entry = readdir(dirp)) != NULL) { @@ -76,25 +83,24 @@ int verify_energy_info(void) id = atoi(entry->d_name); attr_type = value_type(id); - if (attr_type == INVALID) - return -1; + FAIL_IF(attr_type == INVALID); /* Check if the files exist and have data in them */ sprintf(file_name, "%s/%d/desc", path, id); f = fopen(file_name, "r"); - if (!f || fgetc(f) == EOF) - return -1; + FAIL_IF(!f); + FAIL_IF(fgetc(f) == EOF); sprintf(file_name, "%s/%d/value", path, id); f = fopen(file_name, "r"); - if (!f || fgetc(f) == EOF) - return -1; + FAIL_IF(!f); + FAIL_IF(fgetc(f) == EOF); if (attr_type == STR_VAL) { sprintf(file_name, "%s/%d/value_desc", path, id); f = fopen(file_name, "r"); - if (!f || fgetc(f) == EOF) - return -1; + FAIL_IF(!f); + FAIL_IF(fgetc(f) == EOF); } } -- cgit v1.2.3 From d9e9d2300681d68a775c28de6aa6e5290ae17796 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 14 Jun 2022 23:15:38 +0200 Subject: x86,objtool: Create .return_sites Find all the return-thunk sites and record them in a .return_sites section such that the kernel can undo this. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Reviewed-by: Josh Poimboeuf Signed-off-by: Borislav Petkov --- tools/objtool/arch/x86/decode.c | 5 +++ tools/objtool/check.c | 74 +++++++++++++++++++++++++++++++++ tools/objtool/include/objtool/arch.h | 1 + tools/objtool/include/objtool/elf.h | 1 + tools/objtool/include/objtool/objtool.h | 1 + tools/objtool/objtool.c | 1 + 6 files changed, 83 insertions(+) (limited to 'tools') diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 8b990a52aada..c260006106be 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -787,3 +787,8 @@ bool arch_is_retpoline(struct symbol *sym) { return !strncmp(sym->name, "__x86_indirect_", 15); } + +bool arch_is_rethunk(struct symbol *sym) +{ + return !strcmp(sym->name, "__x86_return_thunk"); +} diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 864bb9dd3584..f6d4ffa82432 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -749,6 +749,52 @@ static int create_retpoline_sites_sections(struct objtool_file *file) return 0; } +static int create_return_sites_sections(struct objtool_file *file) +{ + struct instruction *insn; + struct section *sec; + int idx; + + sec = find_section_by_name(file->elf, ".return_sites"); + if (sec) { + WARN("file already has .return_sites, skipping"); + return 0; + } + + idx = 0; + list_for_each_entry(insn, &file->return_thunk_list, call_node) + idx++; + + if (!idx) + return 0; + + sec = elf_create_section(file->elf, ".return_sites", 0, + sizeof(int), idx); + if (!sec) { + WARN("elf_create_section: .return_sites"); + return -1; + } + + idx = 0; + list_for_each_entry(insn, &file->return_thunk_list, call_node) { + + int *site = (int *)sec->data->d_buf + idx; + *site = 0; + + if (elf_add_reloc_to_insn(file->elf, sec, + idx * sizeof(int), + R_X86_64_PC32, + insn->sec, insn->offset)) { + WARN("elf_add_reloc_to_insn: .return_sites"); + return -1; + } + + idx++; + } + + return 0; +} + static int create_ibt_endbr_seal_sections(struct objtool_file *file) { struct instruction *insn; @@ -1083,6 +1129,11 @@ __weak bool arch_is_retpoline(struct symbol *sym) return false; } +__weak bool arch_is_rethunk(struct symbol *sym) +{ + return false; +} + #define NEGATIVE_RELOC ((void *)-1L) static struct reloc *insn_reloc(struct objtool_file *file, struct instruction *insn) @@ -1250,6 +1301,18 @@ static void add_retpoline_call(struct objtool_file *file, struct instruction *in annotate_call_site(file, insn, false); } +static void add_return_call(struct objtool_file *file, struct instruction *insn) +{ + /* + * Return thunk tail calls are really just returns in disguise, + * so convert them accordingly. + */ + insn->type = INSN_RETURN; + insn->retpoline_safe = true; + + list_add_tail(&insn->call_node, &file->return_thunk_list); +} + static bool same_function(struct instruction *insn1, struct instruction *insn2) { return insn1->func->pfunc == insn2->func->pfunc; @@ -1302,6 +1365,9 @@ static int add_jump_destinations(struct objtool_file *file) } else if (reloc->sym->retpoline_thunk) { add_retpoline_call(file, insn); continue; + } else if (reloc->sym->return_thunk) { + add_return_call(file, insn); + continue; } else if (insn->func) { /* * External sibling call or internal sibling call with @@ -2184,6 +2250,9 @@ static int classify_symbols(struct objtool_file *file) if (arch_is_retpoline(func)) func->retpoline_thunk = true; + if (arch_is_rethunk(func)) + func->return_thunk = true; + if (!strcmp(func->name, "__fentry__")) func->fentry = true; @@ -3972,6 +4041,11 @@ int check(struct objtool_file *file) if (ret < 0) goto out; warnings += ret; + + ret = create_return_sites_sections(file); + if (ret < 0) + goto out; + warnings += ret; } if (opts.mcount) { diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index 9b19cc304195..beb2f3aa94ff 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -89,6 +89,7 @@ const char *arch_ret_insn(int len); int arch_decode_hint_reg(u8 sp_reg, int *base); bool arch_is_retpoline(struct symbol *sym); +bool arch_is_rethunk(struct symbol *sym); int arch_rewrite_retpolines(struct objtool_file *file); diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index adebfbc2b518..16f4067b82ae 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -57,6 +57,7 @@ struct symbol { u8 uaccess_safe : 1; u8 static_call_tramp : 1; u8 retpoline_thunk : 1; + u8 return_thunk : 1; u8 fentry : 1; u8 profiling_func : 1; struct list_head pv_target; diff --git a/tools/objtool/include/objtool/objtool.h b/tools/objtool/include/objtool/objtool.h index a6e72d916807..7f2d1b095333 100644 --- a/tools/objtool/include/objtool/objtool.h +++ b/tools/objtool/include/objtool/objtool.h @@ -24,6 +24,7 @@ struct objtool_file { struct list_head insn_list; DECLARE_HASHTABLE(insn_hash, 20); struct list_head retpoline_call_list; + struct list_head return_thunk_list; struct list_head static_call_list; struct list_head mcount_loc_list; struct list_head endbr_list; diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c index 512669ce064c..a7ecc32e3512 100644 --- a/tools/objtool/objtool.c +++ b/tools/objtool/objtool.c @@ -102,6 +102,7 @@ struct objtool_file *objtool_open_read(const char *_objname) INIT_LIST_HEAD(&file.insn_list); hash_init(file.insn_hash); INIT_LIST_HEAD(&file.retpoline_call_list); + INIT_LIST_HEAD(&file.return_thunk_list); INIT_LIST_HEAD(&file.static_call_list); INIT_LIST_HEAD(&file.mcount_loc_list); INIT_LIST_HEAD(&file.endbr_list); -- cgit v1.2.3 From 951ddecf435659553ed15a9214e153a3af43a9a1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 14 Jun 2022 23:15:47 +0200 Subject: objtool: Treat .text.__x86.* as noinstr Needed because zen_untrain_ret() will be called from noinstr code. Also makes sense since the thunks MUST NOT contain instrumentation nor be poked with dynamic instrumentation. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Reviewed-by: Josh Poimboeuf Signed-off-by: Borislav Petkov --- tools/objtool/check.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index f6d4ffa82432..b98fd68013c3 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -376,7 +376,8 @@ static int decode_instructions(struct objtool_file *file) sec->text = true; if (!strcmp(sec->name, ".noinstr.text") || - !strcmp(sec->name, ".entry.text")) + !strcmp(sec->name, ".entry.text") || + !strncmp(sec->name, ".text.__x86.", 12)) sec->noinstr = true; for (offset = 0; offset < sec->sh.sh_size; offset += insn->len) { -- cgit v1.2.3 From a149180fbcf336e97ce4eb2cdc13672727feb94d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 14 Jun 2022 23:15:48 +0200 Subject: x86: Add magic AMD return-thunk Note: needs to be in a section distinct from Retpolines such that the Retpoline RET substitution cannot possibly use immediate jumps. ORC unwinding for zen_untrain_ret() and __x86_return_thunk() is a little tricky but works due to the fact that zen_untrain_ret() doesn't have any stack ops and as such will emit a single ORC entry at the start (+0x3f). Meanwhile, unwinding an IP, including the __x86_return_thunk() one (+0x40) will search for the largest ORC entry smaller or equal to the IP, these will find the one ORC entry (+0x3f) and all works. [ Alexandre: SVM part. ] [ bp: Build fix, massages. ] Suggested-by: Andrew Cooper Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Reviewed-by: Josh Poimboeuf Signed-off-by: Borislav Petkov --- arch/x86/entry/entry_64.S | 6 +++ arch/x86/entry/entry_64_compat.S | 4 ++ arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/disabled-features.h | 3 +- arch/x86/include/asm/nospec-branch.h | 17 +++++++++ arch/x86/kernel/vmlinux.lds.S | 2 +- arch/x86/kvm/svm/vmenter.S | 18 +++++++++ arch/x86/lib/retpoline.S | 64 ++++++++++++++++++++++++++++++-- tools/objtool/check.c | 22 +++++++++-- 9 files changed, 128 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9f5d6a4f293e..1f4b18c8909b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -96,6 +96,7 @@ SYM_CODE_START(entry_SYSCALL_64) SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR + UNTRAIN_RET /* Construct struct pt_regs on stack */ pushq $__USER_DS /* pt_regs->ss */ @@ -717,6 +718,7 @@ native_irq_return_ldt: pushq %rdi /* Stash user RDI */ swapgs /* to kernel GS */ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */ + UNTRAIN_RET movq PER_CPU_VAR(espfix_waddr), %rdi movq %rax, (0*8)(%rdi) /* user RAX */ @@ -911,6 +913,7 @@ SYM_CODE_START_LOCAL(paranoid_entry) * be retrieved from a kernel internal table. */ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 + UNTRAIN_RET /* * Handling GSBASE depends on the availability of FSGSBASE. @@ -1020,6 +1023,7 @@ SYM_CODE_START_LOCAL(error_entry) FENCE_SWAPGS_USER_ENTRY /* We have user CR3. Change to kernel CR3. */ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax + UNTRAIN_RET leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */ .Lerror_entry_from_usermode_after_swapgs: @@ -1072,6 +1076,7 @@ SYM_CODE_START_LOCAL(error_entry) swapgs FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rax + UNTRAIN_RET /* * Pretend that the exception came from user mode: set up pt_regs @@ -1167,6 +1172,7 @@ SYM_CODE_START(asm_exc_nmi) movq %rsp, %rdx movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp UNWIND_HINT_IRET_REGS base=%rdx offset=8 + UNTRAIN_RET pushq 5*8(%rdx) /* pt_regs->ss */ pushq 4*8(%rdx) /* pt_regs->rsp */ pushq 3*8(%rdx) /* pt_regs->flags */ diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index d1052742ad0c..03d74c5153fb 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -72,6 +73,7 @@ SYM_CODE_START(entry_SYSENTER_compat) pushq $__USER32_CS /* pt_regs->cs */ pushq $0 /* pt_regs->ip = 0 (placeholder) */ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) + UNTRAIN_RET /* * User tracing code (ptrace or signal handlers) might assume that @@ -190,6 +192,7 @@ SYM_CODE_START(entry_SYSCALL_compat) SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR + UNTRAIN_RET /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ @@ -332,6 +335,7 @@ SYM_CODE_START(entry_INT80_compat) pushq 0*8(%rax) /* regs->orig_ax */ .Lint80_keep_stack: + UNTRAIN_RET PUSH_AND_CLEAR_REGS rax=$-ENOSYS UNWIND_HINT_REGS diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 295e69090fb8..fa3d0db1470e 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -301,6 +301,7 @@ #define X86_FEATURE_RETPOLINE (11*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */ #define X86_FEATURE_RETHUNK (11*32+14) /* "" Use REturn THUNK */ +#define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 641c479cca17..db75da511a36 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -55,7 +55,8 @@ #else # define DISABLE_RETPOLINE ((1 << (X86_FEATURE_RETPOLINE & 31)) | \ (1 << (X86_FEATURE_RETPOLINE_LFENCE & 31)) | \ - (1 << (X86_FEATURE_RETHUNK & 31))) + (1 << (X86_FEATURE_RETHUNK & 31)) | \ + (1 << (X86_FEATURE_UNRET & 31))) #endif #ifdef CONFIG_INTEL_IOMMU_SVM diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 829c9f827a96..5ca60ae0d14f 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -112,6 +112,22 @@ #endif .endm +/* + * Mitigate RETBleed for AMD/Hygon Zen uarch. Requires KERNEL CR3 because the + * return thunk isn't mapped into the userspace tables (then again, AMD + * typically has NO_MELTDOWN). + * + * Doesn't clobber any registers but does require a stable stack. + * + * As such, this must be placed after every *SWITCH_TO_KERNEL_CR3 at a point + * where we have a stack but before any RET instruction. + */ +.macro UNTRAIN_RET +#ifdef CONFIG_RETPOLINE + ALTERNATIVE "", "call zen_untrain_ret", X86_FEATURE_UNRET +#endif +.endm + #else /* __ASSEMBLY__ */ #define ANNOTATE_RETPOLINE_SAFE \ @@ -124,6 +140,7 @@ typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE]; extern retpoline_thunk_t __x86_indirect_thunk_array[]; extern void __x86_return_thunk(void); +extern void zen_untrain_ret(void); #ifdef CONFIG_RETPOLINE diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index ada7eb738113..c375420036fb 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -141,7 +141,7 @@ SECTIONS #ifdef CONFIG_RETPOLINE __indirect_thunk_start = .; - *(.text.__x86.indirect_thunk) + *(.text.__x86.*) __indirect_thunk_end = .; #endif } :text =0xcccc diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index dfaeb47fcf2a..723f8534986c 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -110,6 +110,15 @@ SYM_FUNC_START(__svm_vcpu_run) mov %r15, VCPU_R15(%_ASM_AX) #endif + /* + * Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be + * untrained as soon as we exit the VM and are back to the + * kernel. This should be done before re-enabling interrupts + * because interrupt handlers won't sanitize 'ret' if the return is + * from the kernel. + */ + UNTRAIN_RET + /* * Clear all general purpose registers except RSP and RAX to prevent * speculative use of the guest's values, even those that are reloaded @@ -190,6 +199,15 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE #endif + /* + * Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be + * untrained as soon as we exit the VM and are back to the + * kernel. This should be done before re-enabling interrupts + * because interrupt handlers won't sanitize RET if the return is + * from the kernel. + */ + UNTRAIN_RET + pop %_ASM_BX #ifdef CONFIG_X86_64 diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 4467c21215f4..fdd16163b996 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -72,11 +72,67 @@ SYM_CODE_END(__x86_indirect_thunk_array) * This function name is magical and is used by -mfunction-return=thunk-extern * for the compiler to generate JMPs to it. */ -SYM_CODE_START(__x86_return_thunk) - UNWIND_HINT_EMPTY - ANNOTATE_NOENDBR + .section .text.__x86.return_thunk + +/* + * Safety details here pertain to the AMD Zen{1,2} microarchitecture: + * 1) The RET at __x86_return_thunk must be on a 64 byte boundary, for + * alignment within the BTB. + * 2) The instruction at zen_untrain_ret must contain, and not + * end with, the 0xc3 byte of the RET. + * 3) STIBP must be enabled, or SMT disabled, to prevent the sibling thread + * from re-poisioning the BTB prediction. + */ + .align 64 + .skip 63, 0xcc +SYM_FUNC_START_NOALIGN(zen_untrain_ret); + + /* + * As executed from zen_untrain_ret, this is: + * + * TEST $0xcc, %bl + * LFENCE + * JMP __x86_return_thunk + * + * Executing the TEST instruction has a side effect of evicting any BTB + * prediction (potentially attacker controlled) attached to the RET, as + * __x86_return_thunk + 1 isn't an instruction boundary at the moment. + */ + .byte 0xf6 + + /* + * As executed from __x86_return_thunk, this is a plain RET. + * + * As part of the TEST above, RET is the ModRM byte, and INT3 the imm8. + * + * We subsequently jump backwards and architecturally execute the RET. + * This creates a correct BTB prediction (type=ret), but in the + * meantime we suffer Straight Line Speculation (because the type was + * no branch) which is halted by the INT3. + * + * With SMT enabled and STIBP active, a sibling thread cannot poison + * RET's prediction to a type of its choice, but can evict the + * prediction due to competitive sharing. If the prediction is + * evicted, __x86_return_thunk will suffer Straight Line Speculation + * which will be contained safely by the INT3. + */ +SYM_INNER_LABEL(__x86_return_thunk, SYM_L_GLOBAL) ret int3 SYM_CODE_END(__x86_return_thunk) -__EXPORT_THUNK(__x86_return_thunk) + /* + * Ensure the TEST decoding / BTB invalidation is complete. + */ + lfence + + /* + * Jump back and execute the RET in the middle of the TEST instruction. + * INT3 is for SLS protection. + */ + jmp __x86_return_thunk + int3 +SYM_FUNC_END(zen_untrain_ret) +__EXPORT_THUNK(zen_untrain_ret) + +EXPORT_SYMBOL(__x86_return_thunk) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index b98fd68013c3..4252cd05dfc4 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1302,7 +1302,7 @@ static void add_retpoline_call(struct objtool_file *file, struct instruction *in annotate_call_site(file, insn, false); } -static void add_return_call(struct objtool_file *file, struct instruction *insn) +static void add_return_call(struct objtool_file *file, struct instruction *insn, bool add) { /* * Return thunk tail calls are really just returns in disguise, @@ -1311,7 +1311,8 @@ static void add_return_call(struct objtool_file *file, struct instruction *insn) insn->type = INSN_RETURN; insn->retpoline_safe = true; - list_add_tail(&insn->call_node, &file->return_thunk_list); + if (add) + list_add_tail(&insn->call_node, &file->return_thunk_list); } static bool same_function(struct instruction *insn1, struct instruction *insn2) @@ -1367,7 +1368,7 @@ static int add_jump_destinations(struct objtool_file *file) add_retpoline_call(file, insn); continue; } else if (reloc->sym->return_thunk) { - add_return_call(file, insn); + add_return_call(file, insn, true); continue; } else if (insn->func) { /* @@ -1387,6 +1388,21 @@ static int add_jump_destinations(struct objtool_file *file) jump_dest = find_insn(file, dest_sec, dest_off); if (!jump_dest) { + struct symbol *sym = find_symbol_by_offset(dest_sec, dest_off); + + /* + * This is a special case for zen_untrain_ret(). + * It jumps to __x86_return_thunk(), but objtool + * can't find the thunk's starting RET + * instruction, because the RET is also in the + * middle of another instruction. Objtool only + * knows about the outer instruction. + */ + if (sym && sym->return_thunk) { + add_return_call(file, insn, false); + continue; + } + WARN_FUNC("can't find jump dest instruction at %s+0x%lx", insn->sec, insn->offset, dest_sec->name, dest_off); -- cgit v1.2.3 From 9bb2ec608a209018080ca262f771e6a9ff203b6f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 14 Jun 2022 23:15:59 +0200 Subject: objtool: Update Retpoline validation Update retpoline validation with the new CONFIG_RETPOLINE requirement of not having bare naked RET instructions. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Reviewed-by: Josh Poimboeuf Signed-off-by: Borislav Petkov --- arch/x86/include/asm/nospec-branch.h | 6 ++++++ arch/x86/mm/mem_encrypt_boot.S | 2 ++ arch/x86/xen/xen-head.S | 1 + tools/objtool/check.c | 19 +++++++++++++------ 4 files changed, 22 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index ce1acb557162..455d79c6c2f3 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -75,6 +75,12 @@ .popsection .endm +/* + * (ab)use RETPOLINE_SAFE on RET to annotate away 'bare' RET instructions + * vs RETBleed validation. + */ +#define ANNOTATE_UNRET_SAFE ANNOTATE_RETPOLINE_SAFE + /* * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple * indirect jmp/call which may be susceptible to the Spectre variant 2 diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index d94dea450fa6..9de3d900bc92 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -66,6 +66,7 @@ SYM_FUNC_START(sme_encrypt_execute) pop %rbp /* Offset to __x86_return_thunk would be wrong here */ + ANNOTATE_UNRET_SAFE ret int3 SYM_FUNC_END(sme_encrypt_execute) @@ -154,6 +155,7 @@ SYM_FUNC_START(__enc_copy) pop %r15 /* Offset to __x86_return_thunk would be wrong here */ + ANNOTATE_UNRET_SAFE ret int3 .L__enc_copy_end: diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 3a2cd93bf059..fa884fc73e07 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -26,6 +26,7 @@ SYM_CODE_START(hypercall_page) .rept (PAGE_SIZE / 32) UNWIND_HINT_FUNC ANNOTATE_NOENDBR + ANNOTATE_UNRET_SAFE ret /* * Xen will write the hypercall page, and sort out ENDBR. diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 4252cd05dfc4..7dc378156a63 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2115,8 +2115,9 @@ static int read_retpoline_hints(struct objtool_file *file) } if (insn->type != INSN_JUMP_DYNAMIC && - insn->type != INSN_CALL_DYNAMIC) { - WARN_FUNC("retpoline_safe hint not an indirect jump/call", + insn->type != INSN_CALL_DYNAMIC && + insn->type != INSN_RETURN) { + WARN_FUNC("retpoline_safe hint not an indirect jump/call/ret", insn->sec, insn->offset); return -1; } @@ -3526,7 +3527,8 @@ static int validate_retpoline(struct objtool_file *file) for_each_insn(file, insn) { if (insn->type != INSN_JUMP_DYNAMIC && - insn->type != INSN_CALL_DYNAMIC) + insn->type != INSN_CALL_DYNAMIC && + insn->type != INSN_RETURN) continue; if (insn->retpoline_safe) @@ -3541,9 +3543,14 @@ static int validate_retpoline(struct objtool_file *file) if (!strcmp(insn->sec->name, ".init.text") && !opts.module) continue; - WARN_FUNC("indirect %s found in RETPOLINE build", - insn->sec, insn->offset, - insn->type == INSN_JUMP_DYNAMIC ? "jump" : "call"); + if (insn->type == INSN_RETURN) { + WARN_FUNC("'naked' return found in RETPOLINE build", + insn->sec, insn->offset); + } else { + WARN_FUNC("indirect %s found in RETPOLINE build", + insn->sec, insn->offset, + insn->type == INSN_JUMP_DYNAMIC ? "jump" : "call"); + } warnings++; } -- cgit v1.2.3 From a09a6e2399ba0595c3042b3164f3ca68a3cff33e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 14 Jun 2022 23:16:03 +0200 Subject: objtool: Add entry UNRET validation Since entry asm is tricky, add a validation pass that ensures the retbleed mitigation has been done before the first actual RET instruction. Entry points are those that either have UNWIND_HINT_ENTRY, which acts as UNWIND_HINT_EMPTY but marks the instruction as an entry point, or those that have UWIND_HINT_IRET_REGS at +0. This is basically a variant of validate_branch() that is intra-function and it will simply follow all branches from marked entry points and ensures that all paths lead to ANNOTATE_UNRET_END. If a path hits RET or an indirection the path is a fail and will be reported. There are 3 ANNOTATE_UNRET_END instances: - UNTRAIN_RET itself - exception from-kernel; this path doesn't need UNTRAIN_RET - all early exceptions; these also don't need UNTRAIN_RET Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Reviewed-by: Josh Poimboeuf Signed-off-by: Borislav Petkov --- arch/x86/entry/entry_64.S | 3 +- arch/x86/entry/entry_64_compat.S | 6 +- arch/x86/include/asm/nospec-branch.h | 12 +++ arch/x86/include/asm/unwind_hints.h | 4 + arch/x86/kernel/head_64.S | 5 + arch/x86/xen/xen-asm.S | 10 +- include/linux/objtool.h | 3 + scripts/Makefile.vmlinux_o | 2 +- tools/include/linux/objtool.h | 3 + tools/objtool/builtin-check.c | 6 ++ tools/objtool/check.c | 177 ++++++++++++++++++++++++++++++-- tools/objtool/include/objtool/builtin.h | 1 + tools/objtool/include/objtool/check.h | 11 +- 13 files changed, 222 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 0c88802e1155..65e3b8b7cbe5 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -85,7 +85,7 @@ */ SYM_CODE_START(entry_SYSCALL_64) - UNWIND_HINT_EMPTY + UNWIND_HINT_ENTRY ENDBR swapgs @@ -1095,6 +1095,7 @@ SYM_CODE_START_LOCAL(error_entry) .Lerror_entry_done_lfence: FENCE_SWAPGS_KERNEL_ENTRY leaq 8(%rsp), %rax /* return pt_regs pointer */ + ANNOTATE_UNRET_END RET .Lbstep_iret: diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index bcb89d23ac0e..682338e7e2a3 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -49,7 +49,7 @@ * 0(%ebp) arg6 */ SYM_CODE_START(entry_SYSENTER_compat) - UNWIND_HINT_EMPTY + UNWIND_HINT_ENTRY ENDBR /* Interrupts are off on entry. */ swapgs @@ -179,7 +179,7 @@ SYM_CODE_END(entry_SYSENTER_compat) * 0(%esp) arg6 */ SYM_CODE_START(entry_SYSCALL_compat) - UNWIND_HINT_EMPTY + UNWIND_HINT_ENTRY ENDBR /* Interrupts are off on entry. */ swapgs @@ -305,7 +305,7 @@ SYM_CODE_END(entry_SYSCALL_compat) * ebp arg6 */ SYM_CODE_START(entry_INT80_compat) - UNWIND_HINT_EMPTY + UNWIND_HINT_ENTRY ENDBR /* * Interrupts are off on entry. diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 05dd75478d7b..bba42bd78edf 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -81,6 +81,17 @@ */ #define ANNOTATE_UNRET_SAFE ANNOTATE_RETPOLINE_SAFE +/* + * Abuse ANNOTATE_RETPOLINE_SAFE on a NOP to indicate UNRET_END, should + * eventually turn into it's own annotation. + */ +.macro ANNOTATE_UNRET_END +#ifdef CONFIG_DEBUG_ENTRY + ANNOTATE_RETPOLINE_SAFE + nop +#endif +.endm + /* * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple * indirect jmp/call which may be susceptible to the Spectre variant 2 @@ -131,6 +142,7 @@ */ .macro UNTRAIN_RET #ifdef CONFIG_RETPOLINE + ANNOTATE_UNRET_END ALTERNATIVE_2 "", \ "call zen_untrain_ret", X86_FEATURE_UNRET, \ "call entry_ibpb", X86_FEATURE_ENTRY_IBPB diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h index 8b33674288ea..6f70fe4c93f2 100644 --- a/arch/x86/include/asm/unwind_hints.h +++ b/arch/x86/include/asm/unwind_hints.h @@ -11,6 +11,10 @@ UNWIND_HINT sp_reg=ORC_REG_UNDEFINED type=UNWIND_HINT_TYPE_CALL end=1 .endm +.macro UNWIND_HINT_ENTRY + UNWIND_HINT sp_reg=ORC_REG_UNDEFINED type=UNWIND_HINT_TYPE_ENTRY end=1 +.endm + .macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 partial=0 .if \base == %rsp .if \indirect diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 92c4afa2b729..d860d437631b 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -389,6 +389,8 @@ SYM_CODE_START_NOALIGN(vc_boot_ghcb) UNWIND_HINT_IRET_REGS offset=8 ENDBR + ANNOTATE_UNRET_END + /* Build pt_regs */ PUSH_AND_CLEAR_REGS @@ -448,6 +450,7 @@ SYM_CODE_END(early_idt_handler_array) SYM_CODE_START_LOCAL(early_idt_handler_common) UNWIND_HINT_IRET_REGS offset=16 + ANNOTATE_UNRET_END /* * The stack is the hardware frame, an error code or zero, and the * vector number. @@ -497,6 +500,8 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb) UNWIND_HINT_IRET_REGS offset=8 ENDBR + ANNOTATE_UNRET_END + /* Build pt_regs */ PUSH_AND_CLEAR_REGS diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 6bf9d45b9178..6b4fdf6b9542 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -121,7 +121,7 @@ SYM_FUNC_END(xen_read_cr2_direct); .macro xen_pv_trap name SYM_CODE_START(xen_\name) - UNWIND_HINT_EMPTY + UNWIND_HINT_ENTRY ENDBR pop %rcx pop %r11 @@ -235,7 +235,7 @@ SYM_CODE_END(xenpv_restore_regs_and_return_to_usermode) /* Normal 64-bit system call target */ SYM_CODE_START(xen_entry_SYSCALL_64) - UNWIND_HINT_EMPTY + UNWIND_HINT_ENTRY ENDBR popq %rcx popq %r11 @@ -255,7 +255,7 @@ SYM_CODE_END(xen_entry_SYSCALL_64) /* 32-bit compat syscall target */ SYM_CODE_START(xen_entry_SYSCALL_compat) - UNWIND_HINT_EMPTY + UNWIND_HINT_ENTRY ENDBR popq %rcx popq %r11 @@ -273,7 +273,7 @@ SYM_CODE_END(xen_entry_SYSCALL_compat) /* 32-bit compat sysenter target */ SYM_CODE_START(xen_entry_SYSENTER_compat) - UNWIND_HINT_EMPTY + UNWIND_HINT_ENTRY ENDBR /* * NB: Xen is polite and clears TF from EFLAGS for us. This means @@ -297,7 +297,7 @@ SYM_CODE_END(xen_entry_SYSENTER_compat) SYM_CODE_START(xen_entry_SYSCALL_compat) SYM_CODE_START(xen_entry_SYSENTER_compat) - UNWIND_HINT_EMPTY + UNWIND_HINT_ENTRY ENDBR lea 16(%rsp), %rsp /* strip %rcx, %r11 */ mov $-ENOSYS, %rax diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 15b940ec1eac..b026f1ae39c6 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -32,11 +32,14 @@ struct unwind_hint { * * UNWIND_HINT_FUNC: Generate the unwind metadata of a callable function. * Useful for code which doesn't have an ELF function annotation. + * + * UNWIND_HINT_ENTRY: machine entry without stack, SYSCALL/SYSENTER etc. */ #define UNWIND_HINT_TYPE_CALL 0 #define UNWIND_HINT_TYPE_REGS 1 #define UNWIND_HINT_TYPE_REGS_PARTIAL 2 #define UNWIND_HINT_TYPE_FUNC 3 +#define UNWIND_HINT_TYPE_ENTRY 4 #ifdef CONFIG_OBJTOOL diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o index 3c97a1564947..bc67748044a6 100644 --- a/scripts/Makefile.vmlinux_o +++ b/scripts/Makefile.vmlinux_o @@ -44,7 +44,7 @@ objtool-enabled := $(or $(delay-objtool),$(CONFIG_NOINSTR_VALIDATION)) objtool_args := \ $(if $(delay-objtool),$(objtool_args)) \ - $(if $(CONFIG_NOINSTR_VALIDATION), --noinstr) \ + $(if $(CONFIG_NOINSTR_VALIDATION), --noinstr $(if $(CONFIG_RETPOLINE), --unret)) \ $(if $(CONFIG_GCOV_KERNEL), --no-unreachable) \ --link diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h index 15b940ec1eac..b026f1ae39c6 100644 --- a/tools/include/linux/objtool.h +++ b/tools/include/linux/objtool.h @@ -32,11 +32,14 @@ struct unwind_hint { * * UNWIND_HINT_FUNC: Generate the unwind metadata of a callable function. * Useful for code which doesn't have an ELF function annotation. + * + * UNWIND_HINT_ENTRY: machine entry without stack, SYSCALL/SYSENTER etc. */ #define UNWIND_HINT_TYPE_CALL 0 #define UNWIND_HINT_TYPE_REGS 1 #define UNWIND_HINT_TYPE_REGS_PARTIAL 2 #define UNWIND_HINT_TYPE_FUNC 3 +#define UNWIND_HINT_TYPE_ENTRY 4 #ifdef CONFIG_OBJTOOL diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index f4c3a5091737..c063e1ff96b2 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -68,6 +68,7 @@ const struct option check_options[] = { OPT_BOOLEAN('n', "noinstr", &opts.noinstr, "validate noinstr rules"), OPT_BOOLEAN('o', "orc", &opts.orc, "generate ORC metadata"), OPT_BOOLEAN('r', "retpoline", &opts.retpoline, "validate and annotate retpoline usage"), + OPT_BOOLEAN(0, "unret", &opts.unret, "validate entry unret placement"), OPT_BOOLEAN('l', "sls", &opts.sls, "validate straight-line-speculation mitigations"), OPT_BOOLEAN('s', "stackval", &opts.stackval, "validate frame pointer rules"), OPT_BOOLEAN('t', "static-call", &opts.static_call, "annotate static calls"), @@ -163,6 +164,11 @@ static bool link_opts_valid(struct objtool_file *file) return false; } + if (opts.unret) { + ERROR("--unret requires --link"); + return false; + } + return true; } diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 7dc378156a63..822a490e6d87 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2032,16 +2032,24 @@ static int read_unwind_hints(struct objtool_file *file) insn->hint = true; - if (opts.ibt && hint->type == UNWIND_HINT_TYPE_REGS_PARTIAL) { + if (hint->type == UNWIND_HINT_TYPE_REGS_PARTIAL) { struct symbol *sym = find_symbol_by_offset(insn->sec, insn->offset); - if (sym && sym->bind == STB_GLOBAL && - insn->type != INSN_ENDBR && !insn->noendbr) { - WARN_FUNC("UNWIND_HINT_IRET_REGS without ENDBR", - insn->sec, insn->offset); + if (sym && sym->bind == STB_GLOBAL) { + if (opts.ibt && insn->type != INSN_ENDBR && !insn->noendbr) { + WARN_FUNC("UNWIND_HINT_IRET_REGS without ENDBR", + insn->sec, insn->offset); + } + + insn->entry = 1; } } + if (hint->type == UNWIND_HINT_TYPE_ENTRY) { + hint->type = UNWIND_HINT_TYPE_CALL; + insn->entry = 1; + } + if (hint->type == UNWIND_HINT_TYPE_FUNC) { insn->cfi = &func_cfi; continue; @@ -2116,8 +2124,9 @@ static int read_retpoline_hints(struct objtool_file *file) if (insn->type != INSN_JUMP_DYNAMIC && insn->type != INSN_CALL_DYNAMIC && - insn->type != INSN_RETURN) { - WARN_FUNC("retpoline_safe hint not an indirect jump/call/ret", + insn->type != INSN_RETURN && + insn->type != INSN_NOP) { + WARN_FUNC("retpoline_safe hint not an indirect jump/call/ret/nop", insn->sec, insn->offset); return -1; } @@ -3305,8 +3314,8 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, return 1; } - visited = 1 << state.uaccess; - if (insn->visited) { + visited = VISITED_BRANCH << state.uaccess; + if (insn->visited & VISITED_BRANCH_MASK) { if (!insn->hint && !insn_cfi_match(insn, &state.cfi)) return 1; @@ -3520,6 +3529,145 @@ static int validate_unwind_hints(struct objtool_file *file, struct section *sec) return warnings; } +/* + * Validate rethunk entry constraint: must untrain RET before the first RET. + * + * Follow every branch (intra-function) and ensure ANNOTATE_UNRET_END comes + * before an actual RET instruction. + */ +static int validate_entry(struct objtool_file *file, struct instruction *insn) +{ + struct instruction *next, *dest; + int ret, warnings = 0; + + for (;;) { + next = next_insn_to_validate(file, insn); + + if (insn->visited & VISITED_ENTRY) + return 0; + + insn->visited |= VISITED_ENTRY; + + if (!insn->ignore_alts && !list_empty(&insn->alts)) { + struct alternative *alt; + bool skip_orig = false; + + list_for_each_entry(alt, &insn->alts, list) { + if (alt->skip_orig) + skip_orig = true; + + ret = validate_entry(file, alt->insn); + if (ret) { + if (opts.backtrace) + BT_FUNC("(alt)", insn); + return ret; + } + } + + if (skip_orig) + return 0; + } + + switch (insn->type) { + + case INSN_CALL_DYNAMIC: + case INSN_JUMP_DYNAMIC: + case INSN_JUMP_DYNAMIC_CONDITIONAL: + WARN_FUNC("early indirect call", insn->sec, insn->offset); + return 1; + + case INSN_JUMP_UNCONDITIONAL: + case INSN_JUMP_CONDITIONAL: + if (!is_sibling_call(insn)) { + if (!insn->jump_dest) { + WARN_FUNC("unresolved jump target after linking?!?", + insn->sec, insn->offset); + return -1; + } + ret = validate_entry(file, insn->jump_dest); + if (ret) { + if (opts.backtrace) { + BT_FUNC("(branch%s)", insn, + insn->type == INSN_JUMP_CONDITIONAL ? "-cond" : ""); + } + return ret; + } + + if (insn->type == INSN_JUMP_UNCONDITIONAL) + return 0; + + break; + } + + /* fallthrough */ + case INSN_CALL: + dest = find_insn(file, insn->call_dest->sec, + insn->call_dest->offset); + if (!dest) { + WARN("Unresolved function after linking!?: %s", + insn->call_dest->name); + return -1; + } + + ret = validate_entry(file, dest); + if (ret) { + if (opts.backtrace) + BT_FUNC("(call)", insn); + return ret; + } + /* + * If a call returns without error, it must have seen UNTRAIN_RET. + * Therefore any non-error return is a success. + */ + return 0; + + case INSN_RETURN: + WARN_FUNC("RET before UNTRAIN", insn->sec, insn->offset); + return 1; + + case INSN_NOP: + if (insn->retpoline_safe) + return 0; + break; + + default: + break; + } + + if (!next) { + WARN_FUNC("teh end!", insn->sec, insn->offset); + return -1; + } + insn = next; + } + + return warnings; +} + +/* + * Validate that all branches starting at 'insn->entry' encounter UNRET_END + * before RET. + */ +static int validate_unret(struct objtool_file *file) +{ + struct instruction *insn; + int ret, warnings = 0; + + for_each_insn(file, insn) { + if (!insn->entry) + continue; + + ret = validate_entry(file, insn); + if (ret < 0) { + WARN_FUNC("Failed UNRET validation", insn->sec, insn->offset); + return ret; + } + warnings += ret; + } + + return warnings; +} + static int validate_retpoline(struct objtool_file *file) { struct instruction *insn; @@ -4039,6 +4187,17 @@ int check(struct objtool_file *file) warnings += ret; } + if (opts.unret) { + /* + * Must be after validate_branch() and friends, it plays + * further games with insn->visited. + */ + ret = validate_unret(file); + if (ret < 0) + return ret; + warnings += ret; + } + if (opts.ibt) { ret = validate_ibt(file); if (ret < 0) diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index 280ea18b7f2b..0c476b0b40a3 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -19,6 +19,7 @@ struct opts { bool noinstr; bool orc; bool retpoline; + bool unret; bool sls; bool stackval; bool static_call; diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h index f10d7374f388..0eeedeacbefb 100644 --- a/tools/objtool/include/objtool/check.h +++ b/tools/objtool/include/objtool/check.h @@ -51,8 +51,10 @@ struct instruction { ignore_alts : 1, hint : 1, retpoline_safe : 1, - noendbr : 1; - /* 2 bit hole */ + noendbr : 1, + entry : 1; + /* 1 bit hole */ + s8 instr; u8 visited; /* u8 hole */ @@ -69,6 +71,11 @@ struct instruction { struct cfi_state *cfi; }; +#define VISITED_BRANCH 0x01 +#define VISITED_BRANCH_UACCESS 0x02 +#define VISITED_BRANCH_MASK 0x03 +#define VISITED_ENTRY 0x04 + static inline bool is_static_jump(struct instruction *insn) { return insn->type == INSN_JUMP_CONDITIONAL || -- cgit v1.2.3 From 8faea26e611189e933ea2281975ff4dc7c1106b6 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 24 Jun 2022 12:52:40 +0200 Subject: objtool: Re-add UNWIND_HINT_{SAVE_RESTORE} Commit c536ed2fffd5 ("objtool: Remove SAVE/RESTORE hints") removed the save/restore unwind hints because they were no longer needed. Now they're going to be needed again so re-add them. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov --- arch/x86/include/asm/unwind_hints.h | 12 +++++++++-- include/linux/objtool.h | 6 ++++-- tools/include/linux/objtool.h | 6 ++++-- tools/objtool/check.c | 40 +++++++++++++++++++++++++++++++++++ tools/objtool/include/objtool/check.h | 19 +++++++++-------- 5 files changed, 68 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h index 6f70fe4c93f2..f66fbe6537dd 100644 --- a/arch/x86/include/asm/unwind_hints.h +++ b/arch/x86/include/asm/unwind_hints.h @@ -8,11 +8,11 @@ #ifdef __ASSEMBLY__ .macro UNWIND_HINT_EMPTY - UNWIND_HINT sp_reg=ORC_REG_UNDEFINED type=UNWIND_HINT_TYPE_CALL end=1 + UNWIND_HINT type=UNWIND_HINT_TYPE_CALL end=1 .endm .macro UNWIND_HINT_ENTRY - UNWIND_HINT sp_reg=ORC_REG_UNDEFINED type=UNWIND_HINT_TYPE_ENTRY end=1 + UNWIND_HINT type=UNWIND_HINT_TYPE_ENTRY end=1 .endm .macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 partial=0 @@ -56,6 +56,14 @@ UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=8 type=UNWIND_HINT_TYPE_FUNC .endm +.macro UNWIND_HINT_SAVE + UNWIND_HINT type=UNWIND_HINT_TYPE_SAVE +.endm + +.macro UNWIND_HINT_RESTORE + UNWIND_HINT type=UNWIND_HINT_TYPE_RESTORE +.endm + #else #define UNWIND_HINT_FUNC \ diff --git a/include/linux/objtool.h b/include/linux/objtool.h index b026f1ae39c6..10bc88cc3bf6 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -40,6 +40,8 @@ struct unwind_hint { #define UNWIND_HINT_TYPE_REGS_PARTIAL 2 #define UNWIND_HINT_TYPE_FUNC 3 #define UNWIND_HINT_TYPE_ENTRY 4 +#define UNWIND_HINT_TYPE_SAVE 5 +#define UNWIND_HINT_TYPE_RESTORE 6 #ifdef CONFIG_OBJTOOL @@ -127,7 +129,7 @@ struct unwind_hint { * the debuginfo as necessary. It will also warn if it sees any * inconsistencies. */ -.macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0 +.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0 .Lunwind_hint_ip_\@: .pushsection .discard.unwind_hints /* struct unwind_hint */ @@ -180,7 +182,7 @@ struct unwind_hint { #define ASM_REACHABLE #else #define ANNOTATE_INTRA_FUNCTION_CALL -.macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0 +.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0 .endm .macro STACK_FRAME_NON_STANDARD func:req .endm diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h index b026f1ae39c6..10bc88cc3bf6 100644 --- a/tools/include/linux/objtool.h +++ b/tools/include/linux/objtool.h @@ -40,6 +40,8 @@ struct unwind_hint { #define UNWIND_HINT_TYPE_REGS_PARTIAL 2 #define UNWIND_HINT_TYPE_FUNC 3 #define UNWIND_HINT_TYPE_ENTRY 4 +#define UNWIND_HINT_TYPE_SAVE 5 +#define UNWIND_HINT_TYPE_RESTORE 6 #ifdef CONFIG_OBJTOOL @@ -127,7 +129,7 @@ struct unwind_hint { * the debuginfo as necessary. It will also warn if it sees any * inconsistencies. */ -.macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0 +.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0 .Lunwind_hint_ip_\@: .pushsection .discard.unwind_hints /* struct unwind_hint */ @@ -180,7 +182,7 @@ struct unwind_hint { #define ASM_REACHABLE #else #define ANNOTATE_INTRA_FUNCTION_CALL -.macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0 +.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0 .endm .macro STACK_FRAME_NON_STANDARD func:req .endm diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 822a490e6d87..ddfdd138cc2a 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2032,6 +2032,17 @@ static int read_unwind_hints(struct objtool_file *file) insn->hint = true; + if (hint->type == UNWIND_HINT_TYPE_SAVE) { + insn->hint = false; + insn->save = true; + continue; + } + + if (hint->type == UNWIND_HINT_TYPE_RESTORE) { + insn->restore = true; + continue; + } + if (hint->type == UNWIND_HINT_TYPE_REGS_PARTIAL) { struct symbol *sym = find_symbol_by_offset(insn->sec, insn->offset); @@ -3329,6 +3340,35 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, state.instr += insn->instr; if (insn->hint) { + if (insn->restore) { + struct instruction *save_insn, *i; + + i = insn; + save_insn = NULL; + + sym_for_each_insn_continue_reverse(file, func, i) { + if (i->save) { + save_insn = i; + break; + } + } + + if (!save_insn) { + WARN_FUNC("no corresponding CFI save for CFI restore", + sec, insn->offset); + return 1; + } + + if (!save_insn->visited) { + WARN_FUNC("objtool isn't smart enough to handle this CFI save/restore combo", + sec, insn->offset); + return 1; + } + + insn->cfi = save_insn->cfi; + nr_cfi_reused++; + } + state.cfi = *insn->cfi; } else { /* XXX track if we actually changed state.cfi */ diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h index 0eeedeacbefb..036129cebeee 100644 --- a/tools/objtool/include/objtool/check.h +++ b/tools/objtool/include/objtool/check.h @@ -46,18 +46,19 @@ struct instruction { enum insn_type type; unsigned long immediate; - u8 dead_end : 1, - ignore : 1, - ignore_alts : 1, - hint : 1, - retpoline_safe : 1, - noendbr : 1, - entry : 1; - /* 1 bit hole */ + u16 dead_end : 1, + ignore : 1, + ignore_alts : 1, + hint : 1, + save : 1, + restore : 1, + retpoline_safe : 1, + noendbr : 1, + entry : 1; + /* 7 bit hole */ s8 instr; u8 visited; - /* u8 hole */ struct alt_group *alt_group; struct symbol *call_dest; -- cgit v1.2.3 From ee65728e103bb7dd99d8604bf6c7aa89c7d7e446 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 27 Jun 2022 09:00:26 +0300 Subject: docs: rename Documentation/vm to Documentation/mm so it will be consistent with code mm directory and with Documentation/admin-guide/mm and won't be confused with virtual machines. Signed-off-by: Mike Rapoport Suggested-by: Matthew Wilcox Tested-by: Ira Weiny Acked-by: Jonathan Corbet Acked-by: Wu XiangCheng --- Documentation/ABI/testing/sysfs-kernel-mm-ksm | 2 +- Documentation/ABI/testing/sysfs-kernel-slab | 4 +- Documentation/admin-guide/kernel-parameters.txt | 10 +- Documentation/admin-guide/mm/concepts.rst | 2 +- Documentation/admin-guide/mm/damon/index.rst | 2 +- Documentation/admin-guide/mm/damon/reclaim.rst | 2 +- Documentation/admin-guide/mm/damon/usage.rst | 8 +- Documentation/admin-guide/sysctl/vm.rst | 2 +- Documentation/core-api/index.rst | 2 +- Documentation/filesystems/proc.rst | 2 +- Documentation/index.rst | 2 +- Documentation/mm/active_mm.rst | 91 ++++ Documentation/mm/arch_pgtable_helpers.rst | 260 +++++++++ Documentation/mm/balance.rst | 102 ++++ Documentation/mm/bootmem.rst | 5 + Documentation/mm/damon/api.rst | 20 + Documentation/mm/damon/design.rst | 176 ++++++ Documentation/mm/damon/faq.rst | 50 ++ Documentation/mm/damon/index.rst | 29 + Documentation/mm/free_page_reporting.rst | 40 ++ Documentation/mm/frontswap.rst | 266 +++++++++ Documentation/mm/highmem.rst | 167 ++++++ Documentation/mm/hmm.rst | 452 ++++++++++++++++ Documentation/mm/hugetlbfs_reserv.rst | 596 +++++++++++++++++++++ Documentation/mm/hwpoison.rst | 184 +++++++ Documentation/mm/index.rst | 68 +++ Documentation/mm/ksm.rst | 87 +++ Documentation/mm/memory-model.rst | 177 ++++++ Documentation/mm/mmu_notifier.rst | 99 ++++ Documentation/mm/numa.rst | 150 ++++++ Documentation/mm/oom.rst | 5 + Documentation/mm/overcommit-accounting.rst | 88 +++ Documentation/mm/page_allocation.rst | 5 + Documentation/mm/page_cache.rst | 5 + Documentation/mm/page_frags.rst | 45 ++ Documentation/mm/page_migration.rst | 288 ++++++++++ Documentation/mm/page_owner.rst | 196 +++++++ Documentation/mm/page_reclaim.rst | 5 + Documentation/mm/page_table_check.rst | 56 ++ Documentation/mm/page_tables.rst | 5 + Documentation/mm/physical_memory.rst | 5 + Documentation/mm/process_addrs.rst | 5 + Documentation/mm/remap_file_pages.rst | 33 ++ Documentation/mm/shmfs.rst | 5 + Documentation/mm/slab.rst | 5 + Documentation/mm/slub.rst | 452 ++++++++++++++++ Documentation/mm/split_page_table_lock.rst | 100 ++++ Documentation/mm/swap.rst | 5 + Documentation/mm/transhuge.rst | 187 +++++++ Documentation/mm/unevictable-lru.rst | 554 +++++++++++++++++++ Documentation/mm/vmalloc.rst | 5 + Documentation/mm/vmalloced-kernel-stacks.rst | 153 ++++++ Documentation/mm/vmemmap_dedup.rst | 223 ++++++++ Documentation/mm/z3fold.rst | 30 ++ Documentation/mm/zsmalloc.rst | 82 +++ .../zh_CN/admin-guide/mm/damon/index.rst | 2 +- .../zh_CN/admin-guide/mm/damon/reclaim.rst | 2 +- .../zh_CN/admin-guide/mm/damon/usage.rst | 8 +- .../translations/zh_CN/core-api/index.rst | 2 +- Documentation/translations/zh_CN/index.rst | 2 +- Documentation/translations/zh_CN/mm/active_mm.rst | 85 +++ Documentation/translations/zh_CN/mm/balance.rst | 81 +++ Documentation/translations/zh_CN/mm/damon/api.rst | 32 ++ .../translations/zh_CN/mm/damon/design.rst | 140 +++++ Documentation/translations/zh_CN/mm/damon/faq.rst | 48 ++ .../translations/zh_CN/mm/damon/index.rst | 32 ++ .../translations/zh_CN/mm/free_page_reporting.rst | 38 ++ Documentation/translations/zh_CN/mm/frontswap.rst | 196 +++++++ Documentation/translations/zh_CN/mm/highmem.rst | 128 +++++ Documentation/translations/zh_CN/mm/hmm.rst | 361 +++++++++++++ .../translations/zh_CN/mm/hugetlbfs_reserv.rst | 436 +++++++++++++++ Documentation/translations/zh_CN/mm/hwpoison.rst | 166 ++++++ Documentation/translations/zh_CN/mm/index.rst | 54 ++ Documentation/translations/zh_CN/mm/ksm.rst | 70 +++ .../translations/zh_CN/mm/memory-model.rst | 135 +++++ .../translations/zh_CN/mm/mmu_notifier.rst | 97 ++++ Documentation/translations/zh_CN/mm/numa.rst | 101 ++++ .../zh_CN/mm/overcommit-accounting.rst | 86 +++ Documentation/translations/zh_CN/mm/page_frags.rst | 38 ++ Documentation/translations/zh_CN/mm/page_owner.rst | 116 ++++ .../translations/zh_CN/mm/page_table_check.rst | 56 ++ .../translations/zh_CN/mm/remap_file_pages.rst | 32 ++ .../zh_CN/mm/split_page_table_lock.rst | 96 ++++ Documentation/translations/zh_CN/mm/z3fold.rst | 31 ++ Documentation/translations/zh_CN/mm/zsmalloc.rst | 78 +++ Documentation/translations/zh_CN/vm/active_mm.rst | 85 --- Documentation/translations/zh_CN/vm/balance.rst | 81 --- Documentation/translations/zh_CN/vm/damon/api.rst | 32 -- .../translations/zh_CN/vm/damon/design.rst | 140 ----- Documentation/translations/zh_CN/vm/damon/faq.rst | 48 -- .../translations/zh_CN/vm/damon/index.rst | 33 -- .../translations/zh_CN/vm/free_page_reporting.rst | 38 -- Documentation/translations/zh_CN/vm/frontswap.rst | 196 ------- Documentation/translations/zh_CN/vm/highmem.rst | 128 ----- Documentation/translations/zh_CN/vm/hmm.rst | 361 ------------- .../translations/zh_CN/vm/hugetlbfs_reserv.rst | 436 --------------- Documentation/translations/zh_CN/vm/hwpoison.rst | 166 ------ Documentation/translations/zh_CN/vm/index.rst | 54 -- Documentation/translations/zh_CN/vm/ksm.rst | 70 --- .../translations/zh_CN/vm/memory-model.rst | 135 ----- .../translations/zh_CN/vm/mmu_notifier.rst | 97 ---- Documentation/translations/zh_CN/vm/numa.rst | 101 ---- .../zh_CN/vm/overcommit-accounting.rst | 86 --- Documentation/translations/zh_CN/vm/page_frags.rst | 38 -- Documentation/translations/zh_CN/vm/page_owner.rst | 116 ---- .../translations/zh_CN/vm/page_table_check.rst | 56 -- .../translations/zh_CN/vm/remap_file_pages.rst | 32 -- .../zh_CN/vm/split_page_table_lock.rst | 96 ---- Documentation/translations/zh_CN/vm/z3fold.rst | 31 -- Documentation/translations/zh_CN/vm/zsmalloc.rst | 78 --- Documentation/translations/zh_TW/index.rst | 2 +- Documentation/vm/.gitignore | 3 - Documentation/vm/active_mm.rst | 91 ---- Documentation/vm/arch_pgtable_helpers.rst | 260 --------- Documentation/vm/balance.rst | 102 ---- Documentation/vm/bootmem.rst | 5 - Documentation/vm/damon/api.rst | 20 - Documentation/vm/damon/design.rst | 176 ------ Documentation/vm/damon/faq.rst | 50 -- Documentation/vm/damon/index.rst | 29 - Documentation/vm/free_page_reporting.rst | 40 -- Documentation/vm/frontswap.rst | 266 --------- Documentation/vm/highmem.rst | 167 ------ Documentation/vm/hmm.rst | 452 ---------------- Documentation/vm/hugetlbfs_reserv.rst | 596 --------------------- Documentation/vm/hwpoison.rst | 184 ------- Documentation/vm/index.rst | 68 --- Documentation/vm/ksm.rst | 87 --- Documentation/vm/memory-model.rst | 177 ------ Documentation/vm/mmu_notifier.rst | 99 ---- Documentation/vm/numa.rst | 150 ------ Documentation/vm/oom.rst | 5 - Documentation/vm/overcommit-accounting.rst | 88 --- Documentation/vm/page_allocation.rst | 5 - Documentation/vm/page_cache.rst | 5 - Documentation/vm/page_frags.rst | 45 -- Documentation/vm/page_migration.rst | 288 ---------- Documentation/vm/page_owner.rst | 196 ------- Documentation/vm/page_reclaim.rst | 5 - Documentation/vm/page_table_check.rst | 56 -- Documentation/vm/page_tables.rst | 5 - Documentation/vm/physical_memory.rst | 5 - Documentation/vm/process_addrs.rst | 5 - Documentation/vm/remap_file_pages.rst | 33 -- Documentation/vm/shmfs.rst | 5 - Documentation/vm/slab.rst | 5 - Documentation/vm/slub.rst | 452 ---------------- Documentation/vm/split_page_table_lock.rst | 100 ---- Documentation/vm/swap.rst | 5 - Documentation/vm/transhuge.rst | 187 ------- Documentation/vm/unevictable-lru.rst | 554 ------------------- Documentation/vm/vmalloc.rst | 5 - Documentation/vm/vmalloced-kernel-stacks.rst | 153 ------ Documentation/vm/vmemmap_dedup.rst | 223 -------- Documentation/vm/z3fold.rst | 30 -- Documentation/vm/zsmalloc.rst | 82 --- MAINTAINERS | 12 +- arch/loongarch/Kconfig | 2 +- arch/powerpc/include/asm/book3s/64/pgtable.h | 2 +- include/linux/hmm.h | 4 +- include/linux/memremap.h | 2 +- include/linux/mmu_notifier.h | 2 +- include/linux/sched/mm.h | 4 +- include/linux/swap.h | 2 +- mm/Kconfig | 2 +- mm/debug_vm_pgtable.c | 2 +- mm/frontswap.c | 2 +- mm/huge_memory.c | 2 +- mm/hugetlb.c | 6 +- mm/hugetlb_vmemmap.c | 2 +- mm/ksm.c | 4 +- mm/mmap.c | 2 +- mm/rmap.c | 8 +- mm/sparse-vmemmap.c | 2 +- mm/util.c | 2 +- tools/vm/page_owner_sort.c | 2 +- 176 files changed, 8355 insertions(+), 8359 deletions(-) create mode 100644 Documentation/mm/active_mm.rst create mode 100644 Documentation/mm/arch_pgtable_helpers.rst create mode 100644 Documentation/mm/balance.rst create mode 100644 Documentation/mm/bootmem.rst create mode 100644 Documentation/mm/damon/api.rst create mode 100644 Documentation/mm/damon/design.rst create mode 100644 Documentation/mm/damon/faq.rst create mode 100644 Documentation/mm/damon/index.rst create mode 100644 Documentation/mm/free_page_reporting.rst create mode 100644 Documentation/mm/frontswap.rst create mode 100644 Documentation/mm/highmem.rst create mode 100644 Documentation/mm/hmm.rst create mode 100644 Documentation/mm/hugetlbfs_reserv.rst create mode 100644 Documentation/mm/hwpoison.rst create mode 100644 Documentation/mm/index.rst create mode 100644 Documentation/mm/ksm.rst create mode 100644 Documentation/mm/memory-model.rst create mode 100644 Documentation/mm/mmu_notifier.rst create mode 100644 Documentation/mm/numa.rst create mode 100644 Documentation/mm/oom.rst create mode 100644 Documentation/mm/overcommit-accounting.rst create mode 100644 Documentation/mm/page_allocation.rst create mode 100644 Documentation/mm/page_cache.rst create mode 100644 Documentation/mm/page_frags.rst create mode 100644 Documentation/mm/page_migration.rst create mode 100644 Documentation/mm/page_owner.rst create mode 100644 Documentation/mm/page_reclaim.rst create mode 100644 Documentation/mm/page_table_check.rst create mode 100644 Documentation/mm/page_tables.rst create mode 100644 Documentation/mm/physical_memory.rst create mode 100644 Documentation/mm/process_addrs.rst create mode 100644 Documentation/mm/remap_file_pages.rst create mode 100644 Documentation/mm/shmfs.rst create mode 100644 Documentation/mm/slab.rst create mode 100644 Documentation/mm/slub.rst create mode 100644 Documentation/mm/split_page_table_lock.rst create mode 100644 Documentation/mm/swap.rst create mode 100644 Documentation/mm/transhuge.rst create mode 100644 Documentation/mm/unevictable-lru.rst create mode 100644 Documentation/mm/vmalloc.rst create mode 100644 Documentation/mm/vmalloced-kernel-stacks.rst create mode 100644 Documentation/mm/vmemmap_dedup.rst create mode 100644 Documentation/mm/z3fold.rst create mode 100644 Documentation/mm/zsmalloc.rst create mode 100644 Documentation/translations/zh_CN/mm/active_mm.rst create mode 100644 Documentation/translations/zh_CN/mm/balance.rst create mode 100644 Documentation/translations/zh_CN/mm/damon/api.rst create mode 100644 Documentation/translations/zh_CN/mm/damon/design.rst create mode 100644 Documentation/translations/zh_CN/mm/damon/faq.rst create mode 100644 Documentation/translations/zh_CN/mm/damon/index.rst create mode 100644 Documentation/translations/zh_CN/mm/free_page_reporting.rst create mode 100644 Documentation/translations/zh_CN/mm/frontswap.rst create mode 100644 Documentation/translations/zh_CN/mm/highmem.rst create mode 100644 Documentation/translations/zh_CN/mm/hmm.rst create mode 100644 Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst create mode 100644 Documentation/translations/zh_CN/mm/hwpoison.rst create mode 100644 Documentation/translations/zh_CN/mm/index.rst create mode 100644 Documentation/translations/zh_CN/mm/ksm.rst create mode 100644 Documentation/translations/zh_CN/mm/memory-model.rst create mode 100644 Documentation/translations/zh_CN/mm/mmu_notifier.rst create mode 100644 Documentation/translations/zh_CN/mm/numa.rst create mode 100644 Documentation/translations/zh_CN/mm/overcommit-accounting.rst create mode 100644 Documentation/translations/zh_CN/mm/page_frags.rst create mode 100644 Documentation/translations/zh_CN/mm/page_owner.rst create mode 100644 Documentation/translations/zh_CN/mm/page_table_check.rst create mode 100644 Documentation/translations/zh_CN/mm/remap_file_pages.rst create mode 100644 Documentation/translations/zh_CN/mm/split_page_table_lock.rst create mode 100644 Documentation/translations/zh_CN/mm/z3fold.rst create mode 100644 Documentation/translations/zh_CN/mm/zsmalloc.rst delete mode 100644 Documentation/translations/zh_CN/vm/active_mm.rst delete mode 100644 Documentation/translations/zh_CN/vm/balance.rst delete mode 100644 Documentation/translations/zh_CN/vm/damon/api.rst delete mode 100644 Documentation/translations/zh_CN/vm/damon/design.rst delete mode 100644 Documentation/translations/zh_CN/vm/damon/faq.rst delete mode 100644 Documentation/translations/zh_CN/vm/damon/index.rst delete mode 100644 Documentation/translations/zh_CN/vm/free_page_reporting.rst delete mode 100644 Documentation/translations/zh_CN/vm/frontswap.rst delete mode 100644 Documentation/translations/zh_CN/vm/highmem.rst delete mode 100644 Documentation/translations/zh_CN/vm/hmm.rst delete mode 100644 Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst delete mode 100644 Documentation/translations/zh_CN/vm/hwpoison.rst delete mode 100644 Documentation/translations/zh_CN/vm/index.rst delete mode 100644 Documentation/translations/zh_CN/vm/ksm.rst delete mode 100644 Documentation/translations/zh_CN/vm/memory-model.rst delete mode 100644 Documentation/translations/zh_CN/vm/mmu_notifier.rst delete mode 100644 Documentation/translations/zh_CN/vm/numa.rst delete mode 100644 Documentation/translations/zh_CN/vm/overcommit-accounting.rst delete mode 100644 Documentation/translations/zh_CN/vm/page_frags.rst delete mode 100644 Documentation/translations/zh_CN/vm/page_owner.rst delete mode 100644 Documentation/translations/zh_CN/vm/page_table_check.rst delete mode 100644 Documentation/translations/zh_CN/vm/remap_file_pages.rst delete mode 100644 Documentation/translations/zh_CN/vm/split_page_table_lock.rst delete mode 100644 Documentation/translations/zh_CN/vm/z3fold.rst delete mode 100644 Documentation/translations/zh_CN/vm/zsmalloc.rst delete mode 100644 Documentation/vm/.gitignore delete mode 100644 Documentation/vm/active_mm.rst delete mode 100644 Documentation/vm/arch_pgtable_helpers.rst delete mode 100644 Documentation/vm/balance.rst delete mode 100644 Documentation/vm/bootmem.rst delete mode 100644 Documentation/vm/damon/api.rst delete mode 100644 Documentation/vm/damon/design.rst delete mode 100644 Documentation/vm/damon/faq.rst delete mode 100644 Documentation/vm/damon/index.rst delete mode 100644 Documentation/vm/free_page_reporting.rst delete mode 100644 Documentation/vm/frontswap.rst delete mode 100644 Documentation/vm/highmem.rst delete mode 100644 Documentation/vm/hmm.rst delete mode 100644 Documentation/vm/hugetlbfs_reserv.rst delete mode 100644 Documentation/vm/hwpoison.rst delete mode 100644 Documentation/vm/index.rst delete mode 100644 Documentation/vm/ksm.rst delete mode 100644 Documentation/vm/memory-model.rst delete mode 100644 Documentation/vm/mmu_notifier.rst delete mode 100644 Documentation/vm/numa.rst delete mode 100644 Documentation/vm/oom.rst delete mode 100644 Documentation/vm/overcommit-accounting.rst delete mode 100644 Documentation/vm/page_allocation.rst delete mode 100644 Documentation/vm/page_cache.rst delete mode 100644 Documentation/vm/page_frags.rst delete mode 100644 Documentation/vm/page_migration.rst delete mode 100644 Documentation/vm/page_owner.rst delete mode 100644 Documentation/vm/page_reclaim.rst delete mode 100644 Documentation/vm/page_table_check.rst delete mode 100644 Documentation/vm/page_tables.rst delete mode 100644 Documentation/vm/physical_memory.rst delete mode 100644 Documentation/vm/process_addrs.rst delete mode 100644 Documentation/vm/remap_file_pages.rst delete mode 100644 Documentation/vm/shmfs.rst delete mode 100644 Documentation/vm/slab.rst delete mode 100644 Documentation/vm/slub.rst delete mode 100644 Documentation/vm/split_page_table_lock.rst delete mode 100644 Documentation/vm/swap.rst delete mode 100644 Documentation/vm/transhuge.rst delete mode 100644 Documentation/vm/unevictable-lru.rst delete mode 100644 Documentation/vm/vmalloc.rst delete mode 100644 Documentation/vm/vmalloced-kernel-stacks.rst delete mode 100644 Documentation/vm/vmemmap_dedup.rst delete mode 100644 Documentation/vm/z3fold.rst delete mode 100644 Documentation/vm/zsmalloc.rst (limited to 'tools') diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-ksm b/Documentation/ABI/testing/sysfs-kernel-mm-ksm index 1c9bed5595f5..d244674a9480 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-ksm +++ b/Documentation/ABI/testing/sysfs-kernel-mm-ksm @@ -41,7 +41,7 @@ Description: Kernel Samepage Merging daemon sysfs interface sleep_millisecs: how many milliseconds ksm should sleep between scans. - See Documentation/vm/ksm.rst for more information. + See Documentation/mm/ksm.rst for more information. What: /sys/kernel/mm/ksm/merge_across_nodes Date: January 2013 diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab index c440f4946e12..cd5fb8fa3ddf 100644 --- a/Documentation/ABI/testing/sysfs-kernel-slab +++ b/Documentation/ABI/testing/sysfs-kernel-slab @@ -37,7 +37,7 @@ Description: The alloc_calls file is read-only and lists the kernel code locations from which allocations for this cache were performed. The alloc_calls file only contains information if debugging is - enabled for that cache (see Documentation/vm/slub.rst). + enabled for that cache (see Documentation/mm/slub.rst). What: /sys/kernel/slab//alloc_fastpath Date: February 2008 @@ -219,7 +219,7 @@ Contact: Pekka Enberg , Description: The free_calls file is read-only and lists the locations of object frees if slab debugging is enabled (see - Documentation/vm/slub.rst). + Documentation/mm/slub.rst). What: /sys/kernel/slab//free_fastpath Date: February 2008 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2522b11e593f..8c0ea6b6c6a9 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5442,7 +5442,7 @@ cache (risks via metadata attacks are mostly unchanged). Debug options disable merging on their own. - For more information see Documentation/vm/slub.rst. + For more information see Documentation/mm/slub.rst. slab_max_order= [MM, SLAB] Determines the maximum allowed order for slabs. @@ -5456,13 +5456,13 @@ slub_debug can create guard zones around objects and may poison objects when not in use. Also tracks the last alloc / free. For more information see - Documentation/vm/slub.rst. + Documentation/mm/slub.rst. slub_max_order= [MM, SLUB] Determines the maximum allowed order for slabs. A high setting may cause OOMs due to memory fragmentation. For more information see - Documentation/vm/slub.rst. + Documentation/mm/slub.rst. slub_min_objects= [MM, SLUB] The minimum number of objects per slab. SLUB will @@ -5471,12 +5471,12 @@ the number of objects indicated. The higher the number of objects the smaller the overhead of tracking slabs and the less frequently locks need to be acquired. - For more information see Documentation/vm/slub.rst. + For more information see Documentation/mm/slub.rst. slub_min_order= [MM, SLUB] Determines the minimum page order for slabs. Must be lower than slub_max_order. - For more information see Documentation/vm/slub.rst. + For more information see Documentation/mm/slub.rst. slub_merge [MM, SLUB] Same with slab_merge. diff --git a/Documentation/admin-guide/mm/concepts.rst b/Documentation/admin-guide/mm/concepts.rst index b966fcff993b..c79f1e336222 100644 --- a/Documentation/admin-guide/mm/concepts.rst +++ b/Documentation/admin-guide/mm/concepts.rst @@ -125,7 +125,7 @@ processor. Each bank is referred to as a `node` and for each node Linux constructs an independent memory management subsystem. A node has its own set of zones, lists of free and used pages and various statistics counters. You can find more details about NUMA in -:ref:`Documentation/vm/numa.rst ` and in +:ref:`Documentation/mm/numa.rst ` and in :ref:`Documentation/admin-guide/mm/numa_memory_policy.rst `. Page cache diff --git a/Documentation/admin-guide/mm/damon/index.rst b/Documentation/admin-guide/mm/damon/index.rst index 61aff88347f3..c4681fa69b9c 100644 --- a/Documentation/admin-guide/mm/damon/index.rst +++ b/Documentation/admin-guide/mm/damon/index.rst @@ -4,7 +4,7 @@ Monitoring Data Accesses ======================== -:doc:`DAMON ` allows light-weight data access monitoring. +:doc:`DAMON ` allows light-weight data access monitoring. Using DAMON, users can analyze the memory access patterns of their systems and optimize those. diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst index 46306f1f34b1..a8bd3bd29959 100644 --- a/Documentation/admin-guide/mm/damon/reclaim.rst +++ b/Documentation/admin-guide/mm/damon/reclaim.rst @@ -268,4 +268,4 @@ granularity reclamation. :: .. [1] https://research.google/pubs/pub48551/ .. [2] https://lwn.net/Articles/787611/ -.. [3] https://www.kernel.org/doc/html/latest/vm/free_page_reporting.html +.. [3] https://www.kernel.org/doc/html/latest/mm/free_page_reporting.html diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 1bb7b72414b2..5540a3a40fc9 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -30,11 +30,11 @@ DAMON provides below interfaces for different users. `. This will be removed after next LTS kernel is released, so users should move to the :ref:`sysfs interface `. - *Kernel Space Programming Interface.* - :doc:`This ` is for kernel space programmers. Using this, + :doc:`This ` is for kernel space programmers. Using this, users can utilize every feature of DAMON most flexibly and efficiently by writing kernel space DAMON application programs for you. You can even extend DAMON for various address spaces. For detail, please refer to the interface - :doc:`document `. + :doc:`document `. .. _sysfs_interface: @@ -185,7 +185,7 @@ controls the monitoring overhead, exist. You can set and get the values by writing to and rading from the files. For more details about the intervals and monitoring regions range, please refer -to the Design document (:doc:`/vm/damon/design`). +to the Design document (:doc:`/mm/damon/design`). contexts//targets/ --------------------- @@ -402,7 +402,7 @@ Attributes Users can get and set the ``sampling interval``, ``aggregation interval``, ``update interval``, and min/max number of monitoring target regions by reading from and writing to the ``attrs`` file. To know about the monitoring -attributes in detail, please refer to the :doc:`/vm/damon/design`. For +attributes in detail, please refer to the :doc:`/mm/damon/design`. For example, below commands set those values to 5 ms, 100 ms, 1,000 ms, 10 and 1000, and then check it again:: diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 5c9aa171a0d3..4a440a7cfeb0 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -760,7 +760,7 @@ and don't use much of it. The default value is 0. -See Documentation/vm/overcommit-accounting.rst and +See Documentation/mm/overcommit-accounting.rst and mm/util.c::__vm_enough_memory() for more information. diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index dedd4d853329..5b1188494bcd 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -87,7 +87,7 @@ Memory management ================= How to allocate and use memory in the kernel. Note that there is a lot -more memory-management documentation in Documentation/vm/index.rst. +more memory-management documentation in Documentation/mm/index.rst. .. toctree:: :maxdepth: 1 diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 1bc91fb8c321..8543a59f288f 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -1109,7 +1109,7 @@ CommitLimit yield a CommitLimit of 7.3G. For more details, see the memory overcommit documentation - in vm/overcommit-accounting. + in mm/overcommit-accounting. Committed_AS The amount of memory presently allocated on the system. The committed memory is a sum of all of the memory which diff --git a/Documentation/index.rst b/Documentation/index.rst index 67036a05b771..4737c18c97ff 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -128,7 +128,7 @@ needed). sound/index crypto/index filesystems/index - vm/index + mm/index bpf/index usb/index PCI/index diff --git a/Documentation/mm/active_mm.rst b/Documentation/mm/active_mm.rst new file mode 100644 index 000000000000..6f8269c284ed --- /dev/null +++ b/Documentation/mm/active_mm.rst @@ -0,0 +1,91 @@ +.. _active_mm: + +========= +Active MM +========= + +:: + + List: linux-kernel + Subject: Re: active_mm + From: Linus Torvalds + Date: 1999-07-30 21:36:24 + + Cc'd to linux-kernel, because I don't write explanations all that often, + and when I do I feel better about more people reading them. + + On Fri, 30 Jul 1999, David Mosberger wrote: + > + > Is there a brief description someplace on how "mm" vs. "active_mm" in + > the task_struct are supposed to be used? (My apologies if this was + > discussed on the mailing lists---I just returned from vacation and + > wasn't able to follow linux-kernel for a while). + + Basically, the new setup is: + + - we have "real address spaces" and "anonymous address spaces". The + difference is that an anonymous address space doesn't care about the + user-level page tables at all, so when we do a context switch into an + anonymous address space we just leave the previous address space + active. + + The obvious use for a "anonymous address space" is any thread that + doesn't need any user mappings - all kernel threads basically fall into + this category, but even "real" threads can temporarily say that for + some amount of time they are not going to be interested in user space, + and that the scheduler might as well try to avoid wasting time on + switching the VM state around. Currently only the old-style bdflush + sync does that. + + - "tsk->mm" points to the "real address space". For an anonymous process, + tsk->mm will be NULL, for the logical reason that an anonymous process + really doesn't _have_ a real address space at all. + + - however, we obviously need to keep track of which address space we + "stole" for such an anonymous user. For that, we have "tsk->active_mm", + which shows what the currently active address space is. + + The rule is that for a process with a real address space (ie tsk->mm is + non-NULL) the active_mm obviously always has to be the same as the real + one. + + For a anonymous process, tsk->mm == NULL, and tsk->active_mm is the + "borrowed" mm while the anonymous process is running. When the + anonymous process gets scheduled away, the borrowed address space is + returned and cleared. + + To support all that, the "struct mm_struct" now has two counters: a + "mm_users" counter that is how many "real address space users" there are, + and a "mm_count" counter that is the number of "lazy" users (ie anonymous + users) plus one if there are any real users. + + Usually there is at least one real user, but it could be that the real + user exited on another CPU while a lazy user was still active, so you do + actually get cases where you have a address space that is _only_ used by + lazy users. That is often a short-lived state, because once that thread + gets scheduled away in favour of a real thread, the "zombie" mm gets + released because "mm_count" becomes zero. + + Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any + more. "init_mm" should be considered just a "lazy context when no other + context is available", and in fact it is mainly used just at bootup when + no real VM has yet been created. So code that used to check + + if (current->mm == &init_mm) + + should generally just do + + if (!current->mm) + + instead (which makes more sense anyway - the test is basically one of "do + we have a user context", and is generally done by the page fault handler + and things like that). + + Anyway, I put a pre-patch-2.3.13-1 on ftp.kernel.org just a moment ago, + because it slightly changes the interfaces to accommodate the alpha (who + would have thought it, but the alpha actually ends up having one of the + ugliest context switch codes - unlike the other architectures where the MM + and register state is separate, the alpha PALcode joins the two, and you + need to switch both together). + + (From http://marc.info/?l=linux-kernel&m=93337278602211&w=2) diff --git a/Documentation/mm/arch_pgtable_helpers.rst b/Documentation/mm/arch_pgtable_helpers.rst new file mode 100644 index 000000000000..cbaee9e59241 --- /dev/null +++ b/Documentation/mm/arch_pgtable_helpers.rst @@ -0,0 +1,260 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. _arch_page_table_helpers: + +=============================== +Architecture Page Table Helpers +=============================== + +Generic MM expects architectures (with MMU) to provide helpers to create, access +and modify page table entries at various level for different memory functions. +These page table helpers need to conform to a common semantics across platforms. +Following tables describe the expected semantics which can also be tested during +boot via CONFIG_DEBUG_VM_PGTABLE option. All future changes in here or the debug +test need to be in sync. + + +PTE Page Table Helpers +====================== + ++---------------------------+--------------------------------------------------+ +| pte_same | Tests whether both PTE entries are the same | ++---------------------------+--------------------------------------------------+ +| pte_bad | Tests a non-table mapped PTE | ++---------------------------+--------------------------------------------------+ +| pte_present | Tests a valid mapped PTE | ++---------------------------+--------------------------------------------------+ +| pte_young | Tests a young PTE | ++---------------------------+--------------------------------------------------+ +| pte_dirty | Tests a dirty PTE | ++---------------------------+--------------------------------------------------+ +| pte_write | Tests a writable PTE | ++---------------------------+--------------------------------------------------+ +| pte_special | Tests a special PTE | ++---------------------------+--------------------------------------------------+ +| pte_protnone | Tests a PROT_NONE PTE | ++---------------------------+--------------------------------------------------+ +| pte_devmap | Tests a ZONE_DEVICE mapped PTE | ++---------------------------+--------------------------------------------------+ +| pte_soft_dirty | Tests a soft dirty PTE | ++---------------------------+--------------------------------------------------+ +| pte_swp_soft_dirty | Tests a soft dirty swapped PTE | ++---------------------------+--------------------------------------------------+ +| pte_mkyoung | Creates a young PTE | ++---------------------------+--------------------------------------------------+ +| pte_mkold | Creates an old PTE | ++---------------------------+--------------------------------------------------+ +| pte_mkdirty | Creates a dirty PTE | ++---------------------------+--------------------------------------------------+ +| pte_mkclean | Creates a clean PTE | ++---------------------------+--------------------------------------------------+ +| pte_mkwrite | Creates a writable PTE | ++---------------------------+--------------------------------------------------+ +| pte_wrprotect | Creates a write protected PTE | ++---------------------------+--------------------------------------------------+ +| pte_mkspecial | Creates a special PTE | ++---------------------------+--------------------------------------------------+ +| pte_mkdevmap | Creates a ZONE_DEVICE mapped PTE | ++---------------------------+--------------------------------------------------+ +| pte_mksoft_dirty | Creates a soft dirty PTE | ++---------------------------+--------------------------------------------------+ +| pte_clear_soft_dirty | Clears a soft dirty PTE | ++---------------------------+--------------------------------------------------+ +| pte_swp_mksoft_dirty | Creates a soft dirty swapped PTE | ++---------------------------+--------------------------------------------------+ +| pte_swp_clear_soft_dirty | Clears a soft dirty swapped PTE | ++---------------------------+--------------------------------------------------+ +| pte_mknotpresent | Invalidates a mapped PTE | ++---------------------------+--------------------------------------------------+ +| ptep_clear | Clears a PTE | ++---------------------------+--------------------------------------------------+ +| ptep_get_and_clear | Clears and returns PTE | ++---------------------------+--------------------------------------------------+ +| ptep_get_and_clear_full | Clears and returns PTE (batched PTE unmap) | ++---------------------------+--------------------------------------------------+ +| ptep_test_and_clear_young | Clears young from a PTE | ++---------------------------+--------------------------------------------------+ +| ptep_set_wrprotect | Converts into a write protected PTE | ++---------------------------+--------------------------------------------------+ +| ptep_set_access_flags | Converts into a more permissive PTE | ++---------------------------+--------------------------------------------------+ + + +PMD Page Table Helpers +====================== + ++---------------------------+--------------------------------------------------+ +| pmd_same | Tests whether both PMD entries are the same | ++---------------------------+--------------------------------------------------+ +| pmd_bad | Tests a non-table mapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_leaf | Tests a leaf mapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_huge | Tests a HugeTLB mapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_trans_huge | Tests a Transparent Huge Page (THP) at PMD | ++---------------------------+--------------------------------------------------+ +| pmd_present | Tests a valid mapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_young | Tests a young PMD | ++---------------------------+--------------------------------------------------+ +| pmd_dirty | Tests a dirty PMD | ++---------------------------+--------------------------------------------------+ +| pmd_write | Tests a writable PMD | ++---------------------------+--------------------------------------------------+ +| pmd_special | Tests a special PMD | ++---------------------------+--------------------------------------------------+ +| pmd_protnone | Tests a PROT_NONE PMD | ++---------------------------+--------------------------------------------------+ +| pmd_devmap | Tests a ZONE_DEVICE mapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_soft_dirty | Tests a soft dirty PMD | ++---------------------------+--------------------------------------------------+ +| pmd_swp_soft_dirty | Tests a soft dirty swapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mkyoung | Creates a young PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mkold | Creates an old PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mkdirty | Creates a dirty PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mkclean | Creates a clean PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mkwrite | Creates a writable PMD | ++---------------------------+--------------------------------------------------+ +| pmd_wrprotect | Creates a write protected PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mkspecial | Creates a special PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mkdevmap | Creates a ZONE_DEVICE mapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mksoft_dirty | Creates a soft dirty PMD | ++---------------------------+--------------------------------------------------+ +| pmd_clear_soft_dirty | Clears a soft dirty PMD | ++---------------------------+--------------------------------------------------+ +| pmd_swp_mksoft_dirty | Creates a soft dirty swapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_swp_clear_soft_dirty | Clears a soft dirty swapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mkinvalid | Invalidates a mapped PMD [1] | ++---------------------------+--------------------------------------------------+ +| pmd_set_huge | Creates a PMD huge mapping | ++---------------------------+--------------------------------------------------+ +| pmd_clear_huge | Clears a PMD huge mapping | ++---------------------------+--------------------------------------------------+ +| pmdp_get_and_clear | Clears a PMD | ++---------------------------+--------------------------------------------------+ +| pmdp_get_and_clear_full | Clears a PMD | ++---------------------------+--------------------------------------------------+ +| pmdp_test_and_clear_young | Clears young from a PMD | ++---------------------------+--------------------------------------------------+ +| pmdp_set_wrprotect | Converts into a write protected PMD | ++---------------------------+--------------------------------------------------+ +| pmdp_set_access_flags | Converts into a more permissive PMD | ++---------------------------+--------------------------------------------------+ + + +PUD Page Table Helpers +====================== + ++---------------------------+--------------------------------------------------+ +| pud_same | Tests whether both PUD entries are the same | ++---------------------------+--------------------------------------------------+ +| pud_bad | Tests a non-table mapped PUD | ++---------------------------+--------------------------------------------------+ +| pud_leaf | Tests a leaf mapped PUD | ++---------------------------+--------------------------------------------------+ +| pud_huge | Tests a HugeTLB mapped PUD | ++---------------------------+--------------------------------------------------+ +| pud_trans_huge | Tests a Transparent Huge Page (THP) at PUD | ++---------------------------+--------------------------------------------------+ +| pud_present | Tests a valid mapped PUD | ++---------------------------+--------------------------------------------------+ +| pud_young | Tests a young PUD | ++---------------------------+--------------------------------------------------+ +| pud_dirty | Tests a dirty PUD | ++---------------------------+--------------------------------------------------+ +| pud_write | Tests a writable PUD | ++---------------------------+--------------------------------------------------+ +| pud_devmap | Tests a ZONE_DEVICE mapped PUD | ++---------------------------+--------------------------------------------------+ +| pud_mkyoung | Creates a young PUD | ++---------------------------+--------------------------------------------------+ +| pud_mkold | Creates an old PUD | ++---------------------------+--------------------------------------------------+ +| pud_mkdirty | Creates a dirty PUD | ++---------------------------+--------------------------------------------------+ +| pud_mkclean | Creates a clean PUD | ++---------------------------+--------------------------------------------------+ +| pud_mkwrite | Creates a writable PUD | ++---------------------------+--------------------------------------------------+ +| pud_wrprotect | Creates a write protected PUD | ++---------------------------+--------------------------------------------------+ +| pud_mkdevmap | Creates a ZONE_DEVICE mapped PUD | ++---------------------------+--------------------------------------------------+ +| pud_mkinvalid | Invalidates a mapped PUD [1] | ++---------------------------+--------------------------------------------------+ +| pud_set_huge | Creates a PUD huge mapping | ++---------------------------+--------------------------------------------------+ +| pud_clear_huge | Clears a PUD huge mapping | ++---------------------------+--------------------------------------------------+ +| pudp_get_and_clear | Clears a PUD | ++---------------------------+--------------------------------------------------+ +| pudp_get_and_clear_full | Clears a PUD | ++---------------------------+--------------------------------------------------+ +| pudp_test_and_clear_young | Clears young from a PUD | ++---------------------------+--------------------------------------------------+ +| pudp_set_wrprotect | Converts into a write protected PUD | ++---------------------------+--------------------------------------------------+ +| pudp_set_access_flags | Converts into a more permissive PUD | ++---------------------------+--------------------------------------------------+ + + +HugeTLB Page Table Helpers +========================== + ++---------------------------+--------------------------------------------------+ +| pte_huge | Tests a HugeTLB | ++---------------------------+--------------------------------------------------+ +| pte_mkhuge | Creates a HugeTLB | ++---------------------------+--------------------------------------------------+ +| huge_pte_dirty | Tests a dirty HugeTLB | ++---------------------------+--------------------------------------------------+ +| huge_pte_write | Tests a writable HugeTLB | ++---------------------------+--------------------------------------------------+ +| huge_pte_mkdirty | Creates a dirty HugeTLB | ++---------------------------+--------------------------------------------------+ +| huge_pte_mkwrite | Creates a writable HugeTLB | ++---------------------------+--------------------------------------------------+ +| huge_pte_wrprotect | Creates a write protected HugeTLB | ++---------------------------+--------------------------------------------------+ +| huge_ptep_get_and_clear | Clears a HugeTLB | ++---------------------------+--------------------------------------------------+ +| huge_ptep_set_wrprotect | Converts into a write protected HugeTLB | ++---------------------------+--------------------------------------------------+ +| huge_ptep_set_access_flags | Converts into a more permissive HugeTLB | ++---------------------------+--------------------------------------------------+ + + +SWAP Page Table Helpers +======================== + ++---------------------------+--------------------------------------------------+ +| __pte_to_swp_entry | Creates a swapped entry (arch) from a mapped PTE | ++---------------------------+--------------------------------------------------+ +| __swp_to_pte_entry | Creates a mapped PTE from a swapped entry (arch) | ++---------------------------+--------------------------------------------------+ +| __pmd_to_swp_entry | Creates a swapped entry (arch) from a mapped PMD | ++---------------------------+--------------------------------------------------+ +| __swp_to_pmd_entry | Creates a mapped PMD from a swapped entry (arch) | ++---------------------------+--------------------------------------------------+ +| is_migration_entry | Tests a migration (read or write) swapped entry | ++-------------------------------+----------------------------------------------+ +| is_writable_migration_entry | Tests a write migration swapped entry | ++-------------------------------+----------------------------------------------+ +| make_readable_migration_entry | Creates a read migration swapped entry | ++-------------------------------+----------------------------------------------+ +| make_writable_migration_entry | Creates a write migration swapped entry | ++-------------------------------+----------------------------------------------+ + +[1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/ diff --git a/Documentation/mm/balance.rst b/Documentation/mm/balance.rst new file mode 100644 index 000000000000..6a1fadf3e173 --- /dev/null +++ b/Documentation/mm/balance.rst @@ -0,0 +1,102 @@ +.. _balance: + +================ +Memory Balancing +================ + +Started Jan 2000 by Kanoj Sarcar + +Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as +well as for non __GFP_IO allocations. + +The first reason why a caller may avoid reclaim is that the caller can not +sleep due to holding a spinlock or is in interrupt context. The second may +be that the caller is willing to fail the allocation without incurring the +overhead of page reclaim. This may happen for opportunistic high-order +allocation requests that have order-0 fallback options. In such cases, +the caller may also wish to avoid waking kswapd. + +__GFP_IO allocation requests are made to prevent file system deadlocks. + +In the absence of non sleepable allocation requests, it seems detrimental +to be doing balancing. Page reclamation can be kicked off lazily, that +is, only when needed (aka zone free memory is 0), instead of making it +a proactive process. + +That being said, the kernel should try to fulfill requests for direct +mapped pages from the direct mapped pool, instead of falling back on +the dma pool, so as to keep the dma pool filled for dma requests (atomic +or not). A similar argument applies to highmem and direct mapped pages. +OTOH, if there is a lot of free dma pages, it is preferable to satisfy +regular memory requests by allocating one from the dma pool, instead +of incurring the overhead of regular zone balancing. + +In 2.2, memory balancing/page reclamation would kick off only when the +_total_ number of free pages fell below 1/64 th of total memory. With the +right ratio of dma and regular memory, it is quite possible that balancing +would not be done even when the dma zone was completely empty. 2.2 has +been running production machines of varying memory sizes, and seems to be +doing fine even with the presence of this problem. In 2.3, due to +HIGHMEM, this problem is aggravated. + +In 2.3, zone balancing can be done in one of two ways: depending on the +zone size (and possibly of the size of lower class zones), we can decide +at init time how many free pages we should aim for while balancing any +zone. The good part is, while balancing, we do not need to look at sizes +of lower class zones, the bad part is, we might do too frequent balancing +due to ignoring possibly lower usage in the lower class zones. Also, +with a slight change in the allocation routine, it is possible to reduce +the memclass() macro to be a simple equality. + +Another possible solution is that we balance only when the free memory +of a zone _and_ all its lower class zones falls below 1/64th of the +total memory in the zone and its lower class zones. This fixes the 2.2 +balancing problem, and stays as close to 2.2 behavior as possible. Also, +the balancing algorithm works the same way on the various architectures, +which have different numbers and types of zones. If we wanted to get +fancy, we could assign different weights to free pages in different +zones in the future. + +Note that if the size of the regular zone is huge compared to dma zone, +it becomes less significant to consider the free dma pages while +deciding whether to balance the regular zone. The first solution +becomes more attractive then. + +The appended patch implements the second solution. It also "fixes" two +problems: first, kswapd is woken up as in 2.2 on low memory conditions +for non-sleepable allocations. Second, the HIGHMEM zone is also balanced, +so as to give a fighting chance for replace_with_highmem() to get a +HIGHMEM page, as well as to ensure that HIGHMEM allocations do not +fall back into regular zone. This also makes sure that HIGHMEM pages +are not leaked (for example, in situations where a HIGHMEM page is in +the swapcache but is not being used by anyone) + +kswapd also needs to know about the zones it should balance. kswapd is +primarily needed in a situation where balancing can not be done, +probably because all allocation requests are coming from intr context +and all process contexts are sleeping. For 2.3, kswapd does not really +need to balance the highmem zone, since intr context does not request +highmem pages. kswapd looks at the zone_wake_kswapd field in the zone +structure to decide whether a zone needs balancing. + +Page stealing from process memory and shm is done if stealing the page would +alleviate memory pressure on any zone in the page's node that has fallen below +its watermark. + +watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These +are per-zone fields, used to determine when a zone needs to be balanced. When +the number of pages falls below watermark[WMARK_MIN], the hysteric field +low_on_memory gets set. This stays set till the number of free pages becomes +watermark[WMARK_HIGH]. When low_on_memory is set, page allocation requests will +try to free some pages in the zone (providing GFP_WAIT is set in the request). +Orthogonal to this, is the decision to poke kswapd to free some zone pages. +That decision is not hysteresis based, and is done when the number of free +pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set. + + +(Good) Ideas that I have heard: + +1. Dynamic experience should influence balancing: number of failed requests + for a zone can be tracked and fed into the balancing scheme (jalvo@mbay.net) +2. Implement a replace_with_highmem()-like replace_with_regular() to preserve + dma pages. (lkd@tantalophile.demon.co.uk) diff --git a/Documentation/mm/bootmem.rst b/Documentation/mm/bootmem.rst new file mode 100644 index 000000000000..eb2b31eedfa1 --- /dev/null +++ b/Documentation/mm/bootmem.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========== +Boot Memory +=========== diff --git a/Documentation/mm/damon/api.rst b/Documentation/mm/damon/api.rst new file mode 100644 index 000000000000..08f34df45523 --- /dev/null +++ b/Documentation/mm/damon/api.rst @@ -0,0 +1,20 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============= +API Reference +============= + +Kernel space programs can use every feature of DAMON using below APIs. All you +need to do is including ``damon.h``, which is located in ``include/linux/`` of +the source tree. + +Structures +========== + +.. kernel-doc:: include/linux/damon.h + + +Functions +========= + +.. kernel-doc:: mm/damon/core.c diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst new file mode 100644 index 000000000000..0cff6fac6b7e --- /dev/null +++ b/Documentation/mm/damon/design.rst @@ -0,0 +1,176 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====== +Design +====== + +Configurable Layers +=================== + +DAMON provides data access monitoring functionality while making the accuracy +and the overhead controllable. The fundamental access monitorings require +primitives that dependent on and optimized for the target address space. On +the other hand, the accuracy and overhead tradeoff mechanism, which is the core +of DAMON, is in the pure logic space. DAMON separates the two parts in +different layers and defines its interface to allow various low level +primitives implementations configurable with the core logic. We call the low +level primitives implementations monitoring operations. + +Due to this separated design and the configurable interface, users can extend +DAMON for any address space by configuring the core logics with appropriate +monitoring operations. If appropriate one is not provided, users can implement +the operations on their own. + +For example, physical memory, virtual memory, swap space, those for specific +processes, NUMA nodes, files, and backing memory devices would be supportable. +Also, if some architectures or devices support special optimized access check +primitives, those will be easily configurable. + + +Reference Implementations of Address Space Specific Monitoring Operations +========================================================================= + +The monitoring operations are defined in two parts: + +1. Identification of the monitoring target address range for the address space. +2. Access check of specific address range in the target space. + +DAMON currently provides the implementations of the operations for the physical +and virtual address spaces. Below two subsections describe how those work. + + +VMA-based Target Address Range Construction +------------------------------------------- + +This is only for the virtual address space monitoring operations +implementation. That for the physical address space simply asks users to +manually set the monitoring target address ranges. + +Only small parts in the super-huge virtual address space of the processes are +mapped to the physical memory and accessed. Thus, tracking the unmapped +address regions is just wasteful. However, because DAMON can deal with some +level of noise using the adaptive regions adjustment mechanism, tracking every +mapping is not strictly required but could even incur a high overhead in some +cases. That said, too huge unmapped areas inside the monitoring target should +be removed to not take the time for the adaptive mechanism. + +For the reason, this implementation converts the complex mappings to three +distinct regions that cover every mapped area of the address space. The two +gaps between the three regions are the two biggest unmapped areas in the given +address space. The two biggest unmapped areas would be the gap between the +heap and the uppermost mmap()-ed region, and the gap between the lowermost +mmap()-ed region and the stack in most of the cases. Because these gaps are +exceptionally huge in usual address spaces, excluding these will be sufficient +to make a reasonable trade-off. Below shows this in detail:: + + + + + (small mmap()-ed regions and munmap()-ed regions) + + + + + +PTE Accessed-bit Based Access Check +----------------------------------- + +Both of the implementations for physical and virtual address spaces use PTE +Accessed-bit for basic access checks. Only one difference is the way of +finding the relevant PTE Accessed bit(s) from the address. While the +implementation for the virtual address walks the page table for the target task +of the address, the implementation for the physical address walks every page +table having a mapping to the address. In this way, the implementations find +and clear the bit(s) for next sampling target address and checks whether the +bit(s) set again after one sampling period. This could disturb other kernel +subsystems using the Accessed bits, namely Idle page tracking and the reclaim +logic. DAMON does nothing to avoid disturbing Idle page tracking, so handling +the interference is the responsibility of sysadmins. However, it solves the +conflict with the reclaim logic using ``PG_idle`` and ``PG_young`` page flags, +as Idle page tracking does. + + +Address Space Independent Core Mechanisms +========================================= + +Below four sections describe each of the DAMON core mechanisms and the five +monitoring attributes, ``sampling interval``, ``aggregation interval``, +``update interval``, ``minimum number of regions``, and ``maximum number of +regions``. + + +Access Frequency Monitoring +--------------------------- + +The output of DAMON says what pages are how frequently accessed for a given +duration. The resolution of the access frequency is controlled by setting +``sampling interval`` and ``aggregation interval``. In detail, DAMON checks +access to each page per ``sampling interval`` and aggregates the results. In +other words, counts the number of the accesses to each page. After each +``aggregation interval`` passes, DAMON calls callback functions that previously +registered by users so that users can read the aggregated results and then +clears the results. This can be described in below simple pseudo-code:: + + while monitoring_on: + for page in monitoring_target: + if accessed(page): + nr_accesses[page] += 1 + if time() % aggregation_interval == 0: + for callback in user_registered_callbacks: + callback(monitoring_target, nr_accesses) + for page in monitoring_target: + nr_accesses[page] = 0 + sleep(sampling interval) + +The monitoring overhead of this mechanism will arbitrarily increase as the +size of the target workload grows. + + +Region Based Sampling +--------------------- + +To avoid the unbounded increase of the overhead, DAMON groups adjacent pages +that assumed to have the same access frequencies into a region. As long as the +assumption (pages in a region have the same access frequencies) is kept, only +one page in the region is required to be checked. Thus, for each ``sampling +interval``, DAMON randomly picks one page in each region, waits for one +``sampling interval``, checks whether the page is accessed meanwhile, and +increases the access frequency of the region if so. Therefore, the monitoring +overhead is controllable by setting the number of regions. DAMON allows users +to set the minimum and the maximum number of regions for the trade-off. + +This scheme, however, cannot preserve the quality of the output if the +assumption is not guaranteed. + + +Adaptive Regions Adjustment +--------------------------- + +Even somehow the initial monitoring target regions are well constructed to +fulfill the assumption (pages in same region have similar access frequencies), +the data access pattern can be dynamically changed. This will result in low +monitoring quality. To keep the assumption as much as possible, DAMON +adaptively merges and splits each region based on their access frequency. + +For each ``aggregation interval``, it compares the access frequencies of +adjacent regions and merges those if the frequency difference is small. Then, +after it reports and clears the aggregated access frequency of each region, it +splits each region into two or three regions if the total number of regions +will not exceed the user-specified maximum number of regions after the split. + +In this way, DAMON provides its best-effort quality and minimal overhead while +keeping the bounds users set for their trade-off. + + +Dynamic Target Space Updates Handling +------------------------------------- + +The monitoring target address range could dynamically changed. For example, +virtual memory could be dynamically mapped and unmapped. Physical memory could +be hot-plugged. + +As the changes could be quite frequent in some cases, DAMON allows the +monitoring operations to check dynamic changes including memory mapping changes +and applies it to monitoring operations-related data structures such as the +abstracted monitoring target memory area only for each of a user-specified time +interval (``update interval``). diff --git a/Documentation/mm/damon/faq.rst b/Documentation/mm/damon/faq.rst new file mode 100644 index 000000000000..dde7e2414ee6 --- /dev/null +++ b/Documentation/mm/damon/faq.rst @@ -0,0 +1,50 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== +Frequently Asked Questions +========================== + +Why a new subsystem, instead of extending perf or other user space tools? +========================================================================= + +First, because it needs to be lightweight as much as possible so that it can be +used online, any unnecessary overhead such as kernel - user space context +switching cost should be avoided. Second, DAMON aims to be used by other +programs including the kernel. Therefore, having a dependency on specific +tools like perf is not desirable. These are the two biggest reasons why DAMON +is implemented in the kernel space. + + +Can 'idle pages tracking' or 'perf mem' substitute DAMON? +========================================================= + +Idle page tracking is a low level primitive for access check of the physical +address space. 'perf mem' is similar, though it can use sampling to minimize +the overhead. On the other hand, DAMON is a higher-level framework for the +monitoring of various address spaces. It is focused on memory management +optimization and provides sophisticated accuracy/overhead handling mechanisms. +Therefore, 'idle pages tracking' and 'perf mem' could provide a subset of +DAMON's output, but cannot substitute DAMON. + + +Does DAMON support virtual memory only? +======================================= + +No. The core of the DAMON is address space independent. The address space +specific monitoring operations including monitoring target regions +constructions and actual access checks can be implemented and configured on the +DAMON core by the users. In this way, DAMON users can monitor any address +space with any access check technique. + +Nonetheless, DAMON provides vma/rmap tracking and PTE Accessed bit check based +implementations of the address space dependent functions for the virtual memory +and the physical memory by default, for a reference and convenient use. + + +Can I simply monitor page granularity? +====================================== + +Yes. You can do so by setting the ``min_nr_regions`` attribute higher than the +working set size divided by the page size. Because the monitoring target +regions size is forced to be ``>=page size``, the region split will make no +effect. diff --git a/Documentation/mm/damon/index.rst b/Documentation/mm/damon/index.rst new file mode 100644 index 000000000000..48c0bbff98b2 --- /dev/null +++ b/Documentation/mm/damon/index.rst @@ -0,0 +1,29 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== +DAMON: Data Access MONitor +========================== + +DAMON is a data access monitoring framework subsystem for the Linux kernel. +The core mechanisms of DAMON (refer to :doc:`design` for the detail) make it + + - *accurate* (the monitoring output is useful enough for DRAM level memory + management; It might not appropriate for CPU Cache levels, though), + - *light-weight* (the monitoring overhead is low enough to be applied online), + and + - *scalable* (the upper-bound of the overhead is in constant range regardless + of the size of target workloads). + +Using this framework, therefore, the kernel's memory management mechanisms can +make advanced decisions. Experimental memory management optimization works +that incurring high data accesses monitoring overhead could implemented again. +In user space, meanwhile, users who have some special workloads can write +personalized applications for better understanding and optimizations of their +workloads and systems. + +.. toctree:: + :maxdepth: 2 + + faq + design + api diff --git a/Documentation/mm/free_page_reporting.rst b/Documentation/mm/free_page_reporting.rst new file mode 100644 index 000000000000..8c05e62d8b2b --- /dev/null +++ b/Documentation/mm/free_page_reporting.rst @@ -0,0 +1,40 @@ +.. _free_page_reporting: + +===================== +Free Page Reporting +===================== + +Free page reporting is an API by which a device can register to receive +lists of pages that are currently unused by the system. This is useful in +the case of virtualization where a guest is then able to use this data to +notify the hypervisor that it is no longer using certain pages in memory. + +For the driver, typically a balloon driver, to use of this functionality +it will allocate and initialize a page_reporting_dev_info structure. The +field within the structure it will populate is the "report" function +pointer used to process the scatterlist. It must also guarantee that it can +handle at least PAGE_REPORTING_CAPACITY worth of scatterlist entries per +call to the function. A call to page_reporting_register will register the +page reporting interface with the reporting framework assuming no other +page reporting devices are already registered. + +Once registered the page reporting API will begin reporting batches of +pages to the driver. The API will start reporting pages 2 seconds after +the interface is registered and will continue to do so 2 seconds after any +page of a sufficiently high order is freed. + +Pages reported will be stored in the scatterlist passed to the reporting +function with the final entry having the end bit set in entry nent - 1. +While pages are being processed by the report function they will not be +accessible to the allocator. Once the report function has been completed +the pages will be returned to the free area from which they were obtained. + +Prior to removing a driver that is making use of free page reporting it +is necessary to call page_reporting_unregister to have the +page_reporting_dev_info structure that is currently in use by free page +reporting removed. Doing this will prevent further reports from being +issued via the interface. If another driver or the same driver is +registered it is possible for it to resume where the previous driver had +left off in terms of reporting free pages. + +Alexander Duyck, Dec 04, 2019 diff --git a/Documentation/mm/frontswap.rst b/Documentation/mm/frontswap.rst new file mode 100644 index 000000000000..feecc5e24477 --- /dev/null +++ b/Documentation/mm/frontswap.rst @@ -0,0 +1,266 @@ +.. _frontswap: + +========= +Frontswap +========= + +Frontswap provides a "transcendent memory" interface for swap pages. +In some environments, dramatic performance savings may be obtained because +swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk. + +.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/ + +Frontswap is so named because it can be thought of as the opposite of +a "backing" store for a swap device. The storage is assumed to be +a synchronous concurrency-safe page-oriented "pseudo-RAM device" conforming +to the requirements of transcendent memory (such as Xen's "tmem", or +in-kernel compressed memory, aka "zcache", or future RAM-like devices); +this pseudo-RAM device is not directly accessible or addressable by the +kernel and is of unknown and possibly time-varying size. The driver +links itself to frontswap by calling frontswap_register_ops to set the +frontswap_ops funcs appropriately and the functions it provides must +conform to certain policies as follows: + +An "init" prepares the device to receive frontswap pages associated +with the specified swap device number (aka "type"). A "store" will +copy the page to transcendent memory and associate it with the type and +offset associated with the page. A "load" will copy the page, if found, +from transcendent memory into kernel memory, but will NOT remove the page +from transcendent memory. An "invalidate_page" will remove the page +from transcendent memory and an "invalidate_area" will remove ALL pages +associated with the swap type (e.g., like swapoff) and notify the "device" +to refuse further stores with that swap type. + +Once a page is successfully stored, a matching load on the page will normally +succeed. So when the kernel finds itself in a situation where it needs +to swap out a page, it first attempts to use frontswap. If the store returns +success, the data has been successfully saved to transcendent memory and +a disk write and, if the data is later read back, a disk read are avoided. +If a store returns failure, transcendent memory has rejected the data, and the +page can be written to swap as usual. + +Note that if a page is stored and the page already exists in transcendent memory +(a "duplicate" store), either the store succeeds and the data is overwritten, +or the store fails AND the page is invalidated. This ensures stale data may +never be obtained from frontswap. + +If properly configured, monitoring of frontswap is done via debugfs in +the `/sys/kernel/debug/frontswap` directory. The effectiveness of +frontswap can be measured (across all swap devices) with: + +``failed_stores`` + how many store attempts have failed + +``loads`` + how many loads were attempted (all should succeed) + +``succ_stores`` + how many store attempts have succeeded + +``invalidates`` + how many invalidates were attempted + +A backend implementation may provide additional metrics. + +FAQ +=== + +* Where's the value? + +When a workload starts swapping, performance falls through the floor. +Frontswap significantly increases performance in many such workloads by +providing a clean, dynamic interface to read and write swap pages to +"transcendent memory" that is otherwise not directly addressable to the kernel. +This interface is ideal when data is transformed to a different form +and size (such as with compression) or secretly moved (as might be +useful for write-balancing for some RAM-like devices). Swap pages (and +evicted page-cache pages) are a great use for this kind of slower-than-RAM- +but-much-faster-than-disk "pseudo-RAM device". + +Frontswap with a fairly small impact on the kernel, +provides a huge amount of flexibility for more dynamic, flexible RAM +utilization in various system configurations: + +In the single kernel case, aka "zcache", pages are compressed and +stored in local memory, thus increasing the total anonymous pages +that can be safely kept in RAM. Zcache essentially trades off CPU +cycles used in compression/decompression for better memory utilization. +Benchmarks have shown little or no impact when memory pressure is +low while providing a significant performance improvement (25%+) +on some workloads under high memory pressure. + +"RAMster" builds on zcache by adding "peer-to-peer" transcendent memory +support for clustered systems. Frontswap pages are locally compressed +as in zcache, but then "remotified" to another system's RAM. This +allows RAM to be dynamically load-balanced back-and-forth as needed, +i.e. when system A is overcommitted, it can swap to system B, and +vice versa. RAMster can also be configured as a memory server so +many servers in a cluster can swap, dynamically as needed, to a single +server configured with a large amount of RAM... without pre-configuring +how much of the RAM is available for each of the clients! + +In the virtual case, the whole point of virtualization is to statistically +multiplex physical resources across the varying demands of multiple +virtual machines. This is really hard to do with RAM and efforts to do +it well with no kernel changes have essentially failed (except in some +well-publicized special-case workloads). +Specifically, the Xen Transcendent Memory backend allows otherwise +"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple +virtual machines, but the pages can be compressed and deduplicated to +optimize RAM utilization. And when guest OS's are induced to surrender +underutilized RAM (e.g. with "selfballooning"), sudden unexpected +memory pressure may result in swapping; frontswap allows those pages +to be swapped to and from hypervisor RAM (if overall host system memory +conditions allow), thus mitigating the potentially awful performance impact +of unplanned swapping. + +A KVM implementation is underway and has been RFC'ed to lkml. And, +using frontswap, investigation is also underway on the use of NVM as +a memory extension technology. + +* Sure there may be performance advantages in some situations, but + what's the space/time overhead of frontswap? + +If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into +nothingness and the only overhead is a few extra bytes per swapon'ed +swap device. If CONFIG_FRONTSWAP is enabled but no frontswap "backend" +registers, there is one extra global variable compared to zero for +every swap page read or written. If CONFIG_FRONTSWAP is enabled +AND a frontswap backend registers AND the backend fails every "store" +request (i.e. provides no memory despite claiming it might), +CPU overhead is still negligible -- and since every frontswap fail +precedes a swap page write-to-disk, the system is highly likely +to be I/O bound and using a small fraction of a percent of a CPU +will be irrelevant anyway. + +As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend +registers, one bit is allocated for every swap page for every swap +device that is swapon'd. This is added to the EIGHT bits (which +was sixteen until about 2.6.34) that the kernel already allocates +for every swap page for every swap device that is swapon'd. (Hugh +Dickins has observed that frontswap could probably steal one of +the existing eight bits, but let's worry about that minor optimization +later.) For very large swap disks (which are rare) on a standard +4K pagesize, this is 1MB per 32GB swap. + +When swap pages are stored in transcendent memory instead of written +out to disk, there is a side effect that this may create more memory +pressure that can potentially outweigh the other advantages. A +backend, such as zcache, must implement policies to carefully (but +dynamically) manage memory limits to ensure this doesn't happen. + +* OK, how about a quick overview of what this frontswap patch does + in terms that a kernel hacker can grok? + +Let's assume that a frontswap "backend" has registered during +kernel initialization; this registration indicates that this +frontswap backend has access to some "memory" that is not directly +accessible by the kernel. Exactly how much memory it provides is +entirely dynamic and random. + +Whenever a swap-device is swapon'd frontswap_init() is called, +passing the swap device number (aka "type") as a parameter. +This notifies frontswap to expect attempts to "store" swap pages +associated with that number. + +Whenever the swap subsystem is readying a page to write to a swap +device (c.f swap_writepage()), frontswap_store is called. Frontswap +consults with the frontswap backend and if the backend says it does NOT +have room, frontswap_store returns -1 and the kernel swaps the page +to the swap device as normal. Note that the response from the frontswap +backend is unpredictable to the kernel; it may choose to never accept a +page, it could accept every ninth page, or it might accept every +page. But if the backend does accept a page, the data from the page +has already been copied and associated with the type and offset, +and the backend guarantees the persistence of the data. In this case, +frontswap sets a bit in the "frontswap_map" for the swap device +corresponding to the page offset on the swap device to which it would +otherwise have written the data. + +When the swap subsystem needs to swap-in a page (swap_readpage()), +it first calls frontswap_load() which checks the frontswap_map to +see if the page was earlier accepted by the frontswap backend. If +it was, the page of data is filled from the frontswap backend and +the swap-in is complete. If not, the normal swap-in code is +executed to obtain the page of data from the real swap device. + +So every time the frontswap backend accepts a page, a swap device read +and (potentially) a swap device write are replaced by a "frontswap backend +store" and (possibly) a "frontswap backend loads", which are presumably much +faster. + +* Can't frontswap be configured as a "special" swap device that is + just higher priority than any real swap device (e.g. like zswap, + or maybe swap-over-nbd/NFS)? + +No. First, the existing swap subsystem doesn't allow for any kind of +swap hierarchy. Perhaps it could be rewritten to accommodate a hierarchy, +but this would require fairly drastic changes. Even if it were +rewritten, the existing swap subsystem uses the block I/O layer which +assumes a swap device is fixed size and any page in it is linearly +addressable. Frontswap barely touches the existing swap subsystem, +and works around the constraints of the block I/O subsystem to provide +a great deal of flexibility and dynamicity. + +For example, the acceptance of any swap page by the frontswap backend is +entirely unpredictable. This is critical to the definition of frontswap +backends because it grants completely dynamic discretion to the +backend. In zcache, one cannot know a priori how compressible a page is. +"Poorly" compressible pages can be rejected, and "poorly" can itself be +defined dynamically depending on current memory constraints. + +Further, frontswap is entirely synchronous whereas a real swap +device is, by definition, asynchronous and uses block I/O. The +block I/O layer is not only unnecessary, but may perform "optimizations" +that are inappropriate for a RAM-oriented device including delaying +the write of some pages for a significant amount of time. Synchrony is +required to ensure the dynamicity of the backend and to avoid thorny race +conditions that would unnecessarily and greatly complicate frontswap +and/or the block I/O subsystem. That said, only the initial "store" +and "load" operations need be synchronous. A separate asynchronous thread +is free to manipulate the pages stored by frontswap. For example, +the "remotification" thread in RAMster uses standard asynchronous +kernel sockets to move compressed frontswap pages to a remote machine. +Similarly, a KVM guest-side implementation could do in-guest compression +and use "batched" hypercalls. + +In a virtualized environment, the dynamicity allows the hypervisor +(or host OS) to do "intelligent overcommit". For example, it can +choose to accept pages only until host-swapping might be imminent, +then force guests to do their own swapping. + +There is a downside to the transcendent memory specifications for +frontswap: Since any "store" might fail, there must always be a real +slot on a real swap device to swap the page. Thus frontswap must be +implemented as a "shadow" to every swapon'd device with the potential +capability of holding every page that the swap device might have held +and the possibility that it might hold no pages at all. This means +that frontswap cannot contain more pages than the total of swapon'd +swap devices. For example, if NO swap device is configured on some +installation, frontswap is useless. Swapless portable devices +can still use frontswap but a backend for such devices must configure +some kind of "ghost" swap device and ensure that it is never used. + +* Why this weird definition about "duplicate stores"? If a page + has been previously successfully stored, can't it always be + successfully overwritten? + +Nearly always it can, but no, sometimes it cannot. Consider an example +where data is compressed and the original 4K page has been compressed +to 1K. Now an attempt is made to overwrite the page with data that +is non-compressible and so would take the entire 4K. But the backend +has no more space. In this case, the store must be rejected. Whenever +frontswap rejects a store that would overwrite, it also must invalidate +the old data and ensure that it is no longer accessible. Since the +swap subsystem then writes the new data to the read swap device, +this is the correct course of action to ensure coherency. + +* Why does the frontswap patch create the new include file swapfile.h? + +The frontswap code depends on some swap-subsystem-internal data +structures that have, over the years, moved back and forth between +static and global. This seemed a reasonable compromise: Define +them as global but declare them in a new include file that isn't +included by the large number of source files that include swap.h. + +Dan Magenheimer, last updated April 9, 2012 diff --git a/Documentation/mm/highmem.rst b/Documentation/mm/highmem.rst new file mode 100644 index 000000000000..c9887f241c6c --- /dev/null +++ b/Documentation/mm/highmem.rst @@ -0,0 +1,167 @@ +.. _highmem: + +==================== +High Memory Handling +==================== + +By: Peter Zijlstra + +.. contents:: :local: + +What Is High Memory? +==================== + +High memory (highmem) is used when the size of physical memory approaches or +exceeds the maximum size of virtual memory. At that point it becomes +impossible for the kernel to keep all of the available physical memory mapped +at all times. This means the kernel needs to start using temporary mappings of +the pieces of physical memory that it wants to access. + +The part of (physical) memory not covered by a permanent mapping is what we +refer to as 'highmem'. There are various architecture dependent constraints on +where exactly that border lies. + +In the i386 arch, for example, we choose to map the kernel into every process's +VM space so that we don't have to pay the full TLB invalidation costs for +kernel entry/exit. This means the available virtual memory space (4GiB on +i386) has to be divided between user and kernel space. + +The traditional split for architectures using this approach is 3:1, 3GiB for +userspace and the top 1GiB for kernel space:: + + +--------+ 0xffffffff + | Kernel | + +--------+ 0xc0000000 + | | + | User | + | | + +--------+ 0x00000000 + +This means that the kernel can at most map 1GiB of physical memory at any one +time, but because we need virtual address space for other things - including +temporary maps to access the rest of the physical memory - the actual direct +map will typically be less (usually around ~896MiB). + +Other architectures that have mm context tagged TLBs can have separate kernel +and user maps. Some hardware (like some ARMs), however, have limited virtual +space when they use mm context tags. + + +Temporary Virtual Mappings +========================== + +The kernel contains several ways of creating temporary mappings. The following +list shows them in order of preference of use. + +* kmap_local_page(). This function is used to require short term mappings. + It can be invoked from any context (including interrupts) but the mappings + can only be used in the context which acquired them. + + This function should be preferred, where feasible, over all the others. + + These mappings are thread-local and CPU-local, meaning that the mapping + can only be accessed from within this thread and the thread is bound the + CPU while the mapping is active. Even if the thread is preempted (since + preemption is never disabled by the function) the CPU can not be + unplugged from the system via CPU-hotplug until the mapping is disposed. + + It's valid to take pagefaults in a local kmap region, unless the context + in which the local mapping is acquired does not allow it for other reasons. + + kmap_local_page() always returns a valid virtual address and it is assumed + that kunmap_local() will never fail. + + Nesting kmap_local_page() and kmap_atomic() mappings is allowed to a certain + extent (up to KMAP_TYPE_NR) but their invocations have to be strictly ordered + because the map implementation is stack based. See kmap_local_page() kdocs + (included in the "Functions" section) for details on how to manage nested + mappings. + +* kmap_atomic(). This permits a very short duration mapping of a single + page. Since the mapping is restricted to the CPU that issued it, it + performs well, but the issuing task is therefore required to stay on that + CPU until it has finished, lest some other task displace its mappings. + + kmap_atomic() may also be used by interrupt contexts, since it does not + sleep and the callers too may not sleep until after kunmap_atomic() is + called. + + Each call of kmap_atomic() in the kernel creates a non-preemptible section + and disable pagefaults. This could be a source of unwanted latency. Therefore + users should prefer kmap_local_page() instead of kmap_atomic(). + + It is assumed that k[un]map_atomic() won't fail. + +* kmap(). This should be used to make short duration mapping of a single + page with no restrictions on preemption or migration. It comes with an + overhead as mapping space is restricted and protected by a global lock + for synchronization. When mapping is no longer needed, the address that + the page was mapped to must be released with kunmap(). + + Mapping changes must be propagated across all the CPUs. kmap() also + requires global TLB invalidation when the kmap's pool wraps and it might + block when the mapping space is fully utilized until a slot becomes + available. Therefore, kmap() is only callable from preemptible context. + + All the above work is necessary if a mapping must last for a relatively + long time but the bulk of high-memory mappings in the kernel are + short-lived and only used in one place. This means that the cost of + kmap() is mostly wasted in such cases. kmap() was not intended for long + term mappings but it has morphed in that direction and its use is + strongly discouraged in newer code and the set of the preceding functions + should be preferred. + + On 64-bit systems, calls to kmap_local_page(), kmap_atomic() and kmap() have + no real work to do because a 64-bit address space is more than sufficient to + address all the physical memory whose pages are permanently mapped. + +* vmap(). This can be used to make a long duration mapping of multiple + physical pages into a contiguous virtual space. It needs global + synchronization to unmap. + + +Cost of Temporary Mappings +========================== + +The cost of creating temporary mappings can be quite high. The arch has to +manipulate the kernel's page tables, the data TLB and/or the MMU's registers. + +If CONFIG_HIGHMEM is not set, then the kernel will try and create a mapping +simply with a bit of arithmetic that will convert the page struct address into +a pointer to the page contents rather than juggling mappings about. In such a +case, the unmap operation may be a null operation. + +If CONFIG_MMU is not set, then there can be no temporary mappings and no +highmem. In such a case, the arithmetic approach will also be used. + + +i386 PAE +======== + +The i386 arch, under some circumstances, will permit you to stick up to 64GiB +of RAM into your 32-bit machine. This has a number of consequences: + +* Linux needs a page-frame structure for each page in the system and the + pageframes need to live in the permanent mapping, which means: + +* you can have 896M/sizeof(struct page) page-frames at most; with struct + page being 32-bytes that would end up being something in the order of 112G + worth of pages; the kernel, however, needs to store more than just + page-frames in that memory... + +* PAE makes your page tables larger - which slows the system down as more + data has to be accessed to traverse in TLB fills and the like. One + advantage is that PAE has more PTE bits and can provide advanced features + like NX and PAT. + +The general recommendation is that you don't use more than 8GiB on a 32-bit +machine - although more might work for you and your workload, you're pretty +much on your own - don't expect kernel developers to really care much if things +come apart. + + +Functions +========= + +.. kernel-doc:: include/linux/highmem.h +.. kernel-doc:: include/linux/highmem-internal.h diff --git a/Documentation/mm/hmm.rst b/Documentation/mm/hmm.rst new file mode 100644 index 000000000000..f2a59ed82ed3 --- /dev/null +++ b/Documentation/mm/hmm.rst @@ -0,0 +1,452 @@ +.. _hmm: + +===================================== +Heterogeneous Memory Management (HMM) +===================================== + +Provide infrastructure and helpers to integrate non-conventional memory (device +memory like GPU on board memory) into regular kernel path, with the cornerstone +of this being specialized struct page for such memory (see sections 5 to 7 of +this document). + +HMM also provides optional helpers for SVM (Share Virtual Memory), i.e., +allowing a device to transparently access program addresses coherently with +the CPU meaning that any valid pointer on the CPU is also a valid pointer +for the device. This is becoming mandatory to simplify the use of advanced +heterogeneous computing where GPU, DSP, or FPGA are used to perform various +computations on behalf of a process. + +This document is divided as follows: in the first section I expose the problems +related to using device specific memory allocators. In the second section, I +expose the hardware limitations that are inherent to many platforms. The third +section gives an overview of the HMM design. The fourth section explains how +CPU page-table mirroring works and the purpose of HMM in this context. The +fifth section deals with how device memory is represented inside the kernel. +Finally, the last section presents a new migration helper that allows +leveraging the device DMA engine. + +.. contents:: :local: + +Problems of using a device specific memory allocator +==================================================== + +Devices with a large amount of on board memory (several gigabytes) like GPUs +have historically managed their memory through dedicated driver specific APIs. +This creates a disconnect between memory allocated and managed by a device +driver and regular application memory (private anonymous, shared memory, or +regular file backed memory). From here on I will refer to this aspect as split +address space. I use shared address space to refer to the opposite situation: +i.e., one in which any application memory region can be used by a device +transparently. + +Split address space happens because devices can only access memory allocated +through a device specific API. This implies that all memory objects in a program +are not equal from the device point of view which complicates large programs +that rely on a wide set of libraries. + +Concretely, this means that code that wants to leverage devices like GPUs needs +to copy objects between generically allocated memory (malloc, mmap private, mmap +share) and memory allocated through the device driver API (this still ends up +with an mmap but of the device file). + +For flat data sets (array, grid, image, ...) this isn't too hard to achieve but +for complex data sets (list, tree, ...) it's hard to get right. Duplicating a +complex data set needs to re-map all the pointer relations between each of its +elements. This is error prone and programs get harder to debug because of the +duplicate data set and addresses. + +Split address space also means that libraries cannot transparently use data +they are getting from the core program or another library and thus each library +might have to duplicate its input data set using the device specific memory +allocator. Large projects suffer from this and waste resources because of the +various memory copies. + +Duplicating each library API to accept as input or output memory allocated by +each device specific allocator is not a viable option. It would lead to a +combinatorial explosion in the library entry points. + +Finally, with the advance of high level language constructs (in C++ but in +other languages too) it is now possible for the compiler to leverage GPUs and +other devices without programmer knowledge. Some compiler identified patterns +are only do-able with a shared address space. It is also more reasonable to use +a shared address space for all other patterns. + + +I/O bus, device memory characteristics +====================================== + +I/O buses cripple shared address spaces due to a few limitations. Most I/O +buses only allow basic memory access from device to main memory; even cache +coherency is often optional. Access to device memory from a CPU is even more +limited. More often than not, it is not cache coherent. + +If we only consider the PCIE bus, then a device can access main memory (often +through an IOMMU) and be cache coherent with the CPUs. However, it only allows +a limited set of atomic operations from the device on main memory. This is worse +in the other direction: the CPU can only access a limited range of the device +memory and cannot perform atomic operations on it. Thus device memory cannot +be considered the same as regular memory from the kernel point of view. + +Another crippling factor is the limited bandwidth (~32GBytes/s with PCIE 4.0 +and 16 lanes). This is 33 times less than the fastest GPU memory (1 TBytes/s). +The final limitation is latency. Access to main memory from the device has an +order of magnitude higher latency than when the device accesses its own memory. + +Some platforms are developing new I/O buses or additions/modifications to PCIE +to address some of these limitations (OpenCAPI, CCIX). They mainly allow +two-way cache coherency between CPU and device and allow all atomic operations the +architecture supports. Sadly, not all platforms are following this trend and +some major architectures are left without hardware solutions to these problems. + +So for shared address space to make sense, not only must we allow devices to +access any memory but we must also permit any memory to be migrated to device +memory while the device is using it (blocking CPU access while it happens). + + +Shared address space and migration +================================== + +HMM intends to provide two main features. The first one is to share the address +space by duplicating the CPU page table in the device page table so the same +address points to the same physical memory for any valid main memory address in +the process address space. + +To achieve this, HMM offers a set of helpers to populate the device page table +while keeping track of CPU page table updates. Device page table updates are +not as easy as CPU page table updates. To update the device page table, you must +allocate a buffer (or use a pool of pre-allocated buffers) and write GPU +specific commands in it to perform the update (unmap, cache invalidations, and +flush, ...). This cannot be done through common code for all devices. Hence +why HMM provides helpers to factor out everything that can be while leaving the +hardware specific details to the device driver. + +The second mechanism HMM provides is a new kind of ZONE_DEVICE memory that +allows allocating a struct page for each page of device memory. Those pages +are special because the CPU cannot map them. However, they allow migrating +main memory to device memory using existing migration mechanisms and everything +looks like a page that is swapped out to disk from the CPU point of view. Using a +struct page gives the easiest and cleanest integration with existing mm +mechanisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE +memory for the device memory and second to perform migration. Policy decisions +of what and when to migrate is left to the device driver. + +Note that any CPU access to a device page triggers a page fault and a migration +back to main memory. For example, when a page backing a given CPU address A is +migrated from a main memory page to a device page, then any CPU access to +address A triggers a page fault and initiates a migration back to main memory. + +With these two features, HMM not only allows a device to mirror process address +space and keeps both CPU and device page tables synchronized, but also +leverages device memory by migrating the part of the data set that is actively being +used by the device. + + +Address space mirroring implementation and API +============================================== + +Address space mirroring's main objective is to allow duplication of a range of +CPU page table into a device page table; HMM helps keep both synchronized. A +device driver that wants to mirror a process address space must start with the +registration of a mmu_interval_notifier:: + + int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, + struct mm_struct *mm, unsigned long start, + unsigned long length, + const struct mmu_interval_notifier_ops *ops); + +During the ops->invalidate() callback the device driver must perform the +update action to the range (mark range read only, or fully unmap, etc.). The +device must complete the update before the driver callback returns. + +When the device driver wants to populate a range of virtual addresses, it can +use:: + + int hmm_range_fault(struct hmm_range *range); + +It will trigger a page fault on missing or read-only entries if write access is +requested (see below). Page faults use the generic mm page fault code path just +like a CPU page fault. + +Both functions copy CPU page table entries into their pfns array argument. Each +entry in that array corresponds to an address in the virtual range. HMM +provides a set of flags to help the driver identify special CPU page table +entries. + +Locking within the sync_cpu_device_pagetables() callback is the most important +aspect the driver must respect in order to keep things properly synchronized. +The usage pattern is:: + + int driver_populate_range(...) + { + struct hmm_range range; + ... + + range.notifier = &interval_sub; + range.start = ...; + range.end = ...; + range.hmm_pfns = ...; + + if (!mmget_not_zero(interval_sub->notifier.mm)) + return -EFAULT; + + again: + range.notifier_seq = mmu_interval_read_begin(&interval_sub); + mmap_read_lock(mm); + ret = hmm_range_fault(&range); + if (ret) { + mmap_read_unlock(mm); + if (ret == -EBUSY) + goto again; + return ret; + } + mmap_read_unlock(mm); + + take_lock(driver->update); + if (mmu_interval_read_retry(&ni, range.notifier_seq) { + release_lock(driver->update); + goto again; + } + + /* Use pfns array content to update device page table, + * under the update lock */ + + release_lock(driver->update); + return 0; + } + +The driver->update lock is the same lock that the driver takes inside its +invalidate() callback. That lock must be held before calling +mmu_interval_read_retry() to avoid any race with a concurrent CPU page table +update. + +Leverage default_flags and pfn_flags_mask +========================================= + +The hmm_range struct has 2 fields, default_flags and pfn_flags_mask, that specify +fault or snapshot policy for the whole range instead of having to set them +for each entry in the pfns array. + +For instance if the device driver wants pages for a range with at least read +permission, it sets:: + + range->default_flags = HMM_PFN_REQ_FAULT; + range->pfn_flags_mask = 0; + +and calls hmm_range_fault() as described above. This will fill fault all pages +in the range with at least read permission. + +Now let's say the driver wants to do the same except for one page in the range for +which it wants to have write permission. Now driver set:: + + range->default_flags = HMM_PFN_REQ_FAULT; + range->pfn_flags_mask = HMM_PFN_REQ_WRITE; + range->pfns[index_of_write] = HMM_PFN_REQ_WRITE; + +With this, HMM will fault in all pages with at least read (i.e., valid) and for the +address == range->start + (index_of_write << PAGE_SHIFT) it will fault with +write permission i.e., if the CPU pte does not have write permission set then HMM +will call handle_mm_fault(). + +After hmm_range_fault completes the flag bits are set to the current state of +the page tables, ie HMM_PFN_VALID | HMM_PFN_WRITE will be set if the page is +writable. + + +Represent and manage device memory from core kernel point of view +================================================================= + +Several different designs were tried to support device memory. The first one +used a device specific data structure to keep information about migrated memory +and HMM hooked itself in various places of mm code to handle any access to +addresses that were backed by device memory. It turns out that this ended up +replicating most of the fields of struct page and also needed many kernel code +paths to be updated to understand this new kind of memory. + +Most kernel code paths never try to access the memory behind a page +but only care about struct page contents. Because of this, HMM switched to +directly using struct page for device memory which left most kernel code paths +unaware of the difference. We only need to make sure that no one ever tries to +map those pages from the CPU side. + +Migration to and from device memory +=================================== + +Because the CPU cannot access device memory directly, the device driver must +use hardware DMA or device specific load/store instructions to migrate data. +The migrate_vma_setup(), migrate_vma_pages(), and migrate_vma_finalize() +functions are designed to make drivers easier to write and to centralize common +code across drivers. + +Before migrating pages to device private memory, special device private +``struct page`` need to be created. These will be used as special "swap" +page table entries so that a CPU process will fault if it tries to access +a page that has been migrated to device private memory. + +These can be allocated and freed with:: + + struct resource *res; + struct dev_pagemap pagemap; + + res = request_free_mem_region(&iomem_resource, /* number of bytes */, + "name of driver resource"); + pagemap.type = MEMORY_DEVICE_PRIVATE; + pagemap.range.start = res->start; + pagemap.range.end = res->end; + pagemap.nr_range = 1; + pagemap.ops = &device_devmem_ops; + memremap_pages(&pagemap, numa_node_id()); + + memunmap_pages(&pagemap); + release_mem_region(pagemap.range.start, range_len(&pagemap.range)); + +There are also devm_request_free_mem_region(), devm_memremap_pages(), +devm_memunmap_pages(), and devm_release_mem_region() when the resources can +be tied to a ``struct device``. + +The overall migration steps are similar to migrating NUMA pages within system +memory (see :ref:`Page migration `) but the steps are split +between device driver specific code and shared common code: + +1. ``mmap_read_lock()`` + + The device driver has to pass a ``struct vm_area_struct`` to + migrate_vma_setup() so the mmap_read_lock() or mmap_write_lock() needs to + be held for the duration of the migration. + +2. ``migrate_vma_setup(struct migrate_vma *args)`` + + The device driver initializes the ``struct migrate_vma`` fields and passes + the pointer to migrate_vma_setup(). The ``args->flags`` field is used to + filter which source pages should be migrated. For example, setting + ``MIGRATE_VMA_SELECT_SYSTEM`` will only migrate system memory and + ``MIGRATE_VMA_SELECT_DEVICE_PRIVATE`` will only migrate pages residing in + device private memory. If the latter flag is set, the ``args->pgmap_owner`` + field is used to identify device private pages owned by the driver. This + avoids trying to migrate device private pages residing in other devices. + Currently only anonymous private VMA ranges can be migrated to or from + system memory and device private memory. + + One of the first steps migrate_vma_setup() does is to invalidate other + device's MMUs with the ``mmu_notifier_invalidate_range_start(()`` and + ``mmu_notifier_invalidate_range_end()`` calls around the page table + walks to fill in the ``args->src`` array with PFNs to be migrated. + The ``invalidate_range_start()`` callback is passed a + ``struct mmu_notifier_range`` with the ``event`` field set to + ``MMU_NOTIFY_MIGRATE`` and the ``owner`` field set to + the ``args->pgmap_owner`` field passed to migrate_vma_setup(). This is + allows the device driver to skip the invalidation callback and only + invalidate device private MMU mappings that are actually migrating. + This is explained more in the next section. + + While walking the page tables, a ``pte_none()`` or ``is_zero_pfn()`` + entry results in a valid "zero" PFN stored in the ``args->src`` array. + This lets the driver allocate device private memory and clear it instead + of copying a page of zeros. Valid PTE entries to system memory or + device private struct pages will be locked with ``lock_page()``, isolated + from the LRU (if system memory since device private pages are not on + the LRU), unmapped from the process, and a special migration PTE is + inserted in place of the original PTE. + migrate_vma_setup() also clears the ``args->dst`` array. + +3. The device driver allocates destination pages and copies source pages to + destination pages. + + The driver checks each ``src`` entry to see if the ``MIGRATE_PFN_MIGRATE`` + bit is set and skips entries that are not migrating. The device driver + can also choose to skip migrating a page by not filling in the ``dst`` + array for that page. + + The driver then allocates either a device private struct page or a + system memory page, locks the page with ``lock_page()``, and fills in the + ``dst`` array entry with:: + + dst[i] = migrate_pfn(page_to_pfn(dpage)); + + Now that the driver knows that this page is being migrated, it can + invalidate device private MMU mappings and copy device private memory + to system memory or another device private page. The core Linux kernel + handles CPU page table invalidations so the device driver only has to + invalidate its own MMU mappings. + + The driver can use ``migrate_pfn_to_page(src[i])`` to get the + ``struct page`` of the source and either copy the source page to the + destination or clear the destination device private memory if the pointer + is ``NULL`` meaning the source page was not populated in system memory. + +4. ``migrate_vma_pages()`` + + This step is where the migration is actually "committed". + + If the source page was a ``pte_none()`` or ``is_zero_pfn()`` page, this + is where the newly allocated page is inserted into the CPU's page table. + This can fail if a CPU thread faults on the same page. However, the page + table is locked and only one of the new pages will be inserted. + The device driver will see that the ``MIGRATE_PFN_MIGRATE`` bit is cleared + if it loses the race. + + If the source page was locked, isolated, etc. the source ``struct page`` + information is now copied to destination ``struct page`` finalizing the + migration on the CPU side. + +5. Device driver updates device MMU page tables for pages still migrating, + rolling back pages not migrating. + + If the ``src`` entry still has ``MIGRATE_PFN_MIGRATE`` bit set, the device + driver can update the device MMU and set the write enable bit if the + ``MIGRATE_PFN_WRITE`` bit is set. + +6. ``migrate_vma_finalize()`` + + This step replaces the special migration page table entry with the new + page's page table entry and releases the reference to the source and + destination ``struct page``. + +7. ``mmap_read_unlock()`` + + The lock can now be released. + +Exclusive access memory +======================= + +Some devices have features such as atomic PTE bits that can be used to implement +atomic access to system memory. To support atomic operations to a shared virtual +memory page such a device needs access to that page which is exclusive of any +userspace access from the CPU. The ``make_device_exclusive_range()`` function +can be used to make a memory range inaccessible from userspace. + +This replaces all mappings for pages in the given range with special swap +entries. Any attempt to access the swap entry results in a fault which is +resovled by replacing the entry with the original mapping. A driver gets +notified that the mapping has been changed by MMU notifiers, after which point +it will no longer have exclusive access to the page. Exclusive access is +guranteed to last until the driver drops the page lock and page reference, at +which point any CPU faults on the page may proceed as described. + +Memory cgroup (memcg) and rss accounting +======================================== + +For now, device memory is accounted as any regular page in rss counters (either +anonymous if device page is used for anonymous, file if device page is used for +file backed page, or shmem if device page is used for shared memory). This is a +deliberate choice to keep existing applications, that might start using device +memory without knowing about it, running unimpacted. + +A drawback is that the OOM killer might kill an application using a lot of +device memory and not a lot of regular system memory and thus not freeing much +system memory. We want to gather more real world experience on how applications +and system react under memory pressure in the presence of device memory before +deciding to account device memory differently. + + +Same decision was made for memory cgroup. Device memory pages are accounted +against same memory cgroup a regular page would be accounted to. This does +simplify migration to and from device memory. This also means that migration +back from device memory to regular memory cannot fail because it would +go above memory cgroup limit. We might revisit this choice latter on once we +get more experience in how device memory is used and its impact on memory +resource control. + + +Note that device memory can never be pinned by a device driver nor through GUP +and thus such memory is always free upon process exit. Or when last reference +is dropped in case of shared memory or file backed memory. diff --git a/Documentation/mm/hugetlbfs_reserv.rst b/Documentation/mm/hugetlbfs_reserv.rst new file mode 100644 index 000000000000..f143954e0d05 --- /dev/null +++ b/Documentation/mm/hugetlbfs_reserv.rst @@ -0,0 +1,596 @@ +.. _hugetlbfs_reserve: + +===================== +Hugetlbfs Reservation +===================== + +Overview +======== + +Huge pages as described at :ref:`hugetlbpage` are typically +preallocated for application use. These huge pages are instantiated in a +task's address space at page fault time if the VMA indicates huge pages are +to be used. If no huge page exists at page fault time, the task is sent +a SIGBUS and often dies an unhappy death. Shortly after huge page support +was added, it was determined that it would be better to detect a shortage +of huge pages at mmap() time. The idea is that if there were not enough +huge pages to cover the mapping, the mmap() would fail. This was first +done with a simple check in the code at mmap() time to determine if there +were enough free huge pages to cover the mapping. Like most things in the +kernel, the code has evolved over time. However, the basic idea was to +'reserve' huge pages at mmap() time to ensure that huge pages would be +available for page faults in that mapping. The description below attempts to +describe how huge page reserve processing is done in the v4.10 kernel. + + +Audience +======== +This description is primarily targeted at kernel developers who are modifying +hugetlbfs code. + + +The Data Structures +=================== + +resv_huge_pages + This is a global (per-hstate) count of reserved huge pages. Reserved + huge pages are only available to the task which reserved them. + Therefore, the number of huge pages generally available is computed + as (``free_huge_pages - resv_huge_pages``). +Reserve Map + A reserve map is described by the structure:: + + struct resv_map { + struct kref refs; + spinlock_t lock; + struct list_head regions; + long adds_in_progress; + struct list_head region_cache; + long region_cache_count; + }; + + There is one reserve map for each huge page mapping in the system. + The regions list within the resv_map describes the regions within + the mapping. A region is described as:: + + struct file_region { + struct list_head link; + long from; + long to; + }; + + The 'from' and 'to' fields of the file region structure are huge page + indices into the mapping. Depending on the type of mapping, a + region in the reserv_map may indicate reservations exist for the + range, or reservations do not exist. +Flags for MAP_PRIVATE Reservations + These are stored in the bottom bits of the reservation map pointer. + + ``#define HPAGE_RESV_OWNER (1UL << 0)`` + Indicates this task is the owner of the reservations + associated with the mapping. + ``#define HPAGE_RESV_UNMAPPED (1UL << 1)`` + Indicates task originally mapping this range (and creating + reserves) has unmapped a page from this task (the child) + due to a failed COW. +Page Flags + The PagePrivate page flag is used to indicate that a huge page + reservation must be restored when the huge page is freed. More + details will be discussed in the "Freeing huge pages" section. + + +Reservation Map Location (Private or Shared) +============================================ + +A huge page mapping or segment is either private or shared. If private, +it is typically only available to a single address space (task). If shared, +it can be mapped into multiple address spaces (tasks). The location and +semantics of the reservation map is significantly different for the two types +of mappings. Location differences are: + +- For private mappings, the reservation map hangs off the VMA structure. + Specifically, vma->vm_private_data. This reserve map is created at the + time the mapping (mmap(MAP_PRIVATE)) is created. +- For shared mappings, the reservation map hangs off the inode. Specifically, + inode->i_mapping->private_data. Since shared mappings are always backed + by files in the hugetlbfs filesystem, the hugetlbfs code ensures each inode + contains a reservation map. As a result, the reservation map is allocated + when the inode is created. + + +Creating Reservations +===================== +Reservations are created when a huge page backed shared memory segment is +created (shmget(SHM_HUGETLB)) or a mapping is created via mmap(MAP_HUGETLB). +These operations result in a call to the routine hugetlb_reserve_pages():: + + int hugetlb_reserve_pages(struct inode *inode, + long from, long to, + struct vm_area_struct *vma, + vm_flags_t vm_flags) + +The first thing hugetlb_reserve_pages() does is check if the NORESERVE +flag was specified in either the shmget() or mmap() call. If NORESERVE +was specified, then this routine returns immediately as no reservations +are desired. + +The arguments 'from' and 'to' are huge page indices into the mapping or +underlying file. For shmget(), 'from' is always 0 and 'to' corresponds to +the length of the segment/mapping. For mmap(), the offset argument could +be used to specify the offset into the underlying file. In such a case, +the 'from' and 'to' arguments have been adjusted by this offset. + +One of the big differences between PRIVATE and SHARED mappings is the way +in which reservations are represented in the reservation map. + +- For shared mappings, an entry in the reservation map indicates a reservation + exists or did exist for the corresponding page. As reservations are + consumed, the reservation map is not modified. +- For private mappings, the lack of an entry in the reservation map indicates + a reservation exists for the corresponding page. As reservations are + consumed, entries are added to the reservation map. Therefore, the + reservation map can also be used to determine which reservations have + been consumed. + +For private mappings, hugetlb_reserve_pages() creates the reservation map and +hangs it off the VMA structure. In addition, the HPAGE_RESV_OWNER flag is set +to indicate this VMA owns the reservations. + +The reservation map is consulted to determine how many huge page reservations +are needed for the current mapping/segment. For private mappings, this is +always the value (to - from). However, for shared mappings it is possible that +some reservations may already exist within the range (to - from). See the +section :ref:`Reservation Map Modifications ` +for details on how this is accomplished. + +The mapping may be associated with a subpool. If so, the subpool is consulted +to ensure there is sufficient space for the mapping. It is possible that the +subpool has set aside reservations that can be used for the mapping. See the +section :ref:`Subpool Reservations ` for more details. + +After consulting the reservation map and subpool, the number of needed new +reservations is known. The routine hugetlb_acct_memory() is called to check +for and take the requested number of reservations. hugetlb_acct_memory() +calls into routines that potentially allocate and adjust surplus page counts. +However, within those routines the code is simply checking to ensure there +are enough free huge pages to accommodate the reservation. If there are, +the global reservation count resv_huge_pages is adjusted something like the +following:: + + if (resv_needed <= (resv_huge_pages - free_huge_pages)) + resv_huge_pages += resv_needed; + +Note that the global lock hugetlb_lock is held when checking and adjusting +these counters. + +If there were enough free huge pages and the global count resv_huge_pages +was adjusted, then the reservation map associated with the mapping is +modified to reflect the reservations. In the case of a shared mapping, a +file_region will exist that includes the range 'from' - 'to'. For private +mappings, no modifications are made to the reservation map as lack of an +entry indicates a reservation exists. + +If hugetlb_reserve_pages() was successful, the global reservation count and +reservation map associated with the mapping will be modified as required to +ensure reservations exist for the range 'from' - 'to'. + +.. _consume_resv: + +Consuming Reservations/Allocating a Huge Page +============================================= + +Reservations are consumed when huge pages associated with the reservations +are allocated and instantiated in the corresponding mapping. The allocation +is performed within the routine alloc_huge_page():: + + struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve) + +alloc_huge_page is passed a VMA pointer and a virtual address, so it can +consult the reservation map to determine if a reservation exists. In addition, +alloc_huge_page takes the argument avoid_reserve which indicates reserves +should not be used even if it appears they have been set aside for the +specified address. The avoid_reserve argument is most often used in the case +of Copy on Write and Page Migration where additional copies of an existing +page are being allocated. + +The helper routine vma_needs_reservation() is called to determine if a +reservation exists for the address within the mapping(vma). See the section +:ref:`Reservation Map Helper Routines ` for detailed +information on what this routine does. +The value returned from vma_needs_reservation() is generally +0 or 1. 0 if a reservation exists for the address, 1 if no reservation exists. +If a reservation does not exist, and there is a subpool associated with the +mapping the subpool is consulted to determine if it contains reservations. +If the subpool contains reservations, one can be used for this allocation. +However, in every case the avoid_reserve argument overrides the use of +a reservation for the allocation. After determining whether a reservation +exists and can be used for the allocation, the routine dequeue_huge_page_vma() +is called. This routine takes two arguments related to reservations: + +- avoid_reserve, this is the same value/argument passed to alloc_huge_page() +- chg, even though this argument is of type long only the values 0 or 1 are + passed to dequeue_huge_page_vma. If the value is 0, it indicates a + reservation exists (see the section "Memory Policy and Reservations" for + possible issues). If the value is 1, it indicates a reservation does not + exist and the page must be taken from the global free pool if possible. + +The free lists associated with the memory policy of the VMA are searched for +a free page. If a page is found, the value free_huge_pages is decremented +when the page is removed from the free list. If there was a reservation +associated with the page, the following adjustments are made:: + + SetPagePrivate(page); /* Indicates allocating this page consumed + * a reservation, and if an error is + * encountered such that the page must be + * freed, the reservation will be restored. */ + resv_huge_pages--; /* Decrement the global reservation count */ + +Note, if no huge page can be found that satisfies the VMA's memory policy +an attempt will be made to allocate one using the buddy allocator. This +brings up the issue of surplus huge pages and overcommit which is beyond +the scope reservations. Even if a surplus page is allocated, the same +reservation based adjustments as above will be made: SetPagePrivate(page) and +resv_huge_pages--. + +After obtaining a new huge page, (page)->private is set to the value of +the subpool associated with the page if it exists. This will be used for +subpool accounting when the page is freed. + +The routine vma_commit_reservation() is then called to adjust the reserve +map based on the consumption of the reservation. In general, this involves +ensuring the page is represented within a file_region structure of the region +map. For shared mappings where the reservation was present, an entry +in the reserve map already existed so no change is made. However, if there +was no reservation in a shared mapping or this was a private mapping a new +entry must be created. + +It is possible that the reserve map could have been changed between the call +to vma_needs_reservation() at the beginning of alloc_huge_page() and the +call to vma_commit_reservation() after the page was allocated. This would +be possible if hugetlb_reserve_pages was called for the same page in a shared +mapping. In such cases, the reservation count and subpool free page count +will be off by one. This rare condition can be identified by comparing the +return value from vma_needs_reservation and vma_commit_reservation. If such +a race is detected, the subpool and global reserve counts are adjusted to +compensate. See the section +:ref:`Reservation Map Helper Routines ` for more +information on these routines. + + +Instantiate Huge Pages +====================== + +After huge page allocation, the page is typically added to the page tables +of the allocating task. Before this, pages in a shared mapping are added +to the page cache and pages in private mappings are added to an anonymous +reverse mapping. In both cases, the PagePrivate flag is cleared. Therefore, +when a huge page that has been instantiated is freed no adjustment is made +to the global reservation count (resv_huge_pages). + + +Freeing Huge Pages +================== + +Huge page freeing is performed by the routine free_huge_page(). This routine +is the destructor for hugetlbfs compound pages. As a result, it is only +passed a pointer to the page struct. When a huge page is freed, reservation +accounting may need to be performed. This would be the case if the page was +associated with a subpool that contained reserves, or the page is being freed +on an error path where a global reserve count must be restored. + +The page->private field points to any subpool associated with the page. +If the PagePrivate flag is set, it indicates the global reserve count should +be adjusted (see the section +:ref:`Consuming Reservations/Allocating a Huge Page ` +for information on how these are set). + +The routine first calls hugepage_subpool_put_pages() for the page. If this +routine returns a value of 0 (which does not equal the value passed 1) it +indicates reserves are associated with the subpool, and this newly free page +must be used to keep the number of subpool reserves above the minimum size. +Therefore, the global resv_huge_pages counter is incremented in this case. + +If the PagePrivate flag was set in the page, the global resv_huge_pages counter +will always be incremented. + +.. _sub_pool_resv: + +Subpool Reservations +==================== + +There is a struct hstate associated with each huge page size. The hstate +tracks all huge pages of the specified size. A subpool represents a subset +of pages within a hstate that is associated with a mounted hugetlbfs +filesystem. + +When a hugetlbfs filesystem is mounted a min_size option can be specified +which indicates the minimum number of huge pages required by the filesystem. +If this option is specified, the number of huge pages corresponding to +min_size are reserved for use by the filesystem. This number is tracked in +the min_hpages field of a struct hugepage_subpool. At mount time, +hugetlb_acct_memory(min_hpages) is called to reserve the specified number of +huge pages. If they can not be reserved, the mount fails. + +The routines hugepage_subpool_get/put_pages() are called when pages are +obtained from or released back to a subpool. They perform all subpool +accounting, and track any reservations associated with the subpool. +hugepage_subpool_get/put_pages are passed the number of huge pages by which +to adjust the subpool 'used page' count (down for get, up for put). Normally, +they return the same value that was passed or an error if not enough pages +exist in the subpool. + +However, if reserves are associated with the subpool a return value less +than the passed value may be returned. This return value indicates the +number of additional global pool adjustments which must be made. For example, +suppose a subpool contains 3 reserved huge pages and someone asks for 5. +The 3 reserved pages associated with the subpool can be used to satisfy part +of the request. But, 2 pages must be obtained from the global pools. To +relay this information to the caller, the value 2 is returned. The caller +is then responsible for attempting to obtain the additional two pages from +the global pools. + + +COW and Reservations +==================== + +Since shared mappings all point to and use the same underlying pages, the +biggest reservation concern for COW is private mappings. In this case, +two tasks can be pointing at the same previously allocated page. One task +attempts to write to the page, so a new page must be allocated so that each +task points to its own page. + +When the page was originally allocated, the reservation for that page was +consumed. When an attempt to allocate a new page is made as a result of +COW, it is possible that no free huge pages are free and the allocation +will fail. + +When the private mapping was originally created, the owner of the mapping +was noted by setting the HPAGE_RESV_OWNER bit in the pointer to the reservation +map of the owner. Since the owner created the mapping, the owner owns all +the reservations associated with the mapping. Therefore, when a write fault +occurs and there is no page available, different action is taken for the owner +and non-owner of the reservation. + +In the case where the faulting task is not the owner, the fault will fail and +the task will typically receive a SIGBUS. + +If the owner is the faulting task, we want it to succeed since it owned the +original reservation. To accomplish this, the page is unmapped from the +non-owning task. In this way, the only reference is from the owning task. +In addition, the HPAGE_RESV_UNMAPPED bit is set in the reservation map pointer +of the non-owning task. The non-owning task may receive a SIGBUS if it later +faults on a non-present page. But, the original owner of the +mapping/reservation will behave as expected. + + +.. _resv_map_modifications: + +Reservation Map Modifications +============================= + +The following low level routines are used to make modifications to a +reservation map. Typically, these routines are not called directly. Rather, +a reservation map helper routine is called which calls one of these low level +routines. These low level routines are fairly well documented in the source +code (mm/hugetlb.c). These routines are:: + + long region_chg(struct resv_map *resv, long f, long t); + long region_add(struct resv_map *resv, long f, long t); + void region_abort(struct resv_map *resv, long f, long t); + long region_count(struct resv_map *resv, long f, long t); + +Operations on the reservation map typically involve two operations: + +1) region_chg() is called to examine the reserve map and determine how + many pages in the specified range [f, t) are NOT currently represented. + + The calling code performs global checks and allocations to determine if + there are enough huge pages for the operation to succeed. + +2) + a) If the operation can succeed, region_add() is called to actually modify + the reservation map for the same range [f, t) previously passed to + region_chg(). + b) If the operation can not succeed, region_abort is called for the same + range [f, t) to abort the operation. + +Note that this is a two step process where region_add() and region_abort() +are guaranteed to succeed after a prior call to region_chg() for the same +range. region_chg() is responsible for pre-allocating any data structures +necessary to ensure the subsequent operations (specifically region_add())) +will succeed. + +As mentioned above, region_chg() determines the number of pages in the range +which are NOT currently represented in the map. This number is returned to +the caller. region_add() returns the number of pages in the range added to +the map. In most cases, the return value of region_add() is the same as the +return value of region_chg(). However, in the case of shared mappings it is +possible for changes to the reservation map to be made between the calls to +region_chg() and region_add(). In this case, the return value of region_add() +will not match the return value of region_chg(). It is likely that in such +cases global counts and subpool accounting will be incorrect and in need of +adjustment. It is the responsibility of the caller to check for this condition +and make the appropriate adjustments. + +The routine region_del() is called to remove regions from a reservation map. +It is typically called in the following situations: + +- When a file in the hugetlbfs filesystem is being removed, the inode will + be released and the reservation map freed. Before freeing the reservation + map, all the individual file_region structures must be freed. In this case + region_del is passed the range [0, LONG_MAX). +- When a hugetlbfs file is being truncated. In this case, all allocated pages + after the new file size must be freed. In addition, any file_region entries + in the reservation map past the new end of file must be deleted. In this + case, region_del is passed the range [new_end_of_file, LONG_MAX). +- When a hole is being punched in a hugetlbfs file. In this case, huge pages + are removed from the middle of the file one at a time. As the pages are + removed, region_del() is called to remove the corresponding entry from the + reservation map. In this case, region_del is passed the range + [page_idx, page_idx + 1). + +In every case, region_del() will return the number of pages removed from the +reservation map. In VERY rare cases, region_del() can fail. This can only +happen in the hole punch case where it has to split an existing file_region +entry and can not allocate a new structure. In this error case, region_del() +will return -ENOMEM. The problem here is that the reservation map will +indicate that there is a reservation for the page. However, the subpool and +global reservation counts will not reflect the reservation. To handle this +situation, the routine hugetlb_fix_reserve_counts() is called to adjust the +counters so that they correspond with the reservation map entry that could +not be deleted. + +region_count() is called when unmapping a private huge page mapping. In +private mappings, the lack of a entry in the reservation map indicates that +a reservation exists. Therefore, by counting the number of entries in the +reservation map we know how many reservations were consumed and how many are +outstanding (outstanding = (end - start) - region_count(resv, start, end)). +Since the mapping is going away, the subpool and global reservation counts +are decremented by the number of outstanding reservations. + +.. _resv_map_helpers: + +Reservation Map Helper Routines +=============================== + +Several helper routines exist to query and modify the reservation maps. +These routines are only interested with reservations for a specific huge +page, so they just pass in an address instead of a range. In addition, +they pass in the associated VMA. From the VMA, the type of mapping (private +or shared) and the location of the reservation map (inode or VMA) can be +determined. These routines simply call the underlying routines described +in the section "Reservation Map Modifications". However, they do take into +account the 'opposite' meaning of reservation map entries for private and +shared mappings and hide this detail from the caller:: + + long vma_needs_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +This routine calls region_chg() for the specified page. If no reservation +exists, 1 is returned. If a reservation exists, 0 is returned:: + + long vma_commit_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +This calls region_add() for the specified page. As in the case of region_chg +and region_add, this routine is to be called after a previous call to +vma_needs_reservation. It will add a reservation entry for the page. It +returns 1 if the reservation was added and 0 if not. The return value should +be compared with the return value of the previous call to +vma_needs_reservation. An unexpected difference indicates the reservation +map was modified between calls:: + + void vma_end_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +This calls region_abort() for the specified page. As in the case of region_chg +and region_abort, this routine is to be called after a previous call to +vma_needs_reservation. It will abort/end the in progress reservation add +operation:: + + long vma_add_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +This is a special wrapper routine to help facilitate reservation cleanup +on error paths. It is only called from the routine restore_reserve_on_error(). +This routine is used in conjunction with vma_needs_reservation in an attempt +to add a reservation to the reservation map. It takes into account the +different reservation map semantics for private and shared mappings. Hence, +region_add is called for shared mappings (as an entry present in the map +indicates a reservation), and region_del is called for private mappings (as +the absence of an entry in the map indicates a reservation). See the section +"Reservation cleanup in error paths" for more information on what needs to +be done on error paths. + + +Reservation Cleanup in Error Paths +================================== + +As mentioned in the section +:ref:`Reservation Map Helper Routines `, reservation +map modifications are performed in two steps. First vma_needs_reservation +is called before a page is allocated. If the allocation is successful, +then vma_commit_reservation is called. If not, vma_end_reservation is called. +Global and subpool reservation counts are adjusted based on success or failure +of the operation and all is well. + +Additionally, after a huge page is instantiated the PagePrivate flag is +cleared so that accounting when the page is ultimately freed is correct. + +However, there are several instances where errors are encountered after a huge +page is allocated but before it is instantiated. In this case, the page +allocation has consumed the reservation and made the appropriate subpool, +reservation map and global count adjustments. If the page is freed at this +time (before instantiation and clearing of PagePrivate), then free_huge_page +will increment the global reservation count. However, the reservation map +indicates the reservation was consumed. This resulting inconsistent state +will cause the 'leak' of a reserved huge page. The global reserve count will +be higher than it should and prevent allocation of a pre-allocated page. + +The routine restore_reserve_on_error() attempts to handle this situation. It +is fairly well documented. The intention of this routine is to restore +the reservation map to the way it was before the page allocation. In this +way, the state of the reservation map will correspond to the global reservation +count after the page is freed. + +The routine restore_reserve_on_error itself may encounter errors while +attempting to restore the reservation map entry. In this case, it will +simply clear the PagePrivate flag of the page. In this way, the global +reserve count will not be incremented when the page is freed. However, the +reservation map will continue to look as though the reservation was consumed. +A page can still be allocated for the address, but it will not use a reserved +page as originally intended. + +There is some code (most notably userfaultfd) which can not call +restore_reserve_on_error. In this case, it simply modifies the PagePrivate +so that a reservation will not be leaked when the huge page is freed. + + +Reservations and Memory Policy +============================== +Per-node huge page lists existed in struct hstate when git was first used +to manage Linux code. The concept of reservations was added some time later. +When reservations were added, no attempt was made to take memory policy +into account. While cpusets are not exactly the same as memory policy, this +comment in hugetlb_acct_memory sums up the interaction between reservations +and cpusets/memory policy:: + + /* + * When cpuset is configured, it breaks the strict hugetlb page + * reservation as the accounting is done on a global variable. Such + * reservation is completely rubbish in the presence of cpuset because + * the reservation is not checked against page availability for the + * current cpuset. Application can still potentially OOM'ed by kernel + * with lack of free htlb page in cpuset that the task is in. + * Attempt to enforce strict accounting with cpuset is almost + * impossible (or too ugly) because cpuset is too fluid that + * task or memory node can be dynamically moved between cpusets. + * + * The change of semantics for shared hugetlb mapping with cpuset is + * undesirable. However, in order to preserve some of the semantics, + * we fall back to check against current free page availability as + * a best attempt and hopefully to minimize the impact of changing + * semantics that cpuset has. + */ + +Huge page reservations were added to prevent unexpected page allocation +failures (OOM) at page fault time. However, if an application makes use +of cpusets or memory policy there is no guarantee that huge pages will be +available on the required nodes. This is true even if there are a sufficient +number of global reservations. + +Hugetlbfs regression testing +============================ + +The most complete set of hugetlb tests are in the libhugetlbfs repository. +If you modify any hugetlb related code, use the libhugetlbfs test suite +to check for regressions. In addition, if you add any new hugetlb +functionality, please add appropriate tests to libhugetlbfs. + +-- +Mike Kravetz, 7 April 2017 diff --git a/Documentation/mm/hwpoison.rst b/Documentation/mm/hwpoison.rst new file mode 100644 index 000000000000..b9d5253c1305 --- /dev/null +++ b/Documentation/mm/hwpoison.rst @@ -0,0 +1,184 @@ +.. hwpoison: + +======== +hwpoison +======== + +What is hwpoison? +================= + +Upcoming Intel CPUs have support for recovering from some memory errors +(``MCA recovery``). This requires the OS to declare a page "poisoned", +kill the processes associated with it and avoid using it in the future. + +This patchkit implements the necessary infrastructure in the VM. + +To quote the overview comment:: + + High level machine check handler. Handles pages reported by the + hardware as being corrupted usually due to a 2bit ECC memory or cache + failure. + + This focusses on pages detected as corrupted in the background. + When the current CPU tries to consume corruption the currently + running process can just be killed directly instead. This implies + that if the error cannot be handled for some reason it's safe to + just ignore it because no corruption has been consumed yet. Instead + when that happens another machine check will happen. + + Handles page cache pages in various states. The tricky part + here is that we can access any page asynchronous to other VM + users, because memory failures could happen anytime and anywhere, + possibly violating some of their assumptions. This is why this code + has to be extremely careful. Generally it tries to use normal locking + rules, as in get the standard locks, even if that means the + error handling takes potentially a long time. + + Some of the operations here are somewhat inefficient and have non + linear algorithmic complexity, because the data structures have not + been optimized for this case. This is in particular the case + for the mapping from a vma to a process. Since this case is expected + to be rare we hope we can get away with this. + +The code consists of a the high level handler in mm/memory-failure.c, +a new page poison bit and various checks in the VM to handle poisoned +pages. + +The main target right now is KVM guests, but it works for all kinds +of applications. KVM support requires a recent qemu-kvm release. + +For the KVM use there was need for a new signal type so that +KVM can inject the machine check into the guest with the proper +address. This in theory allows other applications to handle +memory failures too. The expection is that near all applications +won't do that, but some very specialized ones might. + +Failure recovery modes +====================== + +There are two (actually three) modes memory failure recovery can be in: + +vm.memory_failure_recovery sysctl set to zero: + All memory failures cause a panic. Do not attempt recovery. + +early kill + (can be controlled globally and per process) + Send SIGBUS to the application as soon as the error is detected + This allows applications who can process memory errors in a gentle + way (e.g. drop affected object) + This is the mode used by KVM qemu. + +late kill + Send SIGBUS when the application runs into the corrupted page. + This is best for memory error unaware applications and default + Note some pages are always handled as late kill. + +User control +============ + +vm.memory_failure_recovery + See sysctl.txt + +vm.memory_failure_early_kill + Enable early kill mode globally + +PR_MCE_KILL + Set early/late kill mode/revert to system default + + arg1: PR_MCE_KILL_CLEAR: + Revert to system default + arg1: PR_MCE_KILL_SET: + arg2 defines thread specific mode + + PR_MCE_KILL_EARLY: + Early kill + PR_MCE_KILL_LATE: + Late kill + PR_MCE_KILL_DEFAULT + Use system global default + + Note that if you want to have a dedicated thread which handles + the SIGBUS(BUS_MCEERR_AO) on behalf of the process, you should + call prctl(PR_MCE_KILL_EARLY) on the designated thread. Otherwise, + the SIGBUS is sent to the main thread. + +PR_MCE_KILL_GET + return current mode + +Testing +======= + +* madvise(MADV_HWPOISON, ....) (as root) - Poison a page in the + process for testing + +* hwpoison-inject module through debugfs ``/sys/kernel/debug/hwpoison/`` + + corrupt-pfn + Inject hwpoison fault at PFN echoed into this file. This does + some early filtering to avoid corrupted unintended pages in test suites. + + unpoison-pfn + Software-unpoison page at PFN echoed into this file. This way + a page can be reused again. This only works for Linux + injected failures, not for real memory failures. Once any hardware + memory failure happens, this feature is disabled. + + Note these injection interfaces are not stable and might change between + kernel versions + + corrupt-filter-dev-major, corrupt-filter-dev-minor + Only handle memory failures to pages associated with the file + system defined by block device major/minor. -1U is the + wildcard value. This should be only used for testing with + artificial injection. + + corrupt-filter-memcg + Limit injection to pages owned by memgroup. Specified by inode + number of the memcg. + + Example:: + + mkdir /sys/fs/cgroup/mem/hwpoison + + usemem -m 100 -s 1000 & + echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks + + memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ') + echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg + + page-types -p `pidof init` --hwpoison # shall do nothing + page-types -p `pidof usemem` --hwpoison # poison its pages + + corrupt-filter-flags-mask, corrupt-filter-flags-value + When specified, only poison pages if ((page_flags & mask) == + value). This allows stress testing of many kinds of + pages. The page_flags are the same as in /proc/kpageflags. The + flag bits are defined in include/linux/kernel-page-flags.h and + documented in Documentation/admin-guide/mm/pagemap.rst + +* Architecture specific MCE injector + + x86 has mce-inject, mce-test + + Some portable hwpoison test programs in mce-test, see below. + +References +========== + +http://halobates.de/mce-lc09-2.pdf + Overview presentation from LinuxCon 09 + +git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git + Test suite (hwpoison specific portable tests in tsrc) + +git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git + x86 specific injector + + +Limitations +=========== +- Not all page types are supported and never will. Most kernel internal + objects cannot be recovered, only LRU pages for now. + +--- +Andi Kleen, Oct 2009 diff --git a/Documentation/mm/index.rst b/Documentation/mm/index.rst new file mode 100644 index 000000000000..575ccd40e30c --- /dev/null +++ b/Documentation/mm/index.rst @@ -0,0 +1,68 @@ +===================================== +Linux Memory Management Documentation +===================================== + +Memory Management Guide +======================= + +This is a guide to understanding the memory management subsystem +of Linux. If you are looking for advice on simply allocating memory, +see the :ref:`memory_allocation`. For controlling and tuning guides, +see the :doc:`admin guide <../admin-guide/mm/index>`. + +.. toctree:: + :maxdepth: 1 + + physical_memory + page_tables + process_addrs + bootmem + page_allocation + vmalloc + slab + highmem + page_reclaim + swap + page_cache + shmfs + oom + +Legacy Documentation +==================== + +This is a collection of older documents about the Linux memory management +(MM) subsystem internals with different level of details ranging from +notes and mailing list responses for elaborating descriptions of data +structures and algorithms. It should all be integrated nicely into the +above structured documentation, or deleted if it has served its purpose. + +.. toctree:: + :maxdepth: 1 + + active_mm + arch_pgtable_helpers + balance + damon/index + free_page_reporting + frontswap + hmm + hwpoison + hugetlbfs_reserv + ksm + memory-model + mmu_notifier + numa + overcommit-accounting + page_migration + page_frags + page_owner + page_table_check + remap_file_pages + slub + split_page_table_lock + transhuge + unevictable-lru + vmalloced-kernel-stacks + vmemmap_dedup + z3fold + zsmalloc diff --git a/Documentation/mm/ksm.rst b/Documentation/mm/ksm.rst new file mode 100644 index 000000000000..9e37add068e6 --- /dev/null +++ b/Documentation/mm/ksm.rst @@ -0,0 +1,87 @@ +.. _ksm: + +======================= +Kernel Samepage Merging +======================= + +KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y, +added to the Linux kernel in 2.6.32. See ``mm/ksm.c`` for its implementation, +and http://lwn.net/Articles/306704/ and https://lwn.net/Articles/330589/ + +The userspace interface of KSM is described in :ref:`Documentation/admin-guide/mm/ksm.rst ` + +Design +====== + +Overview +-------- + +.. kernel-doc:: mm/ksm.c + :DOC: Overview + +Reverse mapping +--------------- +KSM maintains reverse mapping information for KSM pages in the stable +tree. + +If a KSM page is shared between less than ``max_page_sharing`` VMAs, +the node of the stable tree that represents such KSM page points to a +list of struct rmap_item and the ``page->mapping`` of the +KSM page points to the stable tree node. + +When the sharing passes this threshold, KSM adds a second dimension to +the stable tree. The tree node becomes a "chain" that links one or +more "dups". Each "dup" keeps reverse mapping information for a KSM +page with ``page->mapping`` pointing to that "dup". + +Every "chain" and all "dups" linked into a "chain" enforce the +invariant that they represent the same write protected memory content, +even if each "dup" will be pointed by a different KSM page copy of +that content. + +This way the stable tree lookup computational complexity is unaffected +if compared to an unlimited list of reverse mappings. It is still +enforced that there cannot be KSM page content duplicates in the +stable tree itself. + +The deduplication limit enforced by ``max_page_sharing`` is required +to avoid the virtual memory rmap lists to grow too large. The rmap +walk has O(N) complexity where N is the number of rmap_items +(i.e. virtual mappings) that are sharing the page, which is in turn +capped by ``max_page_sharing``. So this effectively spreads the linear +O(N) computational complexity from rmap walk context over different +KSM pages. The ksmd walk over the stable_node "chains" is also O(N), +but N is the number of stable_node "dups", not the number of +rmap_items, so it has not a significant impact on ksmd performance. In +practice the best stable_node "dup" candidate will be kept and found +at the head of the "dups" list. + +High values of ``max_page_sharing`` result in faster memory merging +(because there will be fewer stable_node dups queued into the +stable_node chain->hlist to check for pruning) and higher +deduplication factor at the expense of slower worst case for rmap +walks for any KSM page which can happen during swapping, compaction, +NUMA balancing and page migration. + +The ``stable_node_dups/stable_node_chains`` ratio is also affected by the +``max_page_sharing`` tunable, and an high ratio may indicate fragmentation +in the stable_node dups, which could be solved by introducing +fragmentation algorithms in ksmd which would refile rmap_items from +one stable_node dup to another stable_node dup, in order to free up +stable_node "dups" with few rmap_items in them, but that may increase +the ksmd CPU usage and possibly slowdown the readonly computations on +the KSM pages of the applications. + +The whole list of stable_node "dups" linked in the stable_node +"chains" is scanned periodically in order to prune stale stable_nodes. +The frequency of such scans is defined by +``stable_node_chains_prune_millisecs`` sysfs tunable. + +Reference +--------- +.. kernel-doc:: mm/ksm.c + :functions: mm_slot ksm_scan stable_node rmap_item + +-- +Izik Eidus, +Hugh Dickins, 17 Nov 2009 diff --git a/Documentation/mm/memory-model.rst b/Documentation/mm/memory-model.rst new file mode 100644 index 000000000000..3779e562dc76 --- /dev/null +++ b/Documentation/mm/memory-model.rst @@ -0,0 +1,177 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. _physical_memory_model: + +===================== +Physical Memory Model +===================== + +Physical memory in a system may be addressed in different ways. The +simplest case is when the physical memory starts at address 0 and +spans a contiguous range up to the maximal address. It could be, +however, that this range contains small holes that are not accessible +for the CPU. Then there could be several contiguous ranges at +completely distinct addresses. And, don't forget about NUMA, where +different memory banks are attached to different CPUs. + +Linux abstracts this diversity using one of the two memory models: +FLATMEM and SPARSEMEM. Each architecture defines what +memory models it supports, what the default memory model is and +whether it is possible to manually override that default. + +All the memory models track the status of physical page frames using +struct page arranged in one or more arrays. + +Regardless of the selected memory model, there exists one-to-one +mapping between the physical page frame number (PFN) and the +corresponding `struct page`. + +Each memory model defines :c:func:`pfn_to_page` and :c:func:`page_to_pfn` +helpers that allow the conversion from PFN to `struct page` and vice +versa. + +FLATMEM +======= + +The simplest memory model is FLATMEM. This model is suitable for +non-NUMA systems with contiguous, or mostly contiguous, physical +memory. + +In the FLATMEM memory model, there is a global `mem_map` array that +maps the entire physical memory. For most architectures, the holes +have entries in the `mem_map` array. The `struct page` objects +corresponding to the holes are never fully initialized. + +To allocate the `mem_map` array, architecture specific setup code should +call :c:func:`free_area_init` function. Yet, the mappings array is not +usable until the call to :c:func:`memblock_free_all` that hands all the +memory to the page allocator. + +An architecture may free parts of the `mem_map` array that do not cover the +actual physical pages. In such case, the architecture specific +:c:func:`pfn_valid` implementation should take the holes in the +`mem_map` into account. + +With FLATMEM, the conversion between a PFN and the `struct page` is +straightforward: `PFN - ARCH_PFN_OFFSET` is an index to the +`mem_map` array. + +The `ARCH_PFN_OFFSET` defines the first page frame number for +systems with physical memory starting at address different from 0. + +SPARSEMEM +========= + +SPARSEMEM is the most versatile memory model available in Linux and it +is the only memory model that supports several advanced features such +as hot-plug and hot-remove of the physical memory, alternative memory +maps for non-volatile memory devices and deferred initialization of +the memory map for larger systems. + +The SPARSEMEM model presents the physical memory as a collection of +sections. A section is represented with struct mem_section +that contains `section_mem_map` that is, logically, a pointer to an +array of struct pages. However, it is stored with some other magic +that aids the sections management. The section size and maximal number +of section is specified using `SECTION_SIZE_BITS` and +`MAX_PHYSMEM_BITS` constants defined by each architecture that +supports SPARSEMEM. While `MAX_PHYSMEM_BITS` is an actual width of a +physical address that an architecture supports, the +`SECTION_SIZE_BITS` is an arbitrary value. + +The maximal number of sections is denoted `NR_MEM_SECTIONS` and +defined as + +.. math:: + + NR\_MEM\_SECTIONS = 2 ^ {(MAX\_PHYSMEM\_BITS - SECTION\_SIZE\_BITS)} + +The `mem_section` objects are arranged in a two-dimensional array +called `mem_sections`. The size and placement of this array depend +on `CONFIG_SPARSEMEM_EXTREME` and the maximal possible number of +sections: + +* When `CONFIG_SPARSEMEM_EXTREME` is disabled, the `mem_sections` + array is static and has `NR_MEM_SECTIONS` rows. Each row holds a + single `mem_section` object. +* When `CONFIG_SPARSEMEM_EXTREME` is enabled, the `mem_sections` + array is dynamically allocated. Each row contains PAGE_SIZE worth of + `mem_section` objects and the number of rows is calculated to fit + all the memory sections. + +The architecture setup code should call sparse_init() to +initialize the memory sections and the memory maps. + +With SPARSEMEM there are two possible ways to convert a PFN to the +corresponding `struct page` - a "classic sparse" and "sparse +vmemmap". The selection is made at build time and it is determined by +the value of `CONFIG_SPARSEMEM_VMEMMAP`. + +The classic sparse encodes the section number of a page in page->flags +and uses high bits of a PFN to access the section that maps that page +frame. Inside a section, the PFN is the index to the array of pages. + +The sparse vmemmap uses a virtually mapped memory map to optimize +pfn_to_page and page_to_pfn operations. There is a global `struct +page *vmemmap` pointer that points to a virtually contiguous array of +`struct page` objects. A PFN is an index to that array and the +offset of the `struct page` from `vmemmap` is the PFN of that +page. + +To use vmemmap, an architecture has to reserve a range of virtual +addresses that will map the physical pages containing the memory +map and make sure that `vmemmap` points to that range. In addition, +the architecture should implement :c:func:`vmemmap_populate` method +that will allocate the physical memory and create page tables for the +virtual memory map. If an architecture does not have any special +requirements for the vmemmap mappings, it can use default +:c:func:`vmemmap_populate_basepages` provided by the generic memory +management. + +The virtually mapped memory map allows storing `struct page` objects +for persistent memory devices in pre-allocated storage on those +devices. This storage is represented with struct vmem_altmap +that is eventually passed to vmemmap_populate() through a long chain +of function calls. The vmemmap_populate() implementation may use the +`vmem_altmap` along with :c:func:`vmemmap_alloc_block_buf` helper to +allocate memory map on the persistent memory device. + +ZONE_DEVICE +=========== +The `ZONE_DEVICE` facility builds upon `SPARSEMEM_VMEMMAP` to offer +`struct page` `mem_map` services for device driver identified physical +address ranges. The "device" aspect of `ZONE_DEVICE` relates to the fact +that the page objects for these address ranges are never marked online, +and that a reference must be taken against the device, not just the page +to keep the memory pinned for active use. `ZONE_DEVICE`, via +:c:func:`devm_memremap_pages`, performs just enough memory hotplug to +turn on :c:func:`pfn_to_page`, :c:func:`page_to_pfn`, and +:c:func:`get_user_pages` service for the given range of pfns. Since the +page reference count never drops below 1 the page is never tracked as +free memory and the page's `struct list_head lru` space is repurposed +for back referencing to the host device / driver that mapped the memory. + +While `SPARSEMEM` presents memory as a collection of sections, +optionally collected into memory blocks, `ZONE_DEVICE` users have a need +for smaller granularity of populating the `mem_map`. Given that +`ZONE_DEVICE` memory is never marked online it is subsequently never +subject to its memory ranges being exposed through the sysfs memory +hotplug api on memory block boundaries. The implementation relies on +this lack of user-api constraint to allow sub-section sized memory +ranges to be specified to :c:func:`arch_add_memory`, the top-half of +memory hotplug. Sub-section support allows for 2MB as the cross-arch +common alignment granularity for :c:func:`devm_memremap_pages`. + +The users of `ZONE_DEVICE` are: + +* pmem: Map platform persistent memory to be used as a direct-I/O target + via DAX mappings. + +* hmm: Extend `ZONE_DEVICE` with `->page_fault()` and `->page_free()` + event callbacks to allow a device-driver to coordinate memory management + events related to device-memory, typically GPU memory. See + Documentation/mm/hmm.rst. + +* p2pdma: Create `struct page` objects to allow peer devices in a + PCI/-E topology to coordinate direct-DMA operations between themselves, + i.e. bypass host memory. diff --git a/Documentation/mm/mmu_notifier.rst b/Documentation/mm/mmu_notifier.rst new file mode 100644 index 000000000000..df5d7777fc6b --- /dev/null +++ b/Documentation/mm/mmu_notifier.rst @@ -0,0 +1,99 @@ +.. _mmu_notifier: + +When do you need to notify inside page table lock ? +=================================================== + +When clearing a pte/pmd we are given a choice to notify the event through +(notify version of \*_clear_flush call mmu_notifier_invalidate_range) under +the page table lock. But that notification is not necessary in all cases. + +For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use +thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a +process virtual address space). There is only 2 cases when you need to notify +those secondary TLB while holding page table lock when clearing a pte/pmd: + + A) page backing address is free before mmu_notifier_invalidate_range_end() + B) a page table entry is updated to point to a new page (COW, write fault + on zero page, __replace_page(), ...) + +Case A is obvious you do not want to take the risk for the device to write to +a page that might now be used by some completely different task. + +Case B is more subtle. For correctness it requires the following sequence to +happen: + + - take page table lock + - clear page table entry and notify ([pmd/pte]p_huge_clear_flush_notify()) + - set page table entry to point to new page + +If clearing the page table entry is not followed by a notify before setting +the new pte/pmd value then you can break memory model like C11 or C++11 for +the device. + +Consider the following scenario (device use a feature similar to ATS/PASID): + +Two address addrA and addrB such that \|addrA - addrB\| >= PAGE_SIZE we assume +they are write protected for COW (other case of B apply too). + +:: + + [Time N] -------------------------------------------------------------------- + CPU-thread-0 {try to write to addrA} + CPU-thread-1 {try to write to addrB} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {read addrA and populate device TLB} + DEV-thread-2 {read addrB and populate device TLB} + [Time N+1] ------------------------------------------------------------------ + CPU-thread-0 {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}} + CPU-thread-1 {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+2] ------------------------------------------------------------------ + CPU-thread-0 {COW_step1: {update page table to point to new page for addrA}} + CPU-thread-1 {COW_step1: {update page table to point to new page for addrB}} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+3] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {preempted} + CPU-thread-2 {write to addrA which is a write to new page} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+3] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {preempted} + CPU-thread-2 {} + CPU-thread-3 {write to addrB which is a write to new page} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+4] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+5] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {read addrA from old page} + DEV-thread-2 {read addrB from new page} + +So here because at time N+2 the clear page table entry was not pair with a +notification to invalidate the secondary TLB, the device see the new value for +addrB before seeing the new value for addrA. This break total memory ordering +for the device. + +When changing a pte to write protect or to point to a new write protected page +with same content (KSM) it is fine to delay the mmu_notifier_invalidate_range +call to mmu_notifier_invalidate_range_end() outside the page table lock. This +is true even if the thread doing the page table update is preempted right after +releasing page table lock but before call mmu_notifier_invalidate_range_end(). diff --git a/Documentation/mm/numa.rst b/Documentation/mm/numa.rst new file mode 100644 index 000000000000..99fdeca917ca --- /dev/null +++ b/Documentation/mm/numa.rst @@ -0,0 +1,150 @@ +.. _numa: + +Started Nov 1999 by Kanoj Sarcar + +============= +What is NUMA? +============= + +This question can be answered from a couple of perspectives: the +hardware view and the Linux software view. + +From the hardware perspective, a NUMA system is a computer platform that +comprises multiple components or assemblies each of which may contain 0 +or more CPUs, local memory, and/or IO buses. For brevity and to +disambiguate the hardware view of these physical components/assemblies +from the software abstraction thereof, we'll call the components/assemblies +'cells' in this document. + +Each of the 'cells' may be viewed as an SMP [symmetric multi-processor] subset +of the system--although some components necessary for a stand-alone SMP system +may not be populated on any given cell. The cells of the NUMA system are +connected together with some sort of system interconnect--e.g., a crossbar or +point-to-point link are common types of NUMA system interconnects. Both of +these types of interconnects can be aggregated to create NUMA platforms with +cells at multiple distances from other cells. + +For Linux, the NUMA platforms of interest are primarily what is known as Cache +Coherent NUMA or ccNUMA systems. With ccNUMA systems, all memory is visible +to and accessible from any CPU attached to any cell and cache coherency +is handled in hardware by the processor caches and/or the system interconnect. + +Memory access time and effective memory bandwidth varies depending on how far +away the cell containing the CPU or IO bus making the memory access is from the +cell containing the target memory. For example, access to memory by CPUs +attached to the same cell will experience faster access times and higher +bandwidths than accesses to memory on other, remote cells. NUMA platforms +can have cells at multiple remote distances from any given cell. + +Platform vendors don't build NUMA systems just to make software developers' +lives interesting. Rather, this architecture is a means to provide scalable +memory bandwidth. However, to achieve scalable memory bandwidth, system and +application software must arrange for a large majority of the memory references +[cache misses] to be to "local" memory--memory on the same cell, if any--or +to the closest cell with memory. + +This leads to the Linux software view of a NUMA system: + +Linux divides the system's hardware resources into multiple software +abstractions called "nodes". Linux maps the nodes onto the physical cells +of the hardware platform, abstracting away some of the details for some +architectures. As with physical cells, software nodes may contain 0 or more +CPUs, memory and/or IO buses. And, again, memory accesses to memory on +"closer" nodes--nodes that map to closer cells--will generally experience +faster access times and higher effective bandwidth than accesses to more +remote cells. + +For some architectures, such as x86, Linux will "hide" any node representing a +physical cell that has no memory attached, and reassign any CPUs attached to +that cell to a node representing a cell that does have memory. Thus, on +these architectures, one cannot assume that all CPUs that Linux associates with +a given node will see the same local memory access times and bandwidth. + +In addition, for some architectures, again x86 is an example, Linux supports +the emulation of additional nodes. For NUMA emulation, linux will carve up +the existing nodes--or the system memory for non-NUMA platforms--into multiple +nodes. Each emulated node will manage a fraction of the underlying cells' +physical memory. NUMA emluation is useful for testing NUMA kernel and +application features on non-NUMA platforms, and as a sort of memory resource +management mechanism when used together with cpusets. +[see Documentation/admin-guide/cgroup-v1/cpusets.rst] + +For each node with memory, Linux constructs an independent memory management +subsystem, complete with its own free page lists, in-use page lists, usage +statistics and locks to mediate access. In addition, Linux constructs for +each memory zone [one or more of DMA, DMA32, NORMAL, HIGH_MEMORY, MOVABLE], +an ordered "zonelist". A zonelist specifies the zones/nodes to visit when a +selected zone/node cannot satisfy the allocation request. This situation, +when a zone has no available memory to satisfy a request, is called +"overflow" or "fallback". + +Because some nodes contain multiple zones containing different types of +memory, Linux must decide whether to order the zonelists such that allocations +fall back to the same zone type on a different node, or to a different zone +type on the same node. This is an important consideration because some zones, +such as DMA or DMA32, represent relatively scarce resources. Linux chooses +a default Node ordered zonelist. This means it tries to fallback to other zones +from the same node before using remote nodes which are ordered by NUMA distance. + +By default, Linux will attempt to satisfy memory allocation requests from the +node to which the CPU that executes the request is assigned. Specifically, +Linux will attempt to allocate from the first node in the appropriate zonelist +for the node where the request originates. This is called "local allocation." +If the "local" node cannot satisfy the request, the kernel will examine other +nodes' zones in the selected zonelist looking for the first zone in the list +that can satisfy the request. + +Local allocation will tend to keep subsequent access to the allocated memory +"local" to the underlying physical resources and off the system interconnect-- +as long as the task on whose behalf the kernel allocated some memory does not +later migrate away from that memory. The Linux scheduler is aware of the +NUMA topology of the platform--embodied in the "scheduling domains" data +structures [see Documentation/scheduler/sched-domains.rst]--and the scheduler +attempts to minimize task migration to distant scheduling domains. However, +the scheduler does not take a task's NUMA footprint into account directly. +Thus, under sufficient imbalance, tasks can migrate between nodes, remote +from their initial node and kernel data structures. + +System administrators and application designers can restrict a task's migration +to improve NUMA locality using various CPU affinity command line interfaces, +such as taskset(1) and numactl(1), and program interfaces such as +sched_setaffinity(2). Further, one can modify the kernel's default local +allocation behavior using Linux NUMA memory policy. [see +:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst `]. + +System administrators can restrict the CPUs and nodes' memories that a non- +privileged user can specify in the scheduling or NUMA commands and functions +using control groups and CPUsets. [see Documentation/admin-guide/cgroup-v1/cpusets.rst] + +On architectures that do not hide memoryless nodes, Linux will include only +zones [nodes] with memory in the zonelists. This means that for a memoryless +node the "local memory node"--the node of the first zone in CPU's node's +zonelist--will not be the node itself. Rather, it will be the node that the +kernel selected as the nearest node with memory when it built the zonelists. +So, default, local allocations will succeed with the kernel supplying the +closest available memory. This is a consequence of the same mechanism that +allows such allocations to fallback to other nearby nodes when a node that +does contain memory overflows. + +Some kernel allocations do not want or cannot tolerate this allocation fallback +behavior. Rather they want to be sure they get memory from the specified node +or get notified that the node has no free memory. This is usually the case when +a subsystem allocates per CPU memory resources, for example. + +A typical model for making such an allocation is to obtain the node id of the +node to which the "current CPU" is attached using one of the kernel's +numa_node_id() or CPU_to_node() functions and then request memory from only +the node id returned. When such an allocation fails, the requesting subsystem +may revert to its own fallback path. The slab kernel memory allocator is an +example of this. Or, the subsystem may choose to disable or not to enable +itself on allocation failure. The kernel profiling subsystem is an example of +this. + +If the architecture supports--does not hide--memoryless nodes, then CPUs +attached to memoryless nodes would always incur the fallback path overhead +or some subsystems would fail to initialize if they attempted to allocated +memory exclusively from a node without memory. To support such +architectures transparently, kernel subsystems can use the numa_mem_id() +or cpu_to_mem() function to locate the "local memory node" for the calling or +specified CPU. Again, this is the same node from which default, local page +allocations will be attempted. diff --git a/Documentation/mm/oom.rst b/Documentation/mm/oom.rst new file mode 100644 index 000000000000..18e9e40c1ec1 --- /dev/null +++ b/Documentation/mm/oom.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================== +Out Of Memory Handling +====================== diff --git a/Documentation/mm/overcommit-accounting.rst b/Documentation/mm/overcommit-accounting.rst new file mode 100644 index 000000000000..1addb0c374a4 --- /dev/null +++ b/Documentation/mm/overcommit-accounting.rst @@ -0,0 +1,88 @@ +.. _overcommit_accounting: + +===================== +Overcommit Accounting +===================== + +The Linux kernel supports the following overcommit handling modes + +0 + Heuristic overcommit handling. Obvious overcommits of address + space are refused. Used for a typical system. It ensures a + seriously wild allocation fails while allowing overcommit to + reduce swap usage. root is allowed to allocate slightly more + memory in this mode. This is the default. + +1 + Always overcommit. Appropriate for some scientific + applications. Classic example is code using sparse arrays and + just relying on the virtual memory consisting almost entirely + of zero pages. + +2 + Don't overcommit. The total address space commit for the + system is not permitted to exceed swap + a configurable amount + (default is 50%) of physical RAM. Depending on the amount you + use, in most situations this means a process will not be + killed while accessing pages but will receive errors on memory + allocation as appropriate. + + Useful for applications that want to guarantee their memory + allocations will be available in the future without having to + initialize every page. + +The overcommit policy is set via the sysctl ``vm.overcommit_memory``. + +The overcommit amount can be set via ``vm.overcommit_ratio`` (percentage) +or ``vm.overcommit_kbytes`` (absolute value). These only have an effect +when ``vm.overcommit_memory`` is set to 2. + +The current overcommit limit and amount committed are viewable in +``/proc/meminfo`` as CommitLimit and Committed_AS respectively. + +Gotchas +======= + +The C language stack growth does an implicit mremap. If you want absolute +guarantees and run close to the edge you MUST mmap your stack for the +largest size you think you will need. For typical stack usage this does +not matter much but it's a corner case if you really really care + +In mode 2 the MAP_NORESERVE flag is ignored. + + +How It Works +============ + +The overcommit is based on the following rules + +For a file backed map + | SHARED or READ-only - 0 cost (the file is the map not swap) + | PRIVATE WRITABLE - size of mapping per instance + +For an anonymous or ``/dev/zero`` map + | SHARED - size of mapping + | PRIVATE READ-only - 0 cost (but of little use) + | PRIVATE WRITABLE - size of mapping per instance + +Additional accounting + | Pages made writable copies by mmap + | shmfs memory drawn from the same pool + +Status +====== + +* We account mmap memory mappings +* We account mprotect changes in commit +* We account mremap changes in size +* We account brk +* We account munmap +* We report the commit status in /proc +* Account and check on fork +* Review stack handling/building on exec +* SHMfs accounting +* Implement actual limit enforcement + +To Do +===== +* Account ptrace pages (this is hard) diff --git a/Documentation/mm/page_allocation.rst b/Documentation/mm/page_allocation.rst new file mode 100644 index 000000000000..d9b4495561f1 --- /dev/null +++ b/Documentation/mm/page_allocation.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +Page Allocation +=============== diff --git a/Documentation/mm/page_cache.rst b/Documentation/mm/page_cache.rst new file mode 100644 index 000000000000..75eba7c431b2 --- /dev/null +++ b/Documentation/mm/page_cache.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========== +Page Cache +========== diff --git a/Documentation/mm/page_frags.rst b/Documentation/mm/page_frags.rst new file mode 100644 index 000000000000..7d6f9385d129 --- /dev/null +++ b/Documentation/mm/page_frags.rst @@ -0,0 +1,45 @@ +.. _page_frags: + +============== +Page fragments +============== + +A page fragment is an arbitrary-length arbitrary-offset area of memory +which resides within a 0 or higher order compound page. Multiple +fragments within that page are individually refcounted, in the page's +reference counter. + +The page_frag functions, page_frag_alloc and page_frag_free, provide a +simple allocation framework for page fragments. This is used by the +network stack and network device drivers to provide a backing region of +memory for use as either an sk_buff->head, or to be used in the "frags" +portion of skb_shared_info. + +In order to make use of the page fragment APIs a backing page fragment +cache is needed. This provides a central point for the fragment allocation +and tracks allows multiple calls to make use of a cached page. The +advantage to doing this is that multiple calls to get_page can be avoided +which can be expensive at allocation time. However due to the nature of +this caching it is required that any calls to the cache be protected by +either a per-cpu limitation, or a per-cpu limitation and forcing interrupts +to be disabled when executing the fragment allocation. + +The network stack uses two separate caches per CPU to handle fragment +allocation. The netdev_alloc_cache is used by callers making use of the +netdev_alloc_frag and __netdev_alloc_skb calls. The napi_alloc_cache is +used by callers of the __napi_alloc_frag and __napi_alloc_skb calls. The +main difference between these two calls is the context in which they may be +called. The "netdev" prefixed functions are usable in any context as these +functions will disable interrupts, while the "napi" prefixed functions are +only usable within the softirq context. + +Many network device drivers use a similar methodology for allocating page +fragments, but the page fragments are cached at the ring or descriptor +level. In order to enable these cases it is necessary to provide a generic +way of tearing down a page cache. For this reason __page_frag_cache_drain +was implemented. It allows for freeing multiple references from a single +page via a single call. The advantage to doing this is that it allows for +cleaning up the multiple references that were added to a page in order to +avoid calling get_page per allocation. + +Alexander Duyck, Nov 29, 2016. diff --git a/Documentation/mm/page_migration.rst b/Documentation/mm/page_migration.rst new file mode 100644 index 000000000000..8c5cb8147e55 --- /dev/null +++ b/Documentation/mm/page_migration.rst @@ -0,0 +1,288 @@ +.. _page_migration: + +============== +Page migration +============== + +Page migration allows moving the physical location of pages between +nodes in a NUMA system while the process is running. This means that the +virtual addresses that the process sees do not change. However, the +system rearranges the physical location of those pages. + +Also see :ref:`Heterogeneous Memory Management (HMM) ` +for migrating pages to or from device private memory. + +The main intent of page migration is to reduce the latency of memory accesses +by moving pages near to the processor where the process accessing that memory +is running. + +Page migration allows a process to manually relocate the node on which its +pages are located through the MF_MOVE and MF_MOVE_ALL options while setting +a new memory policy via mbind(). The pages of a process can also be relocated +from another process using the sys_migrate_pages() function call. The +migrate_pages() function call takes two sets of nodes and moves pages of a +process that are located on the from nodes to the destination nodes. +Page migration functions are provided by the numactl package by Andi Kleen +(a version later than 0.9.3 is required. Get it from +https://github.com/numactl/numactl.git). numactl provides libnuma +which provides an interface similar to other NUMA functionality for page +migration. cat ``/proc//numa_maps`` allows an easy review of where the +pages of a process are located. See also the numa_maps documentation in the +proc(5) man page. + +Manual migration is useful if for example the scheduler has relocated +a process to a processor on a distant node. A batch scheduler or an +administrator may detect the situation and move the pages of the process +nearer to the new processor. The kernel itself only provides +manual page migration support. Automatic page migration may be implemented +through user space processes that move pages. A special function call +"move_pages" allows the moving of individual pages within a process. +For example, A NUMA profiler may obtain a log showing frequent off-node +accesses and may use the result to move pages to more advantageous +locations. + +Larger installations usually partition the system using cpusets into +sections of nodes. Paul Jackson has equipped cpusets with the ability to +move pages when a task is moved to another cpuset (See +:ref:`CPUSETS `). +Cpusets allow the automation of process locality. If a task is moved to +a new cpuset then also all its pages are moved with it so that the +performance of the process does not sink dramatically. Also the pages +of processes in a cpuset are moved if the allowed memory nodes of a +cpuset are changed. + +Page migration allows the preservation of the relative location of pages +within a group of nodes for all migration techniques which will preserve a +particular memory allocation pattern generated even after migrating a +process. This is necessary in order to preserve the memory latencies. +Processes will run with similar performance after migration. + +Page migration occurs in several steps. First a high level +description for those trying to use migrate_pages() from the kernel +(for userspace usage see the Andi Kleen's numactl package mentioned above) +and then a low level description of how the low level details work. + +In kernel use of migrate_pages() +================================ + +1. Remove pages from the LRU. + + Lists of pages to be migrated are generated by scanning over + pages and moving them into lists. This is done by + calling isolate_lru_page(). + Calling isolate_lru_page() increases the references to the page + so that it cannot vanish while the page migration occurs. + It also prevents the swapper or other scans from encountering + the page. + +2. We need to have a function of type new_page_t that can be + passed to migrate_pages(). This function should figure out + how to allocate the correct new page given the old page. + +3. The migrate_pages() function is called which attempts + to do the migration. It will call the function to allocate + the new page for each page that is considered for + moving. + +How migrate_pages() works +========================= + +migrate_pages() does several passes over its list of pages. A page is moved +if all references to a page are removable at the time. The page has +already been removed from the LRU via isolate_lru_page() and the refcount +is increased so that the page cannot be freed while page migration occurs. + +Steps: + +1. Lock the page to be migrated. + +2. Ensure that writeback is complete. + +3. Lock the new page that we want to move to. It is locked so that accesses to + this (not yet up-to-date) page immediately block while the move is in progress. + +4. All the page table references to the page are converted to migration + entries. This decreases the mapcount of a page. If the resulting + mapcount is not zero then we do not migrate the page. All user space + processes that attempt to access the page will now wait on the page lock + or wait for the migration page table entry to be removed. + +5. The i_pages lock is taken. This will cause all processes trying + to access the page via the mapping to block on the spinlock. + +6. The refcount of the page is examined and we back out if references remain. + Otherwise, we know that we are the only one referencing this page. + +7. The radix tree is checked and if it does not contain the pointer to this + page then we back out because someone else modified the radix tree. + +8. The new page is prepped with some settings from the old page so that + accesses to the new page will discover a page with the correct settings. + +9. The radix tree is changed to point to the new page. + +10. The reference count of the old page is dropped because the address space + reference is gone. A reference to the new page is established because + the new page is referenced by the address space. + +11. The i_pages lock is dropped. With that lookups in the mapping + become possible again. Processes will move from spinning on the lock + to sleeping on the locked new page. + +12. The page contents are copied to the new page. + +13. The remaining page flags are copied to the new page. + +14. The old page flags are cleared to indicate that the page does + not provide any information anymore. + +15. Queued up writeback on the new page is triggered. + +16. If migration entries were inserted into the page table, then replace them + with real ptes. Doing so will enable access for user space processes not + already waiting for the page lock. + +17. The page locks are dropped from the old and new page. + Processes waiting on the page lock will redo their page faults + and will reach the new page. + +18. The new page is moved to the LRU and can be scanned by the swapper, + etc. again. + +Non-LRU page migration +====================== + +Although migration originally aimed for reducing the latency of memory accesses +for NUMA, compaction also uses migration to create high-order pages. + +Current problem of the implementation is that it is designed to migrate only +*LRU* pages. However, there are potential non-LRU pages which can be migrated +in drivers, for example, zsmalloc, virtio-balloon pages. + +For virtio-balloon pages, some parts of migration code path have been hooked +up and added virtio-balloon specific functions to intercept migration logics. +It's too specific to a driver so other drivers who want to make their pages +movable would have to add their own specific hooks in the migration path. + +To overcome the problem, VM supports non-LRU page migration which provides +generic functions for non-LRU movable pages without driver specific hooks +in the migration path. + +If a driver wants to make its pages movable, it should define three functions +which are function pointers of struct address_space_operations. + +1. ``bool (*isolate_page) (struct page *page, isolate_mode_t mode);`` + + What VM expects from isolate_page() function of driver is to return *true* + if driver isolates the page successfully. On returning true, VM marks the page + as PG_isolated so concurrent isolation in several CPUs skip the page + for isolation. If a driver cannot isolate the page, it should return *false*. + + Once page is successfully isolated, VM uses page.lru fields so driver + shouldn't expect to preserve values in those fields. + +2. ``int (*migratepage) (struct address_space *mapping,`` +| ``struct page *newpage, struct page *oldpage, enum migrate_mode);`` + + After isolation, VM calls migratepage() of driver with the isolated page. + The function of migratepage() is to move the contents of the old page to the + new page + and set up fields of struct page newpage. Keep in mind that you should + indicate to the VM the oldpage is no longer movable via __ClearPageMovable() + under page_lock if you migrated the oldpage successfully and returned + MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver + can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time + because VM interprets -EAGAIN as "temporary migration failure". On returning + any error except -EAGAIN, VM will give up the page migration without + retrying. + + Driver shouldn't touch the page.lru field while in the migratepage() function. + +3. ``void (*putback_page)(struct page *);`` + + If migration fails on the isolated page, VM should return the isolated page + to the driver so VM calls the driver's putback_page() with the isolated page. + In this function, the driver should put the isolated page back into its own data + structure. + +Non-LRU movable page flags + + There are two page flags for supporting non-LRU movable page. + + * PG_movable + + Driver should use the function below to make page movable under page_lock:: + + void __SetPageMovable(struct page *page, struct address_space *mapping) + + It needs argument of address_space for registering migration + family functions which will be called by VM. Exactly speaking, + PG_movable is not a real flag of struct page. Rather, VM + reuses the page->mapping's lower bits to represent it:: + + #define PAGE_MAPPING_MOVABLE 0x2 + page->mapping = page->mapping | PAGE_MAPPING_MOVABLE; + + so driver shouldn't access page->mapping directly. Instead, driver should + use page_mapping() which masks off the low two bits of page->mapping under + page lock so it can get the right struct address_space. + + For testing of non-LRU movable pages, VM supports __PageMovable() function. + However, it doesn't guarantee to identify non-LRU movable pages because + the page->mapping field is unified with other variables in struct page. + If the driver releases the page after isolation by VM, page->mapping + doesn't have a stable value although it has PAGE_MAPPING_MOVABLE set + (look at __ClearPageMovable). But __PageMovable() is cheap to call whether + page is LRU or non-LRU movable once the page has been isolated because LRU + pages can never have PAGE_MAPPING_MOVABLE set in page->mapping. It is also + good for just peeking to test non-LRU movable pages before more expensive + checking with lock_page() in pfn scanning to select a victim. + + For guaranteeing non-LRU movable page, VM provides PageMovable() function. + Unlike __PageMovable(), PageMovable() validates page->mapping and + mapping->a_ops->isolate_page under lock_page(). The lock_page() prevents + sudden destroying of page->mapping. + + Drivers using __SetPageMovable() should clear the flag via + __ClearMovablePage() under page_lock() before the releasing the page. + + * PG_isolated + + To prevent concurrent isolation among several CPUs, VM marks isolated page + as PG_isolated under lock_page(). So if a CPU encounters PG_isolated + non-LRU movable page, it can skip it. Driver doesn't need to manipulate the + flag because VM will set/clear it automatically. Keep in mind that if the + driver sees a PG_isolated page, it means the page has been isolated by the + VM so it shouldn't touch the page.lru field. + The PG_isolated flag is aliased with the PG_reclaim flag so drivers + shouldn't use PG_isolated for its own purposes. + +Monitoring Migration +===================== + +The following events (counters) can be used to monitor page migration. + +1. PGMIGRATE_SUCCESS: Normal page migration success. Each count means that a + page was migrated. If the page was a non-THP and non-hugetlb page, then + this counter is increased by one. If the page was a THP or hugetlb, then + this counter is increased by the number of THP or hugetlb subpages. + For example, migration of a single 2MB THP that has 4KB-size base pages + (subpages) will cause this counter to increase by 512. + +2. PGMIGRATE_FAIL: Normal page migration failure. Same counting rules as for + PGMIGRATE_SUCCESS, above: this will be increased by the number of subpages, + if it was a THP or hugetlb. + +3. THP_MIGRATION_SUCCESS: A THP was migrated without being split. + +4. THP_MIGRATION_FAIL: A THP could not be migrated nor it could be split. + +5. THP_MIGRATION_SPLIT: A THP was migrated, but not as such: first, the THP had + to be split. After splitting, a migration retry was used for it's sub-pages. + +THP_MIGRATION_* events also update the appropriate PGMIGRATE_SUCCESS or +PGMIGRATE_FAIL events. For example, a THP migration failure will cause both +THP_MIGRATION_FAIL and PGMIGRATE_FAIL to increase. + +Christoph Lameter, May 8, 2006. +Minchan Kim, Mar 28, 2016. diff --git a/Documentation/mm/page_owner.rst b/Documentation/mm/page_owner.rst new file mode 100644 index 000000000000..f5c954afe97c --- /dev/null +++ b/Documentation/mm/page_owner.rst @@ -0,0 +1,196 @@ +.. _page_owner: + +================================================== +page owner: Tracking about who allocated each page +================================================== + +Introduction +============ + +page owner is for the tracking about who allocated each page. +It can be used to debug memory leak or to find a memory hogger. +When allocation happens, information about allocation such as call stack +and order of pages is stored into certain storage for each page. +When we need to know about status of all pages, we can get and analyze +this information. + +Although we already have tracepoint for tracing page allocation/free, +using it for analyzing who allocate each page is rather complex. We need +to enlarge the trace buffer for preventing overlapping until userspace +program launched. And, launched program continually dump out the trace +buffer for later analysis and it would change system behaviour with more +possibility rather than just keeping it in memory, so bad for debugging. + +page owner can also be used for various purposes. For example, accurate +fragmentation statistics can be obtained through gfp flag information of +each page. It is already implemented and activated if page owner is +enabled. Other usages are more than welcome. + +page owner is disabled by default. So, if you'd like to use it, you need +to add "page_owner=on" to your boot cmdline. If the kernel is built +with page owner and page owner is disabled in runtime due to not enabling +boot option, runtime overhead is marginal. If disabled in runtime, it +doesn't require memory to store owner information, so there is no runtime +memory overhead. And, page owner inserts just two unlikely branches into +the page allocator hotpath and if not enabled, then allocation is done +like as the kernel without page owner. These two unlikely branches should +not affect to allocation performance, especially if the static keys jump +label patching functionality is available. Following is the kernel's code +size change due to this facility. + +- Without page owner:: + + text data bss dec hex filename + 48392 2333 644 51369 c8a9 mm/page_alloc.o + +- With page owner:: + + text data bss dec hex filename + 48800 2445 644 51889 cab1 mm/page_alloc.o + 6662 108 29 6799 1a8f mm/page_owner.o + 1025 8 8 1041 411 mm/page_ext.o + +Although, roughly, 8 KB code is added in total, page_alloc.o increase by +520 bytes and less than half of it is in hotpath. Building the kernel with +page owner and turning it on if needed would be great option to debug +kernel memory problem. + +There is one notice that is caused by implementation detail. page owner +stores information into the memory from struct page extension. This memory +is initialized some time later than that page allocator starts in sparse +memory system, so, until initialization, many pages can be allocated and +they would have no owner information. To fix it up, these early allocated +pages are investigated and marked as allocated in initialization phase. +Although it doesn't mean that they have the right owner information, +at least, we can tell whether the page is allocated or not, +more accurately. On 2GB memory x86-64 VM box, 13343 early allocated pages +are catched and marked, although they are mostly allocated from struct +page extension feature. Anyway, after that, no page is left in +un-tracking state. + +Usage +===== + +1) Build user-space helper:: + + cd tools/vm + make page_owner_sort + +2) Enable page owner: add "page_owner=on" to boot cmdline. + +3) Do the job that you want to debug. + +4) Analyze information from page owner:: + + cat /sys/kernel/debug/page_owner > page_owner_full.txt + ./page_owner_sort page_owner_full.txt sorted_page_owner.txt + + The general output of ``page_owner_full.txt`` is as follows:: + + Page allocated via order XXX, ... + PFN XXX ... + // Detailed stack + + Page allocated via order XXX, ... + PFN XXX ... + // Detailed stack + + The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows + in buf, uses regexp to extract the page order value, counts the times + and pages of buf, and finally sorts them according to the parameter(s). + + See the result about who allocated each page + in the ``sorted_page_owner.txt``. General output:: + + XXX times, XXX pages: + Page allocated via order XXX, ... + // Detailed stack + + By default, ``page_owner_sort`` is sorted according to the times of buf. + If you want to sort by the page nums of buf, use the ``-m`` parameter. + The detailed parameters are: + + fundamental function:: + + Sort: + -a Sort by memory allocation time. + -m Sort by total memory. + -p Sort by pid. + -P Sort by tgid. + -n Sort by task command name. + -r Sort by memory release time. + -s Sort by stack trace. + -t Sort by times (default). + --sort Specify sorting order. Sorting syntax is [+|-]key[,[+|-]key[,...]]. + Choose a key from the **STANDARD FORMAT SPECIFIERS** section. The "+" is + optional since default direction is increasing numerical or lexicographic + order. Mixed use of abbreviated and complete-form of keys is allowed. + + Examples: + ./page_owner_sort --sort=n,+pid,-tgid + ./page_owner_sort --sort=at + + additional function:: + + Cull: + --cull + Specify culling rules.Culling syntax is key[,key[,...]].Choose a + multi-letter key from the **STANDARD FORMAT SPECIFIERS** section. + + is a single argument in the form of a comma-separated list, + which offers a way to specify individual culling rules. The recognized + keywords are described in the **STANDARD FORMAT SPECIFIERS** section below. + can be specified by the sequence of keys k1,k2, ..., as described in + the STANDARD SORT KEYS section below. Mixed use of abbreviated and + complete-form of keys is allowed. + + Examples: + ./page_owner_sort --cull=stacktrace + ./page_owner_sort --cull=st,pid,name + ./page_owner_sort --cull=n,f + + Filter: + -f Filter out the information of blocks whose memory has been released. + + Select: + --pid Select by pid. This selects the blocks whose process ID + numbers appear in . + --tgid Select by tgid. This selects the blocks whose thread + group ID numbers appear in . + --name Select by task command name. This selects the blocks whose + task command name appear in . + + , , are single arguments in the form of a comma-separated list, + which offers a way to specify individual selecting rules. + + + Examples: + ./page_owner_sort --pid=1 + ./page_owner_sort --tgid=1,2,3 + ./page_owner_sort --name name1,name2 + +STANDARD FORMAT SPECIFIERS +========================== +:: + + For --sort option: + + KEY LONG DESCRIPTION + p pid process ID + tg tgid thread group ID + n name task command name + st stacktrace stack trace of the page allocation + T txt full text of block + ft free_ts timestamp of the page when it was released + at alloc_ts timestamp of the page when it was allocated + ator allocator memory allocator for pages + + For --curl option: + + KEY LONG DESCRIPTION + p pid process ID + tg tgid thread group ID + n name task command name + f free whether the page has been released or not + st stacktrace stack trace of the page allocation + ator allocator memory allocator for pages diff --git a/Documentation/mm/page_reclaim.rst b/Documentation/mm/page_reclaim.rst new file mode 100644 index 000000000000..50a30b7f8ac3 --- /dev/null +++ b/Documentation/mm/page_reclaim.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============ +Page Reclaim +============ diff --git a/Documentation/mm/page_table_check.rst b/Documentation/mm/page_table_check.rst new file mode 100644 index 000000000000..1a09472f10a3 --- /dev/null +++ b/Documentation/mm/page_table_check.rst @@ -0,0 +1,56 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. _page_table_check: + +================ +Page Table Check +================ + +Introduction +============ + +Page table check allows to harden the kernel by ensuring that some types of +the memory corruptions are prevented. + +Page table check performs extra verifications at the time when new pages become +accessible from the userspace by getting their page table entries (PTEs PMDs +etc.) added into the table. + +In case of detected corruption, the kernel is crashed. There is a small +performance and memory overhead associated with the page table check. Therefore, +it is disabled by default, but can be optionally enabled on systems where the +extra hardening outweighs the performance costs. Also, because page table check +is synchronous, it can help with debugging double map memory corruption issues, +by crashing kernel at the time wrong mapping occurs instead of later which is +often the case with memory corruptions bugs. + +Double mapping detection logic +============================== + ++-------------------+-------------------+-------------------+------------------+ +| Current Mapping | New mapping | Permissions | Rule | ++===================+===================+===================+==================+ +| Anonymous | Anonymous | Read | Allow | ++-------------------+-------------------+-------------------+------------------+ +| Anonymous | Anonymous | Read / Write | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Anonymous | Named | Any | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Named | Anonymous | Any | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Named | Named | Any | Allow | ++-------------------+-------------------+-------------------+------------------+ + +Enabling Page Table Check +========================= + +Build kernel with: + +- PAGE_TABLE_CHECK=y + Note, it can only be enabled on platforms where ARCH_SUPPORTS_PAGE_TABLE_CHECK + is available. + +- Boot with 'page_table_check=on' kernel parameter. + +Optionally, build kernel with PAGE_TABLE_CHECK_ENFORCED in order to have page +table support without extra kernel parameter. diff --git a/Documentation/mm/page_tables.rst b/Documentation/mm/page_tables.rst new file mode 100644 index 000000000000..96939571d7bc --- /dev/null +++ b/Documentation/mm/page_tables.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========== +Page Tables +=========== diff --git a/Documentation/mm/physical_memory.rst b/Documentation/mm/physical_memory.rst new file mode 100644 index 000000000000..2ab7b8c1c863 --- /dev/null +++ b/Documentation/mm/physical_memory.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +Physical Memory +=============== diff --git a/Documentation/mm/process_addrs.rst b/Documentation/mm/process_addrs.rst new file mode 100644 index 000000000000..e8618fbc62c9 --- /dev/null +++ b/Documentation/mm/process_addrs.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================= +Process Addresses +================= diff --git a/Documentation/mm/remap_file_pages.rst b/Documentation/mm/remap_file_pages.rst new file mode 100644 index 000000000000..7bef6718e3a9 --- /dev/null +++ b/Documentation/mm/remap_file_pages.rst @@ -0,0 +1,33 @@ +.. _remap_file_pages: + +============================== +remap_file_pages() system call +============================== + +The remap_file_pages() system call is used to create a nonlinear mapping, +that is, a mapping in which the pages of the file are mapped into a +nonsequential order in memory. The advantage of using remap_file_pages() +over using repeated calls to mmap(2) is that the former approach does not +require the kernel to create additional VMA (Virtual Memory Area) data +structures. + +Supporting of nonlinear mapping requires significant amount of non-trivial +code in kernel virtual memory subsystem including hot paths. Also to get +nonlinear mapping work kernel need a way to distinguish normal page table +entries from entries with file offset (pte_file). Kernel reserves flag in +PTE for this purpose. PTE flags are scarce resource especially on some CPU +architectures. It would be nice to free up the flag for other usage. + +Fortunately, there are not many users of remap_file_pages() in the wild. +It's only known that one enterprise RDBMS implementation uses the syscall +on 32-bit systems to map files bigger than can linearly fit into 32-bit +virtual address space. This use-case is not critical anymore since 64-bit +systems are widely available. + +The syscall is deprecated and replaced it with an emulation now. The +emulation creates new VMAs instead of nonlinear mappings. It's going to +work slower for rare users of remap_file_pages() but ABI is preserved. + +One side effect of emulation (apart from performance) is that user can hit +vm.max_map_count limit more easily due to additional VMAs. See comment for +DEFAULT_MAX_MAP_COUNT for more details on the limit. diff --git a/Documentation/mm/shmfs.rst b/Documentation/mm/shmfs.rst new file mode 100644 index 000000000000..8b01ebb4c30e --- /dev/null +++ b/Documentation/mm/shmfs.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================== +Shared Memory Filesystem +======================== diff --git a/Documentation/mm/slab.rst b/Documentation/mm/slab.rst new file mode 100644 index 000000000000..87d5a5bb172f --- /dev/null +++ b/Documentation/mm/slab.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +Slab Allocation +=============== diff --git a/Documentation/mm/slub.rst b/Documentation/mm/slub.rst new file mode 100644 index 000000000000..43063ade737a --- /dev/null +++ b/Documentation/mm/slub.rst @@ -0,0 +1,452 @@ +.. _slub: + +========================== +Short users guide for SLUB +========================== + +The basic philosophy of SLUB is very different from SLAB. SLAB +requires rebuilding the kernel to activate debug options for all +slab caches. SLUB always includes full debugging but it is off by default. +SLUB can enable debugging only for selected slabs in order to avoid +an impact on overall system performance which may make a bug more +difficult to find. + +In order to switch debugging on one can add an option ``slub_debug`` +to the kernel command line. That will enable full debugging for +all slabs. + +Typically one would then use the ``slabinfo`` command to get statistical +data and perform operation on the slabs. By default ``slabinfo`` only lists +slabs that have data in them. See "slabinfo -h" for more options when +running the command. ``slabinfo`` can be compiled with +:: + + gcc -o slabinfo tools/vm/slabinfo.c + +Some of the modes of operation of ``slabinfo`` require that slub debugging +be enabled on the command line. F.e. no tracking information will be +available without debugging on and validation can only partially +be performed if debugging was not switched on. + +Some more sophisticated uses of slub_debug: +------------------------------------------- + +Parameters may be given to ``slub_debug``. If none is specified then full +debugging is enabled. Format: + +slub_debug= + Enable options for all slabs + +slub_debug=,,,... + Enable options only for select slabs (no spaces + after a comma) + +Multiple blocks of options for all slabs or selected slabs can be given, with +blocks of options delimited by ';'. The last of "all slabs" blocks is applied +to all slabs except those that match one of the "select slabs" block. Options +of the first "select slabs" blocks that matches the slab's name are applied. + +Possible debug options are:: + + F Sanity checks on (enables SLAB_DEBUG_CONSISTENCY_CHECKS + Sorry SLAB legacy issues) + Z Red zoning + P Poisoning (object and padding) + U User tracking (free and alloc) + T Trace (please only use on single slabs) + A Enable failslab filter mark for the cache + O Switch debugging off for caches that would have + caused higher minimum slab orders + - Switch all debugging off (useful if the kernel is + configured with CONFIG_SLUB_DEBUG_ON) + +F.e. in order to boot just with sanity checks and red zoning one would specify:: + + slub_debug=FZ + +Trying to find an issue in the dentry cache? Try:: + + slub_debug=,dentry + +to only enable debugging on the dentry cache. You may use an asterisk at the +end of the slab name, in order to cover all slabs with the same prefix. For +example, here's how you can poison the dentry cache as well as all kmalloc +slabs:: + + slub_debug=P,kmalloc-*,dentry + +Red zoning and tracking may realign the slab. We can just apply sanity checks +to the dentry cache with:: + + slub_debug=F,dentry + +Debugging options may require the minimum possible slab order to increase as +a result of storing the metadata (for example, caches with PAGE_SIZE object +sizes). This has a higher liklihood of resulting in slab allocation errors +in low memory situations or if there's high fragmentation of memory. To +switch off debugging for such caches by default, use:: + + slub_debug=O + +You can apply different options to different list of slab names, using blocks +of options. This will enable red zoning for dentry and user tracking for +kmalloc. All other slabs will not get any debugging enabled:: + + slub_debug=Z,dentry;U,kmalloc-* + +You can also enable options (e.g. sanity checks and poisoning) for all caches +except some that are deemed too performance critical and don't need to be +debugged by specifying global debug options followed by a list of slab names +with "-" as options:: + + slub_debug=FZ;-,zs_handle,zspage + +The state of each debug option for a slab can be found in the respective files +under:: + + /sys/kernel/slab// + +If the file contains 1, the option is enabled, 0 means disabled. The debug +options from the ``slub_debug`` parameter translate to the following files:: + + F sanity_checks + Z red_zone + P poison + U store_user + T trace + A failslab + +Careful with tracing: It may spew out lots of information and never stop if +used on the wrong slab. + +Slab merging +============ + +If no debug options are specified then SLUB may merge similar slabs together +in order to reduce overhead and increase cache hotness of objects. +``slabinfo -a`` displays which slabs were merged together. + +Slab validation +=============== + +SLUB can validate all object if the kernel was booted with slub_debug. In +order to do so you must have the ``slabinfo`` tool. Then you can do +:: + + slabinfo -v + +which will test all objects. Output will be generated to the syslog. + +This also works in a more limited way if boot was without slab debug. +In that case ``slabinfo -v`` simply tests all reachable objects. Usually +these are in the cpu slabs and the partial slabs. Full slabs are not +tracked by SLUB in a non debug situation. + +Getting more performance +======================== + +To some degree SLUB's performance is limited by the need to take the +list_lock once in a while to deal with partial slabs. That overhead is +governed by the order of the allocation for each slab. The allocations +can be influenced by kernel parameters: + +.. slub_min_objects=x (default 4) +.. slub_min_order=x (default 0) +.. slub_max_order=x (default 3 (PAGE_ALLOC_COSTLY_ORDER)) + +``slub_min_objects`` + allows to specify how many objects must at least fit into one + slab in order for the allocation order to be acceptable. In + general slub will be able to perform this number of + allocations on a slab without consulting centralized resources + (list_lock) where contention may occur. + +``slub_min_order`` + specifies a minimum order of slabs. A similar effect like + ``slub_min_objects``. + +``slub_max_order`` + specified the order at which ``slub_min_objects`` should no + longer be checked. This is useful to avoid SLUB trying to + generate super large order pages to fit ``slub_min_objects`` + of a slab cache with large object sizes into one high order + page. Setting command line parameter + ``debug_guardpage_minorder=N`` (N > 0), forces setting + ``slub_max_order`` to 0, what cause minimum possible order of + slabs allocation. + +SLUB Debug output +================= + +Here is a sample of slub debug output:: + + ==================================================================== + BUG kmalloc-8: Right Redzone overwritten + -------------------------------------------------------------------- + + INFO: 0xc90f6d28-0xc90f6d2b. First byte 0x00 instead of 0xcc + INFO: Slab 0xc528c530 flags=0x400000c3 inuse=61 fp=0xc90f6d58 + INFO: Object 0xc90f6d20 @offset=3360 fp=0xc90f6d58 + INFO: Allocated in get_modalias+0x61/0xf5 age=53 cpu=1 pid=554 + + Bytes b4 (0xc90f6d10): 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ + Object (0xc90f6d20): 31 30 31 39 2e 30 30 35 1019.005 + Redzone (0xc90f6d28): 00 cc cc cc . + Padding (0xc90f6d50): 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ + + [] dump_trace+0x63/0x1eb + [] show_trace_log_lvl+0x1a/0x2f + [] show_trace+0x12/0x14 + [] dump_stack+0x16/0x18 + [] object_err+0x143/0x14b + [] check_object+0x66/0x234 + [] __slab_free+0x239/0x384 + [] kfree+0xa6/0xc6 + [] get_modalias+0xb9/0xf5 + [] dmi_dev_uevent+0x27/0x3c + [] dev_uevent+0x1ad/0x1da + [] kobject_uevent_env+0x20a/0x45b + [] kobject_uevent+0xa/0xf + [] store_uevent+0x4f/0x58 + [] dev_attr_store+0x29/0x2f + [] sysfs_write_file+0x16e/0x19c + [] vfs_write+0xd1/0x15a + [] sys_write+0x3d/0x72 + [] sysenter_past_esp+0x5f/0x99 + [] 0xb7f7b410 + ======================= + + FIX kmalloc-8: Restoring Redzone 0xc90f6d28-0xc90f6d2b=0xcc + +If SLUB encounters a corrupted object (full detection requires the kernel +to be booted with slub_debug) then the following output will be dumped +into the syslog: + +1. Description of the problem encountered + + This will be a message in the system log starting with:: + + =============================================== + BUG : + ----------------------------------------------- + + INFO: - + INFO: Slab
+ INFO: Object
+ INFO: Allocated in age= cpu= pid= + INFO: Freed in age= cpu= + pid= + + (Object allocation / free information is only available if SLAB_STORE_USER is + set for the slab. slub_debug sets that option) + +2. The object contents if an object was involved. + + Various types of lines can follow the BUG SLUB line: + + Bytes b4
: + Shows a few bytes before the object where the problem was detected. + Can be useful if the corruption does not stop with the start of the + object. + + Object
: + The bytes of the object. If the object is inactive then the bytes + typically contain poison values. Any non-poison value shows a + corruption by a write after free. + + Redzone
: + The Redzone following the object. The Redzone is used to detect + writes after the object. All bytes should always have the same + value. If there is any deviation then it is due to a write after + the object boundary. + + (Redzone information is only available if SLAB_RED_ZONE is set. + slub_debug sets that option) + + Padding
: + Unused data to fill up the space in order to get the next object + properly aligned. In the debug case we make sure that there are + at least 4 bytes of padding. This allows the detection of writes + before the object. + +3. A stackdump + + The stackdump describes the location where the error was detected. The cause + of the corruption is may be more likely found by looking at the function that + allocated or freed the object. + +4. Report on how the problem was dealt with in order to ensure the continued + operation of the system. + + These are messages in the system log beginning with:: + + FIX : + + In the above sample SLUB found that the Redzone of an active object has + been overwritten. Here a string of 8 characters was written into a slab that + has the length of 8 characters. However, a 8 character string needs a + terminating 0. That zero has overwritten the first byte of the Redzone field. + After reporting the details of the issue encountered the FIX SLUB message + tells us that SLUB has restored the Redzone to its proper value and then + system operations continue. + +Emergency operations +==================== + +Minimal debugging (sanity checks alone) can be enabled by booting with:: + + slub_debug=F + +This will be generally be enough to enable the resiliency features of slub +which will keep the system running even if a bad kernel component will +keep corrupting objects. This may be important for production systems. +Performance will be impacted by the sanity checks and there will be a +continual stream of error messages to the syslog but no additional memory +will be used (unlike full debugging). + +No guarantees. The kernel component still needs to be fixed. Performance +may be optimized further by locating the slab that experiences corruption +and enabling debugging only for that cache + +I.e.:: + + slub_debug=F,dentry + +If the corruption occurs by writing after the end of the object then it +may be advisable to enable a Redzone to avoid corrupting the beginning +of other objects:: + + slub_debug=FZ,dentry + +Extended slabinfo mode and plotting +=================================== + +The ``slabinfo`` tool has a special 'extended' ('-X') mode that includes: + - Slabcache Totals + - Slabs sorted by size (up to -N slabs, default 1) + - Slabs sorted by loss (up to -N slabs, default 1) + +Additionally, in this mode ``slabinfo`` does not dynamically scale +sizes (G/M/K) and reports everything in bytes (this functionality is +also available to other slabinfo modes via '-B' option) which makes +reporting more precise and accurate. Moreover, in some sense the `-X' +mode also simplifies the analysis of slabs' behaviour, because its +output can be plotted using the ``slabinfo-gnuplot.sh`` script. So it +pushes the analysis from looking through the numbers (tons of numbers) +to something easier -- visual analysis. + +To generate plots: + +a) collect slabinfo extended records, for example:: + + while [ 1 ]; do slabinfo -X >> FOO_STATS; sleep 1; done + +b) pass stats file(-s) to ``slabinfo-gnuplot.sh`` script:: + + slabinfo-gnuplot.sh FOO_STATS [FOO_STATS2 .. FOO_STATSN] + + The ``slabinfo-gnuplot.sh`` script will pre-processes the collected records + and generates 3 png files (and 3 pre-processing cache files) per STATS + file: + - Slabcache Totals: FOO_STATS-totals.png + - Slabs sorted by size: FOO_STATS-slabs-by-size.png + - Slabs sorted by loss: FOO_STATS-slabs-by-loss.png + +Another use case, when ``slabinfo-gnuplot.sh`` can be useful, is when you +need to compare slabs' behaviour "prior to" and "after" some code +modification. To help you out there, ``slabinfo-gnuplot.sh`` script +can 'merge' the `Slabcache Totals` sections from different +measurements. To visually compare N plots: + +a) Collect as many STATS1, STATS2, .. STATSN files as you need:: + + while [ 1 ]; do slabinfo -X >> STATS; sleep 1; done + +b) Pre-process those STATS files:: + + slabinfo-gnuplot.sh STATS1 STATS2 .. STATSN + +c) Execute ``slabinfo-gnuplot.sh`` in '-t' mode, passing all of the + generated pre-processed \*-totals:: + + slabinfo-gnuplot.sh -t STATS1-totals STATS2-totals .. STATSN-totals + + This will produce a single plot (png file). + + Plots, expectedly, can be large so some fluctuations or small spikes + can go unnoticed. To deal with that, ``slabinfo-gnuplot.sh`` has two + options to 'zoom-in'/'zoom-out': + + a) ``-s %d,%d`` -- overwrites the default image width and height + b) ``-r %d,%d`` -- specifies a range of samples to use (for example, + in ``slabinfo -X >> FOO_STATS; sleep 1;`` case, using a ``-r + 40,60`` range will plot only samples collected between 40th and + 60th seconds). + + +DebugFS files for SLUB +====================== + +For more information about current state of SLUB caches with the user tracking +debug option enabled, debugfs files are available, typically under +/sys/kernel/debug/slab// (created only for caches with enabled user +tracking). There are 2 types of these files with the following debug +information: + +1. alloc_traces:: + + Prints information about unique allocation traces of the currently + allocated objects. The output is sorted by frequency of each trace. + + Information in the output: + Number of objects, allocating function, minimal/average/maximal jiffies since alloc, + pid range of the allocating processes, cpu mask of allocating cpus, and stack trace. + + Example::: + + 1085 populate_error_injection_list+0x97/0x110 age=166678/166680/166682 pid=1 cpus=1:: + __slab_alloc+0x6d/0x90 + kmem_cache_alloc_trace+0x2eb/0x300 + populate_error_injection_list+0x97/0x110 + init_error_injection+0x1b/0x71 + do_one_initcall+0x5f/0x2d0 + kernel_init_freeable+0x26f/0x2d7 + kernel_init+0xe/0x118 + ret_from_fork+0x22/0x30 + + +2. free_traces:: + + Prints information about unique freeing traces of the currently allocated + objects. The freeing traces thus come from the previous life-cycle of the + objects and are reported as not available for objects allocated for the first + time. The output is sorted by frequency of each trace. + + Information in the output: + Number of objects, freeing function, minimal/average/maximal jiffies since free, + pid range of the freeing processes, cpu mask of freeing cpus, and stack trace. + + Example::: + + 1980 age=4294912290 pid=0 cpus=0 + 51 acpi_ut_update_ref_count+0x6a6/0x782 age=236886/237027/237772 pid=1 cpus=1 + kfree+0x2db/0x420 + acpi_ut_update_ref_count+0x6a6/0x782 + acpi_ut_update_object_reference+0x1ad/0x234 + acpi_ut_remove_reference+0x7d/0x84 + acpi_rs_get_prt_method_data+0x97/0xd6 + acpi_get_irq_routing_table+0x82/0xc4 + acpi_pci_irq_find_prt_entry+0x8e/0x2e0 + acpi_pci_irq_lookup+0x3a/0x1e0 + acpi_pci_irq_enable+0x77/0x240 + pcibios_enable_device+0x39/0x40 + do_pci_enable_device.part.0+0x5d/0xe0 + pci_enable_device_flags+0xfc/0x120 + pci_enable_device+0x13/0x20 + virtio_pci_probe+0x9e/0x170 + local_pci_probe+0x48/0x80 + pci_device_probe+0x105/0x1c0 + +Christoph Lameter, May 30, 2007 +Sergey Senozhatsky, October 23, 2015 diff --git a/Documentation/mm/split_page_table_lock.rst b/Documentation/mm/split_page_table_lock.rst new file mode 100644 index 000000000000..c08919662704 --- /dev/null +++ b/Documentation/mm/split_page_table_lock.rst @@ -0,0 +1,100 @@ +.. _split_page_table_lock: + +===================== +Split page table lock +===================== + +Originally, mm->page_table_lock spinlock protected all page tables of the +mm_struct. But this approach leads to poor page fault scalability of +multi-threaded applications due high contention on the lock. To improve +scalability, split page table lock was introduced. + +With split page table lock we have separate per-table lock to serialize +access to the table. At the moment we use split lock for PTE and PMD +tables. Access to higher level tables protected by mm->page_table_lock. + +There are helpers to lock/unlock a table and other accessor functions: + + - pte_offset_map_lock() + maps pte and takes PTE table lock, returns pointer to the taken + lock; + - pte_unmap_unlock() + unlocks and unmaps PTE table; + - pte_alloc_map_lock() + allocates PTE table if needed and take the lock, returns pointer + to taken lock or NULL if allocation failed; + - pte_lockptr() + returns pointer to PTE table lock; + - pmd_lock() + takes PMD table lock, returns pointer to taken lock; + - pmd_lockptr() + returns pointer to PMD table lock; + +Split page table lock for PTE tables is enabled compile-time if +CONFIG_SPLIT_PTLOCK_CPUS (usually 4) is less or equal to NR_CPUS. +If split lock is disabled, all tables are guarded by mm->page_table_lock. + +Split page table lock for PMD tables is enabled, if it's enabled for PTE +tables and the architecture supports it (see below). + +Hugetlb and split page table lock +================================= + +Hugetlb can support several page sizes. We use split lock only for PMD +level, but not for PUD. + +Hugetlb-specific helpers: + + - huge_pte_lock() + takes pmd split lock for PMD_SIZE page, mm->page_table_lock + otherwise; + - huge_pte_lockptr() + returns pointer to table lock; + +Support of split page table lock by an architecture +=================================================== + +There's no need in special enabling of PTE split page table lock: everything +required is done by pgtable_pte_page_ctor() and pgtable_pte_page_dtor(), which +must be called on PTE table allocation / freeing. + +Make sure the architecture doesn't use slab allocator for page table +allocation: slab uses page->slab_cache for its pages. +This field shares storage with page->ptl. + +PMD split lock only makes sense if you have more than two page table +levels. + +PMD split lock enabling requires pgtable_pmd_page_ctor() call on PMD table +allocation and pgtable_pmd_page_dtor() on freeing. + +Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and +pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing +paths: i.e X86_PAE preallocate few PMDs on pgd_alloc(). + +With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK. + +NOTE: pgtable_pte_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must +be handled properly. + +page->ptl +========= + +page->ptl is used to access split page table lock, where 'page' is struct +page of page containing the table. It shares storage with page->private +(and few other fields in union). + +To avoid increasing size of struct page and have best performance, we use a +trick: + + - if spinlock_t fits into long, we use page->ptr as spinlock, so we + can avoid indirect access and save a cache line. + - if size of spinlock_t is bigger then size of long, we use page->ptl as + pointer to spinlock_t and allocate it dynamically. This allows to use + split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs + one more cache line for indirect access; + +The spinlock_t allocated in pgtable_pte_page_ctor() for PTE table and in +pgtable_pmd_page_ctor() for PMD table. + +Please, never access page->ptl directly -- use appropriate helper. diff --git a/Documentation/mm/swap.rst b/Documentation/mm/swap.rst new file mode 100644 index 000000000000..78819bd4d745 --- /dev/null +++ b/Documentation/mm/swap.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==== +Swap +==== diff --git a/Documentation/mm/transhuge.rst b/Documentation/mm/transhuge.rst new file mode 100644 index 000000000000..216db1d67d04 --- /dev/null +++ b/Documentation/mm/transhuge.rst @@ -0,0 +1,187 @@ +.. _transhuge: + +============================ +Transparent Hugepage Support +============================ + +This document describes design principles for Transparent Hugepage (THP) +support and its interaction with other parts of the memory management +system. + +Design principles +================= + +- "graceful fallback": mm components which don't have transparent hugepage + knowledge fall back to breaking huge pmd mapping into table of ptes and, + if necessary, split a transparent hugepage. Therefore these components + can continue working on the regular pages or regular pte mappings. + +- if a hugepage allocation fails because of memory fragmentation, + regular pages should be gracefully allocated instead and mixed in + the same vma without any failure or significant delay and without + userland noticing + +- if some task quits and more hugepages become available (either + immediately in the buddy or through the VM), guest physical memory + backed by regular pages should be relocated on hugepages + automatically (with khugepaged) + +- it doesn't require memory reservation and in turn it uses hugepages + whenever possible (the only possible reservation here is kernelcore= + to avoid unmovable pages to fragment all the memory but such a tweak + is not specific to transparent hugepage support and it's a generic + feature that applies to all dynamic high order allocations in the + kernel) + +get_user_pages and follow_page +============================== + +get_user_pages and follow_page if run on a hugepage, will return the +head or tail pages as usual (exactly as they would do on +hugetlbfs). Most GUP users will only care about the actual physical +address of the page and its temporary pinning to release after the I/O +is complete, so they won't ever notice the fact the page is huge. But +if any driver is going to mangle over the page structure of the tail +page (like for checking page->mapping or other bits that are relevant +for the head page and not the tail page), it should be updated to jump +to check head page instead. Taking a reference on any head/tail page would +prevent the page from being split by anyone. + +.. note:: + these aren't new constraints to the GUP API, and they match the + same constraints that apply to hugetlbfs too, so any driver capable + of handling GUP on hugetlbfs will also work fine on transparent + hugepage backed mappings. + +Graceful fallback +================= + +Code walking pagetables but unaware about huge pmds can simply call +split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by +pmd_offset. It's trivial to make the code transparent hugepage aware +by just grepping for "pmd_offset" and adding split_huge_pmd where +missing after pmd_offset returns the pmd. Thanks to the graceful +fallback design, with a one liner change, you can avoid to write +hundreds if not thousands of lines of complex code to make your code +hugepage aware. + +If you're not walking pagetables but you run into a physical hugepage +that you can't handle natively in your code, you can split it by +calling split_huge_page(page). This is what the Linux VM does before +it tries to swapout the hugepage for example. split_huge_page() can fail +if the page is pinned and you must handle this correctly. + +Example to make mremap.c transparent hugepage aware with a one liner +change:: + + diff --git a/mm/mremap.c b/mm/mremap.c + --- a/mm/mremap.c + +++ b/mm/mremap.c + @@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru + return NULL; + + pmd = pmd_offset(pud, addr); + + split_huge_pmd(vma, pmd, addr); + if (pmd_none_or_clear_bad(pmd)) + return NULL; + +Locking in hugepage aware code +============================== + +We want as much code as possible hugepage aware, as calling +split_huge_page() or split_huge_pmd() has a cost. + +To make pagetable walks huge pmd aware, all you need to do is to call +pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the +mmap_lock in read (or write) mode to be sure a huge pmd cannot be +created from under you by khugepaged (khugepaged collapse_huge_page +takes the mmap_lock in write mode in addition to the anon_vma lock). If +pmd_trans_huge returns false, you just fallback in the old code +paths. If instead pmd_trans_huge returns true, you have to take the +page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the +page table lock will prevent the huge pmd being converted into a +regular pmd from under you (split_huge_pmd can run in parallel to the +pagetable walk). If the second pmd_trans_huge returns false, you +should just drop the page table lock and fallback to the old code as +before. Otherwise, you can proceed to process the huge pmd and the +hugepage natively. Once finished, you can drop the page table lock. + +Refcounts and transparent huge pages +==================================== + +Refcounting on THP is mostly consistent with refcounting on other compound +pages: + + - get_page()/put_page() and GUP operate on head page's ->_refcount. + + - ->_refcount in tail pages is always zero: get_page_unless_zero() never + succeeds on tail pages. + + - map/unmap of the pages with PTE entry increment/decrement ->_mapcount + on relevant sub-page of the compound page. + + - map/unmap of the whole compound page is accounted for in compound_mapcount + (stored in first tail page). For file huge pages, we also increment + ->_mapcount of all sub-pages in order to have race-free detection of + last unmap of subpages. + +PageDoubleMap() indicates that the page is *possibly* mapped with PTEs. + +For anonymous pages, PageDoubleMap() also indicates ->_mapcount in all +subpages is offset up by one. This additional reference is required to +get race-free detection of unmap of subpages when we have them mapped with +both PMDs and PTEs. + +This optimization is required to lower the overhead of per-subpage mapcount +tracking. The alternative is to alter ->_mapcount in all subpages on each +map/unmap of the whole compound page. + +For anonymous pages, we set PG_double_map when a PMD of the page is split +for the first time, but still have a PMD mapping. The additional references +go away with the last compound_mapcount. + +File pages get PG_double_map set on the first map of the page with PTE and +goes away when the page gets evicted from the page cache. + +split_huge_page internally has to distribute the refcounts in the head +page to the tail pages before clearing all PG_head/tail bits from the page +structures. It can be done easily for refcounts taken by page table +entries, but we don't have enough information on how to distribute any +additional pins (i.e. from get_user_pages). split_huge_page() fails any +requests to split pinned huge pages: it expects page count to be equal to +the sum of mapcount of all sub-pages plus one (split_huge_page caller must +have a reference to the head page). + +split_huge_page uses migration entries to stabilize page->_refcount and +page->_mapcount of anonymous pages. File pages just get unmapped. + +We are safe against physical memory scanners too: the only legitimate way +a scanner can get a reference to a page is get_page_unless_zero(). + +All tail pages have zero ->_refcount until atomic_add(). This prevents the +scanner from getting a reference to the tail page up to that point. After the +atomic_add() we don't care about the ->_refcount value. We already know how +many references should be uncharged from the head page. + +For head page get_page_unless_zero() will succeed and we don't mind. It's +clear where references should go after split: it will stay on the head page. + +Note that split_huge_pmd() doesn't have any limitations on refcounting: +pmd can be split at any point and never fails. + +Partial unmap and deferred_split_huge_page() +============================================ + +Unmapping part of THP (with munmap() or other way) is not going to free +memory immediately. Instead, we detect that a subpage of THP is not in use +in page_remove_rmap() and queue the THP for splitting if memory pressure +comes. Splitting will free up unused subpages. + +Splitting the page right away is not an option due to locking context in +the place where we can detect partial unmap. It also might be +counterproductive since in many cases partial unmap happens during exit(2) if +a THP crosses a VMA boundary. + +The function deferred_split_huge_page() is used to queue a page for splitting. +The splitting itself will happen when we get memory pressure via shrinker +interface. diff --git a/Documentation/mm/unevictable-lru.rst b/Documentation/mm/unevictable-lru.rst new file mode 100644 index 000000000000..b280367d6a44 --- /dev/null +++ b/Documentation/mm/unevictable-lru.rst @@ -0,0 +1,554 @@ +.. _unevictable_lru: + +============================== +Unevictable LRU Infrastructure +============================== + +.. contents:: :local: + + +Introduction +============ + +This document describes the Linux memory manager's "Unevictable LRU" +infrastructure and the use of this to manage several types of "unevictable" +pages. + +The document attempts to provide the overall rationale behind this mechanism +and the rationale for some of the design decisions that drove the +implementation. The latter design rationale is discussed in the context of an +implementation description. Admittedly, one can obtain the implementation +details - the "what does it do?" - by reading the code. One hopes that the +descriptions below add value by provide the answer to "why does it do that?". + + + +The Unevictable LRU +=================== + +The Unevictable LRU facility adds an additional LRU list to track unevictable +pages and to hide these pages from vmscan. This mechanism is based on a patch +by Larry Woodman of Red Hat to address several scalability problems with page +reclaim in Linux. The problems have been observed at customer sites on large +memory x86_64 systems. + +To illustrate this with an example, a non-NUMA x86_64 platform with 128GB of +main memory will have over 32 million 4k pages in a single node. When a large +fraction of these pages are not evictable for any reason [see below], vmscan +will spend a lot of time scanning the LRU lists looking for the small fraction +of pages that are evictable. This can result in a situation where all CPUs are +spending 100% of their time in vmscan for hours or days on end, with the system +completely unresponsive. + +The unevictable list addresses the following classes of unevictable pages: + + * Those owned by ramfs. + + * Those mapped into SHM_LOCK'd shared memory regions. + + * Those mapped into VM_LOCKED [mlock()ed] VMAs. + +The infrastructure may also be able to handle other conditions that make pages +unevictable, either by definition or by circumstance, in the future. + + +The Unevictable LRU Page List +----------------------------- + +The Unevictable LRU page list is a lie. It was never an LRU-ordered list, but a +companion to the LRU-ordered anonymous and file, active and inactive page lists; +and now it is not even a page list. But following familiar convention, here in +this document and in the source, we often imagine it as a fifth LRU page list. + +The Unevictable LRU infrastructure consists of an additional, per-node, LRU list +called the "unevictable" list and an associated page flag, PG_unevictable, to +indicate that the page is being managed on the unevictable list. + +The PG_unevictable flag is analogous to, and mutually exclusive with, the +PG_active flag in that it indicates on which LRU list a page resides when +PG_lru is set. + +The Unevictable LRU infrastructure maintains unevictable pages as if they were +on an additional LRU list for a few reasons: + + (1) We get to "treat unevictable pages just like we treat other pages in the + system - which means we get to use the same code to manipulate them, the + same code to isolate them (for migrate, etc.), the same code to keep track + of the statistics, etc..." [Rik van Riel] + + (2) We want to be able to migrate unevictable pages between nodes for memory + defragmentation, workload management and memory hotplug. The Linux kernel + can only migrate pages that it can successfully isolate from the LRU + lists (or "Movable" pages: outside of consideration here). If we were to + maintain pages elsewhere than on an LRU-like list, where they can be + detected by isolate_lru_page(), we would prevent their migration. + +The unevictable list does not differentiate between file-backed and anonymous, +swap-backed pages. This differentiation is only important while the pages are, +in fact, evictable. + +The unevictable list benefits from the "arrayification" of the per-node LRU +lists and statistics originally proposed and posted by Christoph Lameter. + + +Memory Control Group Interaction +-------------------------------- + +The unevictable LRU facility interacts with the memory control group [aka +memory controller; see Documentation/admin-guide/cgroup-v1/memory.rst] by +extending the lru_list enum. + +The memory controller data structure automatically gets a per-node unevictable +list as a result of the "arrayification" of the per-node LRU lists (one per +lru_list enum element). The memory controller tracks the movement of pages to +and from the unevictable list. + +When a memory control group comes under memory pressure, the controller will +not attempt to reclaim pages on the unevictable list. This has a couple of +effects: + + (1) Because the pages are "hidden" from reclaim on the unevictable list, the + reclaim process can be more efficient, dealing only with pages that have a + chance of being reclaimed. + + (2) On the other hand, if too many of the pages charged to the control group + are unevictable, the evictable portion of the working set of the tasks in + the control group may not fit into the available memory. This can cause + the control group to thrash or to OOM-kill tasks. + + +.. _mark_addr_space_unevict: + +Marking Address Spaces Unevictable +---------------------------------- + +For facilities such as ramfs none of the pages attached to the address space +may be evicted. To prevent eviction of any such pages, the AS_UNEVICTABLE +address space flag is provided, and this can be manipulated by a filesystem +using a number of wrapper functions: + + * ``void mapping_set_unevictable(struct address_space *mapping);`` + + Mark the address space as being completely unevictable. + + * ``void mapping_clear_unevictable(struct address_space *mapping);`` + + Mark the address space as being evictable. + + * ``int mapping_unevictable(struct address_space *mapping);`` + + Query the address space, and return true if it is completely + unevictable. + +These are currently used in three places in the kernel: + + (1) By ramfs to mark the address spaces of its inodes when they are created, + and this mark remains for the life of the inode. + + (2) By SYSV SHM to mark SHM_LOCK'd address spaces until SHM_UNLOCK is called. + Note that SHM_LOCK is not required to page in the locked pages if they're + swapped out; the application must touch the pages manually if it wants to + ensure they're in memory. + + (3) By the i915 driver to mark pinned address space until it's unpinned. The + amount of unevictable memory marked by i915 driver is roughly the bounded + object size in debugfs/dri/0/i915_gem_objects. + + +Detecting Unevictable Pages +--------------------------- + +The function page_evictable() in mm/internal.h determines whether a page is +evictable or not using the query function outlined above [see section +:ref:`Marking address spaces unevictable `] +to check the AS_UNEVICTABLE flag. + +For address spaces that are so marked after being populated (as SHM regions +might be), the lock action (e.g. SHM_LOCK) can be lazy, and need not populate +the page tables for the region as does, for example, mlock(), nor need it make +any special effort to push any pages in the SHM_LOCK'd area to the unevictable +list. Instead, vmscan will do this if and when it encounters the pages during +a reclamation scan. + +On an unlock action (such as SHM_UNLOCK), the unlocker (e.g. shmctl()) must scan +the pages in the region and "rescue" them from the unevictable list if no other +condition is keeping them unevictable. If an unevictable region is destroyed, +the pages are also "rescued" from the unevictable list in the process of +freeing them. + +page_evictable() also checks for mlocked pages by testing an additional page +flag, PG_mlocked (as wrapped by PageMlocked()), which is set when a page is +faulted into a VM_LOCKED VMA, or found in a VMA being VM_LOCKED. + + +Vmscan's Handling of Unevictable Pages +-------------------------------------- + +If unevictable pages are culled in the fault path, or moved to the unevictable +list at mlock() or mmap() time, vmscan will not encounter the pages until they +have become evictable again (via munlock() for example) and have been "rescued" +from the unevictable list. However, there may be situations where we decide, +for the sake of expediency, to leave an unevictable page on one of the regular +active/inactive LRU lists for vmscan to deal with. vmscan checks for such +pages in all of the shrink_{active|inactive|page}_list() functions and will +"cull" such pages that it encounters: that is, it diverts those pages to the +unevictable list for the memory cgroup and node being scanned. + +There may be situations where a page is mapped into a VM_LOCKED VMA, but the +page is not marked as PG_mlocked. Such pages will make it all the way to +shrink_active_list() or shrink_page_list() where they will be detected when +vmscan walks the reverse map in page_referenced() or try_to_unmap(). The page +is culled to the unevictable list when it is released by the shrinker. + +To "cull" an unevictable page, vmscan simply puts the page back on the LRU list +using putback_lru_page() - the inverse operation to isolate_lru_page() - after +dropping the page lock. Because the condition which makes the page unevictable +may change once the page is unlocked, __pagevec_lru_add_fn() will recheck the +unevictable state of a page before placing it on the unevictable list. + + +MLOCKED Pages +============= + +The unevictable page list is also useful for mlock(), in addition to ramfs and +SYSV SHM. Note that mlock() is only available in CONFIG_MMU=y situations; in +NOMMU situations, all mappings are effectively mlocked. + + +History +------- + +The "Unevictable mlocked Pages" infrastructure is based on work originally +posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU". +Nick posted his patch as an alternative to a patch posted by Christoph Lameter +to achieve the same objective: hiding mlocked pages from vmscan. + +In Nick's patch, he used one of the struct page LRU list link fields as a count +of VM_LOCKED VMAs that map the page (Rik van Riel had the same idea three years +earlier). But this use of the link field for a count prevented the management +of the pages on an LRU list, and thus mlocked pages were not migratable as +isolate_lru_page() could not detect them, and the LRU list link field was not +available to the migration subsystem. + +Nick resolved this by putting mlocked pages back on the LRU list before +attempting to isolate them, thus abandoning the count of VM_LOCKED VMAs. When +Nick's patch was integrated with the Unevictable LRU work, the count was +replaced by walking the reverse map when munlocking, to determine whether any +other VM_LOCKED VMAs still mapped the page. + +However, walking the reverse map for each page when munlocking was ugly and +inefficient, and could lead to catastrophic contention on a file's rmap lock, +when many processes which had it mlocked were trying to exit. In 5.18, the +idea of keeping mlock_count in Unevictable LRU list link field was revived and +put to work, without preventing the migration of mlocked pages. This is why +the "Unevictable LRU list" cannot be a linked list of pages now; but there was +no use for that linked list anyway - though its size is maintained for meminfo. + + +Basic Management +---------------- + +mlocked pages - pages mapped into a VM_LOCKED VMA - are a class of unevictable +pages. When such a page has been "noticed" by the memory management subsystem, +the page is marked with the PG_mlocked flag. This can be manipulated using the +PageMlocked() functions. + +A PG_mlocked page will be placed on the unevictable list when it is added to +the LRU. Such pages can be "noticed" by memory management in several places: + + (1) in the mlock()/mlock2()/mlockall() system call handlers; + + (2) in the mmap() system call handler when mmapping a region with the + MAP_LOCKED flag; + + (3) mmapping a region in a task that has called mlockall() with the MCL_FUTURE + flag; + + (4) in the fault path and when a VM_LOCKED stack segment is expanded; or + + (5) as mentioned above, in vmscan:shrink_page_list() when attempting to + reclaim a page in a VM_LOCKED VMA by page_referenced() or try_to_unmap(). + +mlocked pages become unlocked and rescued from the unevictable list when: + + (1) mapped in a range unlocked via the munlock()/munlockall() system calls; + + (2) munmap()'d out of the last VM_LOCKED VMA that maps the page, including + unmapping at task exit; + + (3) when the page is truncated from the last VM_LOCKED VMA of an mmapped file; + or + + (4) before a page is COW'd in a VM_LOCKED VMA. + + +mlock()/mlock2()/mlockall() System Call Handling +------------------------------------------------ + +mlock(), mlock2() and mlockall() system call handlers proceed to mlock_fixup() +for each VMA in the range specified by the call. In the case of mlockall(), +this is the entire active address space of the task. Note that mlock_fixup() +is used for both mlocking and munlocking a range of memory. A call to mlock() +an already VM_LOCKED VMA, or to munlock() a VMA that is not VM_LOCKED, is +treated as a no-op and mlock_fixup() simply returns. + +If the VMA passes some filtering as described in "Filtering Special VMAs" +below, mlock_fixup() will attempt to merge the VMA with its neighbors or split +off a subset of the VMA if the range does not cover the entire VMA. Any pages +already present in the VMA are then marked as mlocked by mlock_page() via +mlock_pte_range() via walk_page_range() via mlock_vma_pages_range(). + +Before returning from the system call, do_mlock() or mlockall() will call +__mm_populate() to fault in the remaining pages via get_user_pages() and to +mark those pages as mlocked as they are faulted. + +Note that the VMA being mlocked might be mapped with PROT_NONE. In this case, +get_user_pages() will be unable to fault in the pages. That's okay. If pages +do end up getting faulted into this VM_LOCKED VMA, they will be handled in the +fault path - which is also how mlock2()'s MLOCK_ONFAULT areas are handled. + +For each PTE (or PMD) being faulted into a VMA, the page add rmap function +calls mlock_vma_page(), which calls mlock_page() when the VMA is VM_LOCKED +(unless it is a PTE mapping of a part of a transparent huge page). Or when +it is a newly allocated anonymous page, lru_cache_add_inactive_or_unevictable() +calls mlock_new_page() instead: similar to mlock_page(), but can make better +judgments, since this page is held exclusively and known not to be on LRU yet. + +mlock_page() sets PageMlocked immediately, then places the page on the CPU's +mlock pagevec, to batch up the rest of the work to be done under lru_lock by +__mlock_page(). __mlock_page() sets PageUnevictable, initializes mlock_count +and moves the page to unevictable state ("the unevictable LRU", but with +mlock_count in place of LRU threading). Or if the page was already PageLRU +and PageUnevictable and PageMlocked, it simply increments the mlock_count. + +But in practice that may not work ideally: the page may not yet be on an LRU, or +it may have been temporarily isolated from LRU. In such cases the mlock_count +field cannot be touched, but will be set to 0 later when __pagevec_lru_add_fn() +returns the page to "LRU". Races prohibit mlock_count from being set to 1 then: +rather than risk stranding a page indefinitely as unevictable, always err with +mlock_count on the low side, so that when munlocked the page will be rescued to +an evictable LRU, then perhaps be mlocked again later if vmscan finds it in a +VM_LOCKED VMA. + + +Filtering Special VMAs +---------------------- + +mlock_fixup() filters several classes of "special" VMAs: + +1) VMAs with VM_IO or VM_PFNMAP set are skipped entirely. The pages behind + these mappings are inherently pinned, so we don't need to mark them as + mlocked. In any case, most of the pages have no struct page in which to so + mark the page. Because of this, get_user_pages() will fail for these VMAs, + so there is no sense in attempting to visit them. + +2) VMAs mapping hugetlbfs page are already effectively pinned into memory. We + neither need nor want to mlock() these pages. But __mm_populate() includes + hugetlbfs ranges, allocating the huge pages and populating the PTEs. + +3) VMAs with VM_DONTEXPAND are generally userspace mappings of kernel pages, + such as the VDSO page, relay channel pages, etc. These pages are inherently + unevictable and are not managed on the LRU lists. __mm_populate() includes + these ranges, populating the PTEs if not already populated. + +4) VMAs with VM_MIXEDMAP set are not marked VM_LOCKED, but __mm_populate() + includes these ranges, populating the PTEs if not already populated. + +Note that for all of these special VMAs, mlock_fixup() does not set the +VM_LOCKED flag. Therefore, we won't have to deal with them later during +munlock(), munmap() or task exit. Neither does mlock_fixup() account these +VMAs against the task's "locked_vm". + + +munlock()/munlockall() System Call Handling +------------------------------------------- + +The munlock() and munlockall() system calls are handled by the same +mlock_fixup() function as mlock(), mlock2() and mlockall() system calls are. +If called to munlock an already munlocked VMA, mlock_fixup() simply returns. +Because of the VMA filtering discussed above, VM_LOCKED will not be set in +any "special" VMAs. So, those VMAs will be ignored for munlock. + +If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the +specified range. All pages in the VMA are then munlocked by munlock_page() via +mlock_pte_range() via walk_page_range() via mlock_vma_pages_range() - the same +function used when mlocking a VMA range, with new flags for the VMA indicating +that it is munlock() being performed. + +munlock_page() uses the mlock pagevec to batch up work to be done under +lru_lock by __munlock_page(). __munlock_page() decrements the page's +mlock_count, and when that reaches 0 it clears PageMlocked and clears +PageUnevictable, moving the page from unevictable state to inactive LRU. + +But in practice that may not work ideally: the page may not yet have reached +"the unevictable LRU", or it may have been temporarily isolated from it. In +those cases its mlock_count field is unusable and must be assumed to be 0: so +that the page will be rescued to an evictable LRU, then perhaps be mlocked +again later if vmscan finds it in a VM_LOCKED VMA. + + +Migrating MLOCKED Pages +----------------------- + +A page that is being migrated has been isolated from the LRU lists and is held +locked across unmapping of the page, updating the page's address space entry +and copying the contents and state, until the page table entry has been +replaced with an entry that refers to the new page. Linux supports migration +of mlocked pages and other unevictable pages. PG_mlocked is cleared from the +the old page when it is unmapped from the last VM_LOCKED VMA, and set when the +new page is mapped in place of migration entry in a VM_LOCKED VMA. If the page +was unevictable because mlocked, PG_unevictable follows PG_mlocked; but if the +page was unevictable for other reasons, PG_unevictable is copied explicitly. + +Note that page migration can race with mlocking or munlocking of the same page. +There is mostly no problem since page migration requires unmapping all PTEs of +the old page (including munlock where VM_LOCKED), then mapping in the new page +(including mlock where VM_LOCKED). The page table locks provide sufficient +synchronization. + +However, since mlock_vma_pages_range() starts by setting VM_LOCKED on a VMA, +before mlocking any pages already present, if one of those pages were migrated +before mlock_pte_range() reached it, it would get counted twice in mlock_count. +To prevent that, mlock_vma_pages_range() temporarily marks the VMA as VM_IO, +so that mlock_vma_page() will skip it. + +To complete page migration, we place the old and new pages back onto the LRU +afterwards. The "unneeded" page - old page on success, new page on failure - +is freed when the reference count held by the migration process is released. + + +Compacting MLOCKED Pages +------------------------ + +The memory map can be scanned for compactable regions and the default behavior +is to let unevictable pages be moved. /proc/sys/vm/compact_unevictable_allowed +controls this behavior (see Documentation/admin-guide/sysctl/vm.rst). The work +of compaction is mostly handled by the page migration code and the same work +flow as described in Migrating MLOCKED Pages will apply. + + +MLOCKING Transparent Huge Pages +------------------------------- + +A transparent huge page is represented by a single entry on an LRU list. +Therefore, we can only make unevictable an entire compound page, not +individual subpages. + +If a user tries to mlock() part of a huge page, and no user mlock()s the +whole of the huge page, we want the rest of the page to be reclaimable. + +We cannot just split the page on partial mlock() as split_huge_page() can +fail and a new intermittent failure mode for the syscall is undesirable. + +We handle this by keeping PTE-mlocked huge pages on evictable LRU lists: +the PMD on the border of a VM_LOCKED VMA will be split into a PTE table. + +This way the huge page is accessible for vmscan. Under memory pressure the +page will be split, subpages which belong to VM_LOCKED VMAs will be moved +to the unevictable LRU and the rest can be reclaimed. + +/proc/meminfo's Unevictable and Mlocked amounts do not include those parts +of a transparent huge page which are mapped only by PTEs in VM_LOCKED VMAs. + + +mmap(MAP_LOCKED) System Call Handling +------------------------------------- + +In addition to the mlock(), mlock2() and mlockall() system calls, an application +can request that a region of memory be mlocked by supplying the MAP_LOCKED flag +to the mmap() call. There is one important and subtle difference here, though. +mmap() + mlock() will fail if the range cannot be faulted in (e.g. because +mm_populate fails) and returns with ENOMEM while mmap(MAP_LOCKED) will not fail. +The mmaped area will still have properties of the locked area - pages will not +get swapped out - but major page faults to fault memory in might still happen. + +Furthermore, any mmap() call or brk() call that expands the heap by a task +that has previously called mlockall() with the MCL_FUTURE flag will result +in the newly mapped memory being mlocked. Before the unevictable/mlock +changes, the kernel simply called make_pages_present() to allocate pages +and populate the page table. + +To mlock a range of memory under the unevictable/mlock infrastructure, +the mmap() handler and task address space expansion functions call +populate_vma_page_range() specifying the vma and the address range to mlock. + + +munmap()/exit()/exec() System Call Handling +------------------------------------------- + +When unmapping an mlocked region of memory, whether by an explicit call to +munmap() or via an internal unmap from exit() or exec() processing, we must +munlock the pages if we're removing the last VM_LOCKED VMA that maps the pages. +Before the unevictable/mlock changes, mlocking did not mark the pages in any +way, so unmapping them required no processing. + +For each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls +munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED +(unless it was a PTE mapping of a part of a transparent huge page). + +munlock_page() uses the mlock pagevec to batch up work to be done under +lru_lock by __munlock_page(). __munlock_page() decrements the page's +mlock_count, and when that reaches 0 it clears PageMlocked and clears +PageUnevictable, moving the page from unevictable state to inactive LRU. + +But in practice that may not work ideally: the page may not yet have reached +"the unevictable LRU", or it may have been temporarily isolated from it. In +those cases its mlock_count field is unusable and must be assumed to be 0: so +that the page will be rescued to an evictable LRU, then perhaps be mlocked +again later if vmscan finds it in a VM_LOCKED VMA. + + +Truncating MLOCKED Pages +------------------------ + +File truncation or hole punching forcibly unmaps the deleted pages from +userspace; truncation even unmaps and deletes any private anonymous pages +which had been Copied-On-Write from the file pages now being truncated. + +Mlocked pages can be munlocked and deleted in this way: like with munmap(), +for each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls +munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED +(unless it was a PTE mapping of a part of a transparent huge page). + +However, if there is a racing munlock(), since mlock_vma_pages_range() starts +munlocking by clearing VM_LOCKED from a VMA, before munlocking all the pages +present, if one of those pages were unmapped by truncation or hole punch before +mlock_pte_range() reached it, it would not be recognized as mlocked by this VMA, +and would not be counted out of mlock_count. In this rare case, a page may +still appear as PageMlocked after it has been fully unmapped: and it is left to +release_pages() (or __page_cache_release()) to clear it and update statistics +before freeing (this event is counted in /proc/vmstat unevictable_pgs_cleared, +which is usually 0). + + +Page Reclaim in shrink_*_list() +------------------------------- + +vmscan's shrink_active_list() culls any obviously unevictable pages - +i.e. !page_evictable(page) pages - diverting those to the unevictable list. +However, shrink_active_list() only sees unevictable pages that made it onto the +active/inactive LRU lists. Note that these pages do not have PageUnevictable +set - otherwise they would be on the unevictable list and shrink_active_list() +would never see them. + +Some examples of these unevictable pages on the LRU lists are: + + (1) ramfs pages that have been placed on the LRU lists when first allocated. + + (2) SHM_LOCK'd shared memory pages. shmctl(SHM_LOCK) does not attempt to + allocate or fault in the pages in the shared memory region. This happens + when an application accesses the page the first time after SHM_LOCK'ing + the segment. + + (3) pages still mapped into VM_LOCKED VMAs, which should be marked mlocked, + but events left mlock_count too low, so they were munlocked too early. + +vmscan's shrink_inactive_list() and shrink_page_list() also divert obviously +unevictable pages found on the inactive lists to the appropriate memory cgroup +and node unevictable list. + +rmap's page_referenced_one(), called via vmscan's shrink_active_list() or +shrink_page_list(), and rmap's try_to_unmap_one() called via shrink_page_list(), +check for (3) pages still mapped into VM_LOCKED VMAs, and call mlock_vma_page() +to correct them. Such pages are culled to the unevictable list when released +by the shrinker. diff --git a/Documentation/mm/vmalloc.rst b/Documentation/mm/vmalloc.rst new file mode 100644 index 000000000000..363fe20d6b9f --- /dev/null +++ b/Documentation/mm/vmalloc.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================================== +Virtually Contiguous Memory Allocation +====================================== diff --git a/Documentation/mm/vmalloced-kernel-stacks.rst b/Documentation/mm/vmalloced-kernel-stacks.rst new file mode 100644 index 000000000000..fc8c67833af6 --- /dev/null +++ b/Documentation/mm/vmalloced-kernel-stacks.rst @@ -0,0 +1,153 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================================== +Virtually Mapped Kernel Stack Support +===================================== + +:Author: Shuah Khan + +.. contents:: :local: + +Overview +-------- + +This is a compilation of information from the code and original patch +series that introduced the `Virtually Mapped Kernel Stacks feature +` + +Introduction +------------ + +Kernel stack overflows are often hard to debug and make the kernel +susceptible to exploits. Problems could show up at a later time making +it difficult to isolate and root-cause. + +Virtually-mapped kernel stacks with guard pages causes kernel stack +overflows to be caught immediately rather than causing difficult to +diagnose corruptions. + +HAVE_ARCH_VMAP_STACK and VMAP_STACK configuration options enable +support for virtually mapped stacks with guard pages. This feature +causes reliable faults when the stack overflows. The usability of +the stack trace after overflow and response to the overflow itself +is architecture dependent. + +.. note:: + As of this writing, arm64, powerpc, riscv, s390, um, and x86 have + support for VMAP_STACK. + +HAVE_ARCH_VMAP_STACK +-------------------- + +Architectures that can support Virtually Mapped Kernel Stacks should +enable this bool configuration option. The requirements are: + +- vmalloc space must be large enough to hold many kernel stacks. This + may rule out many 32-bit architectures. +- Stacks in vmalloc space need to work reliably. For example, if + vmap page tables are created on demand, either this mechanism + needs to work while the stack points to a virtual address with + unpopulated page tables or arch code (switch_to() and switch_mm(), + most likely) needs to ensure that the stack's page table entries + are populated before running on a possibly unpopulated stack. +- If the stack overflows into a guard page, something reasonable + should happen. The definition of "reasonable" is flexible, but + instantly rebooting without logging anything would be unfriendly. + +VMAP_STACK +---------- + +VMAP_STACK bool configuration option when enabled allocates virtually +mapped task stacks. This option depends on HAVE_ARCH_VMAP_STACK. + +- Enable this if you want the use virtually-mapped kernel stacks + with guard pages. This causes kernel stack overflows to be caught + immediately rather than causing difficult-to-diagnose corruption. + +.. note:: + + Using this feature with KASAN requires architecture support + for backing virtual mappings with real shadow memory, and + KASAN_VMALLOC must be enabled. + +.. note:: + + VMAP_STACK is enabled, it is not possible to run DMA on stack + allocated data. + +Kernel configuration options and dependencies keep changing. Refer to +the latest code base: + +`Kconfig ` + +Allocation +----------- + +When a new kernel thread is created, thread stack is allocated from +virtually contiguous memory pages from the page level allocator. These +pages are mapped into contiguous kernel virtual space with PAGE_KERNEL +protections. + +alloc_thread_stack_node() calls __vmalloc_node_range() to allocate stack +with PAGE_KERNEL protections. + +- Allocated stacks are cached and later reused by new threads, so memcg + accounting is performed manually on assigning/releasing stacks to tasks. + Hence, __vmalloc_node_range is called without __GFP_ACCOUNT. +- vm_struct is cached to be able to find when thread free is initiated + in interrupt context. free_thread_stack() can be called in interrupt + context. +- On arm64, all VMAP's stacks need to have the same alignment to ensure + that VMAP'd stack overflow detection works correctly. Arch specific + vmap stack allocator takes care of this detail. +- This does not address interrupt stacks - according to the original patch + +Thread stack allocation is initiated from clone(), fork(), vfork(), +kernel_thread() via kernel_clone(). Leaving a few hints for searching +the code base to understand when and how thread stack is allocated. + +Bulk of the code is in: +`kernel/fork.c `. + +stack_vm_area pointer in task_struct keeps track of the virtually allocated +stack and a non-null stack_vm_area pointer serves as a indication that the +virtually mapped kernel stacks are enabled. + +:: + + struct vm_struct *stack_vm_area; + +Stack overflow handling +----------------------- + +Leading and trailing guard pages help detect stack overflows. When stack +overflows into the guard pages, handlers have to be careful not overflow +the stack again. When handlers are called, it is likely that very little +stack space is left. + +On x86, this is done by handling the page fault indicating the kernel +stack overflow on the double-fault stack. + +Testing VMAP allocation with guard pages +---------------------------------------- + +How do we ensure that VMAP_STACK is actually allocating with a leading +and trailing guard page? The following lkdtm tests can help detect any +regressions. + +:: + + void lkdtm_STACK_GUARD_PAGE_LEADING() + void lkdtm_STACK_GUARD_PAGE_TRAILING() + +Conclusions +----------- + +- A percpu cache of vmalloced stacks appears to be a bit faster than a + high-order stack allocation, at least when the cache hits. +- THREAD_INFO_IN_TASK gets rid of arch-specific thread_info entirely and + simply embed the thread_info (containing only flags) and 'int cpu' into + task_struct. +- The thread stack can be free'ed as soon as the task is dead (without + waiting for RCU) and then, if vmapped stacks are in use, cache the + entire stack for reuse on the same cpu. diff --git a/Documentation/mm/vmemmap_dedup.rst b/Documentation/mm/vmemmap_dedup.rst new file mode 100644 index 000000000000..c9c495f62d12 --- /dev/null +++ b/Documentation/mm/vmemmap_dedup.rst @@ -0,0 +1,223 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================================= +A vmemmap diet for HugeTLB and Device DAX +========================================= + +HugeTLB +======= + +The struct page structures (page structs) are used to describe a physical +page frame. By default, there is a one-to-one mapping from a page frame to +it's corresponding page struct. + +HugeTLB pages consist of multiple base page size pages and is supported by many +architectures. See Documentation/admin-guide/mm/hugetlbpage.rst for more +details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB are +currently supported. Since the base page size on x86 is 4KB, a 2MB HugeTLB page +consists of 512 base pages and a 1GB HugeTLB page consists of 4096 base pages. +For each base page, there is a corresponding page struct. + +Within the HugeTLB subsystem, only the first 4 page structs are used to +contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides +this upper limit. The only 'useful' information in the remaining page structs +is the compound_head field, and this field is the same for all tail pages. + +By removing redundant page structs for HugeTLB pages, memory can be returned +to the buddy allocator for other uses. + +Different architectures support different HugeTLB pages. For example, the +following table is the HugeTLB page size supported by x86 and arm64 +architectures. Because arm64 supports 4k, 16k, and 64k base pages and +supports contiguous entries, so it supports many kinds of sizes of HugeTLB +page. + ++--------------+-----------+-----------------------------------------------+ +| Architecture | Page Size | HugeTLB Page Size | ++--------------+-----------+-----------+-----------+-----------+-----------+ +| x86-64 | 4KB | 2MB | 1GB | | | ++--------------+-----------+-----------+-----------+-----------+-----------+ +| | 4KB | 64KB | 2MB | 32MB | 1GB | +| +-----------+-----------+-----------+-----------+-----------+ +| arm64 | 16KB | 2MB | 32MB | 1GB | | +| +-----------+-----------+-----------+-----------+-----------+ +| | 64KB | 2MB | 512MB | 16GB | | ++--------------+-----------+-----------+-----------+-----------+-----------+ + +When the system boot up, every HugeTLB page has more than one struct page +structs which size is (unit: pages):: + + struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE + +Where HugeTLB_Size is the size of the HugeTLB page. We know that the size +of the HugeTLB page is always n times PAGE_SIZE. So we can get the following +relationship:: + + HugeTLB_Size = n * PAGE_SIZE + +Then:: + + struct_size = n * PAGE_SIZE / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE + = n * sizeof(struct page) / PAGE_SIZE + +We can use huge mapping at the pud/pmd level for the HugeTLB page. + +For the HugeTLB page of the pmd level mapping, then:: + + struct_size = n * sizeof(struct page) / PAGE_SIZE + = PAGE_SIZE / sizeof(pte_t) * sizeof(struct page) / PAGE_SIZE + = sizeof(struct page) / sizeof(pte_t) + = 64 / 8 + = 8 (pages) + +Where n is how many pte entries which one page can contains. So the value of +n is (PAGE_SIZE / sizeof(pte_t)). + +This optimization only supports 64-bit system, so the value of sizeof(pte_t) +is 8. And this optimization also applicable only when the size of struct page +is a power of two. In most cases, the size of struct page is 64 bytes (e.g. +x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the +size of struct page structs of it is 8 page frames which size depends on the +size of the base page. + +For the HugeTLB page of the pud level mapping, then:: + + struct_size = PAGE_SIZE / sizeof(pmd_t) * struct_size(pmd) + = PAGE_SIZE / 8 * 8 (pages) + = PAGE_SIZE (pages) + +Where the struct_size(pmd) is the size of the struct page structs of a +HugeTLB page of the pmd level mapping. + +E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB +HugeTLB page consists in 4096. + +Next, we take the pmd level mapping of the HugeTLB page as an example to +show the internal implementation of this optimization. There are 8 pages +struct page structs associated with a HugeTLB page which is pmd mapped. + +Here is how things look before optimization:: + + HugeTLB struct pages(8 pages) page frame(8 pages) + +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ + | | | 0 | -------------> | 0 | + | | +-----------+ +-----------+ + | | | 1 | -------------> | 1 | + | | +-----------+ +-----------+ + | | | 2 | -------------> | 2 | + | | +-----------+ +-----------+ + | | | 3 | -------------> | 3 | + | | +-----------+ +-----------+ + | | | 4 | -------------> | 4 | + | PMD | +-----------+ +-----------+ + | level | | 5 | -------------> | 5 | + | mapping | +-----------+ +-----------+ + | | | 6 | -------------> | 6 | + | | +-----------+ +-----------+ + | | | 7 | -------------> | 7 | + | | +-----------+ +-----------+ + | | + | | + | | + +-----------+ + +The value of page->compound_head is the same for all tail pages. The first +page of page structs (page 0) associated with the HugeTLB page contains the 4 +page structs necessary to describe the HugeTLB. The only use of the remaining +pages of page structs (page 1 to page 7) is to point to page->compound_head. +Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs +will be used for each HugeTLB page. This will allow us to free the remaining +7 pages to the buddy allocator. + +Here is how things look after remapping:: + + HugeTLB struct pages(8 pages) page frame(8 pages) + +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ + | | | 0 | -------------> | 0 | + | | +-----------+ +-----------+ + | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^ + | | +-----------+ | | | | | | + | | | 2 | -----------------+ | | | | | + | | +-----------+ | | | | | + | | | 3 | -------------------+ | | | | + | | +-----------+ | | | | + | | | 4 | ---------------------+ | | | + | PMD | +-----------+ | | | + | level | | 5 | -----------------------+ | | + | mapping | +-----------+ | | + | | | 6 | -------------------------+ | + | | +-----------+ | + | | | 7 | ---------------------------+ + | | +-----------+ + | | + | | + | | + +-----------+ + +When a HugeTLB is freed to the buddy system, we should allocate 7 pages for +vmemmap pages and restore the previous mapping relationship. + +For the HugeTLB page of the pud level mapping. It is similar to the former. +We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages. + +Apart from the HugeTLB page of the pmd/pud level mapping, some architectures +(e.g. aarch64) provides a contiguous bit in the translation table entries +that hints to the MMU to indicate that it is one of a contiguous set of +entries that can be cached in a single TLB entry. + +The contiguous bit is used to increase the mapping size at the pmd and pte +(last) level. So this type of HugeTLB page can be optimized only when its +size of the struct page structs is greater than 1 page. + +Notice: The head vmemmap page is not freed to the buddy allocator and all +tail vmemmap pages are mapped to the head vmemmap page frame. So we can see +more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page) +associated with each HugeTLB page. The compound_head() can handle this +correctly (more details refer to the comment above compound_head()). + +Device DAX +========== + +The device-dax interface uses the same tail deduplication technique explained +in the previous chapter, except when used with the vmemmap in +the device (altmap). + +The following page sizes are supported in DAX: PAGE_SIZE (4K on x86_64), +PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64). + +The differences with HugeTLB are relatively minor. + +It only use 3 page structs for storing all information as opposed +to 4 on HugeTLB pages. + +There's no remapping of vmemmap given that device-dax memory is not part of +System RAM ranges initialized at boot. Thus the tail page deduplication +happens at a later stage when we populate the sections. HugeTLB reuses the +the head vmemmap page representing, whereas device-dax reuses the tail +vmemmap page. This results in only half of the savings compared to HugeTLB. + +Deduplicated tail pages are not mapped read-only. + +Here's how things look like on device-dax after the sections are populated:: + + +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ + | | | 0 | -------------> | 0 | + | | +-----------+ +-----------+ + | | | 1 | -------------> | 1 | + | | +-----------+ +-----------+ + | | | 2 | ----------------^ ^ ^ ^ ^ ^ + | | +-----------+ | | | | | + | | | 3 | ------------------+ | | | | + | | +-----------+ | | | | + | | | 4 | --------------------+ | | | + | PMD | +-----------+ | | | + | level | | 5 | ----------------------+ | | + | mapping | +-----------+ | | + | | | 6 | ------------------------+ | + | | +-----------+ | + | | | 7 | --------------------------+ + | | +-----------+ + | | + | | + | | + +-----------+ diff --git a/Documentation/mm/z3fold.rst b/Documentation/mm/z3fold.rst new file mode 100644 index 000000000000..224e3c61d686 --- /dev/null +++ b/Documentation/mm/z3fold.rst @@ -0,0 +1,30 @@ +.. _z3fold: + +====== +z3fold +====== + +z3fold is a special purpose allocator for storing compressed pages. +It is designed to store up to three compressed pages per physical page. +It is a zbud derivative which allows for higher compression +ratio keeping the simplicity and determinism of its predecessor. + +The main differences between z3fold and zbud are: + +* unlike zbud, z3fold allows for up to PAGE_SIZE allocations +* z3fold can hold up to 3 compressed pages in its page +* z3fold doesn't export any API itself and is thus intended to be used + via the zpool API. + +To keep the determinism and simplicity, z3fold, just like zbud, always +stores an integral number of compressed pages per page, but it can store +up to 3 pages unlike zbud which can store at most 2. Therefore the +compression ratio goes to around 2.7x while zbud's one is around 1.7x. + +Unlike zbud (but like zsmalloc for that matter) z3fold_alloc() does not +return a dereferenceable pointer. Instead, it returns an unsigned long +handle which encodes actual location of the allocated object. + +Keeping effective compression ratio close to zsmalloc's, z3fold doesn't +depend on MMU enabled and provides more predictable reclaim behavior +which makes it a better fit for small and response-critical systems. diff --git a/Documentation/mm/zsmalloc.rst b/Documentation/mm/zsmalloc.rst new file mode 100644 index 000000000000..6e79893d6132 --- /dev/null +++ b/Documentation/mm/zsmalloc.rst @@ -0,0 +1,82 @@ +.. _zsmalloc: + +======== +zsmalloc +======== + +This allocator is designed for use with zram. Thus, the allocator is +supposed to work well under low memory conditions. In particular, it +never attempts higher order page allocation which is very likely to +fail under memory pressure. On the other hand, if we just use single +(0-order) pages, it would suffer from very high fragmentation -- +any object of size PAGE_SIZE/2 or larger would occupy an entire page. +This was one of the major issues with its predecessor (xvmalloc). + +To overcome these issues, zsmalloc allocates a bunch of 0-order pages +and links them together using various 'struct page' fields. These linked +pages act as a single higher-order page i.e. an object can span 0-order +page boundaries. The code refers to these linked pages as a single entity +called zspage. + +For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE +since this satisfies the requirements of all its current users (in the +worst case, page is incompressible and is thus stored "as-is" i.e. in +uncompressed form). For allocation requests larger than this size, failure +is returned (see zs_malloc). + +Additionally, zs_malloc() does not return a dereferenceable pointer. +Instead, it returns an opaque handle (unsigned long) which encodes actual +location of the allocated object. The reason for this indirection is that +zsmalloc does not keep zspages permanently mapped since that would cause +issues on 32-bit systems where the VA region for kernel space mappings +is very small. So, before using the allocating memory, the object has to +be mapped using zs_map_object() to get a usable pointer and subsequently +unmapped using zs_unmap_object(). + +stat +==== + +With CONFIG_ZSMALLOC_STAT, we could see zsmalloc internal information via +``/sys/kernel/debug/zsmalloc/``. Here is a sample of stat output:: + + # cat /sys/kernel/debug/zsmalloc/zram0/classes + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage + ... + ... + 9 176 0 1 186 129 8 4 + 10 192 1 0 2880 2872 135 3 + 11 208 0 1 819 795 42 2 + 12 224 0 1 219 159 12 4 + ... + ... + + +class + index +size + object size zspage stores +almost_empty + the number of ZS_ALMOST_EMPTY zspages(see below) +almost_full + the number of ZS_ALMOST_FULL zspages(see below) +obj_allocated + the number of objects allocated +obj_used + the number of objects allocated to the user +pages_used + the number of pages allocated for the class +pages_per_zspage + the number of 0-order pages to make a zspage + +We assign a zspage to ZS_ALMOST_EMPTY fullness group when n <= N / f, where + +* n = number of allocated objects +* N = total number of objects zspage can store +* f = fullness_threshold_frac(ie, 4 at the moment) + +Similarly, we assign zspage to: + +* ZS_ALMOST_FULL when n > N / f +* ZS_EMPTY when n == 0 +* ZS_FULL when n == N diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/index.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/index.rst index 0c8276109fc0..30c69e1f44fe 100644 --- a/Documentation/translations/zh_CN/admin-guide/mm/damon/index.rst +++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/index.rst @@ -13,7 +13,7 @@ 监测数据访问 ============ -:doc:`DAMON ` 允许轻量级的数据访问监测。使用DAMON, +:doc:`DAMON ` 允许轻量级的数据访问监测。使用DAMON, 用户可以分析他们系统的内存访问模式,并优化它们。 .. toctree:: diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst index 1500bdbf338a..c976f3e33ffd 100644 --- a/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst +++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst @@ -229,4 +229,4 @@ DAMON_RECLAIM再次什么都不做,这样我们就可以退回到基于LRU列 .. [1] https://research.google/pubs/pub48551/ .. [2] https://lwn.net/Articles/787611/ -.. [3] https://www.kernel.org/doc/html/latest/vm/free_page_reporting.html +.. [3] https://www.kernel.org/doc/html/latest/mm/free_page_reporting.html diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst index eee0e8c5c368..cd41ada4fdad 100644 --- a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst +++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst @@ -33,9 +33,9 @@ DAMON 为不同的用户提供了下面这些接口。 口相同。这将在下一个LTS内核发布后被移除,所以用户应该转移到 :ref:`sysfs interface `。 - *内核空间编程接口。* - :doc:`这 ` 这是为内核空间程序员准备的。使用它,用户可以通过为你编写内 + :doc:`这 ` 这是为内核空间程序员准备的。使用它,用户可以通过为你编写内 核空间的DAMON应用程序,最灵活有效地利用DAMON的每一个功能。你甚至可以为各种地址空间扩展DAMON。 - 详细情况请参考接口 :doc:`文件 `。 + 详细情况请参考接口 :doc:`文件 `。 sysfs接口 ========= @@ -148,7 +148,7 @@ contexts//monitoring_attrs/ 在 ``nr_regions`` 目录下,有两个文件分别用于DAMON监测区域的下限和上限(``min`` 和 ``max`` ), 这两个文件控制着监测的开销。你可以通过向这些文件的写入和读出来设置和获取这些值。 -关于间隔和监测区域范围的更多细节,请参考设计文件 (:doc:`/vm/damon/design`)。 +关于间隔和监测区域范围的更多细节,请参考设计文件 (:doc:`/mm/damon/design`)。 contexts//targets/ --------------------- @@ -318,7 +318,7 @@ DAMON导出了八个文件, ``attrs``, ``target_ids``, ``init_regions``, ---- 用户可以通过读取和写入 ``attrs`` 文件获得和设置 ``采样间隔`` 、 ``聚集间隔`` 、 ``更新间隔`` -以及监测目标区域的最小/最大数量。要详细了解监测属性,请参考 `:doc:/vm/damon/design` 。例如, +以及监测目标区域的最小/最大数量。要详细了解监测属性,请参考 `:doc:/mm/damon/design` 。例如, 下面的命令将这些值设置为5ms、100ms、1000ms、10和1000,然后再次检查:: # cd /damon diff --git a/Documentation/translations/zh_CN/core-api/index.rst b/Documentation/translations/zh_CN/core-api/index.rst index 26d9913fc8b6..b03020c8b2ab 100644 --- a/Documentation/translations/zh_CN/core-api/index.rst +++ b/Documentation/translations/zh_CN/core-api/index.rst @@ -101,7 +101,7 @@ Todolist: ======== 如何在内核中分配和使用内存。请注意,在 -:doc:`/vm/index` 中有更多的内存管理文档。 +:doc:`/mm/index` 中有更多的内存管理文档。 .. toctree:: :maxdepth: 1 diff --git a/Documentation/translations/zh_CN/index.rst b/Documentation/translations/zh_CN/index.rst index ad7bb8c17562..bf85baca8b3e 100644 --- a/Documentation/translations/zh_CN/index.rst +++ b/Documentation/translations/zh_CN/index.rst @@ -118,7 +118,7 @@ TODOList: sound/index filesystems/index scheduler/index - vm/index + mm/index peci/index TODOList: diff --git a/Documentation/translations/zh_CN/mm/active_mm.rst b/Documentation/translations/zh_CN/mm/active_mm.rst new file mode 100644 index 000000000000..c2816f523bd7 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/active_mm.rst @@ -0,0 +1,85 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/mm/active_mm.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +========= +Active MM +========= + +这是一封linux之父回复开发者的一封邮件,所以翻译时我尽量保持邮件格式的完整。 + +:: + + List: linux-kernel + Subject: Re: active_mm + From: Linus Torvalds + Date: 1999-07-30 21:36:24 + + 因为我并不经常写解释,所以已经抄送到linux-kernel邮件列表,而当我做这些, + 且更多的人在阅读它们时,我觉得棒极了。 + + 1999年7月30日 星期五, David Mosberger 写道: + > + > 是否有一个简短的描述,说明task_struct中的 + > "mm" 和 "active_mm"应该如何使用? (如果 + > 这个问题在邮件列表中讨论过,我表示歉意--我刚 + > 刚度假回来,有一段时间没能关注linux-kernel了)。 + + 基本上,新的设定是: + + - 我们有“真实地址空间”和“匿名地址空间”。区别在于,匿名地址空间根本不关心用 + 户级页表,所以当我们做上下文切换到匿名地址空间时,我们只是让以前的地址空间 + 处于活动状态。 + + 一个“匿名地址空间”的明显用途是任何不需要任何用户映射的线程--所有的内核线 + 程基本上都属于这一类,但即使是“真正的”线程也可以暂时说在一定时间内它们不 + 会对用户空间感兴趣,调度器不妨试着避免在切换VM状态上浪费时间。目前只有老 + 式的bdflush sync能做到这一点。 + + - “tsk->mm” 指向 “真实地址空间”。对于一个匿名进程来说,tsk->mm将是NULL, + 其逻辑原因是匿名进程实际上根本就 “没有” 真正的地址空间。 + + - 然而,我们显然需要跟踪我们为这样的匿名用户“偷用”了哪个地址空间。为此,我们 + 有 “tsk->active_mm”,它显示了当前活动的地址空间是什么。 + + 规则是,对于一个有真实地址空间的进程(即tsk->mm是 non-NULL),active_mm + 显然必须与真实的mm相同。 + + 对于一个匿名进程,tsk->mm == NULL,而tsk->active_mm是匿名进程运行时 + “借用”的mm。当匿名进程被调度走时,借用的地址空间被返回并清除。 + + 为了支持所有这些,“struct mm_struct”现在有两个计数器:一个是 “mm_users” + 计数器,即有多少 “真正的地址空间用户”,另一个是 “mm_count”计数器,即 “lazy” + 用户(即匿名用户)的数量,如果有任何真正的用户,则加1。 + + 通常情况下,至少有一个真正的用户,但也可能是真正的用户在另一个CPU上退出,而 + 一个lazy的用户仍在活动,所以你实际上得到的情况是,你有一个地址空间 **只** + 被lazy的用户使用。这通常是一个短暂的生命周期状态,因为一旦这个线程被安排给一 + 个真正的线程,这个 “僵尸” mm就会被释放,因为 “mm_count”变成了零。 + + 另外,一个新的规则是,**没有人** 再把 “init_mm” 作为一个真正的MM了。 + “init_mm”应该被认为只是一个 “没有其他上下文时的lazy上下文”,事实上,它主 + 要是在启动时使用,当时还没有真正的VM被创建。因此,用来检查的代码 + + if (current->mm == &init_mm) + + 一般来说,应该用 + + if (!current->mm) + + 取代上面的写法(这更有意义--测试基本上是 “我们是否有一个用户环境”,并且通常 + 由缺页异常处理程序和类似的东西来完成)。 + + 总之,我刚才在ftp.kernel.org上放了一个pre-patch-2.3.13-1,因为它稍微改 + 变了接口以适配alpha(谁会想到呢,但alpha体系结构上下文切换代码实际上最终是 + 最丑陋的之一--不像其他架构的MM和寄存器状态是分开的,alpha的PALcode将两者 + 连接起来,你需要同时切换两者)。 + + (文档来源 http://marc.info/?l=linux-kernel&m=93337278602211&w=2) diff --git a/Documentation/translations/zh_CN/mm/balance.rst b/Documentation/translations/zh_CN/mm/balance.rst new file mode 100644 index 000000000000..6fd79209c307 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/balance.rst @@ -0,0 +1,81 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/mm/balance.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +======== +内存平衡 +======== + +2000年1月开始,作者:Kanoj Sarcar + +对于 !__GFP_HIGH 和 !__GFP_KSWAPD_RECLAIM 以及非 __GFP_IO 的分配,需要进行 +内存平衡。 + +调用者避免回收的第一个原因是调用者由于持有自旋锁或处于中断环境中而无法睡眠。第二个 +原因可能是,调用者愿意在不产生页面回收开销的情况下分配失败。这可能发生在有0阶回退 +选项的机会主义高阶分配请求中。在这种情况下,调用者可能也希望避免唤醒kswapd。 + +__GFP_IO分配请求是为了防止文件系统死锁。 + +在没有非睡眠分配请求的情况下,做平衡似乎是有害的。页面回收可以被懒散地启动,也就是 +说,只有在需要的时候(也就是区域的空闲内存为0),而不是让它成为一个主动的过程。 + +也就是说,内核应该尝试从直接映射池中满足对直接映射页的请求,而不是回退到dma池中, +这样就可以保持dma池为dma请求(不管是不是原子的)所填充。类似的争论也适用于高内存 +和直接映射的页面。相反,如果有很多空闲的dma页,最好是通过从dma池中分配一个来满足 +常规的内存请求,而不是产生常规区域平衡的开销。 + +在2.2中,只有当空闲页总数低于总内存的1/64时,才会启动内存平衡/页面回收。如果dma +和常规内存的比例合适,即使dma区完全空了,也很可能不会进行平衡。2.2已经在不同内存 +大小的生产机器上运行,即使有这个问题存在,似乎也做得不错。在2.3中,由于HIGHMEM的 +存在,这个问题变得更加严重。 + +在2.3中,区域平衡可以用两种方式之一来完成:根据区域的大小(可能是低级区域的大小), +我们可以在初始化阶段决定在平衡任何区域时应该争取多少空闲页。好的方面是,在平衡的时 +候,我们不需要看低级区的大小,坏的方面是,我们可能会因为忽略低级区可能较低的使用率 +而做过于频繁的平衡。另外,只要对分配程序稍作修改,就有可能将memclass()宏简化为一 +个简单的等式。 + +另一个可能的解决方案是,我们只在一个区 **和** 其所有低级区的空闲内存低于该区及其 +低级区总内存的1/64时进行平衡。这就解决了2.2的平衡问题,并尽可能地保持了与2.2行为 +的接近。另外,平衡算法在各种架构上的工作方式也是一样的,这些架构有不同数量和类型的 +内存区。如果我们想变得更花哨一点,我们可以在未来为不同区域的自由页面分配不同的权重。 + +请注意,如果普通区的大小与dma区相比是巨大的,那么在决定是否平衡普通区的时候,考虑 +空闲的dma页就变得不那么重要了。那么第一个解决方案就变得更有吸引力。 + +所附的补丁实现了第二个解决方案。它还 “修复”了两个问题:首先,在低内存条件下,kswapd +被唤醒,就像2.2中的非睡眠分配。第二,HIGHMEM区也被平衡了,以便给replace_with_highmem() +一个争取获得HIGHMEM页的机会,同时确保HIGHMEM分配不会落回普通区。这也确保了HIGHMEM +页不会被泄露(例如,在一个HIGHMEM页在交换缓存中但没有被任何人使用的情况下)。 + +kswapd还需要知道它应该平衡哪些区。kswapd主要是在无法进行平衡的情况下需要的,可能 +是因为所有的分配请求都来自中断上下文,而所有的进程上下文都在睡眠。对于2.3, +kswapd并不真正需要平衡高内存区,因为中断上下文并不请求高内存页。kswapd看zone +结构体中的zone_wake_kswapd字段来决定一个区是否需要平衡。 + +如果从进程内存和shm中偷取页面可以减轻该页面节点中任何区的内存压力,而该区的内存压力 +已经低于其水位,则会进行偷取。 + +watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: +这些是每个区的字段,用于确定一个区何时需要平衡。当页面数低于水位[WMARK_MIN]时, +hysteric 的字段low_on_memory被设置。这个字段会一直被设置,直到空闲页数变成水位 +[WMARK_HIGH]。当low_on_memory被设置时,页面分配请求将尝试释放该区域的一些页面(如果 +请求中设置了GFP_WAIT)。与此相反的是,决定唤醒kswapd以释放一些区的页。这个决定不是基于 +hysteresis 的,而是当空闲页的数量低于watermark[WMARK_LOW]时就会进行;在这种情况下, +zone_wake_kswapd也被设置。 + + +我所听到的(超棒的)想法: + +1. 动态经历应该影响平衡:可以跟踪一个区的失败请求的数量,并反馈到平衡方案中(jalvo@mbay.net)。 + +2. 实现一个类似于replace_with_highmem()的replace_with_regular(),以保留dma页面。 + (lkd@tantalophile.demon.co.uk) diff --git a/Documentation/translations/zh_CN/mm/damon/api.rst b/Documentation/translations/zh_CN/mm/damon/api.rst new file mode 100644 index 000000000000..5593a83c86bc --- /dev/null +++ b/Documentation/translations/zh_CN/mm/damon/api.rst @@ -0,0 +1,32 @@ +.. SPDX-License-Identifier: GPL-2.0 + +:Original: Documentation/mm/damon/api.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +======= +API参考 +======= + +内核空间的程序可以使用下面的API来使用DAMON的每个功能。你所需要做的就是引用 ``damon.h`` , +它位于源代码树的include/linux/。 + +结构体 +====== + +该API在以下内核代码中: + +include/linux/damon.h + + +函数 +==== + +该API在以下内核代码中: + +mm/damon/core.c diff --git a/Documentation/translations/zh_CN/mm/damon/design.rst b/Documentation/translations/zh_CN/mm/damon/design.rst new file mode 100644 index 000000000000..16e3db34a7dd --- /dev/null +++ b/Documentation/translations/zh_CN/mm/damon/design.rst @@ -0,0 +1,140 @@ +.. SPDX-License-Identifier: GPL-2.0 + +:Original: Documentation/mm/damon/design.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +==== +设计 +==== + +可配置的层 +========== + +DAMON提供了数据访问监控功能,同时使其准确性和开销可控。基本的访问监控需要依赖于目标地址空间 +并为之优化的基元。另一方面,作为DAMON的核心,准确性和开销的权衡机制是在纯逻辑空间中。DAMON +将这两部分分离在不同的层中,并定义了它的接口,以允许各种低层次的基元实现与核心逻辑的配置。 + +由于这种分离的设计和可配置的接口,用户可以通过配置核心逻辑和适当的低级基元实现来扩展DAMON的 +任何地址空间。如果没有提供合适的,用户可以自己实现基元。 + +例如,物理内存、虚拟内存、交换空间、那些特定的进程、NUMA节点、文件和支持的内存设备将被支持。 +另外,如果某些架构或设备支持特殊的优化访问检查基元,这些基元将很容易被配置。 + + +特定地址空间基元的参考实现 +========================== + +基本访问监测的低级基元被定义为两部分。: + +1. 确定地址空间的监测目标地址范围 +2. 目标空间中特定地址范围的访问检查。 + +DAMON目前为物理和虚拟地址空间提供了基元的实现。下面两个小节描述了这些工作的方式。 + + +基于VMA的目标地址范围构造 +------------------------- + +这仅仅是针对虚拟地址空间基元的实现。对于物理地址空间,只是要求用户手动设置监控目标地址范围。 + +在进程的超级巨大的虚拟地址空间中,只有小部分被映射到物理内存并被访问。因此,跟踪未映射的地 +址区域只是一种浪费。然而,由于DAMON可以使用自适应区域调整机制来处理一定程度的噪声,所以严 +格来说,跟踪每一个映射并不是必须的,但在某些情况下甚至会产生很高的开销。也就是说,监测目标 +内部过于巨大的未映射区域应该被移除,以不占用自适应机制的时间。 + +出于这个原因,这个实现将复杂的映射转换为三个不同的区域,覆盖地址空间的每个映射区域。这三个 +区域之间的两个空隙是给定地址空间中两个最大的未映射区域。这两个最大的未映射区域是堆和最上面 +的mmap()区域之间的间隙,以及在大多数情况下最下面的mmap()区域和堆之间的间隙。因为这些间隙 +在通常的地址空间中是异常巨大的,排除这些间隙就足以做出合理的权衡。下面详细说明了这一点:: + + + + + (small mmap()-ed regions and munmap()-ed regions) + + + + + +基于PTE访问位的访问检查 +----------------------- + +物理和虚拟地址空间的实现都使用PTE Accessed-bit进行基本访问检查。唯一的区别在于从地址中 +找到相关的PTE访问位的方式。虚拟地址的实现是为该地址的目标任务查找页表,而物理地址的实现则 +是查找与该地址有映射关系的每一个页表。通过这种方式,实现者找到并清除下一个采样目标地址的位, +并检查该位是否在一个采样周期后再次设置。这可能会干扰其他使用访问位的内核子系统,即空闲页跟 +踪和回收逻辑。为了避免这种干扰,DAMON使其与空闲页面跟踪相互排斥,并使用 ``PG_idle`` 和 +``PG_young`` 页面标志来解决与回收逻辑的冲突,就像空闲页面跟踪那样。 + + +独立于地址空间的核心机制 +======================== + +下面四个部分分别描述了DAMON的核心机制和五个监测属性,即 ``采样间隔`` 、 ``聚集间隔`` 、 +``更新间隔`` 、 ``最小区域数`` 和 ``最大区域数`` 。 + + +访问频率监测 +------------ + +DAMON的输出显示了在给定的时间内哪些页面的访问频率是多少。访问频率的分辨率是通过设置 +``采样间隔`` 和 ``聚集间隔`` 来控制的。详细地说,DAMON检查每个 ``采样间隔`` 对每 +个页面的访问,并将结果汇总。换句话说,计算每个页面的访问次数。在每个 ``聚合间隔`` 过 +去后,DAMON调用先前由用户注册的回调函数,以便用户可以阅读聚合的结果,然后再清除这些结 +果。这可以用以下简单的伪代码来描述:: + + while monitoring_on: + for page in monitoring_target: + if accessed(page): + nr_accesses[page] += 1 + if time() % aggregation_interval == 0: + for callback in user_registered_callbacks: + callback(monitoring_target, nr_accesses) + for page in monitoring_target: + nr_accesses[page] = 0 + sleep(sampling interval) + +这种机制的监测开销将随着目标工作负载规模的增长而任意增加。 + + +基于区域的抽样调查 +------------------ + +为了避免开销的无限制增加,DAMON将假定具有相同访问频率的相邻页面归入一个区域。只要保持 +这个假设(一个区域内的页面具有相同的访问频率),该区域内就只需要检查一个页面。因此,对 +于每个 ``采样间隔`` ,DAMON在每个区域中随机挑选一个页面,等待一个 ``采样间隔`` ,检 +查该页面是否同时被访问,如果被访问则增加该区域的访问频率。因此,监测开销是可以通过设置 +区域的数量来控制的。DAMON允许用户设置最小和最大的区域数量来进行权衡。 + +然而,如果假设没有得到保证,这个方案就不能保持输出的质量。 + + +适应性区域调整 +-------------- + +即使最初的监测目标区域被很好地构建以满足假设(同一区域内的页面具有相似的访问频率),数 +据访问模式也会被动态地改变。这将导致监测质量下降。为了尽可能地保持假设,DAMON根据每个 +区域的访问频率自适应地进行合并和拆分。 + +对于每个 ``聚集区间`` ,它比较相邻区域的访问频率,如果频率差异较小,就合并这些区域。 +然后,在它报告并清除每个区域的聚合接入频率后,如果区域总数不超过用户指定的最大区域数, +它将每个区域拆分为两个或三个区域。 + +通过这种方式,DAMON提供了其最佳的质量和最小的开销,同时保持了用户为其权衡设定的界限。 + + +动态目标空间更新处理 +-------------------- + +监测目标地址范围可以动态改变。例如,虚拟内存可以动态地被映射和解映射。物理内存可以被 +热插拔。 + +由于在某些情况下变化可能相当频繁,DAMON允许监控操作检查动态变化,包括内存映射变化, +并仅在用户指定的时间间隔( ``更新间隔`` )中的每个时间段,将其应用于监控操作相关的 +数据结构,如抽象的监控目标内存区。 \ No newline at end of file diff --git a/Documentation/translations/zh_CN/mm/damon/faq.rst b/Documentation/translations/zh_CN/mm/damon/faq.rst new file mode 100644 index 000000000000..de4be417494a --- /dev/null +++ b/Documentation/translations/zh_CN/mm/damon/faq.rst @@ -0,0 +1,48 @@ +.. SPDX-License-Identifier: GPL-2.0 + +:Original: Documentation/mm/damon/faq.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +======== +常见问题 +======== + +为什么是一个新的子系统,而不是扩展perf或其他用户空间工具? +========================================================== + +首先,因为它需要尽可能的轻量级,以便可以在线使用,所以应该避免任何不必要的开销,如内核-用户 +空间的上下文切换成本。第二,DAMON的目标是被包括内核在内的其他程序所使用。因此,对特定工具 +(如perf)的依赖性是不可取的。这就是DAMON在内核空间实现的两个最大的原因。 + + +“闲置页面跟踪” 或 “perf mem” 可以替代DAMON吗? +============================================== + +闲置页跟踪是物理地址空间访问检查的一个低层次的原始方法。“perf mem”也是类似的,尽管它可以 +使用采样来减少开销。另一方面,DAMON是一个更高层次的框架,用于监控各种地址空间。它专注于内 +存管理优化,并提供复杂的精度/开销处理机制。因此,“空闲页面跟踪” 和 “perf mem” 可以提供 +DAMON输出的一个子集,但不能替代DAMON。 + + +DAMON是否只支持虚拟内存? +========================= + +不,DAMON的核心是独立于地址空间的。用户可以在DAMON核心上实现和配置特定地址空间的低级原始 +部分,包括监测目标区域的构造和实际的访问检查。通过这种方式,DAMON用户可以用任何访问检查技 +术来监测任何地址空间。 + +尽管如此,DAMON默认为虚拟内存和物理内存提供了基于vma/rmap跟踪和PTE访问位检查的地址空间 +相关功能的实现,以供参考和方便使用。 + + +我可以简单地监测页面的粒度吗? +============================== + +是的,你可以通过设置 ``min_nr_regions`` 属性高于工作集大小除以页面大小的值来实现。 +因为监视目标区域的大小被强制为 ``>=page size`` ,所以区域分割不会产生任何影响。 diff --git a/Documentation/translations/zh_CN/mm/damon/index.rst b/Documentation/translations/zh_CN/mm/damon/index.rst new file mode 100644 index 000000000000..b03bf307204f --- /dev/null +++ b/Documentation/translations/zh_CN/mm/damon/index.rst @@ -0,0 +1,32 @@ +.. SPDX-License-Identifier: GPL-2.0 + +:Original: Documentation/mm/damon/index.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +========================== +DAMON:数据访问监视器 +========================== + +DAMON是Linux内核的一个数据访问监控框架子系统。DAMON的核心机制使其成为 +(该核心机制详见(Documentation/translations/zh_CN/mm/damon/design.rst)) + + - *准确度* (监测输出对DRAM级别的内存管理足够有用;但可能不适合CPU Cache级别), + - *轻量级* (监控开销低到可以在线应用),以及 + - *可扩展* (无论目标工作负载的大小,开销的上限值都在恒定范围内)。 + +因此,利用这个框架,内核的内存管理机制可以做出高级决策。会导致高数据访问监控开销的实 +验性内存管理优化工作可以再次进行。同时,在用户空间,有一些特殊工作负载的用户可以编写 +个性化的应用程序,以便更好地了解和优化他们的工作负载和系统。 + +.. toctree:: + :maxdepth: 2 + + faq + design + api diff --git a/Documentation/translations/zh_CN/mm/free_page_reporting.rst b/Documentation/translations/zh_CN/mm/free_page_reporting.rst new file mode 100644 index 000000000000..83b14cce9adf --- /dev/null +++ b/Documentation/translations/zh_CN/mm/free_page_reporting.rst @@ -0,0 +1,38 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/mm/_free_page_reporting.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + +========== +空闲页报告 +========== + +空闲页报告是一个API,设备可以通过它来注册接收系统当前未使用的页面列表。这在虚拟 +化的情况下是很有用的,客户机能够使用这些数据来通知管理器它不再使用内存中的某些页 +面。 + +对于驱动,通常是气球驱动要使用这个功能,它将分配和初始化一个page_reporting_dev_info +结构体。它要填充的结构体中的字段是用于处理散点列表的 "report" 函数指针。它还必 +须保证每次调用该函数时能处理至少相当于PAGE_REPORTING_CAPACITY的散点列表条目。 +假设没有其他页面报告设备已经注册, 对page_reporting_register的调用将向报告框 +架注册页面报告接口。 + +一旦注册,页面报告API将开始向驱动报告成批的页面。API将在接口被注册后2秒开始报告 +页面,并在任何足够高的页面被释放之后2秒继续报告。 + +报告的页面将被存储在传递给报告函数的散列表中,最后一个条目的结束位被设置在条目 +nent-1中。 当页面被报告函数处理时,分配器将无法访问它们。一旦报告函数完成,这些 +页将被返回到它们所获得的自由区域。 + +在移除使用空闲页报告的驱动之前,有必要调用page_reporting_unregister,以移除 +目前被空闲页报告使用的page_reporting_dev_info结构体。这样做将阻止进一步的报 +告通过该接口发出。如果另一个驱动或同一驱动被注册,它就有可能恢复前一个驱动在报告 +空闲页方面的工作。 + + +Alexander Duyck, 2019年12月04日 diff --git a/Documentation/translations/zh_CN/mm/frontswap.rst b/Documentation/translations/zh_CN/mm/frontswap.rst new file mode 100644 index 000000000000..5c18ea2be04f --- /dev/null +++ b/Documentation/translations/zh_CN/mm/frontswap.rst @@ -0,0 +1,196 @@ +:Original: Documentation/mm/_free_page_reporting.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + +========= +Frontswap +========= + +Frontswap为交换页提供了一个 “transcendent memory” 的接口。在一些环境中,由 +于交换页被保存在RAM(或类似RAM的设备)中,而不是交换磁盘,因此可以获得巨大的性能 +节省(提高)。 + +.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/ + +Frontswap之所以这么命名,是因为它可以被认为是与swap设备的“back”存储相反。存 +储器被认为是一个同步并发安全的面向页面的“伪RAM设备”,符合transcendent memory +(如Xen的“tmem”,或内核内压缩内存,又称“zcache”,或未来的类似RAM的设备)的要 +求;这个伪RAM设备不能被内核直接访问或寻址,其大小未知且可能随时间变化。驱动程序通过 +调用frontswap_register_ops将自己与frontswap链接起来,以适当地设置frontswap_ops +的功能,它提供的功能必须符合某些策略,如下所示: + +一个 “init” 将设备准备好接收与指定的交换设备编号(又称“类型”)相关的frontswap +交换页。一个 “store” 将把该页复制到transcendent memory,并与该页的类型和偏移 +量相关联。一个 “load” 将把该页,如果找到的话,从transcendent memory复制到内核 +内存,但不会从transcendent memory中删除该页。一个 “invalidate_page” 将从 +transcendent memory中删除该页,一个 “invalidate_area” 将删除所有与交换类型 +相关的页(例如,像swapoff)并通知 “device” 拒绝进一步存储该交换类型。 + +一旦一个页面被成功存储,在该页面上的匹配加载通常会成功。因此,当内核发现自己处于需 +要交换页面的情况时,它首先尝试使用frontswap。如果存储的结果是成功的,那么数据就已 +经成功的保存到了transcendent memory中,并且避免了磁盘写入,如果后来再读回数据, +也避免了磁盘读取。如果存储返回失败,transcendent memory已经拒绝了该数据,且该页 +可以像往常一样被写入交换空间。 + +请注意,如果一个页面被存储,而该页面已经存在于transcendent memory中(一个 “重复” +的存储),要么存储成功,数据被覆盖,要么存储失败,该页面被废止。这确保了旧的数据永远 +不会从frontswap中获得。 + +如果配置正确,对frontswap的监控是通过 `/sys/kernel/debug/frontswap` 目录下的 +debugfs完成的。frontswap的有效性可以通过以下方式测量(在所有交换设备中): + +``failed_stores`` + 有多少次存储的尝试是失败的 + +``loads`` + 尝试了多少次加载(应该全部成功) + +``succ_stores`` + 有多少次存储的尝试是成功的 + +``invalidates`` + 尝试了多少次作废 + +后台实现可以提供额外的指标。 + +经常问到的问题 +============== + +* 价值在哪里? + +当一个工作负载开始交换时,性能就会下降。Frontswap通过提供一个干净的、动态的接口来 +读取和写入交换页到 “transcendent memory”,从而大大增加了许多这样的工作负载的性 +能,否则内核是无法直接寻址的。当数据被转换为不同的形式和大小(比如压缩)或者被秘密 +移动(对于一些类似RAM的设备来说,这可能对写平衡很有用)时,这个接口是理想的。交换 +页(和被驱逐的页面缓存页)是这种比RAM慢但比磁盘快得多的“伪RAM设备”的一大用途。 + +Frontswap对内核的影响相当小,为各种系统配置中更动态、更灵活的RAM利用提供了巨大的 +灵活性: + +在单一内核的情况下,又称“zcache”,页面被压缩并存储在本地内存中,从而增加了可以安 +全保存在RAM中的匿名页面总数。Zcache本质上是用压缩/解压缩的CPU周期换取更好的内存利 +用率。Benchmarks测试显示,当内存压力较低时,几乎没有影响,而在高内存压力下的一些 +工作负载上,则有明显的性能改善(25%以上)。 + +“RAMster” 在zcache的基础上增加了对集群系统的 “peer-to-peer” transcendent memory +的支持。Frontswap页面像zcache一样被本地压缩,但随后被“remotified” 到另一个系 +统的RAM。这使得RAM可以根据需要动态地来回负载平衡,也就是说,当系统A超载时,它可以 +交换到系统B,反之亦然。RAMster也可以被配置成一个内存服务器,因此集群中的许多服务器 +可以根据需要动态地交换到配置有大量内存的单一服务器上......而不需要预先配置每个客户 +有多少内存可用 + +在虚拟情况下,虚拟化的全部意义在于统计地将物理资源在多个虚拟机的不同需求之间进行复 +用。对于RAM来说,这真的很难做到,而且在不改变内核的情况下,要做好这一点的努力基本上 +是失败的(除了一些广为人知的特殊情况下的工作负载)。具体来说,Xen Transcendent Memory +后端允许管理器拥有的RAM “fallow”,不仅可以在多个虚拟机之间进行“time-shared”, +而且页面可以被压缩和重复利用,以优化RAM的利用率。当客户操作系统被诱导交出未充分利用 +的RAM时(如 “selfballooning”),突然出现的意外内存压力可能会导致交换;frontswap +允许这些页面被交换到管理器RAM中或从管理器RAM中交换(如果整体主机系统内存条件允许), +从而减轻计划外交换可能带来的可怕的性能影响。 + +一个KVM的实现正在进行中,并且已经被RFC'ed到lkml。而且,利用frontswap,对NVM作为 +内存扩展技术的调查也在进行中。 + +* 当然,在某些情况下可能有性能上的优势,但frontswap的空间/时间开销是多少? + +如果 CONFIG_FRONTSWAP 被禁用,每个 frontswap 钩子都会编译成空,唯一的开销是每 +个 swapon'ed swap 设备的几个额外字节。如果 CONFIG_FRONTSWAP 被启用,但没有 +frontswap的 “backend” 寄存器,每读或写一个交换页就会有一个额外的全局变量,而不 +是零。如果 CONFIG_FRONTSWAP 被启用,并且有一个frontswap的backend寄存器,并且 +后端每次 “store” 请求都失败(即尽管声称可能,但没有提供内存),CPU 的开销仍然可以 +忽略不计 - 因为每次frontswap失败都是在交换页写到磁盘之前,系统很可能是 I/O 绑定 +的,无论如何使用一小部分的 CPU 都是不相关的。 + +至于空间,如果CONFIG_FRONTSWAP被启用,并且有一个frontswap的backend注册,那么 +每个交换设备的每个交换页都会被分配一个比特。这是在内核已经为每个交换设备的每个交换 +页分配的8位(在2.6.34之前是16位)上增加的。(Hugh Dickins观察到,frontswap可能 +会偷取现有的8个比特,但是我们以后再来担心这个小的优化问题)。对于标准的4K页面大小的 +非常大的交换盘(这很罕见),这是每32GB交换盘1MB开销。 + +当交换页存储在transcendent memory中而不是写到磁盘上时,有一个副作用,即这可能会 +产生更多的内存压力,有可能超过其他的优点。一个backend,比如zcache,必须实现策略 +来仔细(但动态地)管理内存限制,以确保这种情况不会发生。 + +* 好吧,那就用内核骇客能理解的术语来快速概述一下这个frontswap补丁的作用如何? + +我们假设在内核初始化过程中,一个frontswap 的 “backend” 已经注册了;这个注册表 +明这个frontswap 的 “backend” 可以访问一些不被内核直接访问的“内存”。它到底提 +供了多少内存是完全动态和随机的。 + +每当一个交换设备被交换时,就会调用frontswap_init(),把交换设备的编号(又称“类 +型”)作为一个参数传给它。这就通知了frontswap,以期待 “store” 与该号码相关的交 +换页的尝试。 + +每当交换子系统准备将一个页面写入交换设备时(参见swap_writepage()),就会调用 +frontswap_store。Frontswap与frontswap backend协商,如果backend说它没有空 +间,frontswap_store返回-1,内核就会照常把页换到交换设备上。注意,来自frontswap +backend的响应对内核来说是不可预测的;它可能选择从不接受一个页面,可能接受每九个 +页面,也可能接受每一个页面。但是如果backend确实接受了一个页面,那么这个页面的数 +据已经被复制并与类型和偏移量相关联了,而且backend保证了数据的持久性。在这种情况 +下,frontswap在交换设备的“frontswap_map” 中设置了一个位,对应于交换设备上的 +页面偏移量,否则它就会将数据写入该设备。 + +当交换子系统需要交换一个页面时(swap_readpage()),它首先调用frontswap_load(), +检查frontswap_map,看这个页面是否早先被frontswap backend接受。如果是,该页 +的数据就会从frontswap后端填充,换入就完成了。如果不是,正常的交换代码将被执行, +以便从真正的交换设备上获得这一页的数据。 + +所以每次frontswap backend接受一个页面时,交换设备的读取和(可能)交换设备的写 +入都被 “frontswap backend store” 和(可能)“frontswap backend loads” +所取代,这可能会快得多。 + +* frontswap不能被配置为一个 “特殊的” 交换设备,它的优先级要高于任何真正的交换 + 设备(例如像zswap,或者可能是swap-over-nbd/NFS)? + +首先,现有的交换子系统不允许有任何种类的交换层次结构。也许它可以被重写以适应层次 +结构,但这将需要相当大的改变。即使它被重写,现有的交换子系统也使用了块I/O层,它 +假定交换设备是固定大小的,其中的任何页面都是可线性寻址的。Frontswap几乎没有触 +及现有的交换子系统,而是围绕着块I/O子系统的限制,提供了大量的灵活性和动态性。 + +例如,frontswap backend对任何交换页的接受是完全不可预测的。这对frontswap backend +的定义至关重要,因为它赋予了backend完全动态的决定权。在zcache中,人们无法预 +先知道一个页面的可压缩性如何。可压缩性 “差” 的页面会被拒绝,而 “差” 本身也可 +以根据当前的内存限制动态地定义。 + +此外,frontswap是完全同步的,而真正的交换设备,根据定义,是异步的,并且使用 +块I/O。块I/O层不仅是不必要的,而且可能进行 “优化”,这对面向RAM的设备来说是 +不合适的,包括将一些页面的写入延迟相当长的时间。同步是必须的,以确保后端的动 +态性,并避免棘手的竞争条件,这将不必要地大大增加frontswap和/或块I/O子系统的 +复杂性。也就是说,只有最初的 “store” 和 “load” 操作是需要同步的。一个独立 +的异步线程可以自由地操作由frontswap存储的页面。例如,RAMster中的 “remotification” +线程使用标准的异步内核套接字,将压缩的frontswap页面移动到远程机器。同样, +KVM的客户方实现可以进行客户内压缩,并使用 “batched” hypercalls。 + +在虚拟化环境中,动态性允许管理程序(或主机操作系统)做“intelligent overcommit”。 +例如,它可以选择只接受页面,直到主机交换可能即将发生,然后强迫客户机做他们 +自己的交换。 + +transcendent memory规格的frontswap有一个坏处。因为任何 “store” 都可 +能失败,所以必须在一个真正的交换设备上有一个真正的插槽来交换页面。因此, +frontswap必须作为每个交换设备的 “影子” 来实现,它有可能容纳交换设备可能 +容纳的每一个页面,也有可能根本不容纳任何页面。这意味着frontswap不能包含比 +swap设备总数更多的页面。例如,如果在某些安装上没有配置交换设备,frontswap +就没有用。无交换设备的便携式设备仍然可以使用frontswap,但是这种设备的 +backend必须配置某种 “ghost” 交换设备,并确保它永远不会被使用。 + + +* 为什么会有这种关于 “重复存储” 的奇怪定义?如果一个页面以前被成功地存储过, + 难道它不能总是被成功地覆盖吗? + +几乎总是可以的,不,有时不能。考虑一个例子,数据被压缩了,原来的4K页面被压 +缩到了1K。现在,有人试图用不可压缩的数据覆盖该页,因此会占用整个4K。但是 +backend没有更多的空间了。在这种情况下,这个存储必须被拒绝。每当frontswap +拒绝一个会覆盖的存储时,它也必须使旧的数据作废,并确保它不再被访问。因为交 +换子系统会把新的数据写到读交换设备上,这是确保一致性的正确做法。 + +* 为什么frontswap补丁会创建新的头文件swapfile.h? + +frontswap代码依赖于一些swap子系统内部的数据结构,这些数据结构多年来一直 +在静态和全局之间来回移动。这似乎是一个合理的妥协:将它们定义为全局,但在一 +个新的包含文件中声明它们,该文件不被包含swap.h的大量源文件所包含。 + +Dan Magenheimer,最后更新于2012年4月9日 diff --git a/Documentation/translations/zh_CN/mm/highmem.rst b/Documentation/translations/zh_CN/mm/highmem.rst new file mode 100644 index 000000000000..81202c65e000 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/highmem.rst @@ -0,0 +1,128 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/mm/highmem.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + +========== +高内存处理 +========== + +作者: Peter Zijlstra + +.. contents:: :local: + +高内存是什么? +============== + +当物理内存的大小接近或超过虚拟内存的最大大小时,就会使用高内存(highmem)。在这一点上,内 +核不可能在任何时候都保持所有可用的物理内存的映射。这意味着内核需要开始使用它想访问的物理内 +存的临时映射。 + +没有被永久映射覆盖的那部分(物理)内存就是我们所说的 "高内存"。对于这个边界的确切位置,有 +各种架构上的限制。 + +例如,在i386架构中,我们选择将内核映射到每个进程的虚拟空间,这样我们就不必为内核的进入/退 +出付出全部的TLB作废代价。这意味着可用的虚拟内存空间(i386上为4GiB)必须在用户和内核空间之 +间进行划分。 + +使用这种方法的架构的传统分配方式是3:1,3GiB用于用户空间,顶部的1GiB用于内核空间。:: + + +--------+ 0xffffffff + | Kernel | + +--------+ 0xc0000000 + | | + | User | + | | + +--------+ 0x00000000 + +这意味着内核在任何时候最多可以映射1GiB的物理内存,但是由于我们需要虚拟地址空间来做其他事 +情--包括访问其余物理内存的临时映射--实际的直接映射通常会更少(通常在~896MiB左右)。 + +其他有mm上下文标签的TLB的架构可以有独立的内核和用户映射。然而,一些硬件(如一些ARM)在使 +用mm上下文标签时,其虚拟空间有限。 + + +临时虚拟映射 +============ + +内核包含几种创建临时映射的方法。: + +* vmap(). 这可以用来将多个物理页长期映射到一个连续的虚拟空间。它需要synchronization + 来解除映射。 + +* kmap(). 这允许对单个页面进行短期映射。它需要synchronization,但在一定程度上被摊销。 + 当以嵌套方式使用时,它也很容易出现死锁,因此不建议在新代码中使用它。 + +* kmap_atomic(). 这允许对单个页面进行非常短的时间映射。由于映射被限制在发布它的CPU上, + 它表现得很好,但发布任务因此被要求留在该CPU上直到它完成,以免其他任务取代它的映射。 + + kmap_atomic() 也可以由中断上下文使用,因为它不睡眠,而且调用者可能在调用kunmap_atomic() + 之后才睡眠。 + + 可以假设k[un]map_atomic()不会失败。 + + +使用kmap_atomic +=============== + +何时何地使用 kmap_atomic() 是很直接的。当代码想要访问一个可能从高内存(见__GFP_HIGHMEM) +分配的页面的内容时,例如在页缓存中的页面,就会使用它。该API有两个函数,它们的使用方式与 +下面类似:: + + /* 找到感兴趣的页面。 */ + struct page *page = find_get_page(mapping, offset); + + /* 获得对该页内容的访问权。 */ + void *vaddr = kmap_atomic(page); + + /* 对该页的内容做一些处理。 */ + memset(vaddr, 0, PAGE_SIZE); + + /* 解除该页面的映射。 */ + kunmap_atomic(vaddr); + +注意,kunmap_atomic()调用的是kmap_atomic()调用的结果而不是参数。 + +如果你需要映射两个页面,因为你想从一个页面复制到另一个页面,你需要保持kmap_atomic调用严 +格嵌套,如:: + + vaddr1 = kmap_atomic(page1); + vaddr2 = kmap_atomic(page2); + + memcpy(vaddr1, vaddr2, PAGE_SIZE); + + kunmap_atomic(vaddr2); + kunmap_atomic(vaddr1); + + +临时映射的成本 +============== + +创建临时映射的代价可能相当高。体系架构必须操作内核的页表、数据TLB和/或MMU的寄存器。 + +如果CONFIG_HIGHMEM没有被设置,那么内核会尝试用一点计算来创建映射,将页面结构地址转换成 +指向页面内容的指针,而不是去捣鼓映射。在这种情况下,解映射操作可能是一个空操作。 + +如果CONFIG_MMU没有被设置,那么就不可能有临时映射和高内存。在这种情况下,也将使用计算方法。 + + +i386 PAE +======== + +在某些情况下,i386 架构将允许你在 32 位机器上安装多达 64GiB 的内存。但这有一些后果: + +* Linux需要为系统中的每个页面建立一个页帧结构,而且页帧需要驻在永久映射中,这意味着: + +* 你最多可以有896M/sizeof(struct page)页帧;由于页结构体是32字节的,所以最终会有 + 112G的页;然而,内核需要在内存中存储更多的页帧...... + +* PAE使你的页表变大--这使系统变慢,因为更多的数据需要在TLB填充等方面被访问。一个好处 + 是,PAE有更多的PTE位,可以提供像NX和PAT这样的高级功能。 + +一般的建议是,你不要在32位机器上使用超过8GiB的空间--尽管更多的空间可能对你和你的工作 +量有用,但你几乎是靠你自己--不要指望内核开发者真的会很关心事情的进展情况。 diff --git a/Documentation/translations/zh_CN/mm/hmm.rst b/Documentation/translations/zh_CN/mm/hmm.rst new file mode 100644 index 000000000000..5024a8a15516 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/hmm.rst @@ -0,0 +1,361 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/mm/hmm.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + +================== +异构内存管理 (HMM) +================== + +提供基础设施和帮助程序以将非常规内存(设备内存,如板上 GPU 内存)集成到常规内核路径中,其 +基石是此类内存的专用struct page(请参阅本文档的第 5 至 7 节)。 + +HMM 还为 SVM(共享虚拟内存)提供了可选的帮助程序,即允许设备透明地访问与 CPU 一致的程序 +地址,这意味着 CPU 上的任何有效指针也是该设备的有效指针。这对于简化高级异构计算的使用变得 +必不可少,其中 GPU、DSP 或 FPGA 用于代表进程执行各种计算。 + +本文档分为以下部分:在第一部分中,我揭示了与使用特定于设备的内存分配器相关的问题。在第二 +部分中,我揭示了许多平台固有的硬件限制。第三部分概述了 HMM 设计。第四部分解释了 CPU 页 +表镜像的工作原理以及 HMM 在这种情况下的目的。第五部分处理内核中如何表示设备内存。最后, +最后一节介绍了一个新的迁移助手,它允许利用设备 DMA 引擎。 + +.. contents:: :local: + +使用特定于设备的内存分配器的问题 +================================ + +具有大量板载内存(几 GB)的设备(如 GPU)历来通过专用驱动程序特定 API 管理其内存。这会 +造成设备驱动程序分配和管理的内存与常规应用程序内存(私有匿名、共享内存或常规文件支持内存) +之间的隔断。从这里开始,我将把这个方面称为分割的地址空间。我使用共享地址空间来指代相反的情况: +即,设备可以透明地使用任何应用程序内存区域。 + +分割的地址空间的发生是因为设备只能访问通过设备特定 API 分配的内存。这意味着从设备的角度来 +看,程序中的所有内存对象并不平等,这使得依赖于广泛的库的大型程序变得复杂。 + +具体来说,这意味着想要利用像 GPU 这样的设备的代码需要在通用分配的内存(malloc、mmap +私有、mmap 共享)和通过设备驱动程序 API 分配的内存之间复制对象(这仍然以 mmap 结束, +但是是设备文件)。 + +对于平面数据集(数组、网格、图像……),这并不难实现,但对于复杂数据集(列表、树……), +很难做到正确。复制一个复杂的数据集需要重新映射其每个元素之间的所有指针关系。这很容易出错, +而且由于数据集和地址的重复,程序更难调试。 + +分割地址空间也意味着库不能透明地使用它们从核心程序或另一个库中获得的数据,因此每个库可能 +不得不使用设备特定的内存分配器来重复其输入数据集。大型项目会因此受到影响,并因为各种内存 +拷贝而浪费资源。 + +复制每个库的API以接受每个设备特定分配器分配的内存作为输入或输出,并不是一个可行的选择。 +这将导致库入口点的组合爆炸。 + +最后,随着高级语言结构(在 C++ 中,当然也在其他语言中)的进步,编译器现在有可能在没有程 +序员干预的情况下利用 GPU 和其他设备。某些编译器识别的模式仅适用于共享地址空间。对所有 +其他模式,使用共享地址空间也更合理。 + + +I/O 总线、设备内存特性 +====================== + +由于一些限制,I/O 总线削弱了共享地址空间。大多数 I/O 总线只允许从设备到主内存的基本 +内存访问;甚至缓存一致性通常是可选的。从 CPU 访问设备内存甚至更加有限。通常情况下,它 +不是缓存一致的。 + +如果我们只考虑 PCIE 总线,那么设备可以访问主内存(通常通过 IOMMU)并与 CPU 缓存一 +致。但是,它只允许设备对主存储器进行一组有限的原子操作。这在另一个方向上更糟:CPU +只能访问有限范围的设备内存,而不能对其执行原子操作。因此,从内核的角度来看,设备内存不 +能被视为与常规内存等同。 + +另一个严重的因素是带宽有限(约 32GBytes/s,PCIE 4.0 和 16 通道)。这比最快的 GPU +内存 (1 TBytes/s) 慢 33 倍。最后一个限制是延迟。从设备访问主内存的延迟比设备访问自 +己的内存时高一个数量级。 + +一些平台正在开发新的 I/O 总线或对 PCIE 的添加/修改以解决其中一些限制 +(OpenCAPI、CCIX)。它们主要允许 CPU 和设备之间的双向缓存一致性,并允许架构支持的所 +有原子操作。遗憾的是,并非所有平台都遵循这一趋势,并且一些主要架构没有针对这些问题的硬 +件解决方案。 + +因此,为了使共享地址空间有意义,我们不仅必须允许设备访问任何内存,而且还必须允许任何内 +存在设备使用时迁移到设备内存(在迁移时阻止 CPU 访问)。 + + +共享地址空间和迁移 +================== + +HMM 打算提供两个主要功能。第一个是通过复制cpu页表到设备页表中来共享地址空间,因此对 +于进程地址空间中的任何有效主内存地址,相同的地址指向相同的物理内存。 + +为了实现这一点,HMM 提供了一组帮助程序来填充设备页表,同时跟踪 CPU 页表更新。设备页表 +更新不像 CPU 页表更新那么容易。要更新设备页表,您必须分配一个缓冲区(或使用预先分配的 +缓冲区池)并在其中写入 GPU 特定命令以执行更新(取消映射、缓存失效和刷新等)。这不能通 +过所有设备的通用代码来完成。因此,为什么HMM提供了帮助器,在把硬件的具体细节留给设备驱 +动程序的同时,把一切可以考虑的因素都考虑进去了。 + +HMM 提供的第二种机制是一种新的 ZONE_DEVICE 内存,它允许为设备内存的每个页面分配一个 +struct page。这些页面很特殊,因为 CPU 无法映射它们。然而,它们允许使用现有的迁移机 +制将主内存迁移到设备内存,从 CPU 的角度来看,一切看起来都像是换出到磁盘的页面。使用 +struct page可以与现有的 mm 机制进行最简单、最干净的集成。再次,HMM 仅提供帮助程序, +首先为设备内存热插拔新的 ZONE_DEVICE 内存,然后执行迁移。迁移内容和时间的策略决定留 +给设备驱动程序。 + +请注意,任何 CPU 对设备页面的访问都会触发缺页异常并迁移回主内存。例如,当支持给定CPU +地址 A 的页面从主内存页面迁移到设备页面时,对地址 A 的任何 CPU 访问都会触发缺页异常 +并启动向主内存的迁移。 + +凭借这两个特性,HMM 不仅允许设备镜像进程地址空间并保持 CPU 和设备页表同步,而且还通 +过迁移设备正在使用的数据集部分来利用设备内存。 + + +地址空间镜像实现和API +===================== + +地址空间镜像的主要目标是允许将一定范围的 CPU 页表复制到一个设备页表中;HMM 有助于 +保持两者同步。想要镜像进程地址空间的设备驱动程序必须从注册 mmu_interval_notifier +开始:: + + int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, + struct mm_struct *mm, unsigned long start, + unsigned long length, + const struct mmu_interval_notifier_ops *ops); + +在 ops->invalidate() 回调期间,设备驱动程序必须对范围执行更新操作(将范围标记为只 +读,或完全取消映射等)。设备必须在驱动程序回调返回之前完成更新。 + +当设备驱动程序想要填充一个虚拟地址范围时,它可以使用:: + + int hmm_range_fault(struct hmm_range *range); + +如果请求写访问,它将在丢失或只读条目上触发缺页异常(见下文)。缺页异常使用通用的 mm 缺 +页异常代码路径,就像 CPU 缺页异常一样。 + +这两个函数都将 CPU 页表条目复制到它们的 pfns 数组参数中。该数组中的每个条目对应于虚拟 +范围中的一个地址。HMM 提供了一组标志来帮助驱动程序识别特殊的 CPU 页表项。 + +在 sync_cpu_device_pagetables() 回调中锁定是驱动程序必须尊重的最重要的方面,以保 +持事物正确同步。使用模式是:: + + int driver_populate_range(...) + { + struct hmm_range range; + ... + + range.notifier = &interval_sub; + range.start = ...; + range.end = ...; + range.hmm_pfns = ...; + + if (!mmget_not_zero(interval_sub->notifier.mm)) + return -EFAULT; + + again: + range.notifier_seq = mmu_interval_read_begin(&interval_sub); + mmap_read_lock(mm); + ret = hmm_range_fault(&range); + if (ret) { + mmap_read_unlock(mm); + if (ret == -EBUSY) + goto again; + return ret; + } + mmap_read_unlock(mm); + + take_lock(driver->update); + if (mmu_interval_read_retry(&ni, range.notifier_seq) { + release_lock(driver->update); + goto again; + } + + /* Use pfns array content to update device page table, + * under the update lock */ + + release_lock(driver->update); + return 0; + } + +driver->update 锁与驱动程序在其 invalidate() 回调中使用的锁相同。该锁必须在调用 +mmu_interval_read_retry() 之前保持,以避免与并发 CPU 页表更新发生任何竞争。 + +利用 default_flags 和 pfn_flags_mask +==================================== + +hmm_range 结构有 2 个字段,default_flags 和 pfn_flags_mask,它们指定整个范围 +的故障或快照策略,而不必为 pfns 数组中的每个条目设置它们。 + +例如,如果设备驱动程序需要至少具有读取权限的范围的页面,它会设置:: + + range->default_flags = HMM_PFN_REQ_FAULT; + range->pfn_flags_mask = 0; + +并如上所述调用 hmm_range_fault()。这将填充至少具有读取权限的范围内的所有页面。 + +现在假设驱动程序想要做同样的事情,除了它想要拥有写权限的范围内的一页。现在驱动程序设 +置:: + + range->default_flags = HMM_PFN_REQ_FAULT; + range->pfn_flags_mask = HMM_PFN_REQ_WRITE; + range->pfns[index_of_write] = HMM_PFN_REQ_WRITE; + +有了这个,HMM 将在至少读取(即有效)的所有页面中异常,并且对于地址 +== range->start + (index_of_write << PAGE_SHIFT) 它将异常写入权限,即,如果 +CPU pte 没有设置写权限,那么HMM将调用handle_mm_fault()。 + +hmm_range_fault 完成后,标志位被设置为页表的当前状态,即 HMM_PFN_VALID | 如果页 +面可写,将设置 HMM_PFN_WRITE。 + + +从核心内核的角度表示和管理设备内存 +================================== + +尝试了几种不同的设计来支持设备内存。第一个使用特定于设备的数据结构来保存有关迁移内存 +的信息,HMM 将自身挂接到 mm 代码的各个位置,以处理对设备内存支持的地址的任何访问。 +事实证明,这最终复制了 struct page 的大部分字段,并且还需要更新许多内核代码路径才 +能理解这种新的内存类型。 + +大多数内核代码路径从不尝试访问页面后面的内存,而只关心struct page的内容。正因为如此, +HMM 切换到直接使用 struct page 用于设备内存,这使得大多数内核代码路径不知道差异。 +我们只需要确保没有人试图从 CPU 端映射这些页面。 + +移入和移出设备内存 +================== + +由于 CPU 无法直接访问设备内存,因此设备驱动程序必须使用硬件 DMA 或设备特定的加载/存 +储指令来迁移数据。migrate_vma_setup()、migrate_vma_pages() 和 +migrate_vma_finalize() 函数旨在使驱动程序更易于编写并集中跨驱动程序的通用代码。 + +在将页面迁移到设备私有内存之前,需要创建特殊的设备私有 ``struct page`` 。这些将用 +作特殊的“交换”页表条目,以便 CPU 进程在尝试访问已迁移到设备专用内存的页面时会发生异常。 + +这些可以通过以下方式分配和释放:: + + struct resource *res; + struct dev_pagemap pagemap; + + res = request_free_mem_region(&iomem_resource, /* number of bytes */, + "name of driver resource"); + pagemap.type = MEMORY_DEVICE_PRIVATE; + pagemap.range.start = res->start; + pagemap.range.end = res->end; + pagemap.nr_range = 1; + pagemap.ops = &device_devmem_ops; + memremap_pages(&pagemap, numa_node_id()); + + memunmap_pages(&pagemap); + release_mem_region(pagemap.range.start, range_len(&pagemap.range)); + +还有devm_request_free_mem_region(), devm_memremap_pages(), +devm_memunmap_pages() 和 devm_release_mem_region() 当资源可以绑定到 ``struct device``. + +整体迁移步骤类似于在系统内存中迁移 NUMA 页面(see :ref:`Page migration `) , +但这些步骤分为设备驱动程序特定代码和共享公共代码: + +1. ``mmap_read_lock()`` + + 设备驱动程序必须将 ``struct vm_area_struct`` 传递给migrate_vma_setup(), + 因此需要在迁移期间保留 mmap_read_lock() 或 mmap_write_lock()。 + +2. ``migrate_vma_setup(struct migrate_vma *args)`` + + 设备驱动初始化了 ``struct migrate_vma`` 的字段,并将该指针传递给 + migrate_vma_setup()。``args->flags`` 字段是用来过滤哪些源页面应该被迁移。 + 例如,设置 ``MIGRATE_VMA_SELECT_SYSTEM`` 将只迁移系统内存,设置 + ``MIGRATE_VMA_SELECT_DEVICE_PRIVATE`` 将只迁移驻留在设备私有内存中的页 + 面。如果后者被设置, ``args->pgmap_owner`` 字段被用来识别驱动所拥有的设备 + 私有页。这就避免了试图迁移驻留在其他设备中的设备私有页。目前,只有匿名的私有VMA + 范围可以被迁移到系统内存和设备私有内存。 + + migrate_vma_setup()所做的第一步是用 ``mmu_notifier_invalidate_range_start()`` + 和 ``mmu_notifier_invalidate_range_end()`` 调用来遍历设备周围的页表,使 + 其他设备的MMU无效,以便在 ``args->src`` 数组中填写要迁移的PFN。 + ``invalidate_range_start()`` 回调传递给一个``struct mmu_notifier_range`` , + 其 ``event`` 字段设置为MMU_NOTIFY_MIGRATE, ``owner`` 字段设置为传递给 + migrate_vma_setup()的 ``args->pgmap_owner`` 字段。这允许设备驱动跳过无 + 效化回调,只无效化那些实际正在迁移的设备私有MMU映射。这一点将在下一节详细解释。 + + + 在遍历页表时,一个 ``pte_none()`` 或 ``is_zero_pfn()`` 条目导致一个有效 + 的 “zero” PFN 存储在 ``args->src`` 阵列中。这让驱动分配设备私有内存并清 + 除它,而不是复制一个零页。到系统内存或设备私有结构页的有效PTE条目将被 + ``lock_page()``锁定,与LRU隔离(如果系统内存和设备私有页不在LRU上),从进 + 程中取消映射,并插入一个特殊的迁移PTE来代替原来的PTE。 migrate_vma_setup() + 还清除了 ``args->dst`` 数组。 + +3. 设备驱动程序分配目标页面并将源页面复制到目标页面。 + + 驱动程序检查每个 ``src`` 条目以查看该 ``MIGRATE_PFN_MIGRATE`` 位是否已 + 设置并跳过未迁移的条目。设备驱动程序还可以通过不填充页面的 ``dst`` 数组来选 + 择跳过页面迁移。 + + 然后,驱动程序分配一个设备私有 struct page 或一个系统内存页,用 ``lock_page()`` + 锁定该页,并将 ``dst`` 数组条目填入:: + + dst[i] = migrate_pfn(page_to_pfn(dpage)); + + 现在驱动程序知道这个页面正在被迁移,它可以使设备私有 MMU 映射无效并将设备私有 + 内存复制到系统内存或另一个设备私有页面。由于核心 Linux 内核会处理 CPU 页表失 + 效,因此设备驱动程序只需使其自己的 MMU 映射失效。 + + 驱动程序可以使用 ``migrate_pfn_to_page(src[i])`` 来获取源设备的 + ``struct page`` 面,并将源页面复制到目标设备上,如果指针为 ``NULL`` ,意 + 味着源页面没有被填充到系统内存中,则清除目标设备的私有内存。 + +4. ``migrate_vma_pages()`` + + 这一步是实际“提交”迁移的地方。 + + 如果源页是 ``pte_none()`` 或 ``is_zero_pfn()`` 页,这时新分配的页会被插 + 入到CPU的页表中。如果一个CPU线程在同一页面上发生异常,这可能会失败。然而,页 + 表被锁定,只有一个新页会被插入。如果它失去了竞争,设备驱动将看到 + ``MIGRATE_PFN_MIGRATE`` 位被清除。 + + 如果源页被锁定、隔离等,源 ``struct page`` 信息现在被复制到目标 + ``struct page`` ,最终完成CPU端的迁移。 + +5. 设备驱动为仍在迁移的页面更新设备MMU页表,回滚未迁移的页面。 + + 如果 ``src`` 条目仍然有 ``MIGRATE_PFN_MIGRATE`` 位被设置,设备驱动可以 + 更新设备MMU,如果 ``MIGRATE_PFN_WRITE`` 位被设置,则设置写启用位。 + +6. ``migrate_vma_finalize()`` + + 这一步用新页的页表项替换特殊的迁移页表项,并释放对源和目的 ``struct page`` + 的引用。 + +7. ``mmap_read_unlock()`` + + 现在可以释放锁了。 + +独占访问存储器 +============== + +一些设备具有诸如原子PTE位的功能,可以用来实现对系统内存的原子访问。为了支持对一 +个共享的虚拟内存页的原子操作,这样的设备需要对该页的访问是排他的,而不是来自CPU +的任何用户空间访问。 ``make_device_exclusive_range()`` 函数可以用来使一 +个内存范围不能从用户空间访问。 + +这将用特殊的交换条目替换给定范围内的所有页的映射。任何试图访问交换条目的行为都会 +导致一个异常,该异常会通过用原始映射替换该条目而得到恢复。驱动程序会被通知映射已 +经被MMU通知器改变,之后它将不再有对该页的独占访问。独占访问被保证持续到驱动程序 +放弃页面锁和页面引用为止,这时页面上的任何CPU异常都可以按所述进行。 + +内存 cgroup (memcg) 和 rss 统计 +=============================== + +目前,设备内存被视为 rss 计数器中的任何常规页面(如果设备页面用于匿名,则为匿名, +如果设备页面用于文件支持页面,则为文件,如果设备页面用于共享内存,则为 shmem)。 +这是为了保持现有应用程序的故意选择,这些应用程序可能在不知情的情况下开始使用设备 +内存,运行不受影响。 + +一个缺点是 OOM 杀手可能会杀死使用大量设备内存而不是大量常规系统内存的应用程序, +因此不会释放太多系统内存。在决定以不同方式计算设备内存之前,我们希望收集更多关 +于应用程序和系统在存在设备内存的情况下在内存压力下如何反应的实际经验。 + +对内存 cgroup 做出了相同的决定。设备内存页面根据相同的内存 cgroup 计算,常规 +页面将被计算在内。这确实简化了进出设备内存的迁移。这也意味着从设备内存迁移回常规 +内存不会失败,因为它会超过内存 cgroup 限制。一旦我们对设备内存的使用方式及其对 +内存资源控制的影响有了更多的了解,我们可能会在后面重新考虑这个选择。 + +请注意,设备内存永远不能由设备驱动程序或通过 GUP 固定,因此此类内存在进程退出时 +总是被释放的。或者在共享内存或文件支持内存的情况下,当删除最后一个引用时。 diff --git a/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst b/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst new file mode 100644 index 000000000000..752e5696cd47 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst @@ -0,0 +1,436 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/mm/hugetlbfs_reserv.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + +============== +Hugetlbfs 预留 +============== + +概述 +==== + +:ref:`hugetlbpage` 中描述的巨页通常是预先分配给应用程序使用的。如果VMA指 +示要使用巨页,这些巨页会在缺页异常时被实例化到任务的地址空间。如果在缺页异常 +时没有巨页存在,任务就会被发送一个SIGBUS,并经常不高兴地死去。在加入巨页支 +持后不久,人们决定,在mmap()时检测巨页的短缺情况会更好。这个想法是,如果 +没有足够的巨页来覆盖映射,mmap()将失败。这首先是在mmap()时在代码中做一个 +简单的检查,以确定是否有足够的空闲巨页来覆盖映射。就像内核中的大多数东西一 +样,代码随着时间的推移而不断发展。然而,基本的想法是在mmap()时 “预留” +巨页,以确保巨页可以用于该映射中的缺页异常。下面的描述试图描述在v4.10内核 +中是如何进行巨页预留处理的。 + + +读者 +==== +这个描述主要是针对正在修改hugetlbfs代码的内核开发者。 + + +数据结构 +======== + +resv_huge_pages + 这是一个全局的(per-hstate)预留的巨页的计数。预留的巨页只对预留它们的任 + 务可用。因此,一般可用的巨页的数量被计算为(``free_huge_pages - resv_huge_pages``)。 +Reserve Map + 预留映射由以下结构体描述:: + + struct resv_map { + struct kref refs; + spinlock_t lock; + struct list_head regions; + long adds_in_progress; + struct list_head region_cache; + long region_cache_count; + }; + + 系统中每个巨页映射都有一个预留映射。resv_map中的regions列表描述了映射中的 + 区域。一个区域被描述为:: + + struct file_region { + struct list_head link; + long from; + long to; + }; + + file_region结构体的 ‘from’ 和 ‘to’ 字段是进入映射的巨页索引。根据映射的类型,在 + reserv_map 中的一个区域可能表示该范围存在预留,或预留不存在。 +Flags for MAP_PRIVATE Reservations + 这些被存储在预留的映射指针的底部。 + + ``#define HPAGE_RESV_OWNER (1UL << 0)`` + 表示该任务是与该映射相关的预留的所有者。 + ``#define HPAGE_RESV_UNMAPPED (1UL << 1)`` + 表示最初映射此范围(并创建储备)的任务由于COW失败而从该任务(子任务)中取消映 + 射了一个页面。 +Page Flags + PagePrivate页面标志是用来指示在释放巨页时必须恢复巨页的预留。更多细节将在 + “释放巨页” 一节中讨论。 + + +预留映射位置(私有或共享) +========================== + +一个巨页映射或段要么是私有的,要么是共享的。如果是私有的,它通常只对一个地址空间 +(任务)可用。如果是共享的,它可以被映射到多个地址空间(任务)。对于这两种类型的映射, +预留映射的位置和语义是明显不同的。位置的差异是: + +- 对于私有映射,预留映射挂在VMA结构体上。具体来说,就是vma->vm_private_data。这个保 + 留映射是在创建映射(mmap(MAP_PRIVATE))时创建的。 +- 对于共享映射,预留映射挂在inode上。具体来说,就是inode->i_mapping->private_data。 + 由于共享映射总是由hugetlbfs文件系统中的文件支持,hugetlbfs代码确保每个节点包含一个预 + 留映射。因此,预留映射在创建节点时被分配。 + + +创建预留 +======== +当创建一个巨大的有页面支持的共享内存段(shmget(SHM_HUGETLB))或通过mmap(MAP_HUGETLB) +创建一个映射时,就会创建预留。这些操作会导致对函数hugetlb_reserve_pages()的调用:: + + int hugetlb_reserve_pages(struct inode *inode, + long from, long to, + struct vm_area_struct *vma, + vm_flags_t vm_flags) + +hugetlb_reserve_pages()做的第一件事是检查在调用shmget()或mmap()时是否指定了NORESERVE +标志。如果指定了NORESERVE,那么这个函数立即返回,因为不需要预留。 + +参数'from'和'to'是映射或基础文件的巨页索引。对于shmget(),'from'总是0,'to'对应于段/映射 +的长度。对于mmap(),offset参数可以用来指定进入底层文件的偏移量。在这种情况下,'from'和'to' +参数已经被这个偏移量所调整。 + +PRIVATE和SHARED映射之间的一个很大的区别是预留在预留映射中的表示方式。 + +- 对于共享映射,预留映射中的条目表示对应页面的预留存在或曾经存在。当预留被消耗时,预留映射不被 + 修改。 +- 对于私有映射,预留映射中没有条目表示相应页面存在预留。随着预留被消耗,条目被添加到预留映射中。 + 因此,预留映射也可用于确定哪些预留已被消耗。 + +对于私有映射,hugetlb_reserve_pages()创建预留映射并将其挂在VMA结构体上。此外, +HPAGE_RESV_OWNER标志被设置,以表明该VMA拥有预留。 + +预留映射被查阅以确定当前映射/段需要多少巨页预留。对于私有映射,这始终是一个值(to - from)。 +然而,对于共享映射来说,一些预留可能已经存在于(to - from)的范围内。关于如何实现这一点的细节, +请参见 :ref:`预留映射的修改 ` 一节。 + +该映射可能与一个子池(subpool)相关联。如果是这样,将查询子池以确保有足够的空间用于映射。子池 +有可能已经预留了可用于映射的预留空间。更多细节请参见 :ref: `子池预留 ` +一节。 + +在咨询了预留映射和子池之后,就知道了需要的新预留数量。hugetlb_acct_memory()函数被调用以检查 +并获取所要求的预留数量。hugetlb_acct_memory()调用到可能分配和调整剩余页数的函数。然而,在这 +些函数中,代码只是检查以确保有足够的空闲的巨页来容纳预留。如果有的话,全局预留计数resv_huge_pages +会被调整,如下所示:: + + if (resv_needed <= (resv_huge_pages - free_huge_pages)) + resv_huge_pages += resv_needed; + +注意,在检查和调整这些计数器时,全局锁hugetlb_lock会被预留。 + +如果有足够的空闲的巨页,并且全局计数resv_huge_pages被调整,那么与映射相关的预留映射被修改以 +反映预留。在共享映射的情况下,将存在一个file_region,包括'from'-'to'范围。对于私有映射, +不对预留映射进行修改,因为没有条目表示存在预留。 + +如果hugetlb_reserve_pages()成功,全局预留数和与映射相关的预留映射将根据需要被修改,以确保 +在'from'-'to'范围内存在预留。 + +消耗预留/分配一个巨页 +=========================== + +当与预留相关的巨页在相应的映射中被分配和实例化时,预留就被消耗了。该分配是在函数alloc_huge_page() +中进行的:: + + struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve) + +alloc_huge_page被传递给一个VMA指针和一个虚拟地址,因此它可以查阅预留映射以确定是否存在预留。 +此外,alloc_huge_page需要一个参数avoid_reserve,该参数表示即使看起来已经为指定的地址预留了 +预留,也不应该使用预留。avoid_reserve参数最常被用于写时拷贝和页面迁移的情况下,即现有页面的额 +外拷贝被分配。 + + +调用辅助函数vma_needs_reservation()来确定是否存在对映射(vma)中地址的预留。关于这个函数的详 +细内容,请参见 :ref:`预留映射帮助函数 ` 一节。从 +vma_needs_reservation()返回的值通常为0或1。如果该地址存在预留,则为0,如果不存在预留,则为1。 +如果不存在预留,并且有一个与映射相关联的子池,则查询子池以确定它是否包含预留。如果子池包含预留, +则可将其中一个用于该分配。然而,在任何情况下,avoid_reserve参数都会优先考虑为分配使用预留。在 +确定预留是否存在并可用于分配后,调用dequeue_huge_page_vma()函数。这个函数需要两个与预留有关 +的参数: + +- avoid_reserve,这是传递给alloc_huge_page()的同一个值/参数。 +- chg,尽管这个参数的类型是long,但只有0或1的值被传递给dequeue_huge_page_vma。如果该值为0, + 则表明存在预留(关于可能的问题,请参见 “预留和内存策略” 一节)。如果值 + 为1,则表示不存在预留,如果可能的话,必须从全局空闲池中取出该页。 + +与VMA的内存策略相关的空闲列表被搜索到一个空闲页。如果找到了一个页面,当该页面从空闲列表中移除时, +free_huge_pages的值被递减。如果有一个与该页相关的预留,将进行以下调整:: + + SetPagePrivate(page); /* 表示分配这个页面消耗了一个预留, + * 如果遇到错误,以至于必须释放这个页面,预留将被 + * 恢复。 */ + resv_huge_pages--; /* 减少全局预留计数 */ + +注意,如果找不到满足VMA内存策略的巨页,将尝试使用伙伴分配器分配一个。这就带来了超出预留范围 +的剩余巨页和超额分配的问题。即使分配了一个多余的页面,也会进行与上面一样的基于预留的调整: +SetPagePrivate(page) 和 resv_huge_pages--. + +在获得一个新的巨页后,(page)->private被设置为与该页面相关的子池的值,如果它存在的话。当页 +面被释放时,这将被用于子池的计数。 + +然后调用函数vma_commit_reservation(),根据预留的消耗情况调整预留映射。一般来说,这涉及 +到确保页面在区域映射的file_region结构体中被表示。对于预留存在的共享映射,预留映射中的条目 +已经存在,所以不做任何改变。然而,如果共享映射中没有预留,或者这是一个私有映射,则必须创建一 +个新的条目。 + +注意,如果找不到满足VMA内存策略的巨页,将尝试使用伙伴分配器分配一个。这就带来了超出预留范围 +的剩余巨页和过度分配的问题。即使分配了一个多余的页面,也会进行与上面一样的基于预留的调整。 +SetPagePrivate(page)和resv_huge_pages-。 + +在获得一个新的巨页后,(page)->private被设置为与该页面相关的子池的值,如果它存在的话。当页 +面被释放时,这将被用于子池的计数。 + +然后调用函数vma_commit_reservation(),根据预留的消耗情况调整预留映射。一般来说,这涉及 +到确保页面在区域映射的file_region结构体中被表示。对于预留存在的共享映射,预留映射中的条目 +已经存在,所以不做任何改变。然而,如果共享映射中没有预留,或者这是一个私有映射,则必须创建 +一个新的条目。 + +在alloc_huge_page()开始调用vma_needs_reservation()和页面分配后调用 +vma_commit_reservation()之间,预留映射有可能被改变。如果hugetlb_reserve_pages在共 +享映射中为同一页面被调用,这将是可能的。在这种情况下,预留计数和子池空闲页计数会有一个偏差。 +这种罕见的情况可以通过比较vma_needs_reservation和vma_commit_reservation的返回值来 +识别。如果检测到这种竞争,子池和全局预留计数将被调整以进行补偿。关于这些函数的更多信息,请 +参见 :ref:`预留映射帮助函数 ` 一节。 + + +实例化巨页 +========== + +在巨页分配之后,页面通常被添加到分配任务的页表中。在此之前,共享映射中的页面被添加到页面缓 +存中,私有映射中的页面被添加到匿名反向映射中。在这两种情况下,PagePrivate标志被清除。因此, +当一个已经实例化的巨页被释放时,不会对全局预留计数(resv_huge_pages)进行调整。 + + +释放巨页 +======== + +巨页释放是由函数free_huge_page()执行的。这个函数是hugetlbfs复合页的析构器。因此,它只传 +递一个指向页面结构体的指针。当一个巨页被释放时,可能需要进行预留计算。如果该页与包含保 +留的子池相关联,或者该页在错误路径上被释放,必须恢复全局预留计数,就会出现这种情况。 + +page->private字段指向与该页相关的任何子池。如果PagePrivate标志被设置,它表明全局预留计数 +应该被调整(关于如何设置这些标志的信息,请参见 +:ref: `消耗预留/分配一个巨页 ` )。 + + +该函数首先调用hugepage_subpool_put_pages()来处理该页。如果这个函数返回一个0的值(不等于 +传递的1的值),它表明预留与子池相关联,这个新释放的页面必须被用来保持子池预留的数量超过最小值。 +因此,在这种情况下,全局resv_huge_pages计数器被递增。 + +如果页面中设置了PagePrivate标志,那么全局resv_huge_pages计数器将永远被递增。 + +子池预留 +======== + +有一个结构体hstate与每个巨页尺寸相关联。hstate跟踪所有指定大小的巨页。一个子池代表一 +个hstate中的页面子集,它与一个已挂载的hugetlbfs文件系统相关 + +当一个hugetlbfs文件系统被挂载时,可以指定min_size选项,它表示文件系统所需的最小的巨页数量。 +如果指定了这个选项,与min_size相对应的巨页的数量将被预留给文件系统使用。这个数字在结构体 +hugepage_subpool的min_hpages字段中被跟踪。在挂载时,hugetlb_acct_memory(min_hpages) +被调用以预留指定数量的巨页。如果它们不能被预留,挂载就会失败。 + +当从子池中获取或释放页面时,会调用hugepage_subpool_get/put_pages()函数。 +hugepage_subpool_get/put_pages被传递给巨页数量,以此来调整子池的 “已用页面” 计数 +(get为下降,put为上升)。通常情况下,如果子池中没有足够的页面,它们会返回与传递的相同的值或 +一个错误。 + +然而,如果预留与子池相关联,可能会返回一个小于传递值的返回值。这个返回值表示必须进行的额外全局 +池调整的数量。例如,假设一个子池包含3个预留的巨页,有人要求5个。与子池相关的3个预留页可以用来 +满足部分请求。但是,必须从全局池中获得2个页面。为了向调用者转达这一信息,将返回值2。然后,调用 +者要负责从全局池中获取另外两个页面。 + + +COW和预留 +========== + +由于共享映射都指向并使用相同的底层页面,COW最大的预留问题是私有映射。在这种情况下,两个任务可 +以指向同一个先前分配的页面。一个任务试图写到该页,所以必须分配一个新的页,以便每个任务都指向它 +自己的页。 + +当该页最初被分配时,该页的预留被消耗了。当由于COW而试图分配一个新的页面时,有可能没有空闲的巨 +页,分配会失败。 + +当最初创建私有映射时,通过设置所有者的预留映射指针中的HPAGE_RESV_OWNER位来标记映射的所有者。 +由于所有者创建了映射,所有者拥有与映射相关的所有预留。因此,当一个写异常发生并且没有可用的页面 +时,对预留的所有者和非所有者采取不同的行动。 + +在发生异常的任务不是所有者的情况下,异常将失败,该任务通常会收到一个SIGBUS。 + +如果所有者是发生异常的任务,我们希望它能够成功,因为它拥有原始的预留。为了达到这个目的,该页被 +从非所有者任务中解映射出来。这样一来,唯一的引用就是来自拥有者的任务。此外,HPAGE_RESV_UNMAPPED +位被设置在非拥有任务的预留映射指针中。如果非拥有者任务后来在一个不存在的页面上发生异常,它可能 +会收到一个SIGBUS。但是,映射/预留的原始拥有者的行为将与预期一致。 + +预留映射的修改 +============== + +以下低级函数用于对预留映射进行修改。通常情况下,这些函数不会被直接调用。而是调用一个预留映射辅 +助函数,该函数调用这些低级函数中的一个。这些低级函数在源代码(mm/hugetlb.c)中得到了相当好的 +记录。这些函数是:: + + long region_chg(struct resv_map *resv, long f, long t); + long region_add(struct resv_map *resv, long f, long t); + void region_abort(struct resv_map *resv, long f, long t); + long region_count(struct resv_map *resv, long f, long t); + +在预留映射上的操作通常涉及两个操作: + +1) region_chg()被调用来检查预留映射,并确定在指定的范围[f, t]内有多少页目前没有被代表。 + + 调用代码执行全局检查和分配,以确定是否有足够的巨页使操作成功。 + +2) + a) 如果操作能够成功,regi_add()将被调用,以实际修改先前传递给regi_chg()的相同范围 + [f, t]的预留映射。 + b) 如果操作不能成功,region_abort被调用,在相同的范围[f, t]内中止操作。 + +注意,这是一个两步的过程, region_add()和 region_abort()在事先调用 region_chg()后保证 +成功。 region_chg()负责预先分配任何必要的数据结构以确保后续操作(特别是 region_add())的 +成功。 + +如上所述,region_chg()确定该范围内当前没有在映射中表示的页面的数量。region_add()返回添加 +到映射中的范围内的页数。在大多数情况下, region_add() 的返回值与 region_chg() 的返回值相 +同。然而,在共享映射的情况下,有可能在调用 region_chg() 和 region_add() 之间对预留映射进 +行更改。在这种情况下,regi_add()的返回值将与regi_chg()的返回值不符。在这种情况下,全局计数 +和子池计数很可能是不正确的,需要调整。检查这种情况并进行适当的调整是调用者的责任。 + +函数region_del()被调用以从预留映射中移除区域。 +它通常在以下情况下被调用: + +- 当hugetlbfs文件系统中的一个文件被删除时,该节点将被释放,预留映射也被释放。在释放预留映射 + 之前,所有单独的file_region结构体必须被释放。在这种情况下,region_del的范围是[0, LONG_MAX]。 +- 当一个hugetlbfs文件正在被截断时。在这种情况下,所有在新文件大小之后分配的页面必须被释放。 + 此外,预留映射中任何超过新文件大小的file_region条目必须被删除。在这种情况下,region_del + 的范围是[new_end_of_file, LONG_MAX]。 +- 当在一个hugetlbfs文件中打洞时。在这种情况下,巨页被一次次从文件的中间移除。当这些页被移除 + 时,region_del()被调用以从预留映射中移除相应的条目。在这种情况下,region_del被传递的范 + 围是[page_idx, page_idx + 1]。 + +在任何情况下,region_del()都会返回从预留映射中删除的页面数量。在非常罕见的情况下,region_del() +会失败。这只能发生在打洞的情况下,即它必须分割一个现有的file_region条目,而不能分配一个新的 +结构体。在这种错误情况下,region_del()将返回-ENOMEM。这里的问题是,预留映射将显示对该页有 +预留。然而,子池和全局预留计数将不反映该预留。为了处理这种情况,调用函数hugetlb_fix_reserve_counts() +来调整计数器,使其与不能被删除的预留映射条目相对应。 + +region_count()在解除私有巨页映射时被调用。在私有映射中,预留映射中没有条目表明存在一个预留。 +因此,通过计算预留映射中的条目数,我们知道有多少预留被消耗了,有多少预留是未完成的 +(Outstanding = (end - start) - region_count(resv, start, end))。由于映射正在消 +失,子池和全局预留计数被未完成的预留数量所减去。 + +预留映射帮助函数 +================ + +有几个辅助函数可以查询和修改预留映射。这些函数只对特定的巨页的预留感兴趣,所以它们只是传入一个 +地址而不是一个范围。此外,它们还传入相关的VMA。从VMA中,可以确定映射的类型(私有或共享)和预留 +映射的位置(inode或VMA)。这些函数只是调用 “预留映射的修改” 一节中描述的基础函数。然而, +它们确实考虑到了私有和共享映射的预留映射条目的 “相反” 含义,并向调用者隐藏了这个细节:: + + long vma_needs_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +该函数为指定的页面调用 region_chg()。如果不存在预留,则返回1。如果存在预留,则返回0:: + + long vma_commit_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +这将调用 region_add(),用于指定的页面。与region_chg和region_add的情况一样,该函数应在 +先前调用的vma_needs_reservation后调用。它将为该页添加一个预留条目。如果预留被添加,它将 +返回1,如果没有则返回0。返回值应与之前调用vma_needs_reservation的返回值进行比较。如果出 +现意外的差异,说明在两次调用之间修改了预留映射:: + + void vma_end_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +这将调用指定页面的 region_abort()。与region_chg和region_abort的情况一样,该函数应在 +先前调用的vma_needs_reservation后被调用。它将中止/结束正在进行的预留添加操作:: + + long vma_add_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +这是一个特殊的包装函数,有助于在错误路径上清理预留。它只从repare_reserve_on_error()函数 +中调用。该函数与vma_needs_reservation一起使用,试图将一个预留添加到预留映射中。它考虑到 +了私有和共享映射的不同预留映射语义。因此,region_add被调用用于共享映射(因为映射中的条目表 +示预留),而region_del被调用用于私有映射(因为映射中没有条目表示预留)。关于在错误路径上需 +要做什么的更多信息,请参见 “错误路径中的预留清理” 。 + + +错误路径中的预留清理 +==================== + +正如在:ref:`预留映射帮助函数` 一节中提到的,预留的修改分两步进行。首 +先,在分配页面之前调用vma_needs_reservation。如果分配成功,则调用vma_commit_reservation。 +如果不是,则调用vma_end_reservation。全局和子池的预留计数根据操作的成功或失败进行调整, +一切都很好。 + +此外,在一个巨页被实例化后,PagePrivate标志被清空,这样,当页面最终被释放时,计数是 +正确的。 + +然而,有几种情况是,在一个巨页被分配后,但在它被实例化之前,就遇到了错误。在这种情况下, +页面分配已经消耗了预留,并进行了适当的子池、预留映射和全局计数调整。如果页面在这个时候被释放 +(在实例化和清除PagePrivate之前),那么free_huge_page将增加全局预留计数。然而,预留映射 +显示报留被消耗了。这种不一致的状态将导致预留的巨页的 “泄漏” 。全局预留计数将比它原本的要高, +并阻止分配一个预先分配的页面。 + +函数 restore_reserve_on_error() 试图处理这种情况。它有相当完善的文档。这个函数的目的 +是将预留映射恢复到页面分配前的状态。通过这种方式,预留映射的状态将与页面释放后的全局预留计 +数相对应。 + +函数restore_reserve_on_error本身在试图恢复预留映射条目时可能会遇到错误。在这种情况下, +它将简单地清除该页的PagePrivate标志。这样一来,当页面被释放时,全局预留计数将不会被递增。 +然而,预留映射将继续看起来像预留被消耗了一样。一个页面仍然可以被分配到该地址,但它不会像最 +初设想的那样使用一个预留页。 + +有一些代码(最明显的是userfaultfd)不能调用restore_reserve_on_error。在这种情况下, +它简单地修改了PagePrivate,以便在释放巨页时不会泄露预留。 + + +预留和内存策略 +============== +当git第一次被用来管理Linux代码时,每个节点的巨页列表就存在于hstate结构中。预留的概念是 +在一段时间后加入的。当预留被添加时,没有尝试将内存策略考虑在内。虽然cpusets与内存策略不 +完全相同,但hugetlb_acct_memory中的这个注释总结了预留和cpusets/内存策略之间的相互作 +用:: + + + /* + * 当cpuset被配置时,它打破了严格的hugetlb页面预留,因为计数是在一个全局变量上完 + * 成的。在有cpuset的情况下,这样的预留完全是垃圾,因为预留没有根据当前cpuset的 + * 页面可用性来检查。在任务所在的cpuset中缺乏空闲的htlb页面时,应用程序仍然有可能 + * 被内核OOM'ed。试图用cpuset来执行严格的计数几乎是不可能的(或者说太难看了),因 + * 为cpuset太不稳定了,任务或内存节点可以在cpuset之间动态移动。与cpuset共享 + * hugetlb映射的语义变化是不可取的。然而,为了预留一些语义,我们退回到检查当前空闲 + * 页的可用性,作为一种最好的尝试,希望能将cpuset改变语义的影响降到最低。 + */ + +添加巨页预留是为了防止在缺页异常时出现意外的页面分配失败(OOM)。然而,如果一个应用 +程序使用cpusets或内存策略,就不能保证在所需的节点上有巨页可用。即使有足够数量的全局 +预留,也是如此。 + +Hugetlbfs回归测试 +================= + +最完整的hugetlb测试集在libhugetlbfs仓库。如果你修改了任何hugetlb相关的代码,请使用 +libhugetlbfs测试套件来检查回归情况。此外,如果你添加了任何新的hugetlb功能,请在 +libhugetlbfs中添加适当的测试。 + +-- +Mike Kravetz,2017年4月7日 diff --git a/Documentation/translations/zh_CN/mm/hwpoison.rst b/Documentation/translations/zh_CN/mm/hwpoison.rst new file mode 100644 index 000000000000..310862edc937 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/hwpoison.rst @@ -0,0 +1,166 @@ + +:Original: Documentation/mm/hwpoison.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +======== +hwpoison +======== + +什么是hwpoison? +=============== + + +即将推出的英特尔CPU支持从一些内存错误中恢复( ``MCA恢复`` )。这需要操作系统宣布 +一个页面"poisoned",杀死与之相关的进程,并避免在未来使用它。 + +这个补丁包在虚拟机中实现了必要的(编程)框架。 + +引用概述中的评论:: + + 高级机器的检查与处理。处理方法是损坏的页面被硬件报告,通常是由于2位ECC内 + 存或高速缓存故障。 + + 这主要是针对在后台检测到的损坏的页面。当当前的CPU试图访问它时,当前运行的进程 + 可以直接被杀死。因为还没有访问损坏的页面, 如果错误由于某种原因不能被处理,就可 + 以安全地忽略它. 而不是用另外一个机器检查去处理它。 + + 处理不同状态的页面缓存页。这里棘手的部分是,相对于其他虚拟内存用户, 我们可以异 + 步访问任何页面。因为内存故障可能随时随地发生,可能违反了他们的一些假设。这就是 + 为什么这段代码必须非常小心。一般来说,它试图使用正常的锁规则,如获得标准锁,即使 + 这意味着错误处理可能需要很长的时间。 + + 这里的一些操作有点低效,并且具有非线性的算法复杂性,因为数据结构没有针对这种情 + 况进行优化。特别是从vma到进程的映射就是这种情况。由于这种情况大概率是罕见的,所 + 以我们希望我们可以摆脱这种情况。 + +该代码由mm/memory-failure.c中的高级处理程序、一个新的页面poison位和虚拟机中的 +各种检查组成,用来处理poison的页面。 + +现在主要目标是KVM客户机,但它适用于所有类型的应用程序。支持KVM需要最近的qemu-kvm +版本。 + +对于KVM的使用,需要一个新的信号类型,这样KVM就可以用适当的地址将机器检查注入到客户 +机中。这在理论上也允许其他应用程序处理内存故障。我们的期望是,所有的应用程序都不要这 +样做,但一些非常专业的应用程序可能会这样做。 + +故障恢复模式 +============ + +有两种(实际上是三种)模式的内存故障恢复可以在。 + +vm.memory_failure_recovery sysctl 置零: + 所有的内存故障都会导致panic。请不要尝试恢复。 + +早期处理 + (可以在全局和每个进程中控制) 一旦检测到错误,立即向应用程序发送SIGBUS这允许 + 应用程序以温和的方式处理内存错误(例如,放弃受影响的对象) 这是KVM qemu使用的 + 模式。 + +推迟处理 + 当应用程序运行到损坏的页面时,发送SIGBUS。这对不知道内存错误的应用程序来说是 + 最好的,默认情况下注意一些页面总是被当作late kill处理。 + +用户控制 +======== + +vm.memory_failure_recovery + 参阅 sysctl.txt + +vm.memory_failure_early_kill + 全局启用early kill + +PR_MCE_KILL + 设置early/late kill mode/revert 到系统默认值。 + + arg1: PR_MCE_KILL_CLEAR: + 恢复到系统默认值 + arg1: PR_MCE_KILL_SET: + arg2定义了线程特定模式 + + PR_MCE_KILL_EARLY: + Early kill + PR_MCE_KILL_LATE: + Late kill + PR_MCE_KILL_DEFAULT + 使用系统全局默认值 + + 注意,如果你想有一个专门的线程代表进程处理SIGBUS(BUS_MCEERR_AO),你应该在 + 指定线程上调用prctl(PR_MCE_KILL_EARLY)。否则,SIGBUS将被发送到主线程。 + +PR_MCE_KILL_GET + 返回当前模式 + +测试 +==== + +* madvise(MADV_HWPOISON, ....) (as root) - 在测试过程中Poison一个页面 + +* 通过debugfs ``/sys/kernel/debug/hwpoison/`` hwpoison-inject模块 + + corrupt-pfn + 在PFN处注入hwpoison故障,并echoed到这个文件。这做了一些早期过滤,以避 + 免在测试套件中损坏非预期页面。 + unpoison-pfn + 在PFN的Software-unpoison页面对应到这个文件。这样,一个页面可以再次被 + 复用。这只对Linux注入的故障起作用,对真正的内存故障不起作用。 + + 注意这些注入接口并不稳定,可能会在不同的内核版本中发生变化 + + corrupt-filter-dev-major, corrupt-filter-dev-minor + 只处理与块设备major/minor定义的文件系统相关的页面的内存故障。-1U是通 + 配符值。这应该只用于人工注入的测试。 + + corrupt-filter-memcg + 限制注入到memgroup拥有的页面。由memcg的inode号指定。 + + Example:: + + mkdir /sys/fs/cgroup/mem/hwpoison + + usemem -m 100 -s 1000 & + echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks + + memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ') + echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg + + page-types -p `pidof init` --hwpoison # shall do nothing + page-types -p `pidof usemem` --hwpoison # poison its pages + + corrupt-filter-flags-mask, corrupt-filter-flags-value + 当指定时,只有在((page_flags & mask) == value)的情况下才会poison页面。 + 这允许对许多种类的页面进行压力测试。page_flags与/proc/kpageflags中的相 + 同。这些标志位在include/linux/kernel-page-flags.h中定义,并在 + Documentation/admin-guide/mm/pagemap.rst中记录。 + +* 架构特定的MCE注入器 + + x86 有 mce-inject, mce-test + + 在mce-test中的一些便携式hwpoison测试程序,见下文。 + +引用 +==== + +http://halobates.de/mce-lc09-2.pdf + 09年LinuxCon的概述演讲 + +git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git + 测试套件(在tsrc中的hwpoison特定可移植测试)。 + +git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git + x86特定的注入器 + + +限制 +==== +- 不是所有的页面类型都被支持,而且永远不会。大多数内核内部对象不能被恢 + 复,目前只有LRU页。 + +--- +Andi Kleen, 2009年10月 diff --git a/Documentation/translations/zh_CN/mm/index.rst b/Documentation/translations/zh_CN/mm/index.rst new file mode 100644 index 000000000000..4c8c6b7b72a3 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/index.rst @@ -0,0 +1,54 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/mm/index.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + +================= +Linux内存管理文档 +================= + +这是一个关于Linux内存管理(mm)子系统内部的文档集,其中有不同层次的细节,包括注释 +和邮件列表的回复,用于阐述数据结构和算法的基本情况。如果你正在寻找关于简单分配内存的建 +议,请参阅(Documentation/translations/zh_CN/core-api/memory-allocation.rst)。 +对于控制和调整指南,请参阅(Documentation/admin-guide/mm/index)。 +TODO:待引用文档集被翻译完毕后请及时修改此处) + +.. toctree:: + :maxdepth: 1 + + active_mm + balance + damon/index + free_page_reporting + highmem + ksm + frontswap + hmm + hwpoison + hugetlbfs_reserv + memory-model + mmu_notifier + numa + overcommit-accounting + page_frags + page_owner + page_table_check + remap_file_pages + split_page_table_lock + z3fold + zsmalloc + +TODOLIST: +* arch_pgtable_helpers +* free_page_reporting +* hugetlbfs_reserv +* page_migration +* slub +* transhuge +* unevictable-lru +* vmalloced-kernel-stacks diff --git a/Documentation/translations/zh_CN/mm/ksm.rst b/Documentation/translations/zh_CN/mm/ksm.rst new file mode 100644 index 000000000000..d1f82e857ad7 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/ksm.rst @@ -0,0 +1,70 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/mm/ksm.rst + +:翻译: + + 徐鑫 xu xin + +============ +内核同页合并 +============ + +KSM 是一种节省内存的数据去重功能,由CONFIG_KSM=y启用,并在2.6.32版本时被添加 +到Linux内核。详见 ``mm/ksm.c`` 的实现,以及http://lwn.net/Articles/306704和 +https://lwn.net/Articles/330589 + +KSM的用户空间的接口在Documentation/translations/zh_CN/admin-guide/mm/ksm.rst +文档中有描述。 + +设计 +==== + +概述 +---- + +概述内容请见mm/ksm.c文档中的“DOC: Overview” + +逆映射 +------ +KSM维护着稳定树中的KSM页的逆映射信息。 + +当KSM页面的共享数小于 ``max_page_sharing`` 的虚拟内存区域(VMAs)时,则代表了 +KSM页的稳定树其中的节点指向了一个rmap_item结构体类型的列表。同时,这个KSM页 +的 ``page->mapping`` 指向了该稳定树节点。 + +如果共享数超过了阈值,KSM将给稳定树添加第二个维度。稳定树就变成链接一个或多 +个稳定树"副本"的"链"。每个副本都保留KSM页的逆映射信息,其中 ``page->mapping`` +指向该"副本"。 + +每个链以及链接到该链中的所有"副本"强制不变的是,它们代表了相同的写保护内存 +内容,尽管任中一个"副本"是由同一片内存区的不同的KSM复制页所指向的。 + +这样一来,相比与无限的逆映射链表,稳定树的查找计算复杂性不受影响。但在稳定树 +本身中不能有重复的KSM页面内容仍然是强制要求。 + +由 ``max_page_sharing`` 强制决定的数据去重限制是必要的,以此来避免虚拟内存 +rmap链表变得过大。rmap的遍历具有O(N)的复杂度,其中N是共享页面的rmap_项(即 +虚拟映射)的数量,而这个共享页面的节点数量又被 ``max_page_sharing`` 所限制。 +因此,这有效地将线性O(N)计算复杂度从rmap遍历中分散到不同的KSM页面上。ksmd进 +程在稳定节点"链"上的遍历也是O(N),但这个N是稳定树"副本"的数量,而不是rmap项 +的数量,因此它对ksmd性能没有显著影响。实际上,最佳稳定树"副本"的候选节点将 +保留在"副本"列表的开头。 + +``max_page_sharing`` 的值设置得高了会促使更快的内存合并(因为将有更少的稳定 +树副本排队进入稳定节点chain->hlist)和更高的数据去重系数,但代价是在交换、压 +缩、NUMA平衡和页面迁移过程中可能导致KSM页的最大rmap遍历速度较慢。 + +``stable_node_dups/stable_node_chains`` 的比值还受 ``max_page_sharing`` 调控 +的影响,高比值可能意味着稳定节点dup中存在碎片,这可以通过在ksmd中引入碎片算 +法来解决,该算法将rmap项从一个稳定节点dup重定位到另一个稳定节点dup,以便释放 +那些仅包含极少rmap项的稳定节点"dup",但这可能会增加ksmd进程的CPU使用率,并可 +能会减慢应用程序在KSM页面上的只读计算。 + +KSM会定期扫描稳定节点"链"中链接的所有稳定树"副本",以便删减过时了的稳定节点。 +这种扫描的频率由 ``stable_node_chains_prune_millisecs`` 这个sysfs 接口定义。 + +参考 +==== +内核代码请见mm/ksm.c。 +涉及的函数(mm_slot ksm_scan stable_node rmap_item)。 diff --git a/Documentation/translations/zh_CN/mm/memory-model.rst b/Documentation/translations/zh_CN/mm/memory-model.rst new file mode 100644 index 000000000000..77ec149a970c --- /dev/null +++ b/Documentation/translations/zh_CN/mm/memory-model.rst @@ -0,0 +1,135 @@ +.. SPDX-License-Identifier: GPL-2.0 + +:Original: Documentation/mm/memory-model.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +============ +物理内存模型 +============ + +系统中的物理内存可以用不同的方式进行寻址。最简单的情况是,物理内存从地址0开 +始,跨越一个连续的范围,直到最大的地址。然而,这个范围可能包含CPU无法访问的 +小孔隙。那么,在完全不同的地址可能有几个连续的范围。而且,别忘了NUMA,即不 +同的内存库连接到不同的CPU。 + +Linux使用两种内存模型中的一种对这种多样性进行抽象。FLATMEM和SPARSEM。每 +个架构都定义了它所支持的内存模型,默认的内存模型是什么,以及是否有可能手动 +覆盖该默认值。 + +所有的内存模型都使用排列在一个或多个数组中的 `struct page` 来跟踪物理页 +帧的状态。 + +无论选择哪种内存模型,物理页框号(PFN)和相应的 `struct page` 之间都存 +在一对一的映射关系。 + +每个内存模型都定义了 :c:func:`pfn_to_page` 和 :c:func:`page_to_pfn` +帮助函数,允许从PFN到 `struct page` 的转换,反之亦然。 + +FLATMEM +======= + +最简单的内存模型是FLATMEM。这个模型适用于非NUMA系统的连续或大部分连续的 +物理内存。 + +在FLATMEM内存模型中,有一个全局的 `mem_map` 数组来映射整个物理内存。对 +于大多数架构,孔隙在 `mem_map` 数组中都有条目。与孔洞相对应的 `struct page` +对象从未被完全初始化。 + +为了分配 `mem_map` 数组,架构特定的设置代码应该调用free_area_init()函数。 +然而,在调用memblock_free_all()函数之前,映射数组是不能使用的,该函数 +将所有的内存交给页分配器。 + +一个架构可能会释放 `mem_map` 数组中不包括实际物理页的部分。在这种情况下,特 +定架构的 :c:func:`pfn_valid` 实现应该考虑到 `mem_map` 中的孔隙。 + +使用FLATMEM,PFN和 `struct page` 之间的转换是直接的。 `PFN - ARCH_PFN_OFFSET` +是 `mem_map` 数组的一个索引。 + +`ARCH_PFN_OFFSET` 定义了物理内存起始地址不同于0的系统的第一个页框号。 + +SPARSEMEM +========= + +SPARSEMEM是Linux中最通用的内存模型,它是唯一支持若干高级功能的内存模型, +如物理内存的热插拔、非易失性内存设备的替代内存图和较大系统的内存图的延迟 +初始化。 + +SPARSEMEM模型将物理内存显示为一个部分的集合。一个区段用mem_section结构 +体表示,它包含 `section_mem_map` ,从逻辑上讲,它是一个指向 `struct page` +阵列的指针。然而,它被存储在一些其他的magic中,以帮助分区管理。区段的大小 +和最大区段数是使用 `SECTION_SIZE_BITS` 和 `MAX_PHYSMEM_BITS` 常量 +来指定的,这两个常量是由每个支持SPARSEMEM的架构定义的。 `MAX_PHYSMEM_BITS` +是一个架构所支持的物理地址的实际宽度,而 `SECTION_SIZE_BITS` 是一个任 +意的值。 + +最大的段数表示为 `NR_MEM_SECTIONS` ,定义为 + +.. math:: + + NR\_MEM\_SECTIONS = 2 ^ {(MAX\_PHYSMEM\_BITS - SECTION\_SIZE\_BITS)} + +`mem_section` 对象被安排在一个叫做 `mem_sections` 的二维数组中。这个数组的 +大小和位置取决于 `CONFIG_SPARSEM_EXTREME` 和可能的最大段数: + +* 当 `CONFIG_SPARSEMEM_EXTREME` 被禁用时, `mem_sections` 数组是静态的,有 + `NR_MEM_SECTIONS` 行。每一行持有一个 `mem_section` 对象。 +* 当 `CONFIG_SPARSEMEM_EXTREME` 被启用时, `mem_sections` 数组被动态分配。 + 每一行包含价值 `PAGE_SIZE` 的 `mem_section` 对象,行数的计算是为了适应所有的 + 内存区。 + +架构设置代码应该调用sparse_init()来初始化内存区和内存映射。 + +通过SPARSEMEM,有两种可能的方式将PFN转换为相应的 `struct page` --"classic sparse"和 + "sparse vmemmap"。选择是在构建时进行的,它由 `CONFIG_SPARSEMEM_VMEMMAP` 的 + 值决定。 + +Classic sparse在page->flags中编码了一个页面的段号,并使用PFN的高位来访问映射该页 +框的段。在一个区段内,PFN是指向页数组的索引。 + +Sparse vmemmapvmemmap使用虚拟映射的内存映射来优化pfn_to_page和page_to_pfn操 +作。有一个全局的 `struct page *vmemmap` 指针,指向一个虚拟连续的 `struct page` +对象阵列。PFN是该数组的一个索引,`struct page` 从 `vmemmap` 的偏移量是该页的PFN。 + +为了使用vmemmap,一个架构必须保留一个虚拟地址的范围,以映射包含内存映射的物理页,并 +确保 `vmemmap`指向该范围。此外,架构应该实现 :c:func:`vmemmap_populate` 方法, +它将分配物理内存并为虚拟内存映射创建页表。如果一个架构对vmemmap映射没有任何特殊要求, +它可以使用通用内存管理提供的默认 :c:func:`vmemmap_populate_basepages`。 + +虚拟映射的内存映射允许将持久性内存设备的 `struct page` 对象存储在这些设备上预先分 +配的存储中。这种存储用vmem_altmap结构表示,最终通过一长串的函数调用传递给 +vmemmap_populate()。vmemmap_populate()实现可以使用 `vmem_altmap` 和 +:c:func:`vmemmap_alloc_block_buf` 助手来分配持久性内存设备上的内存映射。 + +ZONE_DEVICE +=========== +`ZONE_DEVICE` 设施建立在 `SPARSEM_VMEMMAP` 之上,为设备驱动识别的物理地址范 +围提供 `struct page` `mem_map` 服务。 `ZONE_DEVICE` 的 "设备" 方面与以下 +事实有关:这些地址范围的页面对象从未被在线标记过,而且必须对设备进行引用,而不仅仅 +是页面,以保持内存被“锁定”以便使用。 `ZONE_DEVICE` ,通过 :c:func:`devm_memremap_pages` , +为给定的pfns范围执行足够的内存热插拔来开启 :c:func:`pfn_to_page`, +:c:func:`page_to_pfn`, ,和 :c:func:`get_user_pages` 服务。由于页面引 +用计数永远不会低于1,所以页面永远不会被追踪为空闲内存,页面的 `struct list_head lru` +空间被重新利用,用于向映射该内存的主机设备/驱动程序进行反向引用。 + +虽然 `SPARSEMEM` 将内存作为一个区段的集合,可以选择收集并合成内存块,但 +`ZONE_DEVICE` 用户需要更小的颗粒度来填充 `mem_map` 。鉴于 `ZONE_DEVICE` +内存从未被在线标记,因此它的内存范围从未通过sysfs内存热插拔api暴露在内存块边界 +上。这个实现依赖于这种缺乏用户接口的约束,允许子段大小的内存范围被指定给 +:c:func:`arch_add_memory` ,即内存热插拔的上半部分。子段支持允许2MB作为 +:c:func:`devm_memremap_pages` 的跨架构通用对齐颗粒度。 + +`ZONE_DEVICE` 的用户是: + +* pmem: 通过DAX映射将平台持久性内存作为直接I/O目标使用。 + +* hmm: 用 `->page_fault()` 和 `->page_free()` 事件回调扩展 `ZONE_DEVICE` , + 以允许设备驱动程序协调与设备内存相关的内存管理事件,通常是GPU内存。参见Documentation/mm/hmm.rst。 + +* p2pdma: 创建 `struct page` 对象,允许PCI/E拓扑结构中的peer设备协调它们之间的 + 直接DMA操作,即绕过主机内存。 diff --git a/Documentation/translations/zh_CN/mm/mmu_notifier.rst b/Documentation/translations/zh_CN/mm/mmu_notifier.rst new file mode 100644 index 000000000000..ce3664d1a410 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/mmu_notifier.rst @@ -0,0 +1,97 @@ +:Original: Documentation/mm/mmu_notifier.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + + +什么时候需要页表锁内通知? +========================== + +当清除一个pte/pmd时,我们可以选择通过在页表锁下(通知版的\*_clear_flush调用 +mmu_notifier_invalidate_range)通知事件。但这种通知并不是在所有情况下都需要的。 + +对于二级TLB(非CPU TLB),如IOMMU TLB或设备TLB(当设备使用类似ATS/PASID的东西让 +IOMMU走CPU页表来访问进程的虚拟地址空间)。只有两种情况需要在清除pte/pmd时在持有页 +表锁的同时通知这些二级TLB: + + A) 在mmu_notifier_invalidate_range_end()之前,支持页的地址被释放。 + B) 一个页表项被更新以指向一个新的页面(COW,零页上的写异常,__replace_page(),...)。 + +情况A很明显,你不想冒风险让设备写到一个现在可能被一些完全不同的任务使用的页面。 + +情况B更加微妙。为了正确起见,它需要按照以下序列发生: + + - 上页表锁 + - 清除页表项并通知 ([pmd/pte]p_huge_clear_flush_notify()) + - 设置页表项以指向新页 + +如果在设置新的pte/pmd值之前,清除页表项之后没有进行通知,那么你就会破坏设备的C11或 +C++11等内存模型。 + +考虑以下情况(设备使用类似于ATS/PASID的功能)。 + +两个地址addrA和addrB,这样|addrA - addrB| >= PAGE_SIZE,我们假设它们是COW的 +写保护(B的其他情况也适用)。 + +:: + + [Time N] -------------------------------------------------------------------- + CPU-thread-0 {尝试写到addrA} + CPU-thread-1 {尝试写到addrB} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {读取addrA并填充设备TLB} + DEV-thread-2 {读取addrB并填充设备TLB} + [Time N+1] ------------------------------------------------------------------ + CPU-thread-0 {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}} + CPU-thread-1 {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+2] ------------------------------------------------------------------ + CPU-thread-0 {COW_step1: {更新页表以指向addrA的新页}} + CPU-thread-1 {COW_step1: {更新页表以指向addrB的新页}} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+3] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {preempted} + CPU-thread-2 {写入addrA,这是对新页面的写入} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+3] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {preempted} + CPU-thread-2 {} + CPU-thread-3 {写入addrB,这是一个写入新页的过程} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+4] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+5] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {从旧页中读取addrA} + DEV-thread-2 {从新页面读取addrB} + +所以在这里,因为在N+2的时候,清空页表项没有和通知一起作废二级TLB,设备在看到addrA的新值之前 +就看到了addrB的新值。这就破坏了设备的总内存序。 + +当改变一个pte的写保护或指向一个新的具有相同内容的写保护页(KSM)时,将mmu_notifier_invalidate_range +调用延迟到页表锁外的mmu_notifier_invalidate_range_end()是可以的。即使做页表更新的线程 +在释放页表锁后但在调用mmu_notifier_invalidate_range_end()前被抢占,也是如此。 diff --git a/Documentation/translations/zh_CN/mm/numa.rst b/Documentation/translations/zh_CN/mm/numa.rst new file mode 100644 index 000000000000..b15cfeeb6dfb --- /dev/null +++ b/Documentation/translations/zh_CN/mm/numa.rst @@ -0,0 +1,101 @@ +:Original: Documentation/mm/numa.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +始于1999年11月,作者: + +========================== +何为非统一内存访问(NUMA)? +========================== + +这个问题可以从几个视角来回答:硬件观点和Linux软件视角。 + +从硬件角度看,NUMA系统是一个由多个组件或装配组成的计算机平台,每个组件可能包含0个或更多的CPU、 +本地内存和/或IO总线。为了简洁起见,并将这些物理组件/装配的硬件视角与软件抽象区分开来,我们在 +本文中称这些组件/装配为“单元”。 + +每个“单元”都可以看作是系统的一个SMP[对称多处理器]子集——尽管独立的SMP系统所需的一些组件可能 +不会在任何给定的单元上填充。NUMA系统的单元通过某种系统互连连接在一起——例如,交叉开关或点对点 +链接是NUMA系统互连的常见类型。这两种类型的互连都可以聚合起来,以创建NUMA平台,其中的单元与其 +他单元有多个距离。 + +对于Linux,感兴趣的NUMA平台主要是所谓的缓存相干NUMA--简称ccNUMA系统系统。在ccNUMA系统中, +所有的内存都是可见的,并且可以从连接到任何单元的任何CPU中访问,缓存一致性是由处理器缓存和/或 +系统互连在硬件中处理。 + +内存访问时间和有效的内存带宽取决于包含CPU的单元或进行内存访问的IO总线距离包含目标内存的单元 +有多远。例如,连接到同一单元的CPU对内存的访问将比访问其他远程单元的内存经历更快的访问时间和 +更高的带宽。 NUMA平台可以在任何给定单元上访问多种远程距离的(其他)单元。 + +平台供应商建立NUMA系统并不只是为了让软件开发人员的生活变得有趣。相反,这种架构是提供可扩展 +内存带宽的一种手段。然而,为了实现可扩展的内存带宽,系统和应用软件必须安排大部分的内存引用 +[cache misses]到“本地”内存——同一单元的内存,如果有的话——或者到最近的有内存的单元。 + +这就自然而然有了Linux软件对NUMA系统的视角: + +Linux将系统的硬件资源划分为多个软件抽象,称为“节点”。Linux将节点映射到硬件平台的物理单元 +上,对一些架构的细节进行了抽象。与物理单元一样,软件节点可能包含0或更多的CPU、内存和/或IO +总线。同样,对“较近”节点的内存访问——映射到较近单元的节点——通常会比对较远单元的访问经历更快 +的访问时间和更高的有效带宽。 + +对于一些架构,如x86,Linux将“隐藏”任何代表没有内存连接的物理单元的节点,并将连接到该单元 +的任何CPU重新分配到代表有内存的单元的节点上。因此,在这些架构上,我们不能假设Linux将所有 +的CPU与一个给定的节点相关联,会看到相同的本地内存访问时间和带宽。 + +此外,对于某些架构,同样以x86为例,Linux支持对额外节点的仿真。对于NUMA仿真,Linux会将现 +有的节点或者非NUMA平台的系统内存分割成多个节点。每个模拟的节点将管理底层单元物理内存的一部 +分。NUMA仿真对于在非NUMA平台上测试NUMA内核和应用功能是非常有用的,当与cpusets一起使用时, +可以作为一种内存资源管理机制。[见 Documentation/admin-guide/cgroup-v1/cpusets.rst] + +对于每个有内存的节点,Linux构建了一个独立的内存管理子系统,有自己的空闲页列表、使用中页列表、 +使用统计和锁来调解访问。此外,Linux为每个内存区[DMA、DMA32、NORMAL、HIGH_MEMORY、MOVABLE +中的一个或多个]构建了一个有序的“区列表”。zonelist指定了当一个选定的区/节点不能满足分配请求 +时要访问的区/节点。当一个区没有可用的内存来满足请求时,这种情况被称为“overflow 溢出”或 +“fallback 回退”。 + +由于一些节点包含多个包含不同类型内存的区,Linux必须决定是否对区列表进行排序,使分配回退到不同 +节点上的相同区类型,或同一节点上的不同区类型。这是一个重要的考虑因素,因为有些区,如DMA或DMA32, +代表了相对稀缺的资源。Linux选择了一个默认的Node ordered zonelist。这意味着在使用按NUMA距 +离排序的远程节点之前,它会尝试回退到同一节点的其他分区。 + +默认情况下,Linux会尝试从执行请求的CPU被分配到的节点中满足内存分配请求。具体来说,Linux将试 +图从请求来源的节点的适当分区列表中的第一个节点进行分配。这被称为“本地分配”。如果“本地”节点不能 +满足请求,内核将检查所选分区列表中其他节点的区域,寻找列表中第一个能满足请求的区域。 + +本地分配将倾向于保持对分配的内存的后续访问 “本地”的底层物理资源和系统互连——只要内核代表其分配 +一些内存的任务后来不从该内存迁移。Linux调度器知道平台的NUMA拓扑结构——体现在“调度域”数据结构 +中[见 Documentation/scheduler/sched-domains.rst]——并且调度器试图尽量减少任务迁移到遥 +远的调度域中。然而,调度器并没有直接考虑到任务的NUMA足迹。因此,在充分不平衡的情况下,任务可 +以在节点之间迁移,远离其初始节点和内核数据结构。 + +系统管理员和应用程序设计者可以使用各种CPU亲和命令行接口,如taskset(1)和numactl(1),以及程 +序接口,如sched_setaffinity(2),来限制任务的迁移,以改善NUMA定位。此外,人们可以使用 +Linux NUMA内存策略修改内核的默认本地分配行为。 [见 +:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst `]. + +系统管理员可以使用控制组和CPUsets限制非特权用户在调度或NUMA命令和功能中可以指定的CPU和节点 +的内存。 [见 Documentation/admin-guide/cgroup-v1/cpusets.rst] + +在不隐藏无内存节点的架构上,Linux会在分区列表中只包括有内存的区域[节点]。这意味着对于一个无 +内存的节点,“本地内存节点”——CPU节点的分区列表中的第一个区域的节点——将不是节点本身。相反,它 +将是内核在建立分区列表时选择的离它最近的有内存的节点。所以,默认情况下,本地分配将由内核提供 +最近的可用内存来完成。这是同一机制的结果,该机制允许这种分配在一个包含内存的节点溢出时回退到 +其他附近的节点。 + +一些内核分配不希望或不能容忍这种分配回退行为。相反,他们想确保他们从指定的节点获得内存,或者 +得到通知说该节点没有空闲内存。例如,当一个子系统分配每个CPU的内存资源时,通常是这种情况。 + +一个典型的分配模式是使用内核的numa_node_id()或CPU_to_node()函数获得“当前CPU”所在节点的 +节点ID,然后只从返回的节点ID请求内存。当这样的分配失败时,请求的子系统可以恢复到它自己的回退 +路径。板块内核内存分配器就是这样的一个例子。或者,子系统可以选择在分配失败时禁用或不启用自己。 +内核分析子系统就是这样的一个例子。 + +如果架构支持——不隐藏无内存节点,那么连接到无内存节点的CPU将总是产生回退路径的开销,或者一些 +子系统如果试图完全从无内存的节点分配内存,将无法初始化。为了透明地支持这种架构,内核子系统可 +以使用numa_mem_id()或cpu_to_mem()函数来定位调用或指定CPU的“本地内存节点”。同样,这是同 +一个节点,默认的本地页分配将从这个节点开始尝试。 diff --git a/Documentation/translations/zh_CN/mm/overcommit-accounting.rst b/Documentation/translations/zh_CN/mm/overcommit-accounting.rst new file mode 100644 index 000000000000..d8452d8b7fbb --- /dev/null +++ b/Documentation/translations/zh_CN/mm/overcommit-accounting.rst @@ -0,0 +1,86 @@ +:Original: Documentation/mm/overcommit-accounting.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + + +============== +超量使用审计 +============== + +Linux内核支持下列超量使用处理模式 + +0 + 启发式超量使用处理。拒绝明显的地址空间超量使用。用于一个典型的系统。 + 它确保严重的疯狂分配失败,同时允许超量使用以减少swap的使用。在这种模式下, + 允许root分配稍多的内存。这是默认的。 +1 + 总是超量使用。适用于一些科学应用。经典的例子是使用稀疏数组的代码,只是依赖 + 几乎完全由零页组成的虚拟内存 + +2 + 不超量使用。系统提交的总地址空间不允许超过swap+一个可配置的物理RAM的数量 + (默认为50%)。根据你使用的数量,在大多数情况下,这意味着一个进程在访问页面时 + 不会被杀死,但会在内存分配上收到相应的错误。 + + 对于那些想保证他们的内存分配在未来可用而又不需要初始化每一个页面的应用程序来说 + 是很有用的。 + +超量使用策略是通过sysctl `vm.overcommit_memory` 设置的。 + +可以通过 `vm.overcommit_ratio` (百分比)或 `vm.overcommit_kbytes` (绝对值) +来设置超限数量。这些只有在 `vm.overcommit_memory` 被设置为2时才有效果。 + +在 ``/proc/meminfo`` 中可以分别以CommitLimit和Committed_AS的形式查看当前 +的超量使用和提交量。 + +陷阱 +==== + +C语言的堆栈增长是一个隐含的mremap。如果你想得到绝对的保证,并在接近边缘的地方运行, +你 **必须** 为你认为你需要的最大尺寸的堆栈进行mmap。对于典型的堆栈使用来说,这并 +不重要,但如果你真的非常关心的话,这就是一个值得关注的案例。 + + +在模式2中,MAP_NORESERVE标志被忽略。 + + +它是如何工作的 +============== + +超量使用是基于以下规则 + +对于文件映射 + | SHARED or READ-only - 0 cost (该文件是映射而不是交换) + | PRIVATE WRITABLE - 每个实例的映射大小 + +对于匿名或者 ``/dev/zero`` 映射 + | SHARED - 映射的大小 + | PRIVATE READ-only - 0 cost (但作用不大) + | PRIVATE WRITABLE - 每个实例的映射大小 + +额外的计数 + | 通过mmap制作可写副本的页面 + | 从同一池中提取的shmfs内存 + +状态 +==== + +* 我们核算mmap内存映射 +* 我们核算mprotect在提交中的变化 +* 我们核算mremap的大小变化 +* 我们的审计 brk +* 审计munmap +* 我们在/proc中报告commit 状态 +* 核对并检查分叉的情况 +* 审查堆栈处理/执行中的构建 +* 叙述SHMfs的情况 +* 实现实际限制的执行 + +待续 +==== +* ptrace 页计数(这很难)。 diff --git a/Documentation/translations/zh_CN/mm/page_frags.rst b/Documentation/translations/zh_CN/mm/page_frags.rst new file mode 100644 index 000000000000..320952ca93af --- /dev/null +++ b/Documentation/translations/zh_CN/mm/page_frags.rst @@ -0,0 +1,38 @@ +:Original: Documentation/mm/page_frag.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +======== +页面片段 +======== + +一个页面片段是一个任意长度的任意偏移的内存区域,它位于一个0或更高阶的复合页面中。 +该页中的多个碎片在该页的引用计数器中被单独计算。 + +page_frag函数,page_frag_alloc和page_frag_free,为页面片段提供了一个简单 +的分配框架。这被网络堆栈和网络设备驱动使用,以提供一个内存的支持区域,作为 +sk_buff->head使用,或者用于skb_shared_info的 “frags” 部分。 + +为了使用页面片段API,需要一个支持页面片段的缓冲区。这为碎片分配提供了一个中心点, +并允许多个调用使用一个缓存的页面。这样做的好处是可以避免对get_page的多次调用, +这在分配时开销可能会很大。然而,由于这种缓存的性质,要求任何对缓存的调用都要受到每 +个CPU的限制,或者每个CPU的限制,并在执行碎片分配时强制禁止中断。 + +网络堆栈在每个CPU使用两个独立的缓存来处理碎片分配。netdev_alloc_cache被使用 +netdev_alloc_frag和__netdev_alloc_skb调用的调用者使用。napi_alloc_cache +被调用__napi_alloc_frag和__napi_alloc_skb的调用者使用。这两个调用的主要区别是 +它们可能被调用的环境。“netdev” 前缀的函数可以在任何上下文中使用,因为这些函数 +将禁用中断,而 ”napi“ 前缀的函数只可以在softirq上下文中使用。 + +许多网络设备驱动程序使用类似的方法来分配页面片段,但页面片段是在环或描述符级别上 +缓存的。为了实现这些情况,有必要提供一种拆解页面缓存的通用方法。出于这个原因, +__page_frag_cache_drain被实现了。它允许通过一次调用从一个页面释放多个引用。 +这样做的好处是,它允许清理被添加到一个页面的多个引用,以避免每次分配都调用 +get_page。 + +Alexander Duyck,2016年11月29日。 diff --git a/Documentation/translations/zh_CN/mm/page_owner.rst b/Documentation/translations/zh_CN/mm/page_owner.rst new file mode 100644 index 000000000000..03d9e613094a --- /dev/null +++ b/Documentation/translations/zh_CN/mm/page_owner.rst @@ -0,0 +1,116 @@ +:Original: Documentation/mm/page_owner.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +================================ +page owner: 跟踪谁分配的每个页面 +================================ + +概述 +==== + +page owner是用来追踪谁分配的每一个页面。它可以用来调试内存泄漏或找到内存占用者。 +当分配发生时,有关分配的信息,如调用堆栈和页面的顺序被存储到每个页面的特定存储中。 +当我们需要了解所有页面的状态时,我们可以获得并分析这些信息。 + +尽管我们已经有了追踪页面分配/释放的tracepoint,但用它来分析谁分配的每个页面是 +相当复杂的。我们需要扩大跟踪缓冲区,以防止在用户空间程序启动前出现重叠。而且,启 +动的程序会不断地将跟踪缓冲区转出,供以后分析,这将会改变系统的行为,会产生更多的 +可能性,而不是仅仅保留在内存中,所以不利于调试。 + +页面所有者也可以用于各种目的。例如,可以通过每个页面的gfp标志信息获得精确的碎片 +统计。如果启用了page owner,它就已经实现并激活了。我们非常欢迎其他用途。 + +page owner在默认情况下是禁用的。所以,如果你想使用它,你需要在你的启动cmdline +中加入"page_owner=on"。如果内核是用page owner构建的,并且由于没有启用启动 +选项而在运行时禁用page owner,那么运行时的开销是很小的。如果在运行时禁用,它不 +需要内存来存储所有者信息,所以没有运行时内存开销。而且,页面所有者在页面分配器的 +热路径中只插入了两个不可能的分支,如果不启用,那么分配就会像没有页面所有者的内核 +一样进行。这两个不可能的分支应该不会影响到分配的性能,特别是在静态键跳转标签修补 +功能可用的情况下。以下是由于这个功能而导致的内核代码大小的变化。 + +- 没有page owner:: + + text data bss dec hex filename + 48392 2333 644 51369 c8a9 mm/page_alloc.o + +- 有page owner:: + + text data bss dec hex filename + 48800 2445 644 51889 cab1 mm/page_alloc.o + 6662 108 29 6799 1a8f mm/page_owner.o + 1025 8 8 1041 411 mm/page_ext.o + +虽然总共增加了8KB的代码,但page_alloc.o增加了520字节,其中不到一半是在hotpath +中。构建带有page owner的内核,并在需要时打开它,将是调试内核内存问题的最佳选择。 + +有一个问题是由实现细节引起的。页所有者将信息存储到struct page扩展的内存中。这 +个内存的初始化时间比稀疏内存系统中的页面分配器启动的时间要晚一些,所以,在初始化 +之前,许多页面可以被分配,但它们没有所有者信息。为了解决这个问题,这些早期分配的 +页面在初始化阶段被调查并标记为分配。虽然这并不意味着它们有正确的所有者信息,但至 +少,我们可以更准确地判断该页是否被分配。在2GB内存的x86-64虚拟机上,有13343 +个早期分配的页面被捕捉和标记,尽管它们大部分是由结构页扩展功能分配的。总之,在这 +之后,没有任何页面处于未追踪状态。 + +使用方法 +======== + +1) 构建用户空间的帮助:: + + cd tools/vm + make page_owner_sort + +2) 启用page owner: 添加 "page_owner=on" 到 boot cmdline. + +3) 做你想调试的工作。 + +4) 分析来自页面所有者的信息:: + + cat /sys/kernel/debug/page_owner > page_owner_full.txt + ./page_owner_sort page_owner_full.txt sorted_page_owner.txt + + ``page_owner_full.txt`` 的一般输出情况如下(输出信息无翻译价值):: + + Page allocated via order XXX, ... + PFN XXX ... + // Detailed stack + + Page allocated via order XXX, ... + PFN XXX ... + // Detailed stack + + ``page_owner_sort`` 工具忽略了 ``PFN`` 行,将剩余的行放在buf中,使用regexp提 + 取页序值,计算buf的次数和页数,最后根据参数进行排序。 + + 在 ``sorted_page_owner.txt`` 中可以看到关于谁分配了每个页面的结果。一般输出:: + + XXX times, XXX pages: + Page allocated via order XXX, ... + // Detailed stack + + 默认情况下, ``page_owner_sort`` 是根据buf的时间来排序的。如果你想 + 按buf的页数排序,请使用-m参数。详细的参数是: + + 基本函数: + + Sort: + -a 按内存分配时间排序 + -m 按总内存排序 + -p 按pid排序。 + -P 按tgid排序。 + -r 按内存释放时间排序。 + -s 按堆栈跟踪排序。 + -t 按时间排序(默认)。 + + 其它函数: + + Cull: + -c 通过比较堆栈跟踪而不是总块来进行剔除。 + + Filter: + -f 过滤掉内存已被释放的块的信息。 diff --git a/Documentation/translations/zh_CN/mm/page_table_check.rst b/Documentation/translations/zh_CN/mm/page_table_check.rst new file mode 100644 index 000000000000..e8077310a76c --- /dev/null +++ b/Documentation/translations/zh_CN/mm/page_table_check.rst @@ -0,0 +1,56 @@ +.. SPDX-License-Identifier: GPL-2.0 + +:Original: Documentation/mm/page_table_check.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +======== +页表检查 +======== + +概述 +==== + +页表检查允许通过确保防止某些类型的内存损坏来强化内核。 + +当新的页面可以从用户空间访问时,页表检查通过将它们的页表项(PTEs PMD等)添加到页表中来执行额外 +的验证。 + +在检测到损坏的情况下,内核会被崩溃。页表检查有一个小的性能和内存开销。因此,它在默认情况下是禁用 +的,但是在额外的加固超过性能成本的系统上,可以选择启用。另外,由于页表检查是同步的,它可以帮助调 +试双映射内存损坏问题,在错误的映射发生时崩溃内核,而不是在内存损坏错误发生后内核崩溃。 + +双重映射检测逻辑 +================ + ++-------------------+-------------------+-------------------+------------------+ +| Current Mapping | New mapping | Permissions | Rule | ++===================+===================+===================+==================+ +| Anonymous | Anonymous | Read | Allow | ++-------------------+-------------------+-------------------+------------------+ +| Anonymous | Anonymous | Read / Write | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Anonymous | Named | Any | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Named | Anonymous | Any | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Named | Named | Any | Allow | ++-------------------+-------------------+-------------------+------------------+ + +启用页表检查 +============ + +用以下方法构建内核: + +- PAGE_TABLE_CHECK=y + 注意,它只能在ARCH_SUPPORTS_PAGE_TABLE_CHECK可用的平台上启用。 + +- 使用 "page_table_check=on" 内核参数启动。 + +可以选择用PAGE_TABLE_CHECK_ENFORCED来构建内核,以便在没有额外的内核参数的情况下获得页表 +支持。 diff --git a/Documentation/translations/zh_CN/mm/remap_file_pages.rst b/Documentation/translations/zh_CN/mm/remap_file_pages.rst new file mode 100644 index 000000000000..31e0c54dc36f --- /dev/null +++ b/Documentation/translations/zh_CN/mm/remap_file_pages.rst @@ -0,0 +1,32 @@ +:Original: Documentation/mm/remap_file_pages.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +============================== +remap_file_pages()系统调用 +============================== + +remap_file_pages()系统调用被用来创建一个非线性映射,也就是说,在这个映射中, +文件的页面被无序映射到内存中。使用remap_file_pages()比重复调用mmap(2)的好 +处是,前者不需要内核创建额外的VMA(虚拟内存区)数据结构。 + +支持非线性映射需要在内核虚拟内存子系统中编写大量的non-trivial的代码,包括热 +路径。另外,为了使非线性映射工作,内核需要一种方法来区分正常的页表项和带有文件 +偏移的项(pte_file)。内核为达到这个目的在PTE中保留了标志。PTE标志是稀缺资 +源,特别是在某些CPU架构上。如果能腾出这个标志用于其他用途就更好了。 + +幸运的是,在生活中并没有很多remap_file_pages()的用户。只知道有一个企业的RDBMS +实现在32位系统上使用这个系统调用来映射比32位虚拟地址空间线性尺寸更大的文件。 +由于64位系统的广泛使用,这种使用情况已经不重要了。 + +syscall被废弃了,现在用一个模拟来代替它。仿真会创建新的VMA,而不是非线性映射。 +对于remap_file_pages()的少数用户来说,它的工作速度会变慢,但ABI被保留了。 + +仿真的一个副作用(除了性能之外)是,由于额外的VMA,用户可以更容易达到 +vm.max_map_count的限制。关于限制的更多细节,请参见DEFAULT_MAX_MAP_COUNT +的注释。 diff --git a/Documentation/translations/zh_CN/mm/split_page_table_lock.rst b/Documentation/translations/zh_CN/mm/split_page_table_lock.rst new file mode 100644 index 000000000000..4fb7aa666037 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/split_page_table_lock.rst @@ -0,0 +1,96 @@ +:Original: Documentation/mm/split_page_table_lock.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +================================= +分页表锁(split page table lock) +================================= + +最初,mm->page_table_lock spinlock保护了mm_struct的所有页表。但是这种方 +法导致了多线程应用程序的缺页异常可扩展性差,因为对锁的争夺很激烈。为了提高可扩 +展性,我们引入了分页表锁。 + +有了分页表锁,我们就有了单独的每张表锁来顺序化对表的访问。目前,我们对PTE和 +PMD表使用分页锁。对高层表的访问由mm->page_table_lock保护。 + +有一些辅助工具来锁定/解锁一个表和其他访问器函数: + + - pte_offset_map_lock() + 映射pte并获取PTE表锁,返回所取锁的指针; + - pte_unmap_unlock() + 解锁和解映射PTE表; + - pte_alloc_map_lock() + 如果需要的话,分配PTE表并获取锁,如果分配失败,返回已获取的锁的指针 + 或NULL; + - pte_lockptr() + 返回指向PTE表锁的指针; + - pmd_lock() + 取得PMD表锁,返回所取锁的指针。 + - pmd_lockptr() + 返回指向PMD表锁的指针; + +如果CONFIG_SPLIT_PTLOCK_CPUS(通常为4)小于或等于NR_CPUS,则在编译 +时启用PTE表的分页表锁。如果分页锁被禁用,所有的表都由mm->page_table_lock +来保护。 + +如果PMD表启用了分页锁,并且架构支持它,那么PMD表的分页锁就会被启用(见 +下文)。 + +Hugetlb 和分页表锁 +================== + +Hugetlb可以支持多种页面大小。我们只对PMD级别使用分页锁,但不对PUD使用。 + +Hugetlb特定的辅助函数: + + - huge_pte_lock() + 对PMD_SIZE页面采取pmd分割锁,否则mm->page_table_lock; + - huge_pte_lockptr() + 返回指向表锁的指针。 + +架构对分页表锁的支持 +==================== + +没有必要特别启用PTE分页表锁:所有需要的东西都由pgtable_pte_page_ctor() +和pgtable_pte_page_dtor()完成,它们必须在PTE表分配/释放时被调用。 + +确保架构不使用slab分配器来分配页表:slab使用page->slab_cache来分配其页 +面。这个区域与page->ptl共享存储。 + +PMD分页锁只有在你有两个以上的页表级别时才有意义。 + +启用PMD分页锁需要在PMD表分配时调用pgtable_pmd_page_ctor(),在释放时调 +用pgtable_pmd_page_dtor()。 + +分配通常发生在pmd_alloc_one()中,释放发生在pmd_free()和pmd_free_tlb() +中,但要确保覆盖所有的PMD表分配/释放路径:即X86_PAE在pgd_alloc()中预先 +分配一些PMD。 + +一切就绪后,你可以设置CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK。 + +注意:pgtable_pte_page_ctor()和pgtable_pmd_page_ctor()可能失败--必 +须正确处理。 + +page->ptl +========= + +page->ptl用于访问分割页表锁,其中'page'是包含该表的页面struct page。它 +与page->private(以及union中的其他几个字段)共享存储。 + +为了避免增加struct page的大小并获得最佳性能,我们使用了一个技巧: + + - 如果spinlock_t适合于long,我们使用page->ptr作为spinlock,这样我们 + 就可以避免间接访问并节省一个缓存行。 + - 如果spinlock_t的大小大于long的大小,我们使用page->ptl作为spinlock_t + 的指针并动态分配它。这允许在启用DEBUG_SPINLOCK或DEBUG_LOCK_ALLOC的 + 情况下使用分页锁,但由于间接访问而多花了一个缓存行。 + +PTE表的spinlock_t分配在pgtable_pte_page_ctor()中,PMD表的spinlock_t +分配在pgtable_pmd_page_ctor()中。 + +请不要直接访问page->ptl - -使用适当的辅助函数。 diff --git a/Documentation/translations/zh_CN/mm/z3fold.rst b/Documentation/translations/zh_CN/mm/z3fold.rst new file mode 100644 index 000000000000..9569a6d88270 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/z3fold.rst @@ -0,0 +1,31 @@ +:Original: Documentation/mm/z3fold.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +====== +z3fold +====== + +z3fold是一个专门用于存储压缩页的分配器。它被设计为每个物理页最多可以存储三个压缩页。 +它是zbud的衍生物,允许更高的压缩率,保持其前辈的简单性和确定性。 + +z3fold和zbud的主要区别是: + +* 与zbud不同的是,z3fold允许最大的PAGE_SIZE分配。 +* z3fold在其页面中最多可以容纳3个压缩页面 +* z3fold本身没有输出任何API,因此打算通过zpool的API来使用 + +为了保持确定性和简单性,z3fold,就像zbud一样,总是在每页存储一个整数的压缩页,但是 +它最多可以存储3页,不像zbud最多可以存储2页。因此压缩率达到2.7倍左右,而zbud的压缩 +率是1.7倍左右。 + +不像zbud(但也像zsmalloc),z3fold_alloc()那样不返回一个可重复引用的指针。相反,它 +返回一个无符号长句柄,它编码了被分配对象的实际位置。 + +保持有效的压缩率接近于zsmalloc,z3fold不依赖于MMU的启用,并提供更可预测的回收行 +为,这使得它更适合于小型和反应迅速的系统。 diff --git a/Documentation/translations/zh_CN/mm/zsmalloc.rst b/Documentation/translations/zh_CN/mm/zsmalloc.rst new file mode 100644 index 000000000000..b5596ea08ae4 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/zsmalloc.rst @@ -0,0 +1,78 @@ +:Original: Documentation/mm/zs_malloc.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + +======== +zsmalloc +======== + +这个分配器是为与zram一起使用而设计的。因此,该分配器应该在低内存条件下工作良好。特别是, +它从未尝试过higher order页面的分配,这在内存压力下很可能会失败。另一方面,如果我们只 +是使用单(0-order)页,它将遭受非常高的碎片化 - 任何大小为PAGE_SIZE/2或更大的对象将 +占据整个页面。这是其前身(xvmalloc)的主要问题之一。 + +为了克服这些问题,zsmalloc分配了一堆0-order页面,并使用各种"struct page"字段将它 +们链接起来。这些链接的页面作为一个单一的higher order页面,即一个对象可以跨越0-order +页面的边界。代码将这些链接的页面作为一个实体,称为zspage。 + +为了简单起见,zsmalloc只能分配大小不超过PAGE_SIZE的对象,因为这满足了所有当前用户的 +要求(在最坏的情况下,页面是不可压缩的,因此以"原样"即未压缩的形式存储)。对于大于这 +个大小的分配请求,会返回失败(见zs_malloc)。 + +此外,zs_malloc()并不返回一个可重复引用的指针。相反,它返回一个不透明的句柄(无符号 +长),它编码了被分配对象的实际位置。这种间接性的原因是zsmalloc并不保持zspages的永久 +映射,因为这在32位系统上会导致问题,因为内核空间映射的VA区域非常小。因此,在使用分配 +的内存之前,对象必须使用zs_map_object()进行映射以获得一个可用的指针,随后使用 +zs_unmap_object()解除映射。 + +stat +==== + +通过CONFIG_ZSMALLOC_STAT,我们可以通过 ``/sys/kernel/debug/zsmalloc/`` +看到zsmalloc内部信息。下面是一个统计输出的例子。:: + + # cat /sys/kernel/debug/zsmalloc/zram0/classes + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage + ... + ... + 9 176 0 1 186 129 8 4 + 10 192 1 0 2880 2872 135 3 + 11 208 0 1 819 795 42 2 + 12 224 0 1 219 159 12 4 + ... + ... + + +class + 索引 +size + zspage存储对象大小 +almost_empty + ZS_ALMOST_EMPTY zspage的数量(见下文)。 +almost_full + ZS_ALMOST_FULL zspage的数量(见下图) +obj_allocated + 已分配对象的数量 +obj_used + 分配给用户的对象的数量 +pages_used + 为该类分配的页数 +pages_per_zspage + 组成一个zspage的0-order页面的数量 + +当n <= N / f时,我们将一个zspage分配给ZS_ALMOST_EMPTYfullness组,其中 + +* n = 已分配对象的数量 +* N = zspage可以存储的对象总数 +* f = fullness_threshold_frac(即,目前是4个) + +同样地,我们将zspage分配给: + +* ZS_ALMOST_FULL when n > N / f +* ZS_EMPTY when n == 0 +* ZS_FULL when n == N diff --git a/Documentation/translations/zh_CN/vm/active_mm.rst b/Documentation/translations/zh_CN/vm/active_mm.rst deleted file mode 100644 index 366609ea4f37..000000000000 --- a/Documentation/translations/zh_CN/vm/active_mm.rst +++ /dev/null @@ -1,85 +0,0 @@ -.. include:: ../disclaimer-zh_CN.rst - -:Original: Documentation/vm/active_mm.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -========= -Active MM -========= - -这是一封linux之父回复开发者的一封邮件,所以翻译时我尽量保持邮件格式的完整。 - -:: - - List: linux-kernel - Subject: Re: active_mm - From: Linus Torvalds - Date: 1999-07-30 21:36:24 - - 因为我并不经常写解释,所以已经抄送到linux-kernel邮件列表,而当我做这些, - 且更多的人在阅读它们时,我觉得棒极了。 - - 1999年7月30日 星期五, David Mosberger 写道: - > - > 是否有一个简短的描述,说明task_struct中的 - > "mm" 和 "active_mm"应该如何使用? (如果 - > 这个问题在邮件列表中讨论过,我表示歉意--我刚 - > 刚度假回来,有一段时间没能关注linux-kernel了)。 - - 基本上,新的设定是: - - - 我们有“真实地址空间”和“匿名地址空间”。区别在于,匿名地址空间根本不关心用 - 户级页表,所以当我们做上下文切换到匿名地址空间时,我们只是让以前的地址空间 - 处于活动状态。 - - 一个“匿名地址空间”的明显用途是任何不需要任何用户映射的线程--所有的内核线 - 程基本上都属于这一类,但即使是“真正的”线程也可以暂时说在一定时间内它们不 - 会对用户空间感兴趣,调度器不妨试着避免在切换VM状态上浪费时间。目前只有老 - 式的bdflush sync能做到这一点。 - - - “tsk->mm” 指向 “真实地址空间”。对于一个匿名进程来说,tsk->mm将是NULL, - 其逻辑原因是匿名进程实际上根本就 “没有” 真正的地址空间。 - - - 然而,我们显然需要跟踪我们为这样的匿名用户“偷用”了哪个地址空间。为此,我们 - 有 “tsk->active_mm”,它显示了当前活动的地址空间是什么。 - - 规则是,对于一个有真实地址空间的进程(即tsk->mm是 non-NULL),active_mm - 显然必须与真实的mm相同。 - - 对于一个匿名进程,tsk->mm == NULL,而tsk->active_mm是匿名进程运行时 - “借用”的mm。当匿名进程被调度走时,借用的地址空间被返回并清除。 - - 为了支持所有这些,“struct mm_struct”现在有两个计数器:一个是 “mm_users” - 计数器,即有多少 “真正的地址空间用户”,另一个是 “mm_count”计数器,即 “lazy” - 用户(即匿名用户)的数量,如果有任何真正的用户,则加1。 - - 通常情况下,至少有一个真正的用户,但也可能是真正的用户在另一个CPU上退出,而 - 一个lazy的用户仍在活动,所以你实际上得到的情况是,你有一个地址空间 **只** - 被lazy的用户使用。这通常是一个短暂的生命周期状态,因为一旦这个线程被安排给一 - 个真正的线程,这个 “僵尸” mm就会被释放,因为 “mm_count”变成了零。 - - 另外,一个新的规则是,**没有人** 再把 “init_mm” 作为一个真正的MM了。 - “init_mm”应该被认为只是一个 “没有其他上下文时的lazy上下文”,事实上,它主 - 要是在启动时使用,当时还没有真正的VM被创建。因此,用来检查的代码 - - if (current->mm == &init_mm) - - 一般来说,应该用 - - if (!current->mm) - - 取代上面的写法(这更有意义--测试基本上是 “我们是否有一个用户环境”,并且通常 - 由缺页异常处理程序和类似的东西来完成)。 - - 总之,我刚才在ftp.kernel.org上放了一个pre-patch-2.3.13-1,因为它稍微改 - 变了接口以适配alpha(谁会想到呢,但alpha体系结构上下文切换代码实际上最终是 - 最丑陋的之一--不像其他架构的MM和寄存器状态是分开的,alpha的PALcode将两者 - 连接起来,你需要同时切换两者)。 - - (文档来源 http://marc.info/?l=linux-kernel&m=93337278602211&w=2) diff --git a/Documentation/translations/zh_CN/vm/balance.rst b/Documentation/translations/zh_CN/vm/balance.rst deleted file mode 100644 index e98a47ef24a8..000000000000 --- a/Documentation/translations/zh_CN/vm/balance.rst +++ /dev/null @@ -1,81 +0,0 @@ -.. include:: ../disclaimer-zh_CN.rst - -:Original: Documentation/vm/balance.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -======== -内存平衡 -======== - -2000年1月开始,作者:Kanoj Sarcar - -对于 !__GFP_HIGH 和 !__GFP_KSWAPD_RECLAIM 以及非 __GFP_IO 的分配,需要进行 -内存平衡。 - -调用者避免回收的第一个原因是调用者由于持有自旋锁或处于中断环境中而无法睡眠。第二个 -原因可能是,调用者愿意在不产生页面回收开销的情况下分配失败。这可能发生在有0阶回退 -选项的机会主义高阶分配请求中。在这种情况下,调用者可能也希望避免唤醒kswapd。 - -__GFP_IO分配请求是为了防止文件系统死锁。 - -在没有非睡眠分配请求的情况下,做平衡似乎是有害的。页面回收可以被懒散地启动,也就是 -说,只有在需要的时候(也就是区域的空闲内存为0),而不是让它成为一个主动的过程。 - -也就是说,内核应该尝试从直接映射池中满足对直接映射页的请求,而不是回退到dma池中, -这样就可以保持dma池为dma请求(不管是不是原子的)所填充。类似的争论也适用于高内存 -和直接映射的页面。相反,如果有很多空闲的dma页,最好是通过从dma池中分配一个来满足 -常规的内存请求,而不是产生常规区域平衡的开销。 - -在2.2中,只有当空闲页总数低于总内存的1/64时,才会启动内存平衡/页面回收。如果dma -和常规内存的比例合适,即使dma区完全空了,也很可能不会进行平衡。2.2已经在不同内存 -大小的生产机器上运行,即使有这个问题存在,似乎也做得不错。在2.3中,由于HIGHMEM的 -存在,这个问题变得更加严重。 - -在2.3中,区域平衡可以用两种方式之一来完成:根据区域的大小(可能是低级区域的大小), -我们可以在初始化阶段决定在平衡任何区域时应该争取多少空闲页。好的方面是,在平衡的时 -候,我们不需要看低级区的大小,坏的方面是,我们可能会因为忽略低级区可能较低的使用率 -而做过于频繁的平衡。另外,只要对分配程序稍作修改,就有可能将memclass()宏简化为一 -个简单的等式。 - -另一个可能的解决方案是,我们只在一个区 **和** 其所有低级区的空闲内存低于该区及其 -低级区总内存的1/64时进行平衡。这就解决了2.2的平衡问题,并尽可能地保持了与2.2行为 -的接近。另外,平衡算法在各种架构上的工作方式也是一样的,这些架构有不同数量和类型的 -内存区。如果我们想变得更花哨一点,我们可以在未来为不同区域的自由页面分配不同的权重。 - -请注意,如果普通区的大小与dma区相比是巨大的,那么在决定是否平衡普通区的时候,考虑 -空闲的dma页就变得不那么重要了。那么第一个解决方案就变得更有吸引力。 - -所附的补丁实现了第二个解决方案。它还 “修复”了两个问题:首先,在低内存条件下,kswapd -被唤醒,就像2.2中的非睡眠分配。第二,HIGHMEM区也被平衡了,以便给replace_with_highmem() -一个争取获得HIGHMEM页的机会,同时确保HIGHMEM分配不会落回普通区。这也确保了HIGHMEM -页不会被泄露(例如,在一个HIGHMEM页在交换缓存中但没有被任何人使用的情况下)。 - -kswapd还需要知道它应该平衡哪些区。kswapd主要是在无法进行平衡的情况下需要的,可能 -是因为所有的分配请求都来自中断上下文,而所有的进程上下文都在睡眠。对于2.3, -kswapd并不真正需要平衡高内存区,因为中断上下文并不请求高内存页。kswapd看zone -结构体中的zone_wake_kswapd字段来决定一个区是否需要平衡。 - -如果从进程内存和shm中偷取页面可以减轻该页面节点中任何区的内存压力,而该区的内存压力 -已经低于其水位,则会进行偷取。 - -watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: -这些是每个区的字段,用于确定一个区何时需要平衡。当页面数低于水位[WMARK_MIN]时, -hysteric 的字段low_on_memory被设置。这个字段会一直被设置,直到空闲页数变成水位 -[WMARK_HIGH]。当low_on_memory被设置时,页面分配请求将尝试释放该区域的一些页面(如果 -请求中设置了GFP_WAIT)。与此相反的是,决定唤醒kswapd以释放一些区的页。这个决定不是基于 -hysteresis 的,而是当空闲页的数量低于watermark[WMARK_LOW]时就会进行;在这种情况下, -zone_wake_kswapd也被设置。 - - -我所听到的(超棒的)想法: - -1. 动态经历应该影响平衡:可以跟踪一个区的失败请求的数量,并反馈到平衡方案中(jalvo@mbay.net)。 - -2. 实现一个类似于replace_with_highmem()的replace_with_regular(),以保留dma页面。 - (lkd@tantalophile.demon.co.uk) diff --git a/Documentation/translations/zh_CN/vm/damon/api.rst b/Documentation/translations/zh_CN/vm/damon/api.rst deleted file mode 100644 index 21143eea4ebe..000000000000 --- a/Documentation/translations/zh_CN/vm/damon/api.rst +++ /dev/null @@ -1,32 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -:Original: Documentation/vm/damon/api.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -======= -API参考 -======= - -内核空间的程序可以使用下面的API来使用DAMON的每个功能。你所需要做的就是引用 ``damon.h`` , -它位于源代码树的include/linux/。 - -结构体 -====== - -该API在以下内核代码中: - -include/linux/damon.h - - -函数 -==== - -该API在以下内核代码中: - -mm/damon/core.c diff --git a/Documentation/translations/zh_CN/vm/damon/design.rst b/Documentation/translations/zh_CN/vm/damon/design.rst deleted file mode 100644 index 46128b77c2b3..000000000000 --- a/Documentation/translations/zh_CN/vm/damon/design.rst +++ /dev/null @@ -1,140 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -:Original: Documentation/vm/damon/design.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -==== -设计 -==== - -可配置的层 -========== - -DAMON提供了数据访问监控功能,同时使其准确性和开销可控。基本的访问监控需要依赖于目标地址空间 -并为之优化的基元。另一方面,作为DAMON的核心,准确性和开销的权衡机制是在纯逻辑空间中。DAMON -将这两部分分离在不同的层中,并定义了它的接口,以允许各种低层次的基元实现与核心逻辑的配置。 - -由于这种分离的设计和可配置的接口,用户可以通过配置核心逻辑和适当的低级基元实现来扩展DAMON的 -任何地址空间。如果没有提供合适的,用户可以自己实现基元。 - -例如,物理内存、虚拟内存、交换空间、那些特定的进程、NUMA节点、文件和支持的内存设备将被支持。 -另外,如果某些架构或设备支持特殊的优化访问检查基元,这些基元将很容易被配置。 - - -特定地址空间基元的参考实现 -========================== - -基本访问监测的低级基元被定义为两部分。: - -1. 确定地址空间的监测目标地址范围 -2. 目标空间中特定地址范围的访问检查。 - -DAMON目前为物理和虚拟地址空间提供了基元的实现。下面两个小节描述了这些工作的方式。 - - -基于VMA的目标地址范围构造 -------------------------- - -这仅仅是针对虚拟地址空间基元的实现。对于物理地址空间,只是要求用户手动设置监控目标地址范围。 - -在进程的超级巨大的虚拟地址空间中,只有小部分被映射到物理内存并被访问。因此,跟踪未映射的地 -址区域只是一种浪费。然而,由于DAMON可以使用自适应区域调整机制来处理一定程度的噪声,所以严 -格来说,跟踪每一个映射并不是必须的,但在某些情况下甚至会产生很高的开销。也就是说,监测目标 -内部过于巨大的未映射区域应该被移除,以不占用自适应机制的时间。 - -出于这个原因,这个实现将复杂的映射转换为三个不同的区域,覆盖地址空间的每个映射区域。这三个 -区域之间的两个空隙是给定地址空间中两个最大的未映射区域。这两个最大的未映射区域是堆和最上面 -的mmap()区域之间的间隙,以及在大多数情况下最下面的mmap()区域和堆之间的间隙。因为这些间隙 -在通常的地址空间中是异常巨大的,排除这些间隙就足以做出合理的权衡。下面详细说明了这一点:: - - - - - (small mmap()-ed regions and munmap()-ed regions) - - - - - -基于PTE访问位的访问检查 ------------------------ - -物理和虚拟地址空间的实现都使用PTE Accessed-bit进行基本访问检查。唯一的区别在于从地址中 -找到相关的PTE访问位的方式。虚拟地址的实现是为该地址的目标任务查找页表,而物理地址的实现则 -是查找与该地址有映射关系的每一个页表。通过这种方式,实现者找到并清除下一个采样目标地址的位, -并检查该位是否在一个采样周期后再次设置。这可能会干扰其他使用访问位的内核子系统,即空闲页跟 -踪和回收逻辑。为了避免这种干扰,DAMON使其与空闲页面跟踪相互排斥,并使用 ``PG_idle`` 和 -``PG_young`` 页面标志来解决与回收逻辑的冲突,就像空闲页面跟踪那样。 - - -独立于地址空间的核心机制 -======================== - -下面四个部分分别描述了DAMON的核心机制和五个监测属性,即 ``采样间隔`` 、 ``聚集间隔`` 、 -``更新间隔`` 、 ``最小区域数`` 和 ``最大区域数`` 。 - - -访问频率监测 ------------- - -DAMON的输出显示了在给定的时间内哪些页面的访问频率是多少。访问频率的分辨率是通过设置 -``采样间隔`` 和 ``聚集间隔`` 来控制的。详细地说,DAMON检查每个 ``采样间隔`` 对每 -个页面的访问,并将结果汇总。换句话说,计算每个页面的访问次数。在每个 ``聚合间隔`` 过 -去后,DAMON调用先前由用户注册的回调函数,以便用户可以阅读聚合的结果,然后再清除这些结 -果。这可以用以下简单的伪代码来描述:: - - while monitoring_on: - for page in monitoring_target: - if accessed(page): - nr_accesses[page] += 1 - if time() % aggregation_interval == 0: - for callback in user_registered_callbacks: - callback(monitoring_target, nr_accesses) - for page in monitoring_target: - nr_accesses[page] = 0 - sleep(sampling interval) - -这种机制的监测开销将随着目标工作负载规模的增长而任意增加。 - - -基于区域的抽样调查 ------------------- - -为了避免开销的无限制增加,DAMON将假定具有相同访问频率的相邻页面归入一个区域。只要保持 -这个假设(一个区域内的页面具有相同的访问频率),该区域内就只需要检查一个页面。因此,对 -于每个 ``采样间隔`` ,DAMON在每个区域中随机挑选一个页面,等待一个 ``采样间隔`` ,检 -查该页面是否同时被访问,如果被访问则增加该区域的访问频率。因此,监测开销是可以通过设置 -区域的数量来控制的。DAMON允许用户设置最小和最大的区域数量来进行权衡。 - -然而,如果假设没有得到保证,这个方案就不能保持输出的质量。 - - -适应性区域调整 --------------- - -即使最初的监测目标区域被很好地构建以满足假设(同一区域内的页面具有相似的访问频率),数 -据访问模式也会被动态地改变。这将导致监测质量下降。为了尽可能地保持假设,DAMON根据每个 -区域的访问频率自适应地进行合并和拆分。 - -对于每个 ``聚集区间`` ,它比较相邻区域的访问频率,如果频率差异较小,就合并这些区域。 -然后,在它报告并清除每个区域的聚合接入频率后,如果区域总数不超过用户指定的最大区域数, -它将每个区域拆分为两个或三个区域。 - -通过这种方式,DAMON提供了其最佳的质量和最小的开销,同时保持了用户为其权衡设定的界限。 - - -动态目标空间更新处理 --------------------- - -监测目标地址范围可以动态改变。例如,虚拟内存可以动态地被映射和解映射。物理内存可以被 -热插拔。 - -由于在某些情况下变化可能相当频繁,DAMON允许监控操作检查动态变化,包括内存映射变化, -并仅在用户指定的时间间隔( ``更新间隔`` )中的每个时间段,将其应用于监控操作相关的 -数据结构,如抽象的监控目标内存区。 \ No newline at end of file diff --git a/Documentation/translations/zh_CN/vm/damon/faq.rst b/Documentation/translations/zh_CN/vm/damon/faq.rst deleted file mode 100644 index 07b4ac19407d..000000000000 --- a/Documentation/translations/zh_CN/vm/damon/faq.rst +++ /dev/null @@ -1,48 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -:Original: Documentation/vm/damon/faq.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -======== -常见问题 -======== - -为什么是一个新的子系统,而不是扩展perf或其他用户空间工具? -========================================================== - -首先,因为它需要尽可能的轻量级,以便可以在线使用,所以应该避免任何不必要的开销,如内核-用户 -空间的上下文切换成本。第二,DAMON的目标是被包括内核在内的其他程序所使用。因此,对特定工具 -(如perf)的依赖性是不可取的。这就是DAMON在内核空间实现的两个最大的原因。 - - -“闲置页面跟踪” 或 “perf mem” 可以替代DAMON吗? -============================================== - -闲置页跟踪是物理地址空间访问检查的一个低层次的原始方法。“perf mem”也是类似的,尽管它可以 -使用采样来减少开销。另一方面,DAMON是一个更高层次的框架,用于监控各种地址空间。它专注于内 -存管理优化,并提供复杂的精度/开销处理机制。因此,“空闲页面跟踪” 和 “perf mem” 可以提供 -DAMON输出的一个子集,但不能替代DAMON。 - - -DAMON是否只支持虚拟内存? -========================= - -不,DAMON的核心是独立于地址空间的。用户可以在DAMON核心上实现和配置特定地址空间的低级原始 -部分,包括监测目标区域的构造和实际的访问检查。通过这种方式,DAMON用户可以用任何访问检查技 -术来监测任何地址空间。 - -尽管如此,DAMON默认为虚拟内存和物理内存提供了基于vma/rmap跟踪和PTE访问位检查的地址空间 -相关功能的实现,以供参考和方便使用。 - - -我可以简单地监测页面的粒度吗? -============================== - -是的,你可以通过设置 ``min_nr_regions`` 属性高于工作集大小除以页面大小的值来实现。 -因为监视目标区域的大小被强制为 ``>=page size`` ,所以区域分割不会产生任何影响。 diff --git a/Documentation/translations/zh_CN/vm/damon/index.rst b/Documentation/translations/zh_CN/vm/damon/index.rst deleted file mode 100644 index 84d36d90c9b0..000000000000 --- a/Documentation/translations/zh_CN/vm/damon/index.rst +++ /dev/null @@ -1,33 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -:Original: Documentation/vm/damon/index.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -========================== -DAMON:数据访问监视器 -========================== - -DAMON是Linux内核的一个数据访问监控框架子系统。DAMON的核心机制使其成为 -(该核心机制详见(Documentation/translations/zh_CN/vm/damon/design.rst)) - - - *准确度* (监测输出对DRAM级别的内存管理足够有用;但可能不适合CPU Cache级别), - - *轻量级* (监控开销低到可以在线应用),以及 - - *可扩展* (无论目标工作负载的大小,开销的上限值都在恒定范围内)。 - -因此,利用这个框架,内核的内存管理机制可以做出高级决策。会导致高数据访问监控开销的实 -验性内存管理优化工作可以再次进行。同时,在用户空间,有一些特殊工作负载的用户可以编写 -个性化的应用程序,以便更好地了解和优化他们的工作负载和系统。 - -.. toctree:: - :maxdepth: 2 - - faq - design - api - diff --git a/Documentation/translations/zh_CN/vm/free_page_reporting.rst b/Documentation/translations/zh_CN/vm/free_page_reporting.rst deleted file mode 100644 index 31d6c34b956b..000000000000 --- a/Documentation/translations/zh_CN/vm/free_page_reporting.rst +++ /dev/null @@ -1,38 +0,0 @@ -.. include:: ../disclaimer-zh_CN.rst - -:Original: Documentation/vm/_free_page_reporting.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - -========== -空闲页报告 -========== - -空闲页报告是一个API,设备可以通过它来注册接收系统当前未使用的页面列表。这在虚拟 -化的情况下是很有用的,客户机能够使用这些数据来通知管理器它不再使用内存中的某些页 -面。 - -对于驱动,通常是气球驱动要使用这个功能,它将分配和初始化一个page_reporting_dev_info -结构体。它要填充的结构体中的字段是用于处理散点列表的 "report" 函数指针。它还必 -须保证每次调用该函数时能处理至少相当于PAGE_REPORTING_CAPACITY的散点列表条目。 -假设没有其他页面报告设备已经注册, 对page_reporting_register的调用将向报告框 -架注册页面报告接口。 - -一旦注册,页面报告API将开始向驱动报告成批的页面。API将在接口被注册后2秒开始报告 -页面,并在任何足够高的页面被释放之后2秒继续报告。 - -报告的页面将被存储在传递给报告函数的散列表中,最后一个条目的结束位被设置在条目 -nent-1中。 当页面被报告函数处理时,分配器将无法访问它们。一旦报告函数完成,这些 -页将被返回到它们所获得的自由区域。 - -在移除使用空闲页报告的驱动之前,有必要调用page_reporting_unregister,以移除 -目前被空闲页报告使用的page_reporting_dev_info结构体。这样做将阻止进一步的报 -告通过该接口发出。如果另一个驱动或同一驱动被注册,它就有可能恢复前一个驱动在报告 -空闲页方面的工作。 - - -Alexander Duyck, 2019年12月04日 diff --git a/Documentation/translations/zh_CN/vm/frontswap.rst b/Documentation/translations/zh_CN/vm/frontswap.rst deleted file mode 100644 index 3eb07870e2ef..000000000000 --- a/Documentation/translations/zh_CN/vm/frontswap.rst +++ /dev/null @@ -1,196 +0,0 @@ -:Original: Documentation/vm/_free_page_reporting.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - -========= -Frontswap -========= - -Frontswap为交换页提供了一个 “transcendent memory” 的接口。在一些环境中,由 -于交换页被保存在RAM(或类似RAM的设备)中,而不是交换磁盘,因此可以获得巨大的性能 -节省(提高)。 - -.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/ - -Frontswap之所以这么命名,是因为它可以被认为是与swap设备的“back”存储相反。存 -储器被认为是一个同步并发安全的面向页面的“伪RAM设备”,符合transcendent memory -(如Xen的“tmem”,或内核内压缩内存,又称“zcache”,或未来的类似RAM的设备)的要 -求;这个伪RAM设备不能被内核直接访问或寻址,其大小未知且可能随时间变化。驱动程序通过 -调用frontswap_register_ops将自己与frontswap链接起来,以适当地设置frontswap_ops -的功能,它提供的功能必须符合某些策略,如下所示: - -一个 “init” 将设备准备好接收与指定的交换设备编号(又称“类型”)相关的frontswap -交换页。一个 “store” 将把该页复制到transcendent memory,并与该页的类型和偏移 -量相关联。一个 “load” 将把该页,如果找到的话,从transcendent memory复制到内核 -内存,但不会从transcendent memory中删除该页。一个 “invalidate_page” 将从 -transcendent memory中删除该页,一个 “invalidate_area” 将删除所有与交换类型 -相关的页(例如,像swapoff)并通知 “device” 拒绝进一步存储该交换类型。 - -一旦一个页面被成功存储,在该页面上的匹配加载通常会成功。因此,当内核发现自己处于需 -要交换页面的情况时,它首先尝试使用frontswap。如果存储的结果是成功的,那么数据就已 -经成功的保存到了transcendent memory中,并且避免了磁盘写入,如果后来再读回数据, -也避免了磁盘读取。如果存储返回失败,transcendent memory已经拒绝了该数据,且该页 -可以像往常一样被写入交换空间。 - -请注意,如果一个页面被存储,而该页面已经存在于transcendent memory中(一个 “重复” -的存储),要么存储成功,数据被覆盖,要么存储失败,该页面被废止。这确保了旧的数据永远 -不会从frontswap中获得。 - -如果配置正确,对frontswap的监控是通过 `/sys/kernel/debug/frontswap` 目录下的 -debugfs完成的。frontswap的有效性可以通过以下方式测量(在所有交换设备中): - -``failed_stores`` - 有多少次存储的尝试是失败的 - -``loads`` - 尝试了多少次加载(应该全部成功) - -``succ_stores`` - 有多少次存储的尝试是成功的 - -``invalidates`` - 尝试了多少次作废 - -后台实现可以提供额外的指标。 - -经常问到的问题 -============== - -* 价值在哪里? - -当一个工作负载开始交换时,性能就会下降。Frontswap通过提供一个干净的、动态的接口来 -读取和写入交换页到 “transcendent memory”,从而大大增加了许多这样的工作负载的性 -能,否则内核是无法直接寻址的。当数据被转换为不同的形式和大小(比如压缩)或者被秘密 -移动(对于一些类似RAM的设备来说,这可能对写平衡很有用)时,这个接口是理想的。交换 -页(和被驱逐的页面缓存页)是这种比RAM慢但比磁盘快得多的“伪RAM设备”的一大用途。 - -Frontswap对内核的影响相当小,为各种系统配置中更动态、更灵活的RAM利用提供了巨大的 -灵活性: - -在单一内核的情况下,又称“zcache”,页面被压缩并存储在本地内存中,从而增加了可以安 -全保存在RAM中的匿名页面总数。Zcache本质上是用压缩/解压缩的CPU周期换取更好的内存利 -用率。Benchmarks测试显示,当内存压力较低时,几乎没有影响,而在高内存压力下的一些 -工作负载上,则有明显的性能改善(25%以上)。 - -“RAMster” 在zcache的基础上增加了对集群系统的 “peer-to-peer” transcendent memory -的支持。Frontswap页面像zcache一样被本地压缩,但随后被“remotified” 到另一个系 -统的RAM。这使得RAM可以根据需要动态地来回负载平衡,也就是说,当系统A超载时,它可以 -交换到系统B,反之亦然。RAMster也可以被配置成一个内存服务器,因此集群中的许多服务器 -可以根据需要动态地交换到配置有大量内存的单一服务器上......而不需要预先配置每个客户 -有多少内存可用 - -在虚拟情况下,虚拟化的全部意义在于统计地将物理资源在多个虚拟机的不同需求之间进行复 -用。对于RAM来说,这真的很难做到,而且在不改变内核的情况下,要做好这一点的努力基本上 -是失败的(除了一些广为人知的特殊情况下的工作负载)。具体来说,Xen Transcendent Memory -后端允许管理器拥有的RAM “fallow”,不仅可以在多个虚拟机之间进行“time-shared”, -而且页面可以被压缩和重复利用,以优化RAM的利用率。当客户操作系统被诱导交出未充分利用 -的RAM时(如 “selfballooning”),突然出现的意外内存压力可能会导致交换;frontswap -允许这些页面被交换到管理器RAM中或从管理器RAM中交换(如果整体主机系统内存条件允许), -从而减轻计划外交换可能带来的可怕的性能影响。 - -一个KVM的实现正在进行中,并且已经被RFC'ed到lkml。而且,利用frontswap,对NVM作为 -内存扩展技术的调查也在进行中。 - -* 当然,在某些情况下可能有性能上的优势,但frontswap的空间/时间开销是多少? - -如果 CONFIG_FRONTSWAP 被禁用,每个 frontswap 钩子都会编译成空,唯一的开销是每 -个 swapon'ed swap 设备的几个额外字节。如果 CONFIG_FRONTSWAP 被启用,但没有 -frontswap的 “backend” 寄存器,每读或写一个交换页就会有一个额外的全局变量,而不 -是零。如果 CONFIG_FRONTSWAP 被启用,并且有一个frontswap的backend寄存器,并且 -后端每次 “store” 请求都失败(即尽管声称可能,但没有提供内存),CPU 的开销仍然可以 -忽略不计 - 因为每次frontswap失败都是在交换页写到磁盘之前,系统很可能是 I/O 绑定 -的,无论如何使用一小部分的 CPU 都是不相关的。 - -至于空间,如果CONFIG_FRONTSWAP被启用,并且有一个frontswap的backend注册,那么 -每个交换设备的每个交换页都会被分配一个比特。这是在内核已经为每个交换设备的每个交换 -页分配的8位(在2.6.34之前是16位)上增加的。(Hugh Dickins观察到,frontswap可能 -会偷取现有的8个比特,但是我们以后再来担心这个小的优化问题)。对于标准的4K页面大小的 -非常大的交换盘(这很罕见),这是每32GB交换盘1MB开销。 - -当交换页存储在transcendent memory中而不是写到磁盘上时,有一个副作用,即这可能会 -产生更多的内存压力,有可能超过其他的优点。一个backend,比如zcache,必须实现策略 -来仔细(但动态地)管理内存限制,以确保这种情况不会发生。 - -* 好吧,那就用内核骇客能理解的术语来快速概述一下这个frontswap补丁的作用如何? - -我们假设在内核初始化过程中,一个frontswap 的 “backend” 已经注册了;这个注册表 -明这个frontswap 的 “backend” 可以访问一些不被内核直接访问的“内存”。它到底提 -供了多少内存是完全动态和随机的。 - -每当一个交换设备被交换时,就会调用frontswap_init(),把交换设备的编号(又称“类 -型”)作为一个参数传给它。这就通知了frontswap,以期待 “store” 与该号码相关的交 -换页的尝试。 - -每当交换子系统准备将一个页面写入交换设备时(参见swap_writepage()),就会调用 -frontswap_store。Frontswap与frontswap backend协商,如果backend说它没有空 -间,frontswap_store返回-1,内核就会照常把页换到交换设备上。注意,来自frontswap -backend的响应对内核来说是不可预测的;它可能选择从不接受一个页面,可能接受每九个 -页面,也可能接受每一个页面。但是如果backend确实接受了一个页面,那么这个页面的数 -据已经被复制并与类型和偏移量相关联了,而且backend保证了数据的持久性。在这种情况 -下,frontswap在交换设备的“frontswap_map” 中设置了一个位,对应于交换设备上的 -页面偏移量,否则它就会将数据写入该设备。 - -当交换子系统需要交换一个页面时(swap_readpage()),它首先调用frontswap_load(), -检查frontswap_map,看这个页面是否早先被frontswap backend接受。如果是,该页 -的数据就会从frontswap后端填充,换入就完成了。如果不是,正常的交换代码将被执行, -以便从真正的交换设备上获得这一页的数据。 - -所以每次frontswap backend接受一个页面时,交换设备的读取和(可能)交换设备的写 -入都被 “frontswap backend store” 和(可能)“frontswap backend loads” -所取代,这可能会快得多。 - -* frontswap不能被配置为一个 “特殊的” 交换设备,它的优先级要高于任何真正的交换 - 设备(例如像zswap,或者可能是swap-over-nbd/NFS)? - -首先,现有的交换子系统不允许有任何种类的交换层次结构。也许它可以被重写以适应层次 -结构,但这将需要相当大的改变。即使它被重写,现有的交换子系统也使用了块I/O层,它 -假定交换设备是固定大小的,其中的任何页面都是可线性寻址的。Frontswap几乎没有触 -及现有的交换子系统,而是围绕着块I/O子系统的限制,提供了大量的灵活性和动态性。 - -例如,frontswap backend对任何交换页的接受是完全不可预测的。这对frontswap backend -的定义至关重要,因为它赋予了backend完全动态的决定权。在zcache中,人们无法预 -先知道一个页面的可压缩性如何。可压缩性 “差” 的页面会被拒绝,而 “差” 本身也可 -以根据当前的内存限制动态地定义。 - -此外,frontswap是完全同步的,而真正的交换设备,根据定义,是异步的,并且使用 -块I/O。块I/O层不仅是不必要的,而且可能进行 “优化”,这对面向RAM的设备来说是 -不合适的,包括将一些页面的写入延迟相当长的时间。同步是必须的,以确保后端的动 -态性,并避免棘手的竞争条件,这将不必要地大大增加frontswap和/或块I/O子系统的 -复杂性。也就是说,只有最初的 “store” 和 “load” 操作是需要同步的。一个独立 -的异步线程可以自由地操作由frontswap存储的页面。例如,RAMster中的 “remotification” -线程使用标准的异步内核套接字,将压缩的frontswap页面移动到远程机器。同样, -KVM的客户方实现可以进行客户内压缩,并使用 “batched” hypercalls。 - -在虚拟化环境中,动态性允许管理程序(或主机操作系统)做“intelligent overcommit”。 -例如,它可以选择只接受页面,直到主机交换可能即将发生,然后强迫客户机做他们 -自己的交换。 - -transcendent memory规格的frontswap有一个坏处。因为任何 “store” 都可 -能失败,所以必须在一个真正的交换设备上有一个真正的插槽来交换页面。因此, -frontswap必须作为每个交换设备的 “影子” 来实现,它有可能容纳交换设备可能 -容纳的每一个页面,也有可能根本不容纳任何页面。这意味着frontswap不能包含比 -swap设备总数更多的页面。例如,如果在某些安装上没有配置交换设备,frontswap -就没有用。无交换设备的便携式设备仍然可以使用frontswap,但是这种设备的 -backend必须配置某种 “ghost” 交换设备,并确保它永远不会被使用。 - - -* 为什么会有这种关于 “重复存储” 的奇怪定义?如果一个页面以前被成功地存储过, - 难道它不能总是被成功地覆盖吗? - -几乎总是可以的,不,有时不能。考虑一个例子,数据被压缩了,原来的4K页面被压 -缩到了1K。现在,有人试图用不可压缩的数据覆盖该页,因此会占用整个4K。但是 -backend没有更多的空间了。在这种情况下,这个存储必须被拒绝。每当frontswap -拒绝一个会覆盖的存储时,它也必须使旧的数据作废,并确保它不再被访问。因为交 -换子系统会把新的数据写到读交换设备上,这是确保一致性的正确做法。 - -* 为什么frontswap补丁会创建新的头文件swapfile.h? - -frontswap代码依赖于一些swap子系统内部的数据结构,这些数据结构多年来一直 -在静态和全局之间来回移动。这似乎是一个合理的妥协:将它们定义为全局,但在一 -个新的包含文件中声明它们,该文件不被包含swap.h的大量源文件所包含。 - -Dan Magenheimer,最后更新于2012年4月9日 diff --git a/Documentation/translations/zh_CN/vm/highmem.rst b/Documentation/translations/zh_CN/vm/highmem.rst deleted file mode 100644 index 018838e58c3e..000000000000 --- a/Documentation/translations/zh_CN/vm/highmem.rst +++ /dev/null @@ -1,128 +0,0 @@ -.. include:: ../disclaimer-zh_CN.rst - -:Original: Documentation/vm/highmem.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - -========== -高内存处理 -========== - -作者: Peter Zijlstra - -.. contents:: :local: - -高内存是什么? -============== - -当物理内存的大小接近或超过虚拟内存的最大大小时,就会使用高内存(highmem)。在这一点上,内 -核不可能在任何时候都保持所有可用的物理内存的映射。这意味着内核需要开始使用它想访问的物理内 -存的临时映射。 - -没有被永久映射覆盖的那部分(物理)内存就是我们所说的 "高内存"。对于这个边界的确切位置,有 -各种架构上的限制。 - -例如,在i386架构中,我们选择将内核映射到每个进程的虚拟空间,这样我们就不必为内核的进入/退 -出付出全部的TLB作废代价。这意味着可用的虚拟内存空间(i386上为4GiB)必须在用户和内核空间之 -间进行划分。 - -使用这种方法的架构的传统分配方式是3:1,3GiB用于用户空间,顶部的1GiB用于内核空间。:: - - +--------+ 0xffffffff - | Kernel | - +--------+ 0xc0000000 - | | - | User | - | | - +--------+ 0x00000000 - -这意味着内核在任何时候最多可以映射1GiB的物理内存,但是由于我们需要虚拟地址空间来做其他事 -情--包括访问其余物理内存的临时映射--实际的直接映射通常会更少(通常在~896MiB左右)。 - -其他有mm上下文标签的TLB的架构可以有独立的内核和用户映射。然而,一些硬件(如一些ARM)在使 -用mm上下文标签时,其虚拟空间有限。 - - -临时虚拟映射 -============ - -内核包含几种创建临时映射的方法。: - -* vmap(). 这可以用来将多个物理页长期映射到一个连续的虚拟空间。它需要synchronization - 来解除映射。 - -* kmap(). 这允许对单个页面进行短期映射。它需要synchronization,但在一定程度上被摊销。 - 当以嵌套方式使用时,它也很容易出现死锁,因此不建议在新代码中使用它。 - -* kmap_atomic(). 这允许对单个页面进行非常短的时间映射。由于映射被限制在发布它的CPU上, - 它表现得很好,但发布任务因此被要求留在该CPU上直到它完成,以免其他任务取代它的映射。 - - kmap_atomic() 也可以由中断上下文使用,因为它不睡眠,而且调用者可能在调用kunmap_atomic() - 之后才睡眠。 - - 可以假设k[un]map_atomic()不会失败。 - - -使用kmap_atomic -=============== - -何时何地使用 kmap_atomic() 是很直接的。当代码想要访问一个可能从高内存(见__GFP_HIGHMEM) -分配的页面的内容时,例如在页缓存中的页面,就会使用它。该API有两个函数,它们的使用方式与 -下面类似:: - - /* 找到感兴趣的页面。 */ - struct page *page = find_get_page(mapping, offset); - - /* 获得对该页内容的访问权。 */ - void *vaddr = kmap_atomic(page); - - /* 对该页的内容做一些处理。 */ - memset(vaddr, 0, PAGE_SIZE); - - /* 解除该页面的映射。 */ - kunmap_atomic(vaddr); - -注意,kunmap_atomic()调用的是kmap_atomic()调用的结果而不是参数。 - -如果你需要映射两个页面,因为你想从一个页面复制到另一个页面,你需要保持kmap_atomic调用严 -格嵌套,如:: - - vaddr1 = kmap_atomic(page1); - vaddr2 = kmap_atomic(page2); - - memcpy(vaddr1, vaddr2, PAGE_SIZE); - - kunmap_atomic(vaddr2); - kunmap_atomic(vaddr1); - - -临时映射的成本 -============== - -创建临时映射的代价可能相当高。体系架构必须操作内核的页表、数据TLB和/或MMU的寄存器。 - -如果CONFIG_HIGHMEM没有被设置,那么内核会尝试用一点计算来创建映射,将页面结构地址转换成 -指向页面内容的指针,而不是去捣鼓映射。在这种情况下,解映射操作可能是一个空操作。 - -如果CONFIG_MMU没有被设置,那么就不可能有临时映射和高内存。在这种情况下,也将使用计算方法。 - - -i386 PAE -======== - -在某些情况下,i386 架构将允许你在 32 位机器上安装多达 64GiB 的内存。但这有一些后果: - -* Linux需要为系统中的每个页面建立一个页帧结构,而且页帧需要驻在永久映射中,这意味着: - -* 你最多可以有896M/sizeof(struct page)页帧;由于页结构体是32字节的,所以最终会有 - 112G的页;然而,内核需要在内存中存储更多的页帧...... - -* PAE使你的页表变大--这使系统变慢,因为更多的数据需要在TLB填充等方面被访问。一个好处 - 是,PAE有更多的PTE位,可以提供像NX和PAT这样的高级功能。 - -一般的建议是,你不要在32位机器上使用超过8GiB的空间--尽管更多的空间可能对你和你的工作 -量有用,但你几乎是靠你自己--不要指望内核开发者真的会很关心事情的进展情况。 diff --git a/Documentation/translations/zh_CN/vm/hmm.rst b/Documentation/translations/zh_CN/vm/hmm.rst deleted file mode 100644 index 2379df95aa58..000000000000 --- a/Documentation/translations/zh_CN/vm/hmm.rst +++ /dev/null @@ -1,361 +0,0 @@ -.. include:: ../disclaimer-zh_CN.rst - -:Original: Documentation/vm/hmm.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - -================== -异构内存管理 (HMM) -================== - -提供基础设施和帮助程序以将非常规内存(设备内存,如板上 GPU 内存)集成到常规内核路径中,其 -基石是此类内存的专用struct page(请参阅本文档的第 5 至 7 节)。 - -HMM 还为 SVM(共享虚拟内存)提供了可选的帮助程序,即允许设备透明地访问与 CPU 一致的程序 -地址,这意味着 CPU 上的任何有效指针也是该设备的有效指针。这对于简化高级异构计算的使用变得 -必不可少,其中 GPU、DSP 或 FPGA 用于代表进程执行各种计算。 - -本文档分为以下部分:在第一部分中,我揭示了与使用特定于设备的内存分配器相关的问题。在第二 -部分中,我揭示了许多平台固有的硬件限制。第三部分概述了 HMM 设计。第四部分解释了 CPU 页 -表镜像的工作原理以及 HMM 在这种情况下的目的。第五部分处理内核中如何表示设备内存。最后, -最后一节介绍了一个新的迁移助手,它允许利用设备 DMA 引擎。 - -.. contents:: :local: - -使用特定于设备的内存分配器的问题 -================================ - -具有大量板载内存(几 GB)的设备(如 GPU)历来通过专用驱动程序特定 API 管理其内存。这会 -造成设备驱动程序分配和管理的内存与常规应用程序内存(私有匿名、共享内存或常规文件支持内存) -之间的隔断。从这里开始,我将把这个方面称为分割的地址空间。我使用共享地址空间来指代相反的情况: -即,设备可以透明地使用任何应用程序内存区域。 - -分割的地址空间的发生是因为设备只能访问通过设备特定 API 分配的内存。这意味着从设备的角度来 -看,程序中的所有内存对象并不平等,这使得依赖于广泛的库的大型程序变得复杂。 - -具体来说,这意味着想要利用像 GPU 这样的设备的代码需要在通用分配的内存(malloc、mmap -私有、mmap 共享)和通过设备驱动程序 API 分配的内存之间复制对象(这仍然以 mmap 结束, -但是是设备文件)。 - -对于平面数据集(数组、网格、图像……),这并不难实现,但对于复杂数据集(列表、树……), -很难做到正确。复制一个复杂的数据集需要重新映射其每个元素之间的所有指针关系。这很容易出错, -而且由于数据集和地址的重复,程序更难调试。 - -分割地址空间也意味着库不能透明地使用它们从核心程序或另一个库中获得的数据,因此每个库可能 -不得不使用设备特定的内存分配器来重复其输入数据集。大型项目会因此受到影响,并因为各种内存 -拷贝而浪费资源。 - -复制每个库的API以接受每个设备特定分配器分配的内存作为输入或输出,并不是一个可行的选择。 -这将导致库入口点的组合爆炸。 - -最后,随着高级语言结构(在 C++ 中,当然也在其他语言中)的进步,编译器现在有可能在没有程 -序员干预的情况下利用 GPU 和其他设备。某些编译器识别的模式仅适用于共享地址空间。对所有 -其他模式,使用共享地址空间也更合理。 - - -I/O 总线、设备内存特性 -====================== - -由于一些限制,I/O 总线削弱了共享地址空间。大多数 I/O 总线只允许从设备到主内存的基本 -内存访问;甚至缓存一致性通常是可选的。从 CPU 访问设备内存甚至更加有限。通常情况下,它 -不是缓存一致的。 - -如果我们只考虑 PCIE 总线,那么设备可以访问主内存(通常通过 IOMMU)并与 CPU 缓存一 -致。但是,它只允许设备对主存储器进行一组有限的原子操作。这在另一个方向上更糟:CPU -只能访问有限范围的设备内存,而不能对其执行原子操作。因此,从内核的角度来看,设备内存不 -能被视为与常规内存等同。 - -另一个严重的因素是带宽有限(约 32GBytes/s,PCIE 4.0 和 16 通道)。这比最快的 GPU -内存 (1 TBytes/s) 慢 33 倍。最后一个限制是延迟。从设备访问主内存的延迟比设备访问自 -己的内存时高一个数量级。 - -一些平台正在开发新的 I/O 总线或对 PCIE 的添加/修改以解决其中一些限制 -(OpenCAPI、CCIX)。它们主要允许 CPU 和设备之间的双向缓存一致性,并允许架构支持的所 -有原子操作。遗憾的是,并非所有平台都遵循这一趋势,并且一些主要架构没有针对这些问题的硬 -件解决方案。 - -因此,为了使共享地址空间有意义,我们不仅必须允许设备访问任何内存,而且还必须允许任何内 -存在设备使用时迁移到设备内存(在迁移时阻止 CPU 访问)。 - - -共享地址空间和迁移 -================== - -HMM 打算提供两个主要功能。第一个是通过复制cpu页表到设备页表中来共享地址空间,因此对 -于进程地址空间中的任何有效主内存地址,相同的地址指向相同的物理内存。 - -为了实现这一点,HMM 提供了一组帮助程序来填充设备页表,同时跟踪 CPU 页表更新。设备页表 -更新不像 CPU 页表更新那么容易。要更新设备页表,您必须分配一个缓冲区(或使用预先分配的 -缓冲区池)并在其中写入 GPU 特定命令以执行更新(取消映射、缓存失效和刷新等)。这不能通 -过所有设备的通用代码来完成。因此,为什么HMM提供了帮助器,在把硬件的具体细节留给设备驱 -动程序的同时,把一切可以考虑的因素都考虑进去了。 - -HMM 提供的第二种机制是一种新的 ZONE_DEVICE 内存,它允许为设备内存的每个页面分配一个 -struct page。这些页面很特殊,因为 CPU 无法映射它们。然而,它们允许使用现有的迁移机 -制将主内存迁移到设备内存,从 CPU 的角度来看,一切看起来都像是换出到磁盘的页面。使用 -struct page可以与现有的 mm 机制进行最简单、最干净的集成。再次,HMM 仅提供帮助程序, -首先为设备内存热插拔新的 ZONE_DEVICE 内存,然后执行迁移。迁移内容和时间的策略决定留 -给设备驱动程序。 - -请注意,任何 CPU 对设备页面的访问都会触发缺页异常并迁移回主内存。例如,当支持给定CPU -地址 A 的页面从主内存页面迁移到设备页面时,对地址 A 的任何 CPU 访问都会触发缺页异常 -并启动向主内存的迁移。 - -凭借这两个特性,HMM 不仅允许设备镜像进程地址空间并保持 CPU 和设备页表同步,而且还通 -过迁移设备正在使用的数据集部分来利用设备内存。 - - -地址空间镜像实现和API -===================== - -地址空间镜像的主要目标是允许将一定范围的 CPU 页表复制到一个设备页表中;HMM 有助于 -保持两者同步。想要镜像进程地址空间的设备驱动程序必须从注册 mmu_interval_notifier -开始:: - - int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, - struct mm_struct *mm, unsigned long start, - unsigned long length, - const struct mmu_interval_notifier_ops *ops); - -在 ops->invalidate() 回调期间,设备驱动程序必须对范围执行更新操作(将范围标记为只 -读,或完全取消映射等)。设备必须在驱动程序回调返回之前完成更新。 - -当设备驱动程序想要填充一个虚拟地址范围时,它可以使用:: - - int hmm_range_fault(struct hmm_range *range); - -如果请求写访问,它将在丢失或只读条目上触发缺页异常(见下文)。缺页异常使用通用的 mm 缺 -页异常代码路径,就像 CPU 缺页异常一样。 - -这两个函数都将 CPU 页表条目复制到它们的 pfns 数组参数中。该数组中的每个条目对应于虚拟 -范围中的一个地址。HMM 提供了一组标志来帮助驱动程序识别特殊的 CPU 页表项。 - -在 sync_cpu_device_pagetables() 回调中锁定是驱动程序必须尊重的最重要的方面,以保 -持事物正确同步。使用模式是:: - - int driver_populate_range(...) - { - struct hmm_range range; - ... - - range.notifier = &interval_sub; - range.start = ...; - range.end = ...; - range.hmm_pfns = ...; - - if (!mmget_not_zero(interval_sub->notifier.mm)) - return -EFAULT; - - again: - range.notifier_seq = mmu_interval_read_begin(&interval_sub); - mmap_read_lock(mm); - ret = hmm_range_fault(&range); - if (ret) { - mmap_read_unlock(mm); - if (ret == -EBUSY) - goto again; - return ret; - } - mmap_read_unlock(mm); - - take_lock(driver->update); - if (mmu_interval_read_retry(&ni, range.notifier_seq) { - release_lock(driver->update); - goto again; - } - - /* Use pfns array content to update device page table, - * under the update lock */ - - release_lock(driver->update); - return 0; - } - -driver->update 锁与驱动程序在其 invalidate() 回调中使用的锁相同。该锁必须在调用 -mmu_interval_read_retry() 之前保持,以避免与并发 CPU 页表更新发生任何竞争。 - -利用 default_flags 和 pfn_flags_mask -==================================== - -hmm_range 结构有 2 个字段,default_flags 和 pfn_flags_mask,它们指定整个范围 -的故障或快照策略,而不必为 pfns 数组中的每个条目设置它们。 - -例如,如果设备驱动程序需要至少具有读取权限的范围的页面,它会设置:: - - range->default_flags = HMM_PFN_REQ_FAULT; - range->pfn_flags_mask = 0; - -并如上所述调用 hmm_range_fault()。这将填充至少具有读取权限的范围内的所有页面。 - -现在假设驱动程序想要做同样的事情,除了它想要拥有写权限的范围内的一页。现在驱动程序设 -置:: - - range->default_flags = HMM_PFN_REQ_FAULT; - range->pfn_flags_mask = HMM_PFN_REQ_WRITE; - range->pfns[index_of_write] = HMM_PFN_REQ_WRITE; - -有了这个,HMM 将在至少读取(即有效)的所有页面中异常,并且对于地址 -== range->start + (index_of_write << PAGE_SHIFT) 它将异常写入权限,即,如果 -CPU pte 没有设置写权限,那么HMM将调用handle_mm_fault()。 - -hmm_range_fault 完成后,标志位被设置为页表的当前状态,即 HMM_PFN_VALID | 如果页 -面可写,将设置 HMM_PFN_WRITE。 - - -从核心内核的角度表示和管理设备内存 -================================== - -尝试了几种不同的设计来支持设备内存。第一个使用特定于设备的数据结构来保存有关迁移内存 -的信息,HMM 将自身挂接到 mm 代码的各个位置,以处理对设备内存支持的地址的任何访问。 -事实证明,这最终复制了 struct page 的大部分字段,并且还需要更新许多内核代码路径才 -能理解这种新的内存类型。 - -大多数内核代码路径从不尝试访问页面后面的内存,而只关心struct page的内容。正因为如此, -HMM 切换到直接使用 struct page 用于设备内存,这使得大多数内核代码路径不知道差异。 -我们只需要确保没有人试图从 CPU 端映射这些页面。 - -移入和移出设备内存 -================== - -由于 CPU 无法直接访问设备内存,因此设备驱动程序必须使用硬件 DMA 或设备特定的加载/存 -储指令来迁移数据。migrate_vma_setup()、migrate_vma_pages() 和 -migrate_vma_finalize() 函数旨在使驱动程序更易于编写并集中跨驱动程序的通用代码。 - -在将页面迁移到设备私有内存之前,需要创建特殊的设备私有 ``struct page`` 。这些将用 -作特殊的“交换”页表条目,以便 CPU 进程在尝试访问已迁移到设备专用内存的页面时会发生异常。 - -这些可以通过以下方式分配和释放:: - - struct resource *res; - struct dev_pagemap pagemap; - - res = request_free_mem_region(&iomem_resource, /* number of bytes */, - "name of driver resource"); - pagemap.type = MEMORY_DEVICE_PRIVATE; - pagemap.range.start = res->start; - pagemap.range.end = res->end; - pagemap.nr_range = 1; - pagemap.ops = &device_devmem_ops; - memremap_pages(&pagemap, numa_node_id()); - - memunmap_pages(&pagemap); - release_mem_region(pagemap.range.start, range_len(&pagemap.range)); - -还有devm_request_free_mem_region(), devm_memremap_pages(), -devm_memunmap_pages() 和 devm_release_mem_region() 当资源可以绑定到 ``struct device``. - -整体迁移步骤类似于在系统内存中迁移 NUMA 页面(see :ref:`Page migration `) , -但这些步骤分为设备驱动程序特定代码和共享公共代码: - -1. ``mmap_read_lock()`` - - 设备驱动程序必须将 ``struct vm_area_struct`` 传递给migrate_vma_setup(), - 因此需要在迁移期间保留 mmap_read_lock() 或 mmap_write_lock()。 - -2. ``migrate_vma_setup(struct migrate_vma *args)`` - - 设备驱动初始化了 ``struct migrate_vma`` 的字段,并将该指针传递给 - migrate_vma_setup()。``args->flags`` 字段是用来过滤哪些源页面应该被迁移。 - 例如,设置 ``MIGRATE_VMA_SELECT_SYSTEM`` 将只迁移系统内存,设置 - ``MIGRATE_VMA_SELECT_DEVICE_PRIVATE`` 将只迁移驻留在设备私有内存中的页 - 面。如果后者被设置, ``args->pgmap_owner`` 字段被用来识别驱动所拥有的设备 - 私有页。这就避免了试图迁移驻留在其他设备中的设备私有页。目前,只有匿名的私有VMA - 范围可以被迁移到系统内存和设备私有内存。 - - migrate_vma_setup()所做的第一步是用 ``mmu_notifier_invalidate_range_start()`` - 和 ``mmu_notifier_invalidate_range_end()`` 调用来遍历设备周围的页表,使 - 其他设备的MMU无效,以便在 ``args->src`` 数组中填写要迁移的PFN。 - ``invalidate_range_start()`` 回调传递给一个``struct mmu_notifier_range`` , - 其 ``event`` 字段设置为MMU_NOTIFY_MIGRATE, ``owner`` 字段设置为传递给 - migrate_vma_setup()的 ``args->pgmap_owner`` 字段。这允许设备驱动跳过无 - 效化回调,只无效化那些实际正在迁移的设备私有MMU映射。这一点将在下一节详细解释。 - - - 在遍历页表时,一个 ``pte_none()`` 或 ``is_zero_pfn()`` 条目导致一个有效 - 的 “zero” PFN 存储在 ``args->src`` 阵列中。这让驱动分配设备私有内存并清 - 除它,而不是复制一个零页。到系统内存或设备私有结构页的有效PTE条目将被 - ``lock_page()``锁定,与LRU隔离(如果系统内存和设备私有页不在LRU上),从进 - 程中取消映射,并插入一个特殊的迁移PTE来代替原来的PTE。 migrate_vma_setup() - 还清除了 ``args->dst`` 数组。 - -3. 设备驱动程序分配目标页面并将源页面复制到目标页面。 - - 驱动程序检查每个 ``src`` 条目以查看该 ``MIGRATE_PFN_MIGRATE`` 位是否已 - 设置并跳过未迁移的条目。设备驱动程序还可以通过不填充页面的 ``dst`` 数组来选 - 择跳过页面迁移。 - - 然后,驱动程序分配一个设备私有 struct page 或一个系统内存页,用 ``lock_page()`` - 锁定该页,并将 ``dst`` 数组条目填入:: - - dst[i] = migrate_pfn(page_to_pfn(dpage)); - - 现在驱动程序知道这个页面正在被迁移,它可以使设备私有 MMU 映射无效并将设备私有 - 内存复制到系统内存或另一个设备私有页面。由于核心 Linux 内核会处理 CPU 页表失 - 效,因此设备驱动程序只需使其自己的 MMU 映射失效。 - - 驱动程序可以使用 ``migrate_pfn_to_page(src[i])`` 来获取源设备的 - ``struct page`` 面,并将源页面复制到目标设备上,如果指针为 ``NULL`` ,意 - 味着源页面没有被填充到系统内存中,则清除目标设备的私有内存。 - -4. ``migrate_vma_pages()`` - - 这一步是实际“提交”迁移的地方。 - - 如果源页是 ``pte_none()`` 或 ``is_zero_pfn()`` 页,这时新分配的页会被插 - 入到CPU的页表中。如果一个CPU线程在同一页面上发生异常,这可能会失败。然而,页 - 表被锁定,只有一个新页会被插入。如果它失去了竞争,设备驱动将看到 - ``MIGRATE_PFN_MIGRATE`` 位被清除。 - - 如果源页被锁定、隔离等,源 ``struct page`` 信息现在被复制到目标 - ``struct page`` ,最终完成CPU端的迁移。 - -5. 设备驱动为仍在迁移的页面更新设备MMU页表,回滚未迁移的页面。 - - 如果 ``src`` 条目仍然有 ``MIGRATE_PFN_MIGRATE`` 位被设置,设备驱动可以 - 更新设备MMU,如果 ``MIGRATE_PFN_WRITE`` 位被设置,则设置写启用位。 - -6. ``migrate_vma_finalize()`` - - 这一步用新页的页表项替换特殊的迁移页表项,并释放对源和目的 ``struct page`` - 的引用。 - -7. ``mmap_read_unlock()`` - - 现在可以释放锁了。 - -独占访问存储器 -============== - -一些设备具有诸如原子PTE位的功能,可以用来实现对系统内存的原子访问。为了支持对一 -个共享的虚拟内存页的原子操作,这样的设备需要对该页的访问是排他的,而不是来自CPU -的任何用户空间访问。 ``make_device_exclusive_range()`` 函数可以用来使一 -个内存范围不能从用户空间访问。 - -这将用特殊的交换条目替换给定范围内的所有页的映射。任何试图访问交换条目的行为都会 -导致一个异常,该异常会通过用原始映射替换该条目而得到恢复。驱动程序会被通知映射已 -经被MMU通知器改变,之后它将不再有对该页的独占访问。独占访问被保证持续到驱动程序 -放弃页面锁和页面引用为止,这时页面上的任何CPU异常都可以按所述进行。 - -内存 cgroup (memcg) 和 rss 统计 -=============================== - -目前,设备内存被视为 rss 计数器中的任何常规页面(如果设备页面用于匿名,则为匿名, -如果设备页面用于文件支持页面,则为文件,如果设备页面用于共享内存,则为 shmem)。 -这是为了保持现有应用程序的故意选择,这些应用程序可能在不知情的情况下开始使用设备 -内存,运行不受影响。 - -一个缺点是 OOM 杀手可能会杀死使用大量设备内存而不是大量常规系统内存的应用程序, -因此不会释放太多系统内存。在决定以不同方式计算设备内存之前,我们希望收集更多关 -于应用程序和系统在存在设备内存的情况下在内存压力下如何反应的实际经验。 - -对内存 cgroup 做出了相同的决定。设备内存页面根据相同的内存 cgroup 计算,常规 -页面将被计算在内。这确实简化了进出设备内存的迁移。这也意味着从设备内存迁移回常规 -内存不会失败,因为它会超过内存 cgroup 限制。一旦我们对设备内存的使用方式及其对 -内存资源控制的影响有了更多的了解,我们可能会在后面重新考虑这个选择。 - -请注意,设备内存永远不能由设备驱动程序或通过 GUP 固定,因此此类内存在进程退出时 -总是被释放的。或者在共享内存或文件支持内存的情况下,当删除最后一个引用时。 diff --git a/Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst b/Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst deleted file mode 100644 index c6d471ce2131..000000000000 --- a/Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst +++ /dev/null @@ -1,436 +0,0 @@ -.. include:: ../disclaimer-zh_CN.rst - -:Original: Documentation/vm/hugetlbfs_reserv.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - -============== -Hugetlbfs 预留 -============== - -概述 -==== - -:ref:`hugetlbpage` 中描述的巨页通常是预先分配给应用程序使用的。如果VMA指 -示要使用巨页,这些巨页会在缺页异常时被实例化到任务的地址空间。如果在缺页异常 -时没有巨页存在,任务就会被发送一个SIGBUS,并经常不高兴地死去。在加入巨页支 -持后不久,人们决定,在mmap()时检测巨页的短缺情况会更好。这个想法是,如果 -没有足够的巨页来覆盖映射,mmap()将失败。这首先是在mmap()时在代码中做一个 -简单的检查,以确定是否有足够的空闲巨页来覆盖映射。就像内核中的大多数东西一 -样,代码随着时间的推移而不断发展。然而,基本的想法是在mmap()时 “预留” -巨页,以确保巨页可以用于该映射中的缺页异常。下面的描述试图描述在v4.10内核 -中是如何进行巨页预留处理的。 - - -读者 -==== -这个描述主要是针对正在修改hugetlbfs代码的内核开发者。 - - -数据结构 -======== - -resv_huge_pages - 这是一个全局的(per-hstate)预留的巨页的计数。预留的巨页只对预留它们的任 - 务可用。因此,一般可用的巨页的数量被计算为(``free_huge_pages - resv_huge_pages``)。 -Reserve Map - 预留映射由以下结构体描述:: - - struct resv_map { - struct kref refs; - spinlock_t lock; - struct list_head regions; - long adds_in_progress; - struct list_head region_cache; - long region_cache_count; - }; - - 系统中每个巨页映射都有一个预留映射。resv_map中的regions列表描述了映射中的 - 区域。一个区域被描述为:: - - struct file_region { - struct list_head link; - long from; - long to; - }; - - file_region结构体的 ‘from’ 和 ‘to’ 字段是进入映射的巨页索引。根据映射的类型,在 - reserv_map 中的一个区域可能表示该范围存在预留,或预留不存在。 -Flags for MAP_PRIVATE Reservations - 这些被存储在预留的映射指针的底部。 - - ``#define HPAGE_RESV_OWNER (1UL << 0)`` - 表示该任务是与该映射相关的预留的所有者。 - ``#define HPAGE_RESV_UNMAPPED (1UL << 1)`` - 表示最初映射此范围(并创建储备)的任务由于COW失败而从该任务(子任务)中取消映 - 射了一个页面。 -Page Flags - PagePrivate页面标志是用来指示在释放巨页时必须恢复巨页的预留。更多细节将在 - “释放巨页” 一节中讨论。 - - -预留映射位置(私有或共享) -========================== - -一个巨页映射或段要么是私有的,要么是共享的。如果是私有的,它通常只对一个地址空间 -(任务)可用。如果是共享的,它可以被映射到多个地址空间(任务)。对于这两种类型的映射, -预留映射的位置和语义是明显不同的。位置的差异是: - -- 对于私有映射,预留映射挂在VMA结构体上。具体来说,就是vma->vm_private_data。这个保 - 留映射是在创建映射(mmap(MAP_PRIVATE))时创建的。 -- 对于共享映射,预留映射挂在inode上。具体来说,就是inode->i_mapping->private_data。 - 由于共享映射总是由hugetlbfs文件系统中的文件支持,hugetlbfs代码确保每个节点包含一个预 - 留映射。因此,预留映射在创建节点时被分配。 - - -创建预留 -======== -当创建一个巨大的有页面支持的共享内存段(shmget(SHM_HUGETLB))或通过mmap(MAP_HUGETLB) -创建一个映射时,就会创建预留。这些操作会导致对函数hugetlb_reserve_pages()的调用:: - - int hugetlb_reserve_pages(struct inode *inode, - long from, long to, - struct vm_area_struct *vma, - vm_flags_t vm_flags) - -hugetlb_reserve_pages()做的第一件事是检查在调用shmget()或mmap()时是否指定了NORESERVE -标志。如果指定了NORESERVE,那么这个函数立即返回,因为不需要预留。 - -参数'from'和'to'是映射或基础文件的巨页索引。对于shmget(),'from'总是0,'to'对应于段/映射 -的长度。对于mmap(),offset参数可以用来指定进入底层文件的偏移量。在这种情况下,'from'和'to' -参数已经被这个偏移量所调整。 - -PRIVATE和SHARED映射之间的一个很大的区别是预留在预留映射中的表示方式。 - -- 对于共享映射,预留映射中的条目表示对应页面的预留存在或曾经存在。当预留被消耗时,预留映射不被 - 修改。 -- 对于私有映射,预留映射中没有条目表示相应页面存在预留。随着预留被消耗,条目被添加到预留映射中。 - 因此,预留映射也可用于确定哪些预留已被消耗。 - -对于私有映射,hugetlb_reserve_pages()创建预留映射并将其挂在VMA结构体上。此外, -HPAGE_RESV_OWNER标志被设置,以表明该VMA拥有预留。 - -预留映射被查阅以确定当前映射/段需要多少巨页预留。对于私有映射,这始终是一个值(to - from)。 -然而,对于共享映射来说,一些预留可能已经存在于(to - from)的范围内。关于如何实现这一点的细节, -请参见 :ref:`预留映射的修改 ` 一节。 - -该映射可能与一个子池(subpool)相关联。如果是这样,将查询子池以确保有足够的空间用于映射。子池 -有可能已经预留了可用于映射的预留空间。更多细节请参见 :ref: `子池预留 ` -一节。 - -在咨询了预留映射和子池之后,就知道了需要的新预留数量。hugetlb_acct_memory()函数被调用以检查 -并获取所要求的预留数量。hugetlb_acct_memory()调用到可能分配和调整剩余页数的函数。然而,在这 -些函数中,代码只是检查以确保有足够的空闲的巨页来容纳预留。如果有的话,全局预留计数resv_huge_pages -会被调整,如下所示:: - - if (resv_needed <= (resv_huge_pages - free_huge_pages)) - resv_huge_pages += resv_needed; - -注意,在检查和调整这些计数器时,全局锁hugetlb_lock会被预留。 - -如果有足够的空闲的巨页,并且全局计数resv_huge_pages被调整,那么与映射相关的预留映射被修改以 -反映预留。在共享映射的情况下,将存在一个file_region,包括'from'-'to'范围。对于私有映射, -不对预留映射进行修改,因为没有条目表示存在预留。 - -如果hugetlb_reserve_pages()成功,全局预留数和与映射相关的预留映射将根据需要被修改,以确保 -在'from'-'to'范围内存在预留。 - -消耗预留/分配一个巨页 -=========================== - -当与预留相关的巨页在相应的映射中被分配和实例化时,预留就被消耗了。该分配是在函数alloc_huge_page() -中进行的:: - - struct page *alloc_huge_page(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve) - -alloc_huge_page被传递给一个VMA指针和一个虚拟地址,因此它可以查阅预留映射以确定是否存在预留。 -此外,alloc_huge_page需要一个参数avoid_reserve,该参数表示即使看起来已经为指定的地址预留了 -预留,也不应该使用预留。avoid_reserve参数最常被用于写时拷贝和页面迁移的情况下,即现有页面的额 -外拷贝被分配。 - - -调用辅助函数vma_needs_reservation()来确定是否存在对映射(vma)中地址的预留。关于这个函数的详 -细内容,请参见 :ref:`预留映射帮助函数 ` 一节。从 -vma_needs_reservation()返回的值通常为0或1。如果该地址存在预留,则为0,如果不存在预留,则为1。 -如果不存在预留,并且有一个与映射相关联的子池,则查询子池以确定它是否包含预留。如果子池包含预留, -则可将其中一个用于该分配。然而,在任何情况下,avoid_reserve参数都会优先考虑为分配使用预留。在 -确定预留是否存在并可用于分配后,调用dequeue_huge_page_vma()函数。这个函数需要两个与预留有关 -的参数: - -- avoid_reserve,这是传递给alloc_huge_page()的同一个值/参数。 -- chg,尽管这个参数的类型是long,但只有0或1的值被传递给dequeue_huge_page_vma。如果该值为0, - 则表明存在预留(关于可能的问题,请参见 “预留和内存策略” 一节)。如果值 - 为1,则表示不存在预留,如果可能的话,必须从全局空闲池中取出该页。 - -与VMA的内存策略相关的空闲列表被搜索到一个空闲页。如果找到了一个页面,当该页面从空闲列表中移除时, -free_huge_pages的值被递减。如果有一个与该页相关的预留,将进行以下调整:: - - SetPagePrivate(page); /* 表示分配这个页面消耗了一个预留, - * 如果遇到错误,以至于必须释放这个页面,预留将被 - * 恢复。 */ - resv_huge_pages--; /* 减少全局预留计数 */ - -注意,如果找不到满足VMA内存策略的巨页,将尝试使用伙伴分配器分配一个。这就带来了超出预留范围 -的剩余巨页和超额分配的问题。即使分配了一个多余的页面,也会进行与上面一样的基于预留的调整: -SetPagePrivate(page) 和 resv_huge_pages--. - -在获得一个新的巨页后,(page)->private被设置为与该页面相关的子池的值,如果它存在的话。当页 -面被释放时,这将被用于子池的计数。 - -然后调用函数vma_commit_reservation(),根据预留的消耗情况调整预留映射。一般来说,这涉及 -到确保页面在区域映射的file_region结构体中被表示。对于预留存在的共享映射,预留映射中的条目 -已经存在,所以不做任何改变。然而,如果共享映射中没有预留,或者这是一个私有映射,则必须创建一 -个新的条目。 - -注意,如果找不到满足VMA内存策略的巨页,将尝试使用伙伴分配器分配一个。这就带来了超出预留范围 -的剩余巨页和过度分配的问题。即使分配了一个多余的页面,也会进行与上面一样的基于预留的调整。 -SetPagePrivate(page)和resv_huge_pages-。 - -在获得一个新的巨页后,(page)->private被设置为与该页面相关的子池的值,如果它存在的话。当页 -面被释放时,这将被用于子池的计数。 - -然后调用函数vma_commit_reservation(),根据预留的消耗情况调整预留映射。一般来说,这涉及 -到确保页面在区域映射的file_region结构体中被表示。对于预留存在的共享映射,预留映射中的条目 -已经存在,所以不做任何改变。然而,如果共享映射中没有预留,或者这是一个私有映射,则必须创建 -一个新的条目。 - -在alloc_huge_page()开始调用vma_needs_reservation()和页面分配后调用 -vma_commit_reservation()之间,预留映射有可能被改变。如果hugetlb_reserve_pages在共 -享映射中为同一页面被调用,这将是可能的。在这种情况下,预留计数和子池空闲页计数会有一个偏差。 -这种罕见的情况可以通过比较vma_needs_reservation和vma_commit_reservation的返回值来 -识别。如果检测到这种竞争,子池和全局预留计数将被调整以进行补偿。关于这些函数的更多信息,请 -参见 :ref:`预留映射帮助函数 ` 一节。 - - -实例化巨页 -========== - -在巨页分配之后,页面通常被添加到分配任务的页表中。在此之前,共享映射中的页面被添加到页面缓 -存中,私有映射中的页面被添加到匿名反向映射中。在这两种情况下,PagePrivate标志被清除。因此, -当一个已经实例化的巨页被释放时,不会对全局预留计数(resv_huge_pages)进行调整。 - - -释放巨页 -======== - -巨页释放是由函数free_huge_page()执行的。这个函数是hugetlbfs复合页的析构器。因此,它只传 -递一个指向页面结构体的指针。当一个巨页被释放时,可能需要进行预留计算。如果该页与包含保 -留的子池相关联,或者该页在错误路径上被释放,必须恢复全局预留计数,就会出现这种情况。 - -page->private字段指向与该页相关的任何子池。如果PagePrivate标志被设置,它表明全局预留计数 -应该被调整(关于如何设置这些标志的信息,请参见 -:ref: `消耗预留/分配一个巨页 ` )。 - - -该函数首先调用hugepage_subpool_put_pages()来处理该页。如果这个函数返回一个0的值(不等于 -传递的1的值),它表明预留与子池相关联,这个新释放的页面必须被用来保持子池预留的数量超过最小值。 -因此,在这种情况下,全局resv_huge_pages计数器被递增。 - -如果页面中设置了PagePrivate标志,那么全局resv_huge_pages计数器将永远被递增。 - -子池预留 -======== - -有一个结构体hstate与每个巨页尺寸相关联。hstate跟踪所有指定大小的巨页。一个子池代表一 -个hstate中的页面子集,它与一个已挂载的hugetlbfs文件系统相关 - -当一个hugetlbfs文件系统被挂载时,可以指定min_size选项,它表示文件系统所需的最小的巨页数量。 -如果指定了这个选项,与min_size相对应的巨页的数量将被预留给文件系统使用。这个数字在结构体 -hugepage_subpool的min_hpages字段中被跟踪。在挂载时,hugetlb_acct_memory(min_hpages) -被调用以预留指定数量的巨页。如果它们不能被预留,挂载就会失败。 - -当从子池中获取或释放页面时,会调用hugepage_subpool_get/put_pages()函数。 -hugepage_subpool_get/put_pages被传递给巨页数量,以此来调整子池的 “已用页面” 计数 -(get为下降,put为上升)。通常情况下,如果子池中没有足够的页面,它们会返回与传递的相同的值或 -一个错误。 - -然而,如果预留与子池相关联,可能会返回一个小于传递值的返回值。这个返回值表示必须进行的额外全局 -池调整的数量。例如,假设一个子池包含3个预留的巨页,有人要求5个。与子池相关的3个预留页可以用来 -满足部分请求。但是,必须从全局池中获得2个页面。为了向调用者转达这一信息,将返回值2。然后,调用 -者要负责从全局池中获取另外两个页面。 - - -COW和预留 -========== - -由于共享映射都指向并使用相同的底层页面,COW最大的预留问题是私有映射。在这种情况下,两个任务可 -以指向同一个先前分配的页面。一个任务试图写到该页,所以必须分配一个新的页,以便每个任务都指向它 -自己的页。 - -当该页最初被分配时,该页的预留被消耗了。当由于COW而试图分配一个新的页面时,有可能没有空闲的巨 -页,分配会失败。 - -当最初创建私有映射时,通过设置所有者的预留映射指针中的HPAGE_RESV_OWNER位来标记映射的所有者。 -由于所有者创建了映射,所有者拥有与映射相关的所有预留。因此,当一个写异常发生并且没有可用的页面 -时,对预留的所有者和非所有者采取不同的行动。 - -在发生异常的任务不是所有者的情况下,异常将失败,该任务通常会收到一个SIGBUS。 - -如果所有者是发生异常的任务,我们希望它能够成功,因为它拥有原始的预留。为了达到这个目的,该页被 -从非所有者任务中解映射出来。这样一来,唯一的引用就是来自拥有者的任务。此外,HPAGE_RESV_UNMAPPED -位被设置在非拥有任务的预留映射指针中。如果非拥有者任务后来在一个不存在的页面上发生异常,它可能 -会收到一个SIGBUS。但是,映射/预留的原始拥有者的行为将与预期一致。 - -预留映射的修改 -============== - -以下低级函数用于对预留映射进行修改。通常情况下,这些函数不会被直接调用。而是调用一个预留映射辅 -助函数,该函数调用这些低级函数中的一个。这些低级函数在源代码(mm/hugetlb.c)中得到了相当好的 -记录。这些函数是:: - - long region_chg(struct resv_map *resv, long f, long t); - long region_add(struct resv_map *resv, long f, long t); - void region_abort(struct resv_map *resv, long f, long t); - long region_count(struct resv_map *resv, long f, long t); - -在预留映射上的操作通常涉及两个操作: - -1) region_chg()被调用来检查预留映射,并确定在指定的范围[f, t]内有多少页目前没有被代表。 - - 调用代码执行全局检查和分配,以确定是否有足够的巨页使操作成功。 - -2) - a) 如果操作能够成功,regi_add()将被调用,以实际修改先前传递给regi_chg()的相同范围 - [f, t]的预留映射。 - b) 如果操作不能成功,region_abort被调用,在相同的范围[f, t]内中止操作。 - -注意,这是一个两步的过程, region_add()和 region_abort()在事先调用 region_chg()后保证 -成功。 region_chg()负责预先分配任何必要的数据结构以确保后续操作(特别是 region_add())的 -成功。 - -如上所述,region_chg()确定该范围内当前没有在映射中表示的页面的数量。region_add()返回添加 -到映射中的范围内的页数。在大多数情况下, region_add() 的返回值与 region_chg() 的返回值相 -同。然而,在共享映射的情况下,有可能在调用 region_chg() 和 region_add() 之间对预留映射进 -行更改。在这种情况下,regi_add()的返回值将与regi_chg()的返回值不符。在这种情况下,全局计数 -和子池计数很可能是不正确的,需要调整。检查这种情况并进行适当的调整是调用者的责任。 - -函数region_del()被调用以从预留映射中移除区域。 -它通常在以下情况下被调用: - -- 当hugetlbfs文件系统中的一个文件被删除时,该节点将被释放,预留映射也被释放。在释放预留映射 - 之前,所有单独的file_region结构体必须被释放。在这种情况下,region_del的范围是[0, LONG_MAX]。 -- 当一个hugetlbfs文件正在被截断时。在这种情况下,所有在新文件大小之后分配的页面必须被释放。 - 此外,预留映射中任何超过新文件大小的file_region条目必须被删除。在这种情况下,region_del - 的范围是[new_end_of_file, LONG_MAX]。 -- 当在一个hugetlbfs文件中打洞时。在这种情况下,巨页被一次次从文件的中间移除。当这些页被移除 - 时,region_del()被调用以从预留映射中移除相应的条目。在这种情况下,region_del被传递的范 - 围是[page_idx, page_idx + 1]。 - -在任何情况下,region_del()都会返回从预留映射中删除的页面数量。在非常罕见的情况下,region_del() -会失败。这只能发生在打洞的情况下,即它必须分割一个现有的file_region条目,而不能分配一个新的 -结构体。在这种错误情况下,region_del()将返回-ENOMEM。这里的问题是,预留映射将显示对该页有 -预留。然而,子池和全局预留计数将不反映该预留。为了处理这种情况,调用函数hugetlb_fix_reserve_counts() -来调整计数器,使其与不能被删除的预留映射条目相对应。 - -region_count()在解除私有巨页映射时被调用。在私有映射中,预留映射中没有条目表明存在一个预留。 -因此,通过计算预留映射中的条目数,我们知道有多少预留被消耗了,有多少预留是未完成的 -(Outstanding = (end - start) - region_count(resv, start, end))。由于映射正在消 -失,子池和全局预留计数被未完成的预留数量所减去。 - -预留映射帮助函数 -================ - -有几个辅助函数可以查询和修改预留映射。这些函数只对特定的巨页的预留感兴趣,所以它们只是传入一个 -地址而不是一个范围。此外,它们还传入相关的VMA。从VMA中,可以确定映射的类型(私有或共享)和预留 -映射的位置(inode或VMA)。这些函数只是调用 “预留映射的修改” 一节中描述的基础函数。然而, -它们确实考虑到了私有和共享映射的预留映射条目的 “相反” 含义,并向调用者隐藏了这个细节:: - - long vma_needs_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -该函数为指定的页面调用 region_chg()。如果不存在预留,则返回1。如果存在预留,则返回0:: - - long vma_commit_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -这将调用 region_add(),用于指定的页面。与region_chg和region_add的情况一样,该函数应在 -先前调用的vma_needs_reservation后调用。它将为该页添加一个预留条目。如果预留被添加,它将 -返回1,如果没有则返回0。返回值应与之前调用vma_needs_reservation的返回值进行比较。如果出 -现意外的差异,说明在两次调用之间修改了预留映射:: - - void vma_end_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -这将调用指定页面的 region_abort()。与region_chg和region_abort的情况一样,该函数应在 -先前调用的vma_needs_reservation后被调用。它将中止/结束正在进行的预留添加操作:: - - long vma_add_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -这是一个特殊的包装函数,有助于在错误路径上清理预留。它只从repare_reserve_on_error()函数 -中调用。该函数与vma_needs_reservation一起使用,试图将一个预留添加到预留映射中。它考虑到 -了私有和共享映射的不同预留映射语义。因此,region_add被调用用于共享映射(因为映射中的条目表 -示预留),而region_del被调用用于私有映射(因为映射中没有条目表示预留)。关于在错误路径上需 -要做什么的更多信息,请参见 “错误路径中的预留清理” 。 - - -错误路径中的预留清理 -==================== - -正如在:ref:`预留映射帮助函数` 一节中提到的,预留的修改分两步进行。首 -先,在分配页面之前调用vma_needs_reservation。如果分配成功,则调用vma_commit_reservation。 -如果不是,则调用vma_end_reservation。全局和子池的预留计数根据操作的成功或失败进行调整, -一切都很好。 - -此外,在一个巨页被实例化后,PagePrivate标志被清空,这样,当页面最终被释放时,计数是 -正确的。 - -然而,有几种情况是,在一个巨页被分配后,但在它被实例化之前,就遇到了错误。在这种情况下, -页面分配已经消耗了预留,并进行了适当的子池、预留映射和全局计数调整。如果页面在这个时候被释放 -(在实例化和清除PagePrivate之前),那么free_huge_page将增加全局预留计数。然而,预留映射 -显示报留被消耗了。这种不一致的状态将导致预留的巨页的 “泄漏” 。全局预留计数将比它原本的要高, -并阻止分配一个预先分配的页面。 - -函数 restore_reserve_on_error() 试图处理这种情况。它有相当完善的文档。这个函数的目的 -是将预留映射恢复到页面分配前的状态。通过这种方式,预留映射的状态将与页面释放后的全局预留计 -数相对应。 - -函数restore_reserve_on_error本身在试图恢复预留映射条目时可能会遇到错误。在这种情况下, -它将简单地清除该页的PagePrivate标志。这样一来,当页面被释放时,全局预留计数将不会被递增。 -然而,预留映射将继续看起来像预留被消耗了一样。一个页面仍然可以被分配到该地址,但它不会像最 -初设想的那样使用一个预留页。 - -有一些代码(最明显的是userfaultfd)不能调用restore_reserve_on_error。在这种情况下, -它简单地修改了PagePrivate,以便在释放巨页时不会泄露预留。 - - -预留和内存策略 -============== -当git第一次被用来管理Linux代码时,每个节点的巨页列表就存在于hstate结构中。预留的概念是 -在一段时间后加入的。当预留被添加时,没有尝试将内存策略考虑在内。虽然cpusets与内存策略不 -完全相同,但hugetlb_acct_memory中的这个注释总结了预留和cpusets/内存策略之间的相互作 -用:: - - - /* - * 当cpuset被配置时,它打破了严格的hugetlb页面预留,因为计数是在一个全局变量上完 - * 成的。在有cpuset的情况下,这样的预留完全是垃圾,因为预留没有根据当前cpuset的 - * 页面可用性来检查。在任务所在的cpuset中缺乏空闲的htlb页面时,应用程序仍然有可能 - * 被内核OOM'ed。试图用cpuset来执行严格的计数几乎是不可能的(或者说太难看了),因 - * 为cpuset太不稳定了,任务或内存节点可以在cpuset之间动态移动。与cpuset共享 - * hugetlb映射的语义变化是不可取的。然而,为了预留一些语义,我们退回到检查当前空闲 - * 页的可用性,作为一种最好的尝试,希望能将cpuset改变语义的影响降到最低。 - */ - -添加巨页预留是为了防止在缺页异常时出现意外的页面分配失败(OOM)。然而,如果一个应用 -程序使用cpusets或内存策略,就不能保证在所需的节点上有巨页可用。即使有足够数量的全局 -预留,也是如此。 - -Hugetlbfs回归测试 -================= - -最完整的hugetlb测试集在libhugetlbfs仓库。如果你修改了任何hugetlb相关的代码,请使用 -libhugetlbfs测试套件来检查回归情况。此外,如果你添加了任何新的hugetlb功能,请在 -libhugetlbfs中添加适当的测试。 - --- -Mike Kravetz,2017年4月7日 diff --git a/Documentation/translations/zh_CN/vm/hwpoison.rst b/Documentation/translations/zh_CN/vm/hwpoison.rst deleted file mode 100644 index c6e1e7bdb05b..000000000000 --- a/Documentation/translations/zh_CN/vm/hwpoison.rst +++ /dev/null @@ -1,166 +0,0 @@ - -:Original: Documentation/vm/hwpoison.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -======== -hwpoison -======== - -什么是hwpoison? -=============== - - -即将推出的英特尔CPU支持从一些内存错误中恢复( ``MCA恢复`` )。这需要操作系统宣布 -一个页面"poisoned",杀死与之相关的进程,并避免在未来使用它。 - -这个补丁包在虚拟机中实现了必要的(编程)框架。 - -引用概述中的评论:: - - 高级机器的检查与处理。处理方法是损坏的页面被硬件报告,通常是由于2位ECC内 - 存或高速缓存故障。 - - 这主要是针对在后台检测到的损坏的页面。当当前的CPU试图访问它时,当前运行的进程 - 可以直接被杀死。因为还没有访问损坏的页面, 如果错误由于某种原因不能被处理,就可 - 以安全地忽略它. 而不是用另外一个机器检查去处理它。 - - 处理不同状态的页面缓存页。这里棘手的部分是,相对于其他虚拟内存用户, 我们可以异 - 步访问任何页面。因为内存故障可能随时随地发生,可能违反了他们的一些假设。这就是 - 为什么这段代码必须非常小心。一般来说,它试图使用正常的锁规则,如获得标准锁,即使 - 这意味着错误处理可能需要很长的时间。 - - 这里的一些操作有点低效,并且具有非线性的算法复杂性,因为数据结构没有针对这种情 - 况进行优化。特别是从vma到进程的映射就是这种情况。由于这种情况大概率是罕见的,所 - 以我们希望我们可以摆脱这种情况。 - -该代码由mm/memory-failure.c中的高级处理程序、一个新的页面poison位和虚拟机中的 -各种检查组成,用来处理poison的页面。 - -现在主要目标是KVM客户机,但它适用于所有类型的应用程序。支持KVM需要最近的qemu-kvm -版本。 - -对于KVM的使用,需要一个新的信号类型,这样KVM就可以用适当的地址将机器检查注入到客户 -机中。这在理论上也允许其他应用程序处理内存故障。我们的期望是,所有的应用程序都不要这 -样做,但一些非常专业的应用程序可能会这样做。 - -故障恢复模式 -============ - -有两种(实际上是三种)模式的内存故障恢复可以在。 - -vm.memory_failure_recovery sysctl 置零: - 所有的内存故障都会导致panic。请不要尝试恢复。 - -早期处理 - (可以在全局和每个进程中控制) 一旦检测到错误,立即向应用程序发送SIGBUS这允许 - 应用程序以温和的方式处理内存错误(例如,放弃受影响的对象) 这是KVM qemu使用的 - 模式。 - -推迟处理 - 当应用程序运行到损坏的页面时,发送SIGBUS。这对不知道内存错误的应用程序来说是 - 最好的,默认情况下注意一些页面总是被当作late kill处理。 - -用户控制 -======== - -vm.memory_failure_recovery - 参阅 sysctl.txt - -vm.memory_failure_early_kill - 全局启用early kill - -PR_MCE_KILL - 设置early/late kill mode/revert 到系统默认值。 - - arg1: PR_MCE_KILL_CLEAR: - 恢复到系统默认值 - arg1: PR_MCE_KILL_SET: - arg2定义了线程特定模式 - - PR_MCE_KILL_EARLY: - Early kill - PR_MCE_KILL_LATE: - Late kill - PR_MCE_KILL_DEFAULT - 使用系统全局默认值 - - 注意,如果你想有一个专门的线程代表进程处理SIGBUS(BUS_MCEERR_AO),你应该在 - 指定线程上调用prctl(PR_MCE_KILL_EARLY)。否则,SIGBUS将被发送到主线程。 - -PR_MCE_KILL_GET - 返回当前模式 - -测试 -==== - -* madvise(MADV_HWPOISON, ....) (as root) - 在测试过程中Poison一个页面 - -* 通过debugfs ``/sys/kernel/debug/hwpoison/`` hwpoison-inject模块 - - corrupt-pfn - 在PFN处注入hwpoison故障,并echoed到这个文件。这做了一些早期过滤,以避 - 免在测试套件中损坏非预期页面。 - unpoison-pfn - 在PFN的Software-unpoison页面对应到这个文件。这样,一个页面可以再次被 - 复用。这只对Linux注入的故障起作用,对真正的内存故障不起作用。 - - 注意这些注入接口并不稳定,可能会在不同的内核版本中发生变化 - - corrupt-filter-dev-major, corrupt-filter-dev-minor - 只处理与块设备major/minor定义的文件系统相关的页面的内存故障。-1U是通 - 配符值。这应该只用于人工注入的测试。 - - corrupt-filter-memcg - 限制注入到memgroup拥有的页面。由memcg的inode号指定。 - - Example:: - - mkdir /sys/fs/cgroup/mem/hwpoison - - usemem -m 100 -s 1000 & - echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks - - memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ') - echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg - - page-types -p `pidof init` --hwpoison # shall do nothing - page-types -p `pidof usemem` --hwpoison # poison its pages - - corrupt-filter-flags-mask, corrupt-filter-flags-value - 当指定时,只有在((page_flags & mask) == value)的情况下才会poison页面。 - 这允许对许多种类的页面进行压力测试。page_flags与/proc/kpageflags中的相 - 同。这些标志位在include/linux/kernel-page-flags.h中定义,并在 - Documentation/admin-guide/mm/pagemap.rst中记录。 - -* 架构特定的MCE注入器 - - x86 有 mce-inject, mce-test - - 在mce-test中的一些便携式hwpoison测试程序,见下文。 - -引用 -==== - -http://halobates.de/mce-lc09-2.pdf - 09年LinuxCon的概述演讲 - -git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git - 测试套件(在tsrc中的hwpoison特定可移植测试)。 - -git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git - x86特定的注入器 - - -限制 -==== -- 不是所有的页面类型都被支持,而且永远不会。大多数内核内部对象不能被恢 - 复,目前只有LRU页。 - ---- -Andi Kleen, 2009年10月 diff --git a/Documentation/translations/zh_CN/vm/index.rst b/Documentation/translations/zh_CN/vm/index.rst deleted file mode 100644 index a1c6d529b6ff..000000000000 --- a/Documentation/translations/zh_CN/vm/index.rst +++ /dev/null @@ -1,54 +0,0 @@ -.. include:: ../disclaimer-zh_CN.rst - -:Original: Documentation/vm/index.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - -================= -Linux内存管理文档 -================= - -这是一个关于Linux内存管理(mm)子系统内部的文档集,其中有不同层次的细节,包括注释 -和邮件列表的回复,用于阐述数据结构和算法的基本情况。如果你正在寻找关于简单分配内存的建 -议,请参阅(Documentation/translations/zh_CN/core-api/memory-allocation.rst)。 -对于控制和调整指南,请参阅(Documentation/admin-guide/mm/index)。 -TODO:待引用文档集被翻译完毕后请及时修改此处) - -.. toctree:: - :maxdepth: 1 - - active_mm - balance - damon/index - free_page_reporting - highmem - ksm - frontswap - hmm - hwpoison - hugetlbfs_reserv - memory-model - mmu_notifier - numa - overcommit-accounting - page_frags - page_owner - page_table_check - remap_file_pages - split_page_table_lock - z3fold - zsmalloc - -TODOLIST: -* arch_pgtable_helpers -* free_page_reporting -* hugetlbfs_reserv -* page_migration -* slub -* transhuge -* unevictable-lru -* vmalloced-kernel-stacks diff --git a/Documentation/translations/zh_CN/vm/ksm.rst b/Documentation/translations/zh_CN/vm/ksm.rst deleted file mode 100644 index 83b0c73984da..000000000000 --- a/Documentation/translations/zh_CN/vm/ksm.rst +++ /dev/null @@ -1,70 +0,0 @@ -.. include:: ../disclaimer-zh_CN.rst - -:Original: Documentation/vm/ksm.rst - -:翻译: - - 徐鑫 xu xin - -============ -内核同页合并 -============ - -KSM 是一种节省内存的数据去重功能,由CONFIG_KSM=y启用,并在2.6.32版本时被添加 -到Linux内核。详见 ``mm/ksm.c`` 的实现,以及http://lwn.net/Articles/306704和 -https://lwn.net/Articles/330589 - -KSM的用户空间的接口在Documentation/translations/zh_CN/admin-guide/mm/ksm.rst -文档中有描述。 - -设计 -==== - -概述 ----- - -概述内容请见mm/ksm.c文档中的“DOC: Overview” - -逆映射 ------- -KSM维护着稳定树中的KSM页的逆映射信息。 - -当KSM页面的共享数小于 ``max_page_sharing`` 的虚拟内存区域(VMAs)时,则代表了 -KSM页的稳定树其中的节点指向了一个rmap_item结构体类型的列表。同时,这个KSM页 -的 ``page->mapping`` 指向了该稳定树节点。 - -如果共享数超过了阈值,KSM将给稳定树添加第二个维度。稳定树就变成链接一个或多 -个稳定树"副本"的"链"。每个副本都保留KSM页的逆映射信息,其中 ``page->mapping`` -指向该"副本"。 - -每个链以及链接到该链中的所有"副本"强制不变的是,它们代表了相同的写保护内存 -内容,尽管任中一个"副本"是由同一片内存区的不同的KSM复制页所指向的。 - -这样一来,相比与无限的逆映射链表,稳定树的查找计算复杂性不受影响。但在稳定树 -本身中不能有重复的KSM页面内容仍然是强制要求。 - -由 ``max_page_sharing`` 强制决定的数据去重限制是必要的,以此来避免虚拟内存 -rmap链表变得过大。rmap的遍历具有O(N)的复杂度,其中N是共享页面的rmap_项(即 -虚拟映射)的数量,而这个共享页面的节点数量又被 ``max_page_sharing`` 所限制。 -因此,这有效地将线性O(N)计算复杂度从rmap遍历中分散到不同的KSM页面上。ksmd进 -程在稳定节点"链"上的遍历也是O(N),但这个N是稳定树"副本"的数量,而不是rmap项 -的数量,因此它对ksmd性能没有显著影响。实际上,最佳稳定树"副本"的候选节点将 -保留在"副本"列表的开头。 - -``max_page_sharing`` 的值设置得高了会促使更快的内存合并(因为将有更少的稳定 -树副本排队进入稳定节点chain->hlist)和更高的数据去重系数,但代价是在交换、压 -缩、NUMA平衡和页面迁移过程中可能导致KSM页的最大rmap遍历速度较慢。 - -``stable_node_dups/stable_node_chains`` 的比值还受 ``max_page_sharing`` 调控 -的影响,高比值可能意味着稳定节点dup中存在碎片,这可以通过在ksmd中引入碎片算 -法来解决,该算法将rmap项从一个稳定节点dup重定位到另一个稳定节点dup,以便释放 -那些仅包含极少rmap项的稳定节点"dup",但这可能会增加ksmd进程的CPU使用率,并可 -能会减慢应用程序在KSM页面上的只读计算。 - -KSM会定期扫描稳定节点"链"中链接的所有稳定树"副本",以便删减过时了的稳定节点。 -这种扫描的频率由 ``stable_node_chains_prune_millisecs`` 这个sysfs 接口定义。 - -参考 -==== -内核代码请见mm/ksm.c。 -涉及的函数(mm_slot ksm_scan stable_node rmap_item)。 diff --git a/Documentation/translations/zh_CN/vm/memory-model.rst b/Documentation/translations/zh_CN/vm/memory-model.rst deleted file mode 100644 index 013e30c88d72..000000000000 --- a/Documentation/translations/zh_CN/vm/memory-model.rst +++ /dev/null @@ -1,135 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -:Original: Documentation/vm/memory-model.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -============ -物理内存模型 -============ - -系统中的物理内存可以用不同的方式进行寻址。最简单的情况是,物理内存从地址0开 -始,跨越一个连续的范围,直到最大的地址。然而,这个范围可能包含CPU无法访问的 -小孔隙。那么,在完全不同的地址可能有几个连续的范围。而且,别忘了NUMA,即不 -同的内存库连接到不同的CPU。 - -Linux使用两种内存模型中的一种对这种多样性进行抽象。FLATMEM和SPARSEM。每 -个架构都定义了它所支持的内存模型,默认的内存模型是什么,以及是否有可能手动 -覆盖该默认值。 - -所有的内存模型都使用排列在一个或多个数组中的 `struct page` 来跟踪物理页 -帧的状态。 - -无论选择哪种内存模型,物理页框号(PFN)和相应的 `struct page` 之间都存 -在一对一的映射关系。 - -每个内存模型都定义了 :c:func:`pfn_to_page` 和 :c:func:`page_to_pfn` -帮助函数,允许从PFN到 `struct page` 的转换,反之亦然。 - -FLATMEM -======= - -最简单的内存模型是FLATMEM。这个模型适用于非NUMA系统的连续或大部分连续的 -物理内存。 - -在FLATMEM内存模型中,有一个全局的 `mem_map` 数组来映射整个物理内存。对 -于大多数架构,孔隙在 `mem_map` 数组中都有条目。与孔洞相对应的 `struct page` -对象从未被完全初始化。 - -为了分配 `mem_map` 数组,架构特定的设置代码应该调用free_area_init()函数。 -然而,在调用memblock_free_all()函数之前,映射数组是不能使用的,该函数 -将所有的内存交给页分配器。 - -一个架构可能会释放 `mem_map` 数组中不包括实际物理页的部分。在这种情况下,特 -定架构的 :c:func:`pfn_valid` 实现应该考虑到 `mem_map` 中的孔隙。 - -使用FLATMEM,PFN和 `struct page` 之间的转换是直接的。 `PFN - ARCH_PFN_OFFSET` -是 `mem_map` 数组的一个索引。 - -`ARCH_PFN_OFFSET` 定义了物理内存起始地址不同于0的系统的第一个页框号。 - -SPARSEMEM -========= - -SPARSEMEM是Linux中最通用的内存模型,它是唯一支持若干高级功能的内存模型, -如物理内存的热插拔、非易失性内存设备的替代内存图和较大系统的内存图的延迟 -初始化。 - -SPARSEMEM模型将物理内存显示为一个部分的集合。一个区段用mem_section结构 -体表示,它包含 `section_mem_map` ,从逻辑上讲,它是一个指向 `struct page` -阵列的指针。然而,它被存储在一些其他的magic中,以帮助分区管理。区段的大小 -和最大区段数是使用 `SECTION_SIZE_BITS` 和 `MAX_PHYSMEM_BITS` 常量 -来指定的,这两个常量是由每个支持SPARSEMEM的架构定义的。 `MAX_PHYSMEM_BITS` -是一个架构所支持的物理地址的实际宽度,而 `SECTION_SIZE_BITS` 是一个任 -意的值。 - -最大的段数表示为 `NR_MEM_SECTIONS` ,定义为 - -.. math:: - - NR\_MEM\_SECTIONS = 2 ^ {(MAX\_PHYSMEM\_BITS - SECTION\_SIZE\_BITS)} - -`mem_section` 对象被安排在一个叫做 `mem_sections` 的二维数组中。这个数组的 -大小和位置取决于 `CONFIG_SPARSEM_EXTREME` 和可能的最大段数: - -* 当 `CONFIG_SPARSEMEM_EXTREME` 被禁用时, `mem_sections` 数组是静态的,有 - `NR_MEM_SECTIONS` 行。每一行持有一个 `mem_section` 对象。 -* 当 `CONFIG_SPARSEMEM_EXTREME` 被启用时, `mem_sections` 数组被动态分配。 - 每一行包含价值 `PAGE_SIZE` 的 `mem_section` 对象,行数的计算是为了适应所有的 - 内存区。 - -架构设置代码应该调用sparse_init()来初始化内存区和内存映射。 - -通过SPARSEMEM,有两种可能的方式将PFN转换为相应的 `struct page` --"classic sparse"和 - "sparse vmemmap"。选择是在构建时进行的,它由 `CONFIG_SPARSEMEM_VMEMMAP` 的 - 值决定。 - -Classic sparse在page->flags中编码了一个页面的段号,并使用PFN的高位来访问映射该页 -框的段。在一个区段内,PFN是指向页数组的索引。 - -Sparse vmemmapvmemmap使用虚拟映射的内存映射来优化pfn_to_page和page_to_pfn操 -作。有一个全局的 `struct page *vmemmap` 指针,指向一个虚拟连续的 `struct page` -对象阵列。PFN是该数组的一个索引,`struct page` 从 `vmemmap` 的偏移量是该页的PFN。 - -为了使用vmemmap,一个架构必须保留一个虚拟地址的范围,以映射包含内存映射的物理页,并 -确保 `vmemmap`指向该范围。此外,架构应该实现 :c:func:`vmemmap_populate` 方法, -它将分配物理内存并为虚拟内存映射创建页表。如果一个架构对vmemmap映射没有任何特殊要求, -它可以使用通用内存管理提供的默认 :c:func:`vmemmap_populate_basepages`。 - -虚拟映射的内存映射允许将持久性内存设备的 `struct page` 对象存储在这些设备上预先分 -配的存储中。这种存储用vmem_altmap结构表示,最终通过一长串的函数调用传递给 -vmemmap_populate()。vmemmap_populate()实现可以使用 `vmem_altmap` 和 -:c:func:`vmemmap_alloc_block_buf` 助手来分配持久性内存设备上的内存映射。 - -ZONE_DEVICE -=========== -`ZONE_DEVICE` 设施建立在 `SPARSEM_VMEMMAP` 之上,为设备驱动识别的物理地址范 -围提供 `struct page` `mem_map` 服务。 `ZONE_DEVICE` 的 "设备" 方面与以下 -事实有关:这些地址范围的页面对象从未被在线标记过,而且必须对设备进行引用,而不仅仅 -是页面,以保持内存被“锁定”以便使用。 `ZONE_DEVICE` ,通过 :c:func:`devm_memremap_pages` , -为给定的pfns范围执行足够的内存热插拔来开启 :c:func:`pfn_to_page`, -:c:func:`page_to_pfn`, ,和 :c:func:`get_user_pages` 服务。由于页面引 -用计数永远不会低于1,所以页面永远不会被追踪为空闲内存,页面的 `struct list_head lru` -空间被重新利用,用于向映射该内存的主机设备/驱动程序进行反向引用。 - -虽然 `SPARSEMEM` 将内存作为一个区段的集合,可以选择收集并合成内存块,但 -`ZONE_DEVICE` 用户需要更小的颗粒度来填充 `mem_map` 。鉴于 `ZONE_DEVICE` -内存从未被在线标记,因此它的内存范围从未通过sysfs内存热插拔api暴露在内存块边界 -上。这个实现依赖于这种缺乏用户接口的约束,允许子段大小的内存范围被指定给 -:c:func:`arch_add_memory` ,即内存热插拔的上半部分。子段支持允许2MB作为 -:c:func:`devm_memremap_pages` 的跨架构通用对齐颗粒度。 - -`ZONE_DEVICE` 的用户是: - -* pmem: 通过DAX映射将平台持久性内存作为直接I/O目标使用。 - -* hmm: 用 `->page_fault()` 和 `->page_free()` 事件回调扩展 `ZONE_DEVICE` , - 以允许设备驱动程序协调与设备内存相关的内存管理事件,通常是GPU内存。参见/vm/hmm.rst。 - -* p2pdma: 创建 `struct page` 对象,允许PCI/E拓扑结构中的peer设备协调它们之间的 - 直接DMA操作,即绕过主机内存。 diff --git a/Documentation/translations/zh_CN/vm/mmu_notifier.rst b/Documentation/translations/zh_CN/vm/mmu_notifier.rst deleted file mode 100644 index b29a37b33628..000000000000 --- a/Documentation/translations/zh_CN/vm/mmu_notifier.rst +++ /dev/null @@ -1,97 +0,0 @@ -:Original: Documentation/vm/mmu_notifier.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - - -什么时候需要页表锁内通知? -========================== - -当清除一个pte/pmd时,我们可以选择通过在页表锁下(通知版的\*_clear_flush调用 -mmu_notifier_invalidate_range)通知事件。但这种通知并不是在所有情况下都需要的。 - -对于二级TLB(非CPU TLB),如IOMMU TLB或设备TLB(当设备使用类似ATS/PASID的东西让 -IOMMU走CPU页表来访问进程的虚拟地址空间)。只有两种情况需要在清除pte/pmd时在持有页 -表锁的同时通知这些二级TLB: - - A) 在mmu_notifier_invalidate_range_end()之前,支持页的地址被释放。 - B) 一个页表项被更新以指向一个新的页面(COW,零页上的写异常,__replace_page(),...)。 - -情况A很明显,你不想冒风险让设备写到一个现在可能被一些完全不同的任务使用的页面。 - -情况B更加微妙。为了正确起见,它需要按照以下序列发生: - - - 上页表锁 - - 清除页表项并通知 ([pmd/pte]p_huge_clear_flush_notify()) - - 设置页表项以指向新页 - -如果在设置新的pte/pmd值之前,清除页表项之后没有进行通知,那么你就会破坏设备的C11或 -C++11等内存模型。 - -考虑以下情况(设备使用类似于ATS/PASID的功能)。 - -两个地址addrA和addrB,这样|addrA - addrB| >= PAGE_SIZE,我们假设它们是COW的 -写保护(B的其他情况也适用)。 - -:: - - [Time N] -------------------------------------------------------------------- - CPU-thread-0 {尝试写到addrA} - CPU-thread-1 {尝试写到addrB} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {读取addrA并填充设备TLB} - DEV-thread-2 {读取addrB并填充设备TLB} - [Time N+1] ------------------------------------------------------------------ - CPU-thread-0 {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}} - CPU-thread-1 {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+2] ------------------------------------------------------------------ - CPU-thread-0 {COW_step1: {更新页表以指向addrA的新页}} - CPU-thread-1 {COW_step1: {更新页表以指向addrB的新页}} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+3] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {preempted} - CPU-thread-2 {写入addrA,这是对新页面的写入} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+3] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {preempted} - CPU-thread-2 {} - CPU-thread-3 {写入addrB,这是一个写入新页的过程} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+4] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+5] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {从旧页中读取addrA} - DEV-thread-2 {从新页面读取addrB} - -所以在这里,因为在N+2的时候,清空页表项没有和通知一起作废二级TLB,设备在看到addrA的新值之前 -就看到了addrB的新值。这就破坏了设备的总内存序。 - -当改变一个pte的写保护或指向一个新的具有相同内容的写保护页(KSM)时,将mmu_notifier_invalidate_range -调用延迟到页表锁外的mmu_notifier_invalidate_range_end()是可以的。即使做页表更新的线程 -在释放页表锁后但在调用mmu_notifier_invalidate_range_end()前被抢占,也是如此。 diff --git a/Documentation/translations/zh_CN/vm/numa.rst b/Documentation/translations/zh_CN/vm/numa.rst deleted file mode 100644 index 6af412b924ad..000000000000 --- a/Documentation/translations/zh_CN/vm/numa.rst +++ /dev/null @@ -1,101 +0,0 @@ -:Original: Documentation/vm/numa.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -始于1999年11月,作者: - -========================== -何为非统一内存访问(NUMA)? -========================== - -这个问题可以从几个视角来回答:硬件观点和Linux软件视角。 - -从硬件角度看,NUMA系统是一个由多个组件或装配组成的计算机平台,每个组件可能包含0个或更多的CPU、 -本地内存和/或IO总线。为了简洁起见,并将这些物理组件/装配的硬件视角与软件抽象区分开来,我们在 -本文中称这些组件/装配为“单元”。 - -每个“单元”都可以看作是系统的一个SMP[对称多处理器]子集——尽管独立的SMP系统所需的一些组件可能 -不会在任何给定的单元上填充。NUMA系统的单元通过某种系统互连连接在一起——例如,交叉开关或点对点 -链接是NUMA系统互连的常见类型。这两种类型的互连都可以聚合起来,以创建NUMA平台,其中的单元与其 -他单元有多个距离。 - -对于Linux,感兴趣的NUMA平台主要是所谓的缓存相干NUMA--简称ccNUMA系统系统。在ccNUMA系统中, -所有的内存都是可见的,并且可以从连接到任何单元的任何CPU中访问,缓存一致性是由处理器缓存和/或 -系统互连在硬件中处理。 - -内存访问时间和有效的内存带宽取决于包含CPU的单元或进行内存访问的IO总线距离包含目标内存的单元 -有多远。例如,连接到同一单元的CPU对内存的访问将比访问其他远程单元的内存经历更快的访问时间和 -更高的带宽。 NUMA平台可以在任何给定单元上访问多种远程距离的(其他)单元。 - -平台供应商建立NUMA系统并不只是为了让软件开发人员的生活变得有趣。相反,这种架构是提供可扩展 -内存带宽的一种手段。然而,为了实现可扩展的内存带宽,系统和应用软件必须安排大部分的内存引用 -[cache misses]到“本地”内存——同一单元的内存,如果有的话——或者到最近的有内存的单元。 - -这就自然而然有了Linux软件对NUMA系统的视角: - -Linux将系统的硬件资源划分为多个软件抽象,称为“节点”。Linux将节点映射到硬件平台的物理单元 -上,对一些架构的细节进行了抽象。与物理单元一样,软件节点可能包含0或更多的CPU、内存和/或IO -总线。同样,对“较近”节点的内存访问——映射到较近单元的节点——通常会比对较远单元的访问经历更快 -的访问时间和更高的有效带宽。 - -对于一些架构,如x86,Linux将“隐藏”任何代表没有内存连接的物理单元的节点,并将连接到该单元 -的任何CPU重新分配到代表有内存的单元的节点上。因此,在这些架构上,我们不能假设Linux将所有 -的CPU与一个给定的节点相关联,会看到相同的本地内存访问时间和带宽。 - -此外,对于某些架构,同样以x86为例,Linux支持对额外节点的仿真。对于NUMA仿真,Linux会将现 -有的节点或者非NUMA平台的系统内存分割成多个节点。每个模拟的节点将管理底层单元物理内存的一部 -分。NUMA仿真对于在非NUMA平台上测试NUMA内核和应用功能是非常有用的,当与cpusets一起使用时, -可以作为一种内存资源管理机制。[见 Documentation/admin-guide/cgroup-v1/cpusets.rst] - -对于每个有内存的节点,Linux构建了一个独立的内存管理子系统,有自己的空闲页列表、使用中页列表、 -使用统计和锁来调解访问。此外,Linux为每个内存区[DMA、DMA32、NORMAL、HIGH_MEMORY、MOVABLE -中的一个或多个]构建了一个有序的“区列表”。zonelist指定了当一个选定的区/节点不能满足分配请求 -时要访问的区/节点。当一个区没有可用的内存来满足请求时,这种情况被称为“overflow 溢出”或 -“fallback 回退”。 - -由于一些节点包含多个包含不同类型内存的区,Linux必须决定是否对区列表进行排序,使分配回退到不同 -节点上的相同区类型,或同一节点上的不同区类型。这是一个重要的考虑因素,因为有些区,如DMA或DMA32, -代表了相对稀缺的资源。Linux选择了一个默认的Node ordered zonelist。这意味着在使用按NUMA距 -离排序的远程节点之前,它会尝试回退到同一节点的其他分区。 - -默认情况下,Linux会尝试从执行请求的CPU被分配到的节点中满足内存分配请求。具体来说,Linux将试 -图从请求来源的节点的适当分区列表中的第一个节点进行分配。这被称为“本地分配”。如果“本地”节点不能 -满足请求,内核将检查所选分区列表中其他节点的区域,寻找列表中第一个能满足请求的区域。 - -本地分配将倾向于保持对分配的内存的后续访问 “本地”的底层物理资源和系统互连——只要内核代表其分配 -一些内存的任务后来不从该内存迁移。Linux调度器知道平台的NUMA拓扑结构——体现在“调度域”数据结构 -中[见 Documentation/scheduler/sched-domains.rst]——并且调度器试图尽量减少任务迁移到遥 -远的调度域中。然而,调度器并没有直接考虑到任务的NUMA足迹。因此,在充分不平衡的情况下,任务可 -以在节点之间迁移,远离其初始节点和内核数据结构。 - -系统管理员和应用程序设计者可以使用各种CPU亲和命令行接口,如taskset(1)和numactl(1),以及程 -序接口,如sched_setaffinity(2),来限制任务的迁移,以改善NUMA定位。此外,人们可以使用 -Linux NUMA内存策略修改内核的默认本地分配行为。 [见 -:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst `]. - -系统管理员可以使用控制组和CPUsets限制非特权用户在调度或NUMA命令和功能中可以指定的CPU和节点 -的内存。 [见 Documentation/admin-guide/cgroup-v1/cpusets.rst] - -在不隐藏无内存节点的架构上,Linux会在分区列表中只包括有内存的区域[节点]。这意味着对于一个无 -内存的节点,“本地内存节点”——CPU节点的分区列表中的第一个区域的节点——将不是节点本身。相反,它 -将是内核在建立分区列表时选择的离它最近的有内存的节点。所以,默认情况下,本地分配将由内核提供 -最近的可用内存来完成。这是同一机制的结果,该机制允许这种分配在一个包含内存的节点溢出时回退到 -其他附近的节点。 - -一些内核分配不希望或不能容忍这种分配回退行为。相反,他们想确保他们从指定的节点获得内存,或者 -得到通知说该节点没有空闲内存。例如,当一个子系统分配每个CPU的内存资源时,通常是这种情况。 - -一个典型的分配模式是使用内核的numa_node_id()或CPU_to_node()函数获得“当前CPU”所在节点的 -节点ID,然后只从返回的节点ID请求内存。当这样的分配失败时,请求的子系统可以恢复到它自己的回退 -路径。板块内核内存分配器就是这样的一个例子。或者,子系统可以选择在分配失败时禁用或不启用自己。 -内核分析子系统就是这样的一个例子。 - -如果架构支持——不隐藏无内存节点,那么连接到无内存节点的CPU将总是产生回退路径的开销,或者一些 -子系统如果试图完全从无内存的节点分配内存,将无法初始化。为了透明地支持这种架构,内核子系统可 -以使用numa_mem_id()或cpu_to_mem()函数来定位调用或指定CPU的“本地内存节点”。同样,这是同 -一个节点,默认的本地页分配将从这个节点开始尝试。 diff --git a/Documentation/translations/zh_CN/vm/overcommit-accounting.rst b/Documentation/translations/zh_CN/vm/overcommit-accounting.rst deleted file mode 100644 index 8765cb118f24..000000000000 --- a/Documentation/translations/zh_CN/vm/overcommit-accounting.rst +++ /dev/null @@ -1,86 +0,0 @@ -:Original: Documentation/vm/overcommit-accounting.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - - -============== -超量使用审计 -============== - -Linux内核支持下列超量使用处理模式 - -0 - 启发式超量使用处理。拒绝明显的地址空间超量使用。用于一个典型的系统。 - 它确保严重的疯狂分配失败,同时允许超量使用以减少swap的使用。在这种模式下, - 允许root分配稍多的内存。这是默认的。 -1 - 总是超量使用。适用于一些科学应用。经典的例子是使用稀疏数组的代码,只是依赖 - 几乎完全由零页组成的虚拟内存 - -2 - 不超量使用。系统提交的总地址空间不允许超过swap+一个可配置的物理RAM的数量 - (默认为50%)。根据你使用的数量,在大多数情况下,这意味着一个进程在访问页面时 - 不会被杀死,但会在内存分配上收到相应的错误。 - - 对于那些想保证他们的内存分配在未来可用而又不需要初始化每一个页面的应用程序来说 - 是很有用的。 - -超量使用策略是通过sysctl `vm.overcommit_memory` 设置的。 - -可以通过 `vm.overcommit_ratio` (百分比)或 `vm.overcommit_kbytes` (绝对值) -来设置超限数量。这些只有在 `vm.overcommit_memory` 被设置为2时才有效果。 - -在 ``/proc/meminfo`` 中可以分别以CommitLimit和Committed_AS的形式查看当前 -的超量使用和提交量。 - -陷阱 -==== - -C语言的堆栈增长是一个隐含的mremap。如果你想得到绝对的保证,并在接近边缘的地方运行, -你 **必须** 为你认为你需要的最大尺寸的堆栈进行mmap。对于典型的堆栈使用来说,这并 -不重要,但如果你真的非常关心的话,这就是一个值得关注的案例。 - - -在模式2中,MAP_NORESERVE标志被忽略。 - - -它是如何工作的 -============== - -超量使用是基于以下规则 - -对于文件映射 - | SHARED or READ-only - 0 cost (该文件是映射而不是交换) - | PRIVATE WRITABLE - 每个实例的映射大小 - -对于匿名或者 ``/dev/zero`` 映射 - | SHARED - 映射的大小 - | PRIVATE READ-only - 0 cost (但作用不大) - | PRIVATE WRITABLE - 每个实例的映射大小 - -额外的计数 - | 通过mmap制作可写副本的页面 - | 从同一池中提取的shmfs内存 - -状态 -==== - -* 我们核算mmap内存映射 -* 我们核算mprotect在提交中的变化 -* 我们核算mremap的大小变化 -* 我们的审计 brk -* 审计munmap -* 我们在/proc中报告commit 状态 -* 核对并检查分叉的情况 -* 审查堆栈处理/执行中的构建 -* 叙述SHMfs的情况 -* 实现实际限制的执行 - -待续 -==== -* ptrace 页计数(这很难)。 diff --git a/Documentation/translations/zh_CN/vm/page_frags.rst b/Documentation/translations/zh_CN/vm/page_frags.rst deleted file mode 100644 index ad27fed33634..000000000000 --- a/Documentation/translations/zh_CN/vm/page_frags.rst +++ /dev/null @@ -1,38 +0,0 @@ -:Original: Documentation/vm/page_frag.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -======== -页面片段 -======== - -一个页面片段是一个任意长度的任意偏移的内存区域,它位于一个0或更高阶的复合页面中。 -该页中的多个碎片在该页的引用计数器中被单独计算。 - -page_frag函数,page_frag_alloc和page_frag_free,为页面片段提供了一个简单 -的分配框架。这被网络堆栈和网络设备驱动使用,以提供一个内存的支持区域,作为 -sk_buff->head使用,或者用于skb_shared_info的 “frags” 部分。 - -为了使用页面片段API,需要一个支持页面片段的缓冲区。这为碎片分配提供了一个中心点, -并允许多个调用使用一个缓存的页面。这样做的好处是可以避免对get_page的多次调用, -这在分配时开销可能会很大。然而,由于这种缓存的性质,要求任何对缓存的调用都要受到每 -个CPU的限制,或者每个CPU的限制,并在执行碎片分配时强制禁止中断。 - -网络堆栈在每个CPU使用两个独立的缓存来处理碎片分配。netdev_alloc_cache被使用 -netdev_alloc_frag和__netdev_alloc_skb调用的调用者使用。napi_alloc_cache -被调用__napi_alloc_frag和__napi_alloc_skb的调用者使用。这两个调用的主要区别是 -它们可能被调用的环境。“netdev” 前缀的函数可以在任何上下文中使用,因为这些函数 -将禁用中断,而 ”napi“ 前缀的函数只可以在softirq上下文中使用。 - -许多网络设备驱动程序使用类似的方法来分配页面片段,但页面片段是在环或描述符级别上 -缓存的。为了实现这些情况,有必要提供一种拆解页面缓存的通用方法。出于这个原因, -__page_frag_cache_drain被实现了。它允许通过一次调用从一个页面释放多个引用。 -这样做的好处是,它允许清理被添加到一个页面的多个引用,以避免每次分配都调用 -get_page。 - -Alexander Duyck,2016年11月29日。 diff --git a/Documentation/translations/zh_CN/vm/page_owner.rst b/Documentation/translations/zh_CN/vm/page_owner.rst deleted file mode 100644 index 9e951fabba9d..000000000000 --- a/Documentation/translations/zh_CN/vm/page_owner.rst +++ /dev/null @@ -1,116 +0,0 @@ -:Original: Documentation/vm/page_owner.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -================================ -page owner: 跟踪谁分配的每个页面 -================================ - -概述 -==== - -page owner是用来追踪谁分配的每一个页面。它可以用来调试内存泄漏或找到内存占用者。 -当分配发生时,有关分配的信息,如调用堆栈和页面的顺序被存储到每个页面的特定存储中。 -当我们需要了解所有页面的状态时,我们可以获得并分析这些信息。 - -尽管我们已经有了追踪页面分配/释放的tracepoint,但用它来分析谁分配的每个页面是 -相当复杂的。我们需要扩大跟踪缓冲区,以防止在用户空间程序启动前出现重叠。而且,启 -动的程序会不断地将跟踪缓冲区转出,供以后分析,这将会改变系统的行为,会产生更多的 -可能性,而不是仅仅保留在内存中,所以不利于调试。 - -页面所有者也可以用于各种目的。例如,可以通过每个页面的gfp标志信息获得精确的碎片 -统计。如果启用了page owner,它就已经实现并激活了。我们非常欢迎其他用途。 - -page owner在默认情况下是禁用的。所以,如果你想使用它,你需要在你的启动cmdline -中加入"page_owner=on"。如果内核是用page owner构建的,并且由于没有启用启动 -选项而在运行时禁用page owner,那么运行时的开销是很小的。如果在运行时禁用,它不 -需要内存来存储所有者信息,所以没有运行时内存开销。而且,页面所有者在页面分配器的 -热路径中只插入了两个不可能的分支,如果不启用,那么分配就会像没有页面所有者的内核 -一样进行。这两个不可能的分支应该不会影响到分配的性能,特别是在静态键跳转标签修补 -功能可用的情况下。以下是由于这个功能而导致的内核代码大小的变化。 - -- 没有page owner:: - - text data bss dec hex filename - 48392 2333 644 51369 c8a9 mm/page_alloc.o - -- 有page owner:: - - text data bss dec hex filename - 48800 2445 644 51889 cab1 mm/page_alloc.o - 6662 108 29 6799 1a8f mm/page_owner.o - 1025 8 8 1041 411 mm/page_ext.o - -虽然总共增加了8KB的代码,但page_alloc.o增加了520字节,其中不到一半是在hotpath -中。构建带有page owner的内核,并在需要时打开它,将是调试内核内存问题的最佳选择。 - -有一个问题是由实现细节引起的。页所有者将信息存储到struct page扩展的内存中。这 -个内存的初始化时间比稀疏内存系统中的页面分配器启动的时间要晚一些,所以,在初始化 -之前,许多页面可以被分配,但它们没有所有者信息。为了解决这个问题,这些早期分配的 -页面在初始化阶段被调查并标记为分配。虽然这并不意味着它们有正确的所有者信息,但至 -少,我们可以更准确地判断该页是否被分配。在2GB内存的x86-64虚拟机上,有13343 -个早期分配的页面被捕捉和标记,尽管它们大部分是由结构页扩展功能分配的。总之,在这 -之后,没有任何页面处于未追踪状态。 - -使用方法 -======== - -1) 构建用户空间的帮助:: - - cd tools/vm - make page_owner_sort - -2) 启用page owner: 添加 "page_owner=on" 到 boot cmdline. - -3) 做你想调试的工作。 - -4) 分析来自页面所有者的信息:: - - cat /sys/kernel/debug/page_owner > page_owner_full.txt - ./page_owner_sort page_owner_full.txt sorted_page_owner.txt - - ``page_owner_full.txt`` 的一般输出情况如下(输出信息无翻译价值):: - - Page allocated via order XXX, ... - PFN XXX ... - // Detailed stack - - Page allocated via order XXX, ... - PFN XXX ... - // Detailed stack - - ``page_owner_sort`` 工具忽略了 ``PFN`` 行,将剩余的行放在buf中,使用regexp提 - 取页序值,计算buf的次数和页数,最后根据参数进行排序。 - - 在 ``sorted_page_owner.txt`` 中可以看到关于谁分配了每个页面的结果。一般输出:: - - XXX times, XXX pages: - Page allocated via order XXX, ... - // Detailed stack - - 默认情况下, ``page_owner_sort`` 是根据buf的时间来排序的。如果你想 - 按buf的页数排序,请使用-m参数。详细的参数是: - - 基本函数: - - Sort: - -a 按内存分配时间排序 - -m 按总内存排序 - -p 按pid排序。 - -P 按tgid排序。 - -r 按内存释放时间排序。 - -s 按堆栈跟踪排序。 - -t 按时间排序(默认)。 - - 其它函数: - - Cull: - -c 通过比较堆栈跟踪而不是总块来进行剔除。 - - Filter: - -f 过滤掉内存已被释放的块的信息。 diff --git a/Documentation/translations/zh_CN/vm/page_table_check.rst b/Documentation/translations/zh_CN/vm/page_table_check.rst deleted file mode 100644 index a29fc1b360e6..000000000000 --- a/Documentation/translations/zh_CN/vm/page_table_check.rst +++ /dev/null @@ -1,56 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -:Original: Documentation/vm/page_table_check.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -======== -页表检查 -======== - -概述 -==== - -页表检查允许通过确保防止某些类型的内存损坏来强化内核。 - -当新的页面可以从用户空间访问时,页表检查通过将它们的页表项(PTEs PMD等)添加到页表中来执行额外 -的验证。 - -在检测到损坏的情况下,内核会被崩溃。页表检查有一个小的性能和内存开销。因此,它在默认情况下是禁用 -的,但是在额外的加固超过性能成本的系统上,可以选择启用。另外,由于页表检查是同步的,它可以帮助调 -试双映射内存损坏问题,在错误的映射发生时崩溃内核,而不是在内存损坏错误发生后内核崩溃。 - -双重映射检测逻辑 -================ - -+-------------------+-------------------+-------------------+------------------+ -| Current Mapping | New mapping | Permissions | Rule | -+===================+===================+===================+==================+ -| Anonymous | Anonymous | Read | Allow | -+-------------------+-------------------+-------------------+------------------+ -| Anonymous | Anonymous | Read / Write | Prohibit | -+-------------------+-------------------+-------------------+------------------+ -| Anonymous | Named | Any | Prohibit | -+-------------------+-------------------+-------------------+------------------+ -| Named | Anonymous | Any | Prohibit | -+-------------------+-------------------+-------------------+------------------+ -| Named | Named | Any | Allow | -+-------------------+-------------------+-------------------+------------------+ - -启用页表检查 -============ - -用以下方法构建内核: - -- PAGE_TABLE_CHECK=y - 注意,它只能在ARCH_SUPPORTS_PAGE_TABLE_CHECK可用的平台上启用。 - -- 使用 "page_table_check=on" 内核参数启动。 - -可以选择用PAGE_TABLE_CHECK_ENFORCED来构建内核,以便在没有额外的内核参数的情况下获得页表 -支持。 diff --git a/Documentation/translations/zh_CN/vm/remap_file_pages.rst b/Documentation/translations/zh_CN/vm/remap_file_pages.rst deleted file mode 100644 index af6b7e28af23..000000000000 --- a/Documentation/translations/zh_CN/vm/remap_file_pages.rst +++ /dev/null @@ -1,32 +0,0 @@ -:Original: Documentation/vm/remap_file_pages.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -============================== -remap_file_pages()系统调用 -============================== - -remap_file_pages()系统调用被用来创建一个非线性映射,也就是说,在这个映射中, -文件的页面被无序映射到内存中。使用remap_file_pages()比重复调用mmap(2)的好 -处是,前者不需要内核创建额外的VMA(虚拟内存区)数据结构。 - -支持非线性映射需要在内核虚拟内存子系统中编写大量的non-trivial的代码,包括热 -路径。另外,为了使非线性映射工作,内核需要一种方法来区分正常的页表项和带有文件 -偏移的项(pte_file)。内核为达到这个目的在PTE中保留了标志。PTE标志是稀缺资 -源,特别是在某些CPU架构上。如果能腾出这个标志用于其他用途就更好了。 - -幸运的是,在生活中并没有很多remap_file_pages()的用户。只知道有一个企业的RDBMS -实现在32位系统上使用这个系统调用来映射比32位虚拟地址空间线性尺寸更大的文件。 -由于64位系统的广泛使用,这种使用情况已经不重要了。 - -syscall被废弃了,现在用一个模拟来代替它。仿真会创建新的VMA,而不是非线性映射。 -对于remap_file_pages()的少数用户来说,它的工作速度会变慢,但ABI被保留了。 - -仿真的一个副作用(除了性能之外)是,由于额外的VMA,用户可以更容易达到 -vm.max_map_count的限制。关于限制的更多细节,请参见DEFAULT_MAX_MAP_COUNT -的注释。 diff --git a/Documentation/translations/zh_CN/vm/split_page_table_lock.rst b/Documentation/translations/zh_CN/vm/split_page_table_lock.rst deleted file mode 100644 index 50694d97c426..000000000000 --- a/Documentation/translations/zh_CN/vm/split_page_table_lock.rst +++ /dev/null @@ -1,96 +0,0 @@ -:Original: Documentation/vm/split_page_table_lock.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -================================= -分页表锁(split page table lock) -================================= - -最初,mm->page_table_lock spinlock保护了mm_struct的所有页表。但是这种方 -法导致了多线程应用程序的缺页异常可扩展性差,因为对锁的争夺很激烈。为了提高可扩 -展性,我们引入了分页表锁。 - -有了分页表锁,我们就有了单独的每张表锁来顺序化对表的访问。目前,我们对PTE和 -PMD表使用分页锁。对高层表的访问由mm->page_table_lock保护。 - -有一些辅助工具来锁定/解锁一个表和其他访问器函数: - - - pte_offset_map_lock() - 映射pte并获取PTE表锁,返回所取锁的指针; - - pte_unmap_unlock() - 解锁和解映射PTE表; - - pte_alloc_map_lock() - 如果需要的话,分配PTE表并获取锁,如果分配失败,返回已获取的锁的指针 - 或NULL; - - pte_lockptr() - 返回指向PTE表锁的指针; - - pmd_lock() - 取得PMD表锁,返回所取锁的指针。 - - pmd_lockptr() - 返回指向PMD表锁的指针; - -如果CONFIG_SPLIT_PTLOCK_CPUS(通常为4)小于或等于NR_CPUS,则在编译 -时启用PTE表的分页表锁。如果分页锁被禁用,所有的表都由mm->page_table_lock -来保护。 - -如果PMD表启用了分页锁,并且架构支持它,那么PMD表的分页锁就会被启用(见 -下文)。 - -Hugetlb 和分页表锁 -================== - -Hugetlb可以支持多种页面大小。我们只对PMD级别使用分页锁,但不对PUD使用。 - -Hugetlb特定的辅助函数: - - - huge_pte_lock() - 对PMD_SIZE页面采取pmd分割锁,否则mm->page_table_lock; - - huge_pte_lockptr() - 返回指向表锁的指针。 - -架构对分页表锁的支持 -==================== - -没有必要特别启用PTE分页表锁:所有需要的东西都由pgtable_pte_page_ctor() -和pgtable_pte_page_dtor()完成,它们必须在PTE表分配/释放时被调用。 - -确保架构不使用slab分配器来分配页表:slab使用page->slab_cache来分配其页 -面。这个区域与page->ptl共享存储。 - -PMD分页锁只有在你有两个以上的页表级别时才有意义。 - -启用PMD分页锁需要在PMD表分配时调用pgtable_pmd_page_ctor(),在释放时调 -用pgtable_pmd_page_dtor()。 - -分配通常发生在pmd_alloc_one()中,释放发生在pmd_free()和pmd_free_tlb() -中,但要确保覆盖所有的PMD表分配/释放路径:即X86_PAE在pgd_alloc()中预先 -分配一些PMD。 - -一切就绪后,你可以设置CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK。 - -注意:pgtable_pte_page_ctor()和pgtable_pmd_page_ctor()可能失败--必 -须正确处理。 - -page->ptl -========= - -page->ptl用于访问分割页表锁,其中'page'是包含该表的页面struct page。它 -与page->private(以及union中的其他几个字段)共享存储。 - -为了避免增加struct page的大小并获得最佳性能,我们使用了一个技巧: - - - 如果spinlock_t适合于long,我们使用page->ptr作为spinlock,这样我们 - 就可以避免间接访问并节省一个缓存行。 - - 如果spinlock_t的大小大于long的大小,我们使用page->ptl作为spinlock_t - 的指针并动态分配它。这允许在启用DEBUG_SPINLOCK或DEBUG_LOCK_ALLOC的 - 情况下使用分页锁,但由于间接访问而多花了一个缓存行。 - -PTE表的spinlock_t分配在pgtable_pte_page_ctor()中,PMD表的spinlock_t -分配在pgtable_pmd_page_ctor()中。 - -请不要直接访问page->ptl - -使用适当的辅助函数。 diff --git a/Documentation/translations/zh_CN/vm/z3fold.rst b/Documentation/translations/zh_CN/vm/z3fold.rst deleted file mode 100644 index 57204aa08caa..000000000000 --- a/Documentation/translations/zh_CN/vm/z3fold.rst +++ /dev/null @@ -1,31 +0,0 @@ -:Original: Documentation/vm/z3fold.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -====== -z3fold -====== - -z3fold是一个专门用于存储压缩页的分配器。它被设计为每个物理页最多可以存储三个压缩页。 -它是zbud的衍生物,允许更高的压缩率,保持其前辈的简单性和确定性。 - -z3fold和zbud的主要区别是: - -* 与zbud不同的是,z3fold允许最大的PAGE_SIZE分配。 -* z3fold在其页面中最多可以容纳3个压缩页面 -* z3fold本身没有输出任何API,因此打算通过zpool的API来使用 - -为了保持确定性和简单性,z3fold,就像zbud一样,总是在每页存储一个整数的压缩页,但是 -它最多可以存储3页,不像zbud最多可以存储2页。因此压缩率达到2.7倍左右,而zbud的压缩 -率是1.7倍左右。 - -不像zbud(但也像zsmalloc),z3fold_alloc()那样不返回一个可重复引用的指针。相反,它 -返回一个无符号长句柄,它编码了被分配对象的实际位置。 - -保持有效的压缩率接近于zsmalloc,z3fold不依赖于MMU的启用,并提供更可预测的回收行 -为,这使得它更适合于小型和反应迅速的系统。 diff --git a/Documentation/translations/zh_CN/vm/zsmalloc.rst b/Documentation/translations/zh_CN/vm/zsmalloc.rst deleted file mode 100644 index 29e9c70a8eb6..000000000000 --- a/Documentation/translations/zh_CN/vm/zsmalloc.rst +++ /dev/null @@ -1,78 +0,0 @@ -:Original: Documentation/vm/zs_malloc.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - -======== -zsmalloc -======== - -这个分配器是为与zram一起使用而设计的。因此,该分配器应该在低内存条件下工作良好。特别是, -它从未尝试过higher order页面的分配,这在内存压力下很可能会失败。另一方面,如果我们只 -是使用单(0-order)页,它将遭受非常高的碎片化 - 任何大小为PAGE_SIZE/2或更大的对象将 -占据整个页面。这是其前身(xvmalloc)的主要问题之一。 - -为了克服这些问题,zsmalloc分配了一堆0-order页面,并使用各种"struct page"字段将它 -们链接起来。这些链接的页面作为一个单一的higher order页面,即一个对象可以跨越0-order -页面的边界。代码将这些链接的页面作为一个实体,称为zspage。 - -为了简单起见,zsmalloc只能分配大小不超过PAGE_SIZE的对象,因为这满足了所有当前用户的 -要求(在最坏的情况下,页面是不可压缩的,因此以"原样"即未压缩的形式存储)。对于大于这 -个大小的分配请求,会返回失败(见zs_malloc)。 - -此外,zs_malloc()并不返回一个可重复引用的指针。相反,它返回一个不透明的句柄(无符号 -长),它编码了被分配对象的实际位置。这种间接性的原因是zsmalloc并不保持zspages的永久 -映射,因为这在32位系统上会导致问题,因为内核空间映射的VA区域非常小。因此,在使用分配 -的内存之前,对象必须使用zs_map_object()进行映射以获得一个可用的指针,随后使用 -zs_unmap_object()解除映射。 - -stat -==== - -通过CONFIG_ZSMALLOC_STAT,我们可以通过 ``/sys/kernel/debug/zsmalloc/`` -看到zsmalloc内部信息。下面是一个统计输出的例子。:: - - # cat /sys/kernel/debug/zsmalloc/zram0/classes - - class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage - ... - ... - 9 176 0 1 186 129 8 4 - 10 192 1 0 2880 2872 135 3 - 11 208 0 1 819 795 42 2 - 12 224 0 1 219 159 12 4 - ... - ... - - -class - 索引 -size - zspage存储对象大小 -almost_empty - ZS_ALMOST_EMPTY zspage的数量(见下文)。 -almost_full - ZS_ALMOST_FULL zspage的数量(见下图) -obj_allocated - 已分配对象的数量 -obj_used - 分配给用户的对象的数量 -pages_used - 为该类分配的页数 -pages_per_zspage - 组成一个zspage的0-order页面的数量 - -当n <= N / f时,我们将一个zspage分配给ZS_ALMOST_EMPTYfullness组,其中 - -* n = 已分配对象的数量 -* N = zspage可以存储的对象总数 -* f = fullness_threshold_frac(即,目前是4个) - -同样地,我们将zspage分配给: - -* ZS_ALMOST_FULL when n > N / f -* ZS_EMPTY when n == 0 -* ZS_FULL when n == N diff --git a/Documentation/translations/zh_TW/index.rst b/Documentation/translations/zh_TW/index.rst index e1ce9d8c06f8..e97d7d578751 100644 --- a/Documentation/translations/zh_TW/index.rst +++ b/Documentation/translations/zh_TW/index.rst @@ -128,7 +128,7 @@ TODOList: * security/index * sound/index * crypto/index -* vm/index +* mm/index * bpf/index * usb/index * PCI/index diff --git a/Documentation/vm/.gitignore b/Documentation/vm/.gitignore deleted file mode 100644 index bc74f5643008..000000000000 --- a/Documentation/vm/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -page-types -slabinfo diff --git a/Documentation/vm/active_mm.rst b/Documentation/vm/active_mm.rst deleted file mode 100644 index 6f8269c284ed..000000000000 --- a/Documentation/vm/active_mm.rst +++ /dev/null @@ -1,91 +0,0 @@ -.. _active_mm: - -========= -Active MM -========= - -:: - - List: linux-kernel - Subject: Re: active_mm - From: Linus Torvalds - Date: 1999-07-30 21:36:24 - - Cc'd to linux-kernel, because I don't write explanations all that often, - and when I do I feel better about more people reading them. - - On Fri, 30 Jul 1999, David Mosberger wrote: - > - > Is there a brief description someplace on how "mm" vs. "active_mm" in - > the task_struct are supposed to be used? (My apologies if this was - > discussed on the mailing lists---I just returned from vacation and - > wasn't able to follow linux-kernel for a while). - - Basically, the new setup is: - - - we have "real address spaces" and "anonymous address spaces". The - difference is that an anonymous address space doesn't care about the - user-level page tables at all, so when we do a context switch into an - anonymous address space we just leave the previous address space - active. - - The obvious use for a "anonymous address space" is any thread that - doesn't need any user mappings - all kernel threads basically fall into - this category, but even "real" threads can temporarily say that for - some amount of time they are not going to be interested in user space, - and that the scheduler might as well try to avoid wasting time on - switching the VM state around. Currently only the old-style bdflush - sync does that. - - - "tsk->mm" points to the "real address space". For an anonymous process, - tsk->mm will be NULL, for the logical reason that an anonymous process - really doesn't _have_ a real address space at all. - - - however, we obviously need to keep track of which address space we - "stole" for such an anonymous user. For that, we have "tsk->active_mm", - which shows what the currently active address space is. - - The rule is that for a process with a real address space (ie tsk->mm is - non-NULL) the active_mm obviously always has to be the same as the real - one. - - For a anonymous process, tsk->mm == NULL, and tsk->active_mm is the - "borrowed" mm while the anonymous process is running. When the - anonymous process gets scheduled away, the borrowed address space is - returned and cleared. - - To support all that, the "struct mm_struct" now has two counters: a - "mm_users" counter that is how many "real address space users" there are, - and a "mm_count" counter that is the number of "lazy" users (ie anonymous - users) plus one if there are any real users. - - Usually there is at least one real user, but it could be that the real - user exited on another CPU while a lazy user was still active, so you do - actually get cases where you have a address space that is _only_ used by - lazy users. That is often a short-lived state, because once that thread - gets scheduled away in favour of a real thread, the "zombie" mm gets - released because "mm_count" becomes zero. - - Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any - more. "init_mm" should be considered just a "lazy context when no other - context is available", and in fact it is mainly used just at bootup when - no real VM has yet been created. So code that used to check - - if (current->mm == &init_mm) - - should generally just do - - if (!current->mm) - - instead (which makes more sense anyway - the test is basically one of "do - we have a user context", and is generally done by the page fault handler - and things like that). - - Anyway, I put a pre-patch-2.3.13-1 on ftp.kernel.org just a moment ago, - because it slightly changes the interfaces to accommodate the alpha (who - would have thought it, but the alpha actually ends up having one of the - ugliest context switch codes - unlike the other architectures where the MM - and register state is separate, the alpha PALcode joins the two, and you - need to switch both together). - - (From http://marc.info/?l=linux-kernel&m=93337278602211&w=2) diff --git a/Documentation/vm/arch_pgtable_helpers.rst b/Documentation/vm/arch_pgtable_helpers.rst deleted file mode 100644 index cbaee9e59241..000000000000 --- a/Documentation/vm/arch_pgtable_helpers.rst +++ /dev/null @@ -1,260 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -.. _arch_page_table_helpers: - -=============================== -Architecture Page Table Helpers -=============================== - -Generic MM expects architectures (with MMU) to provide helpers to create, access -and modify page table entries at various level for different memory functions. -These page table helpers need to conform to a common semantics across platforms. -Following tables describe the expected semantics which can also be tested during -boot via CONFIG_DEBUG_VM_PGTABLE option. All future changes in here or the debug -test need to be in sync. - - -PTE Page Table Helpers -====================== - -+---------------------------+--------------------------------------------------+ -| pte_same | Tests whether both PTE entries are the same | -+---------------------------+--------------------------------------------------+ -| pte_bad | Tests a non-table mapped PTE | -+---------------------------+--------------------------------------------------+ -| pte_present | Tests a valid mapped PTE | -+---------------------------+--------------------------------------------------+ -| pte_young | Tests a young PTE | -+---------------------------+--------------------------------------------------+ -| pte_dirty | Tests a dirty PTE | -+---------------------------+--------------------------------------------------+ -| pte_write | Tests a writable PTE | -+---------------------------+--------------------------------------------------+ -| pte_special | Tests a special PTE | -+---------------------------+--------------------------------------------------+ -| pte_protnone | Tests a PROT_NONE PTE | -+---------------------------+--------------------------------------------------+ -| pte_devmap | Tests a ZONE_DEVICE mapped PTE | -+---------------------------+--------------------------------------------------+ -| pte_soft_dirty | Tests a soft dirty PTE | -+---------------------------+--------------------------------------------------+ -| pte_swp_soft_dirty | Tests a soft dirty swapped PTE | -+---------------------------+--------------------------------------------------+ -| pte_mkyoung | Creates a young PTE | -+---------------------------+--------------------------------------------------+ -| pte_mkold | Creates an old PTE | -+---------------------------+--------------------------------------------------+ -| pte_mkdirty | Creates a dirty PTE | -+---------------------------+--------------------------------------------------+ -| pte_mkclean | Creates a clean PTE | -+---------------------------+--------------------------------------------------+ -| pte_mkwrite | Creates a writable PTE | -+---------------------------+--------------------------------------------------+ -| pte_wrprotect | Creates a write protected PTE | -+---------------------------+--------------------------------------------------+ -| pte_mkspecial | Creates a special PTE | -+---------------------------+--------------------------------------------------+ -| pte_mkdevmap | Creates a ZONE_DEVICE mapped PTE | -+---------------------------+--------------------------------------------------+ -| pte_mksoft_dirty | Creates a soft dirty PTE | -+---------------------------+--------------------------------------------------+ -| pte_clear_soft_dirty | Clears a soft dirty PTE | -+---------------------------+--------------------------------------------------+ -| pte_swp_mksoft_dirty | Creates a soft dirty swapped PTE | -+---------------------------+--------------------------------------------------+ -| pte_swp_clear_soft_dirty | Clears a soft dirty swapped PTE | -+---------------------------+--------------------------------------------------+ -| pte_mknotpresent | Invalidates a mapped PTE | -+---------------------------+--------------------------------------------------+ -| ptep_clear | Clears a PTE | -+---------------------------+--------------------------------------------------+ -| ptep_get_and_clear | Clears and returns PTE | -+---------------------------+--------------------------------------------------+ -| ptep_get_and_clear_full | Clears and returns PTE (batched PTE unmap) | -+---------------------------+--------------------------------------------------+ -| ptep_test_and_clear_young | Clears young from a PTE | -+---------------------------+--------------------------------------------------+ -| ptep_set_wrprotect | Converts into a write protected PTE | -+---------------------------+--------------------------------------------------+ -| ptep_set_access_flags | Converts into a more permissive PTE | -+---------------------------+--------------------------------------------------+ - - -PMD Page Table Helpers -====================== - -+---------------------------+--------------------------------------------------+ -| pmd_same | Tests whether both PMD entries are the same | -+---------------------------+--------------------------------------------------+ -| pmd_bad | Tests a non-table mapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_leaf | Tests a leaf mapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_huge | Tests a HugeTLB mapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_trans_huge | Tests a Transparent Huge Page (THP) at PMD | -+---------------------------+--------------------------------------------------+ -| pmd_present | Tests a valid mapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_young | Tests a young PMD | -+---------------------------+--------------------------------------------------+ -| pmd_dirty | Tests a dirty PMD | -+---------------------------+--------------------------------------------------+ -| pmd_write | Tests a writable PMD | -+---------------------------+--------------------------------------------------+ -| pmd_special | Tests a special PMD | -+---------------------------+--------------------------------------------------+ -| pmd_protnone | Tests a PROT_NONE PMD | -+---------------------------+--------------------------------------------------+ -| pmd_devmap | Tests a ZONE_DEVICE mapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_soft_dirty | Tests a soft dirty PMD | -+---------------------------+--------------------------------------------------+ -| pmd_swp_soft_dirty | Tests a soft dirty swapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mkyoung | Creates a young PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mkold | Creates an old PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mkdirty | Creates a dirty PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mkclean | Creates a clean PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mkwrite | Creates a writable PMD | -+---------------------------+--------------------------------------------------+ -| pmd_wrprotect | Creates a write protected PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mkspecial | Creates a special PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mkdevmap | Creates a ZONE_DEVICE mapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mksoft_dirty | Creates a soft dirty PMD | -+---------------------------+--------------------------------------------------+ -| pmd_clear_soft_dirty | Clears a soft dirty PMD | -+---------------------------+--------------------------------------------------+ -| pmd_swp_mksoft_dirty | Creates a soft dirty swapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_swp_clear_soft_dirty | Clears a soft dirty swapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mkinvalid | Invalidates a mapped PMD [1] | -+---------------------------+--------------------------------------------------+ -| pmd_set_huge | Creates a PMD huge mapping | -+---------------------------+--------------------------------------------------+ -| pmd_clear_huge | Clears a PMD huge mapping | -+---------------------------+--------------------------------------------------+ -| pmdp_get_and_clear | Clears a PMD | -+---------------------------+--------------------------------------------------+ -| pmdp_get_and_clear_full | Clears a PMD | -+---------------------------+--------------------------------------------------+ -| pmdp_test_and_clear_young | Clears young from a PMD | -+---------------------------+--------------------------------------------------+ -| pmdp_set_wrprotect | Converts into a write protected PMD | -+---------------------------+--------------------------------------------------+ -| pmdp_set_access_flags | Converts into a more permissive PMD | -+---------------------------+--------------------------------------------------+ - - -PUD Page Table Helpers -====================== - -+---------------------------+--------------------------------------------------+ -| pud_same | Tests whether both PUD entries are the same | -+---------------------------+--------------------------------------------------+ -| pud_bad | Tests a non-table mapped PUD | -+---------------------------+--------------------------------------------------+ -| pud_leaf | Tests a leaf mapped PUD | -+---------------------------+--------------------------------------------------+ -| pud_huge | Tests a HugeTLB mapped PUD | -+---------------------------+--------------------------------------------------+ -| pud_trans_huge | Tests a Transparent Huge Page (THP) at PUD | -+---------------------------+--------------------------------------------------+ -| pud_present | Tests a valid mapped PUD | -+---------------------------+--------------------------------------------------+ -| pud_young | Tests a young PUD | -+---------------------------+--------------------------------------------------+ -| pud_dirty | Tests a dirty PUD | -+---------------------------+--------------------------------------------------+ -| pud_write | Tests a writable PUD | -+---------------------------+--------------------------------------------------+ -| pud_devmap | Tests a ZONE_DEVICE mapped PUD | -+---------------------------+--------------------------------------------------+ -| pud_mkyoung | Creates a young PUD | -+---------------------------+--------------------------------------------------+ -| pud_mkold | Creates an old PUD | -+---------------------------+--------------------------------------------------+ -| pud_mkdirty | Creates a dirty PUD | -+---------------------------+--------------------------------------------------+ -| pud_mkclean | Creates a clean PUD | -+---------------------------+--------------------------------------------------+ -| pud_mkwrite | Creates a writable PUD | -+---------------------------+--------------------------------------------------+ -| pud_wrprotect | Creates a write protected PUD | -+---------------------------+--------------------------------------------------+ -| pud_mkdevmap | Creates a ZONE_DEVICE mapped PUD | -+---------------------------+--------------------------------------------------+ -| pud_mkinvalid | Invalidates a mapped PUD [1] | -+---------------------------+--------------------------------------------------+ -| pud_set_huge | Creates a PUD huge mapping | -+---------------------------+--------------------------------------------------+ -| pud_clear_huge | Clears a PUD huge mapping | -+---------------------------+--------------------------------------------------+ -| pudp_get_and_clear | Clears a PUD | -+---------------------------+--------------------------------------------------+ -| pudp_get_and_clear_full | Clears a PUD | -+---------------------------+--------------------------------------------------+ -| pudp_test_and_clear_young | Clears young from a PUD | -+---------------------------+--------------------------------------------------+ -| pudp_set_wrprotect | Converts into a write protected PUD | -+---------------------------+--------------------------------------------------+ -| pudp_set_access_flags | Converts into a more permissive PUD | -+---------------------------+--------------------------------------------------+ - - -HugeTLB Page Table Helpers -========================== - -+---------------------------+--------------------------------------------------+ -| pte_huge | Tests a HugeTLB | -+---------------------------+--------------------------------------------------+ -| pte_mkhuge | Creates a HugeTLB | -+---------------------------+--------------------------------------------------+ -| huge_pte_dirty | Tests a dirty HugeTLB | -+---------------------------+--------------------------------------------------+ -| huge_pte_write | Tests a writable HugeTLB | -+---------------------------+--------------------------------------------------+ -| huge_pte_mkdirty | Creates a dirty HugeTLB | -+---------------------------+--------------------------------------------------+ -| huge_pte_mkwrite | Creates a writable HugeTLB | -+---------------------------+--------------------------------------------------+ -| huge_pte_wrprotect | Creates a write protected HugeTLB | -+---------------------------+--------------------------------------------------+ -| huge_ptep_get_and_clear | Clears a HugeTLB | -+---------------------------+--------------------------------------------------+ -| huge_ptep_set_wrprotect | Converts into a write protected HugeTLB | -+---------------------------+--------------------------------------------------+ -| huge_ptep_set_access_flags | Converts into a more permissive HugeTLB | -+---------------------------+--------------------------------------------------+ - - -SWAP Page Table Helpers -======================== - -+---------------------------+--------------------------------------------------+ -| __pte_to_swp_entry | Creates a swapped entry (arch) from a mapped PTE | -+---------------------------+--------------------------------------------------+ -| __swp_to_pte_entry | Creates a mapped PTE from a swapped entry (arch) | -+---------------------------+--------------------------------------------------+ -| __pmd_to_swp_entry | Creates a swapped entry (arch) from a mapped PMD | -+---------------------------+--------------------------------------------------+ -| __swp_to_pmd_entry | Creates a mapped PMD from a swapped entry (arch) | -+---------------------------+--------------------------------------------------+ -| is_migration_entry | Tests a migration (read or write) swapped entry | -+-------------------------------+----------------------------------------------+ -| is_writable_migration_entry | Tests a write migration swapped entry | -+-------------------------------+----------------------------------------------+ -| make_readable_migration_entry | Creates a read migration swapped entry | -+-------------------------------+----------------------------------------------+ -| make_writable_migration_entry | Creates a write migration swapped entry | -+-------------------------------+----------------------------------------------+ - -[1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/ diff --git a/Documentation/vm/balance.rst b/Documentation/vm/balance.rst deleted file mode 100644 index 6a1fadf3e173..000000000000 --- a/Documentation/vm/balance.rst +++ /dev/null @@ -1,102 +0,0 @@ -.. _balance: - -================ -Memory Balancing -================ - -Started Jan 2000 by Kanoj Sarcar - -Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as -well as for non __GFP_IO allocations. - -The first reason why a caller may avoid reclaim is that the caller can not -sleep due to holding a spinlock or is in interrupt context. The second may -be that the caller is willing to fail the allocation without incurring the -overhead of page reclaim. This may happen for opportunistic high-order -allocation requests that have order-0 fallback options. In such cases, -the caller may also wish to avoid waking kswapd. - -__GFP_IO allocation requests are made to prevent file system deadlocks. - -In the absence of non sleepable allocation requests, it seems detrimental -to be doing balancing. Page reclamation can be kicked off lazily, that -is, only when needed (aka zone free memory is 0), instead of making it -a proactive process. - -That being said, the kernel should try to fulfill requests for direct -mapped pages from the direct mapped pool, instead of falling back on -the dma pool, so as to keep the dma pool filled for dma requests (atomic -or not). A similar argument applies to highmem and direct mapped pages. -OTOH, if there is a lot of free dma pages, it is preferable to satisfy -regular memory requests by allocating one from the dma pool, instead -of incurring the overhead of regular zone balancing. - -In 2.2, memory balancing/page reclamation would kick off only when the -_total_ number of free pages fell below 1/64 th of total memory. With the -right ratio of dma and regular memory, it is quite possible that balancing -would not be done even when the dma zone was completely empty. 2.2 has -been running production machines of varying memory sizes, and seems to be -doing fine even with the presence of this problem. In 2.3, due to -HIGHMEM, this problem is aggravated. - -In 2.3, zone balancing can be done in one of two ways: depending on the -zone size (and possibly of the size of lower class zones), we can decide -at init time how many free pages we should aim for while balancing any -zone. The good part is, while balancing, we do not need to look at sizes -of lower class zones, the bad part is, we might do too frequent balancing -due to ignoring possibly lower usage in the lower class zones. Also, -with a slight change in the allocation routine, it is possible to reduce -the memclass() macro to be a simple equality. - -Another possible solution is that we balance only when the free memory -of a zone _and_ all its lower class zones falls below 1/64th of the -total memory in the zone and its lower class zones. This fixes the 2.2 -balancing problem, and stays as close to 2.2 behavior as possible. Also, -the balancing algorithm works the same way on the various architectures, -which have different numbers and types of zones. If we wanted to get -fancy, we could assign different weights to free pages in different -zones in the future. - -Note that if the size of the regular zone is huge compared to dma zone, -it becomes less significant to consider the free dma pages while -deciding whether to balance the regular zone. The first solution -becomes more attractive then. - -The appended patch implements the second solution. It also "fixes" two -problems: first, kswapd is woken up as in 2.2 on low memory conditions -for non-sleepable allocations. Second, the HIGHMEM zone is also balanced, -so as to give a fighting chance for replace_with_highmem() to get a -HIGHMEM page, as well as to ensure that HIGHMEM allocations do not -fall back into regular zone. This also makes sure that HIGHMEM pages -are not leaked (for example, in situations where a HIGHMEM page is in -the swapcache but is not being used by anyone) - -kswapd also needs to know about the zones it should balance. kswapd is -primarily needed in a situation where balancing can not be done, -probably because all allocation requests are coming from intr context -and all process contexts are sleeping. For 2.3, kswapd does not really -need to balance the highmem zone, since intr context does not request -highmem pages. kswapd looks at the zone_wake_kswapd field in the zone -structure to decide whether a zone needs balancing. - -Page stealing from process memory and shm is done if stealing the page would -alleviate memory pressure on any zone in the page's node that has fallen below -its watermark. - -watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These -are per-zone fields, used to determine when a zone needs to be balanced. When -the number of pages falls below watermark[WMARK_MIN], the hysteric field -low_on_memory gets set. This stays set till the number of free pages becomes -watermark[WMARK_HIGH]. When low_on_memory is set, page allocation requests will -try to free some pages in the zone (providing GFP_WAIT is set in the request). -Orthogonal to this, is the decision to poke kswapd to free some zone pages. -That decision is not hysteresis based, and is done when the number of free -pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set. - - -(Good) Ideas that I have heard: - -1. Dynamic experience should influence balancing: number of failed requests - for a zone can be tracked and fed into the balancing scheme (jalvo@mbay.net) -2. Implement a replace_with_highmem()-like replace_with_regular() to preserve - dma pages. (lkd@tantalophile.demon.co.uk) diff --git a/Documentation/vm/bootmem.rst b/Documentation/vm/bootmem.rst deleted file mode 100644 index eb2b31eedfa1..000000000000 --- a/Documentation/vm/bootmem.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -=========== -Boot Memory -=========== diff --git a/Documentation/vm/damon/api.rst b/Documentation/vm/damon/api.rst deleted file mode 100644 index 08f34df45523..000000000000 --- a/Documentation/vm/damon/api.rst +++ /dev/null @@ -1,20 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -============= -API Reference -============= - -Kernel space programs can use every feature of DAMON using below APIs. All you -need to do is including ``damon.h``, which is located in ``include/linux/`` of -the source tree. - -Structures -========== - -.. kernel-doc:: include/linux/damon.h - - -Functions -========= - -.. kernel-doc:: mm/damon/core.c diff --git a/Documentation/vm/damon/design.rst b/Documentation/vm/damon/design.rst deleted file mode 100644 index 0cff6fac6b7e..000000000000 --- a/Documentation/vm/damon/design.rst +++ /dev/null @@ -1,176 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -====== -Design -====== - -Configurable Layers -=================== - -DAMON provides data access monitoring functionality while making the accuracy -and the overhead controllable. The fundamental access monitorings require -primitives that dependent on and optimized for the target address space. On -the other hand, the accuracy and overhead tradeoff mechanism, which is the core -of DAMON, is in the pure logic space. DAMON separates the two parts in -different layers and defines its interface to allow various low level -primitives implementations configurable with the core logic. We call the low -level primitives implementations monitoring operations. - -Due to this separated design and the configurable interface, users can extend -DAMON for any address space by configuring the core logics with appropriate -monitoring operations. If appropriate one is not provided, users can implement -the operations on their own. - -For example, physical memory, virtual memory, swap space, those for specific -processes, NUMA nodes, files, and backing memory devices would be supportable. -Also, if some architectures or devices support special optimized access check -primitives, those will be easily configurable. - - -Reference Implementations of Address Space Specific Monitoring Operations -========================================================================= - -The monitoring operations are defined in two parts: - -1. Identification of the monitoring target address range for the address space. -2. Access check of specific address range in the target space. - -DAMON currently provides the implementations of the operations for the physical -and virtual address spaces. Below two subsections describe how those work. - - -VMA-based Target Address Range Construction -------------------------------------------- - -This is only for the virtual address space monitoring operations -implementation. That for the physical address space simply asks users to -manually set the monitoring target address ranges. - -Only small parts in the super-huge virtual address space of the processes are -mapped to the physical memory and accessed. Thus, tracking the unmapped -address regions is just wasteful. However, because DAMON can deal with some -level of noise using the adaptive regions adjustment mechanism, tracking every -mapping is not strictly required but could even incur a high overhead in some -cases. That said, too huge unmapped areas inside the monitoring target should -be removed to not take the time for the adaptive mechanism. - -For the reason, this implementation converts the complex mappings to three -distinct regions that cover every mapped area of the address space. The two -gaps between the three regions are the two biggest unmapped areas in the given -address space. The two biggest unmapped areas would be the gap between the -heap and the uppermost mmap()-ed region, and the gap between the lowermost -mmap()-ed region and the stack in most of the cases. Because these gaps are -exceptionally huge in usual address spaces, excluding these will be sufficient -to make a reasonable trade-off. Below shows this in detail:: - - - - - (small mmap()-ed regions and munmap()-ed regions) - - - - - -PTE Accessed-bit Based Access Check ------------------------------------ - -Both of the implementations for physical and virtual address spaces use PTE -Accessed-bit for basic access checks. Only one difference is the way of -finding the relevant PTE Accessed bit(s) from the address. While the -implementation for the virtual address walks the page table for the target task -of the address, the implementation for the physical address walks every page -table having a mapping to the address. In this way, the implementations find -and clear the bit(s) for next sampling target address and checks whether the -bit(s) set again after one sampling period. This could disturb other kernel -subsystems using the Accessed bits, namely Idle page tracking and the reclaim -logic. DAMON does nothing to avoid disturbing Idle page tracking, so handling -the interference is the responsibility of sysadmins. However, it solves the -conflict with the reclaim logic using ``PG_idle`` and ``PG_young`` page flags, -as Idle page tracking does. - - -Address Space Independent Core Mechanisms -========================================= - -Below four sections describe each of the DAMON core mechanisms and the five -monitoring attributes, ``sampling interval``, ``aggregation interval``, -``update interval``, ``minimum number of regions``, and ``maximum number of -regions``. - - -Access Frequency Monitoring ---------------------------- - -The output of DAMON says what pages are how frequently accessed for a given -duration. The resolution of the access frequency is controlled by setting -``sampling interval`` and ``aggregation interval``. In detail, DAMON checks -access to each page per ``sampling interval`` and aggregates the results. In -other words, counts the number of the accesses to each page. After each -``aggregation interval`` passes, DAMON calls callback functions that previously -registered by users so that users can read the aggregated results and then -clears the results. This can be described in below simple pseudo-code:: - - while monitoring_on: - for page in monitoring_target: - if accessed(page): - nr_accesses[page] += 1 - if time() % aggregation_interval == 0: - for callback in user_registered_callbacks: - callback(monitoring_target, nr_accesses) - for page in monitoring_target: - nr_accesses[page] = 0 - sleep(sampling interval) - -The monitoring overhead of this mechanism will arbitrarily increase as the -size of the target workload grows. - - -Region Based Sampling ---------------------- - -To avoid the unbounded increase of the overhead, DAMON groups adjacent pages -that assumed to have the same access frequencies into a region. As long as the -assumption (pages in a region have the same access frequencies) is kept, only -one page in the region is required to be checked. Thus, for each ``sampling -interval``, DAMON randomly picks one page in each region, waits for one -``sampling interval``, checks whether the page is accessed meanwhile, and -increases the access frequency of the region if so. Therefore, the monitoring -overhead is controllable by setting the number of regions. DAMON allows users -to set the minimum and the maximum number of regions for the trade-off. - -This scheme, however, cannot preserve the quality of the output if the -assumption is not guaranteed. - - -Adaptive Regions Adjustment ---------------------------- - -Even somehow the initial monitoring target regions are well constructed to -fulfill the assumption (pages in same region have similar access frequencies), -the data access pattern can be dynamically changed. This will result in low -monitoring quality. To keep the assumption as much as possible, DAMON -adaptively merges and splits each region based on their access frequency. - -For each ``aggregation interval``, it compares the access frequencies of -adjacent regions and merges those if the frequency difference is small. Then, -after it reports and clears the aggregated access frequency of each region, it -splits each region into two or three regions if the total number of regions -will not exceed the user-specified maximum number of regions after the split. - -In this way, DAMON provides its best-effort quality and minimal overhead while -keeping the bounds users set for their trade-off. - - -Dynamic Target Space Updates Handling -------------------------------------- - -The monitoring target address range could dynamically changed. For example, -virtual memory could be dynamically mapped and unmapped. Physical memory could -be hot-plugged. - -As the changes could be quite frequent in some cases, DAMON allows the -monitoring operations to check dynamic changes including memory mapping changes -and applies it to monitoring operations-related data structures such as the -abstracted monitoring target memory area only for each of a user-specified time -interval (``update interval``). diff --git a/Documentation/vm/damon/faq.rst b/Documentation/vm/damon/faq.rst deleted file mode 100644 index dde7e2414ee6..000000000000 --- a/Documentation/vm/damon/faq.rst +++ /dev/null @@ -1,50 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -========================== -Frequently Asked Questions -========================== - -Why a new subsystem, instead of extending perf or other user space tools? -========================================================================= - -First, because it needs to be lightweight as much as possible so that it can be -used online, any unnecessary overhead such as kernel - user space context -switching cost should be avoided. Second, DAMON aims to be used by other -programs including the kernel. Therefore, having a dependency on specific -tools like perf is not desirable. These are the two biggest reasons why DAMON -is implemented in the kernel space. - - -Can 'idle pages tracking' or 'perf mem' substitute DAMON? -========================================================= - -Idle page tracking is a low level primitive for access check of the physical -address space. 'perf mem' is similar, though it can use sampling to minimize -the overhead. On the other hand, DAMON is a higher-level framework for the -monitoring of various address spaces. It is focused on memory management -optimization and provides sophisticated accuracy/overhead handling mechanisms. -Therefore, 'idle pages tracking' and 'perf mem' could provide a subset of -DAMON's output, but cannot substitute DAMON. - - -Does DAMON support virtual memory only? -======================================= - -No. The core of the DAMON is address space independent. The address space -specific monitoring operations including monitoring target regions -constructions and actual access checks can be implemented and configured on the -DAMON core by the users. In this way, DAMON users can monitor any address -space with any access check technique. - -Nonetheless, DAMON provides vma/rmap tracking and PTE Accessed bit check based -implementations of the address space dependent functions for the virtual memory -and the physical memory by default, for a reference and convenient use. - - -Can I simply monitor page granularity? -====================================== - -Yes. You can do so by setting the ``min_nr_regions`` attribute higher than the -working set size divided by the page size. Because the monitoring target -regions size is forced to be ``>=page size``, the region split will make no -effect. diff --git a/Documentation/vm/damon/index.rst b/Documentation/vm/damon/index.rst deleted file mode 100644 index 48c0bbff98b2..000000000000 --- a/Documentation/vm/damon/index.rst +++ /dev/null @@ -1,29 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -========================== -DAMON: Data Access MONitor -========================== - -DAMON is a data access monitoring framework subsystem for the Linux kernel. -The core mechanisms of DAMON (refer to :doc:`design` for the detail) make it - - - *accurate* (the monitoring output is useful enough for DRAM level memory - management; It might not appropriate for CPU Cache levels, though), - - *light-weight* (the monitoring overhead is low enough to be applied online), - and - - *scalable* (the upper-bound of the overhead is in constant range regardless - of the size of target workloads). - -Using this framework, therefore, the kernel's memory management mechanisms can -make advanced decisions. Experimental memory management optimization works -that incurring high data accesses monitoring overhead could implemented again. -In user space, meanwhile, users who have some special workloads can write -personalized applications for better understanding and optimizations of their -workloads and systems. - -.. toctree:: - :maxdepth: 2 - - faq - design - api diff --git a/Documentation/vm/free_page_reporting.rst b/Documentation/vm/free_page_reporting.rst deleted file mode 100644 index 8c05e62d8b2b..000000000000 --- a/Documentation/vm/free_page_reporting.rst +++ /dev/null @@ -1,40 +0,0 @@ -.. _free_page_reporting: - -===================== -Free Page Reporting -===================== - -Free page reporting is an API by which a device can register to receive -lists of pages that are currently unused by the system. This is useful in -the case of virtualization where a guest is then able to use this data to -notify the hypervisor that it is no longer using certain pages in memory. - -For the driver, typically a balloon driver, to use of this functionality -it will allocate and initialize a page_reporting_dev_info structure. The -field within the structure it will populate is the "report" function -pointer used to process the scatterlist. It must also guarantee that it can -handle at least PAGE_REPORTING_CAPACITY worth of scatterlist entries per -call to the function. A call to page_reporting_register will register the -page reporting interface with the reporting framework assuming no other -page reporting devices are already registered. - -Once registered the page reporting API will begin reporting batches of -pages to the driver. The API will start reporting pages 2 seconds after -the interface is registered and will continue to do so 2 seconds after any -page of a sufficiently high order is freed. - -Pages reported will be stored in the scatterlist passed to the reporting -function with the final entry having the end bit set in entry nent - 1. -While pages are being processed by the report function they will not be -accessible to the allocator. Once the report function has been completed -the pages will be returned to the free area from which they were obtained. - -Prior to removing a driver that is making use of free page reporting it -is necessary to call page_reporting_unregister to have the -page_reporting_dev_info structure that is currently in use by free page -reporting removed. Doing this will prevent further reports from being -issued via the interface. If another driver or the same driver is -registered it is possible for it to resume where the previous driver had -left off in terms of reporting free pages. - -Alexander Duyck, Dec 04, 2019 diff --git a/Documentation/vm/frontswap.rst b/Documentation/vm/frontswap.rst deleted file mode 100644 index feecc5e24477..000000000000 --- a/Documentation/vm/frontswap.rst +++ /dev/null @@ -1,266 +0,0 @@ -.. _frontswap: - -========= -Frontswap -========= - -Frontswap provides a "transcendent memory" interface for swap pages. -In some environments, dramatic performance savings may be obtained because -swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk. - -.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/ - -Frontswap is so named because it can be thought of as the opposite of -a "backing" store for a swap device. The storage is assumed to be -a synchronous concurrency-safe page-oriented "pseudo-RAM device" conforming -to the requirements of transcendent memory (such as Xen's "tmem", or -in-kernel compressed memory, aka "zcache", or future RAM-like devices); -this pseudo-RAM device is not directly accessible or addressable by the -kernel and is of unknown and possibly time-varying size. The driver -links itself to frontswap by calling frontswap_register_ops to set the -frontswap_ops funcs appropriately and the functions it provides must -conform to certain policies as follows: - -An "init" prepares the device to receive frontswap pages associated -with the specified swap device number (aka "type"). A "store" will -copy the page to transcendent memory and associate it with the type and -offset associated with the page. A "load" will copy the page, if found, -from transcendent memory into kernel memory, but will NOT remove the page -from transcendent memory. An "invalidate_page" will remove the page -from transcendent memory and an "invalidate_area" will remove ALL pages -associated with the swap type (e.g., like swapoff) and notify the "device" -to refuse further stores with that swap type. - -Once a page is successfully stored, a matching load on the page will normally -succeed. So when the kernel finds itself in a situation where it needs -to swap out a page, it first attempts to use frontswap. If the store returns -success, the data has been successfully saved to transcendent memory and -a disk write and, if the data is later read back, a disk read are avoided. -If a store returns failure, transcendent memory has rejected the data, and the -page can be written to swap as usual. - -Note that if a page is stored and the page already exists in transcendent memory -(a "duplicate" store), either the store succeeds and the data is overwritten, -or the store fails AND the page is invalidated. This ensures stale data may -never be obtained from frontswap. - -If properly configured, monitoring of frontswap is done via debugfs in -the `/sys/kernel/debug/frontswap` directory. The effectiveness of -frontswap can be measured (across all swap devices) with: - -``failed_stores`` - how many store attempts have failed - -``loads`` - how many loads were attempted (all should succeed) - -``succ_stores`` - how many store attempts have succeeded - -``invalidates`` - how many invalidates were attempted - -A backend implementation may provide additional metrics. - -FAQ -=== - -* Where's the value? - -When a workload starts swapping, performance falls through the floor. -Frontswap significantly increases performance in many such workloads by -providing a clean, dynamic interface to read and write swap pages to -"transcendent memory" that is otherwise not directly addressable to the kernel. -This interface is ideal when data is transformed to a different form -and size (such as with compression) or secretly moved (as might be -useful for write-balancing for some RAM-like devices). Swap pages (and -evicted page-cache pages) are a great use for this kind of slower-than-RAM- -but-much-faster-than-disk "pseudo-RAM device". - -Frontswap with a fairly small impact on the kernel, -provides a huge amount of flexibility for more dynamic, flexible RAM -utilization in various system configurations: - -In the single kernel case, aka "zcache", pages are compressed and -stored in local memory, thus increasing the total anonymous pages -that can be safely kept in RAM. Zcache essentially trades off CPU -cycles used in compression/decompression for better memory utilization. -Benchmarks have shown little or no impact when memory pressure is -low while providing a significant performance improvement (25%+) -on some workloads under high memory pressure. - -"RAMster" builds on zcache by adding "peer-to-peer" transcendent memory -support for clustered systems. Frontswap pages are locally compressed -as in zcache, but then "remotified" to another system's RAM. This -allows RAM to be dynamically load-balanced back-and-forth as needed, -i.e. when system A is overcommitted, it can swap to system B, and -vice versa. RAMster can also be configured as a memory server so -many servers in a cluster can swap, dynamically as needed, to a single -server configured with a large amount of RAM... without pre-configuring -how much of the RAM is available for each of the clients! - -In the virtual case, the whole point of virtualization is to statistically -multiplex physical resources across the varying demands of multiple -virtual machines. This is really hard to do with RAM and efforts to do -it well with no kernel changes have essentially failed (except in some -well-publicized special-case workloads). -Specifically, the Xen Transcendent Memory backend allows otherwise -"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple -virtual machines, but the pages can be compressed and deduplicated to -optimize RAM utilization. And when guest OS's are induced to surrender -underutilized RAM (e.g. with "selfballooning"), sudden unexpected -memory pressure may result in swapping; frontswap allows those pages -to be swapped to and from hypervisor RAM (if overall host system memory -conditions allow), thus mitigating the potentially awful performance impact -of unplanned swapping. - -A KVM implementation is underway and has been RFC'ed to lkml. And, -using frontswap, investigation is also underway on the use of NVM as -a memory extension technology. - -* Sure there may be performance advantages in some situations, but - what's the space/time overhead of frontswap? - -If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into -nothingness and the only overhead is a few extra bytes per swapon'ed -swap device. If CONFIG_FRONTSWAP is enabled but no frontswap "backend" -registers, there is one extra global variable compared to zero for -every swap page read or written. If CONFIG_FRONTSWAP is enabled -AND a frontswap backend registers AND the backend fails every "store" -request (i.e. provides no memory despite claiming it might), -CPU overhead is still negligible -- and since every frontswap fail -precedes a swap page write-to-disk, the system is highly likely -to be I/O bound and using a small fraction of a percent of a CPU -will be irrelevant anyway. - -As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend -registers, one bit is allocated for every swap page for every swap -device that is swapon'd. This is added to the EIGHT bits (which -was sixteen until about 2.6.34) that the kernel already allocates -for every swap page for every swap device that is swapon'd. (Hugh -Dickins has observed that frontswap could probably steal one of -the existing eight bits, but let's worry about that minor optimization -later.) For very large swap disks (which are rare) on a standard -4K pagesize, this is 1MB per 32GB swap. - -When swap pages are stored in transcendent memory instead of written -out to disk, there is a side effect that this may create more memory -pressure that can potentially outweigh the other advantages. A -backend, such as zcache, must implement policies to carefully (but -dynamically) manage memory limits to ensure this doesn't happen. - -* OK, how about a quick overview of what this frontswap patch does - in terms that a kernel hacker can grok? - -Let's assume that a frontswap "backend" has registered during -kernel initialization; this registration indicates that this -frontswap backend has access to some "memory" that is not directly -accessible by the kernel. Exactly how much memory it provides is -entirely dynamic and random. - -Whenever a swap-device is swapon'd frontswap_init() is called, -passing the swap device number (aka "type") as a parameter. -This notifies frontswap to expect attempts to "store" swap pages -associated with that number. - -Whenever the swap subsystem is readying a page to write to a swap -device (c.f swap_writepage()), frontswap_store is called. Frontswap -consults with the frontswap backend and if the backend says it does NOT -have room, frontswap_store returns -1 and the kernel swaps the page -to the swap device as normal. Note that the response from the frontswap -backend is unpredictable to the kernel; it may choose to never accept a -page, it could accept every ninth page, or it might accept every -page. But if the backend does accept a page, the data from the page -has already been copied and associated with the type and offset, -and the backend guarantees the persistence of the data. In this case, -frontswap sets a bit in the "frontswap_map" for the swap device -corresponding to the page offset on the swap device to which it would -otherwise have written the data. - -When the swap subsystem needs to swap-in a page (swap_readpage()), -it first calls frontswap_load() which checks the frontswap_map to -see if the page was earlier accepted by the frontswap backend. If -it was, the page of data is filled from the frontswap backend and -the swap-in is complete. If not, the normal swap-in code is -executed to obtain the page of data from the real swap device. - -So every time the frontswap backend accepts a page, a swap device read -and (potentially) a swap device write are replaced by a "frontswap backend -store" and (possibly) a "frontswap backend loads", which are presumably much -faster. - -* Can't frontswap be configured as a "special" swap device that is - just higher priority than any real swap device (e.g. like zswap, - or maybe swap-over-nbd/NFS)? - -No. First, the existing swap subsystem doesn't allow for any kind of -swap hierarchy. Perhaps it could be rewritten to accommodate a hierarchy, -but this would require fairly drastic changes. Even if it were -rewritten, the existing swap subsystem uses the block I/O layer which -assumes a swap device is fixed size and any page in it is linearly -addressable. Frontswap barely touches the existing swap subsystem, -and works around the constraints of the block I/O subsystem to provide -a great deal of flexibility and dynamicity. - -For example, the acceptance of any swap page by the frontswap backend is -entirely unpredictable. This is critical to the definition of frontswap -backends because it grants completely dynamic discretion to the -backend. In zcache, one cannot know a priori how compressible a page is. -"Poorly" compressible pages can be rejected, and "poorly" can itself be -defined dynamically depending on current memory constraints. - -Further, frontswap is entirely synchronous whereas a real swap -device is, by definition, asynchronous and uses block I/O. The -block I/O layer is not only unnecessary, but may perform "optimizations" -that are inappropriate for a RAM-oriented device including delaying -the write of some pages for a significant amount of time. Synchrony is -required to ensure the dynamicity of the backend and to avoid thorny race -conditions that would unnecessarily and greatly complicate frontswap -and/or the block I/O subsystem. That said, only the initial "store" -and "load" operations need be synchronous. A separate asynchronous thread -is free to manipulate the pages stored by frontswap. For example, -the "remotification" thread in RAMster uses standard asynchronous -kernel sockets to move compressed frontswap pages to a remote machine. -Similarly, a KVM guest-side implementation could do in-guest compression -and use "batched" hypercalls. - -In a virtualized environment, the dynamicity allows the hypervisor -(or host OS) to do "intelligent overcommit". For example, it can -choose to accept pages only until host-swapping might be imminent, -then force guests to do their own swapping. - -There is a downside to the transcendent memory specifications for -frontswap: Since any "store" might fail, there must always be a real -slot on a real swap device to swap the page. Thus frontswap must be -implemented as a "shadow" to every swapon'd device with the potential -capability of holding every page that the swap device might have held -and the possibility that it might hold no pages at all. This means -that frontswap cannot contain more pages than the total of swapon'd -swap devices. For example, if NO swap device is configured on some -installation, frontswap is useless. Swapless portable devices -can still use frontswap but a backend for such devices must configure -some kind of "ghost" swap device and ensure that it is never used. - -* Why this weird definition about "duplicate stores"? If a page - has been previously successfully stored, can't it always be - successfully overwritten? - -Nearly always it can, but no, sometimes it cannot. Consider an example -where data is compressed and the original 4K page has been compressed -to 1K. Now an attempt is made to overwrite the page with data that -is non-compressible and so would take the entire 4K. But the backend -has no more space. In this case, the store must be rejected. Whenever -frontswap rejects a store that would overwrite, it also must invalidate -the old data and ensure that it is no longer accessible. Since the -swap subsystem then writes the new data to the read swap device, -this is the correct course of action to ensure coherency. - -* Why does the frontswap patch create the new include file swapfile.h? - -The frontswap code depends on some swap-subsystem-internal data -structures that have, over the years, moved back and forth between -static and global. This seemed a reasonable compromise: Define -them as global but declare them in a new include file that isn't -included by the large number of source files that include swap.h. - -Dan Magenheimer, last updated April 9, 2012 diff --git a/Documentation/vm/highmem.rst b/Documentation/vm/highmem.rst deleted file mode 100644 index c9887f241c6c..000000000000 --- a/Documentation/vm/highmem.rst +++ /dev/null @@ -1,167 +0,0 @@ -.. _highmem: - -==================== -High Memory Handling -==================== - -By: Peter Zijlstra - -.. contents:: :local: - -What Is High Memory? -==================== - -High memory (highmem) is used when the size of physical memory approaches or -exceeds the maximum size of virtual memory. At that point it becomes -impossible for the kernel to keep all of the available physical memory mapped -at all times. This means the kernel needs to start using temporary mappings of -the pieces of physical memory that it wants to access. - -The part of (physical) memory not covered by a permanent mapping is what we -refer to as 'highmem'. There are various architecture dependent constraints on -where exactly that border lies. - -In the i386 arch, for example, we choose to map the kernel into every process's -VM space so that we don't have to pay the full TLB invalidation costs for -kernel entry/exit. This means the available virtual memory space (4GiB on -i386) has to be divided between user and kernel space. - -The traditional split for architectures using this approach is 3:1, 3GiB for -userspace and the top 1GiB for kernel space:: - - +--------+ 0xffffffff - | Kernel | - +--------+ 0xc0000000 - | | - | User | - | | - +--------+ 0x00000000 - -This means that the kernel can at most map 1GiB of physical memory at any one -time, but because we need virtual address space for other things - including -temporary maps to access the rest of the physical memory - the actual direct -map will typically be less (usually around ~896MiB). - -Other architectures that have mm context tagged TLBs can have separate kernel -and user maps. Some hardware (like some ARMs), however, have limited virtual -space when they use mm context tags. - - -Temporary Virtual Mappings -========================== - -The kernel contains several ways of creating temporary mappings. The following -list shows them in order of preference of use. - -* kmap_local_page(). This function is used to require short term mappings. - It can be invoked from any context (including interrupts) but the mappings - can only be used in the context which acquired them. - - This function should be preferred, where feasible, over all the others. - - These mappings are thread-local and CPU-local, meaning that the mapping - can only be accessed from within this thread and the thread is bound the - CPU while the mapping is active. Even if the thread is preempted (since - preemption is never disabled by the function) the CPU can not be - unplugged from the system via CPU-hotplug until the mapping is disposed. - - It's valid to take pagefaults in a local kmap region, unless the context - in which the local mapping is acquired does not allow it for other reasons. - - kmap_local_page() always returns a valid virtual address and it is assumed - that kunmap_local() will never fail. - - Nesting kmap_local_page() and kmap_atomic() mappings is allowed to a certain - extent (up to KMAP_TYPE_NR) but their invocations have to be strictly ordered - because the map implementation is stack based. See kmap_local_page() kdocs - (included in the "Functions" section) for details on how to manage nested - mappings. - -* kmap_atomic(). This permits a very short duration mapping of a single - page. Since the mapping is restricted to the CPU that issued it, it - performs well, but the issuing task is therefore required to stay on that - CPU until it has finished, lest some other task displace its mappings. - - kmap_atomic() may also be used by interrupt contexts, since it does not - sleep and the callers too may not sleep until after kunmap_atomic() is - called. - - Each call of kmap_atomic() in the kernel creates a non-preemptible section - and disable pagefaults. This could be a source of unwanted latency. Therefore - users should prefer kmap_local_page() instead of kmap_atomic(). - - It is assumed that k[un]map_atomic() won't fail. - -* kmap(). This should be used to make short duration mapping of a single - page with no restrictions on preemption or migration. It comes with an - overhead as mapping space is restricted and protected by a global lock - for synchronization. When mapping is no longer needed, the address that - the page was mapped to must be released with kunmap(). - - Mapping changes must be propagated across all the CPUs. kmap() also - requires global TLB invalidation when the kmap's pool wraps and it might - block when the mapping space is fully utilized until a slot becomes - available. Therefore, kmap() is only callable from preemptible context. - - All the above work is necessary if a mapping must last for a relatively - long time but the bulk of high-memory mappings in the kernel are - short-lived and only used in one place. This means that the cost of - kmap() is mostly wasted in such cases. kmap() was not intended for long - term mappings but it has morphed in that direction and its use is - strongly discouraged in newer code and the set of the preceding functions - should be preferred. - - On 64-bit systems, calls to kmap_local_page(), kmap_atomic() and kmap() have - no real work to do because a 64-bit address space is more than sufficient to - address all the physical memory whose pages are permanently mapped. - -* vmap(). This can be used to make a long duration mapping of multiple - physical pages into a contiguous virtual space. It needs global - synchronization to unmap. - - -Cost of Temporary Mappings -========================== - -The cost of creating temporary mappings can be quite high. The arch has to -manipulate the kernel's page tables, the data TLB and/or the MMU's registers. - -If CONFIG_HIGHMEM is not set, then the kernel will try and create a mapping -simply with a bit of arithmetic that will convert the page struct address into -a pointer to the page contents rather than juggling mappings about. In such a -case, the unmap operation may be a null operation. - -If CONFIG_MMU is not set, then there can be no temporary mappings and no -highmem. In such a case, the arithmetic approach will also be used. - - -i386 PAE -======== - -The i386 arch, under some circumstances, will permit you to stick up to 64GiB -of RAM into your 32-bit machine. This has a number of consequences: - -* Linux needs a page-frame structure for each page in the system and the - pageframes need to live in the permanent mapping, which means: - -* you can have 896M/sizeof(struct page) page-frames at most; with struct - page being 32-bytes that would end up being something in the order of 112G - worth of pages; the kernel, however, needs to store more than just - page-frames in that memory... - -* PAE makes your page tables larger - which slows the system down as more - data has to be accessed to traverse in TLB fills and the like. One - advantage is that PAE has more PTE bits and can provide advanced features - like NX and PAT. - -The general recommendation is that you don't use more than 8GiB on a 32-bit -machine - although more might work for you and your workload, you're pretty -much on your own - don't expect kernel developers to really care much if things -come apart. - - -Functions -========= - -.. kernel-doc:: include/linux/highmem.h -.. kernel-doc:: include/linux/highmem-internal.h diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst deleted file mode 100644 index f2a59ed82ed3..000000000000 --- a/Documentation/vm/hmm.rst +++ /dev/null @@ -1,452 +0,0 @@ -.. _hmm: - -===================================== -Heterogeneous Memory Management (HMM) -===================================== - -Provide infrastructure and helpers to integrate non-conventional memory (device -memory like GPU on board memory) into regular kernel path, with the cornerstone -of this being specialized struct page for such memory (see sections 5 to 7 of -this document). - -HMM also provides optional helpers for SVM (Share Virtual Memory), i.e., -allowing a device to transparently access program addresses coherently with -the CPU meaning that any valid pointer on the CPU is also a valid pointer -for the device. This is becoming mandatory to simplify the use of advanced -heterogeneous computing where GPU, DSP, or FPGA are used to perform various -computations on behalf of a process. - -This document is divided as follows: in the first section I expose the problems -related to using device specific memory allocators. In the second section, I -expose the hardware limitations that are inherent to many platforms. The third -section gives an overview of the HMM design. The fourth section explains how -CPU page-table mirroring works and the purpose of HMM in this context. The -fifth section deals with how device memory is represented inside the kernel. -Finally, the last section presents a new migration helper that allows -leveraging the device DMA engine. - -.. contents:: :local: - -Problems of using a device specific memory allocator -==================================================== - -Devices with a large amount of on board memory (several gigabytes) like GPUs -have historically managed their memory through dedicated driver specific APIs. -This creates a disconnect between memory allocated and managed by a device -driver and regular application memory (private anonymous, shared memory, or -regular file backed memory). From here on I will refer to this aspect as split -address space. I use shared address space to refer to the opposite situation: -i.e., one in which any application memory region can be used by a device -transparently. - -Split address space happens because devices can only access memory allocated -through a device specific API. This implies that all memory objects in a program -are not equal from the device point of view which complicates large programs -that rely on a wide set of libraries. - -Concretely, this means that code that wants to leverage devices like GPUs needs -to copy objects between generically allocated memory (malloc, mmap private, mmap -share) and memory allocated through the device driver API (this still ends up -with an mmap but of the device file). - -For flat data sets (array, grid, image, ...) this isn't too hard to achieve but -for complex data sets (list, tree, ...) it's hard to get right. Duplicating a -complex data set needs to re-map all the pointer relations between each of its -elements. This is error prone and programs get harder to debug because of the -duplicate data set and addresses. - -Split address space also means that libraries cannot transparently use data -they are getting from the core program or another library and thus each library -might have to duplicate its input data set using the device specific memory -allocator. Large projects suffer from this and waste resources because of the -various memory copies. - -Duplicating each library API to accept as input or output memory allocated by -each device specific allocator is not a viable option. It would lead to a -combinatorial explosion in the library entry points. - -Finally, with the advance of high level language constructs (in C++ but in -other languages too) it is now possible for the compiler to leverage GPUs and -other devices without programmer knowledge. Some compiler identified patterns -are only do-able with a shared address space. It is also more reasonable to use -a shared address space for all other patterns. - - -I/O bus, device memory characteristics -====================================== - -I/O buses cripple shared address spaces due to a few limitations. Most I/O -buses only allow basic memory access from device to main memory; even cache -coherency is often optional. Access to device memory from a CPU is even more -limited. More often than not, it is not cache coherent. - -If we only consider the PCIE bus, then a device can access main memory (often -through an IOMMU) and be cache coherent with the CPUs. However, it only allows -a limited set of atomic operations from the device on main memory. This is worse -in the other direction: the CPU can only access a limited range of the device -memory and cannot perform atomic operations on it. Thus device memory cannot -be considered the same as regular memory from the kernel point of view. - -Another crippling factor is the limited bandwidth (~32GBytes/s with PCIE 4.0 -and 16 lanes). This is 33 times less than the fastest GPU memory (1 TBytes/s). -The final limitation is latency. Access to main memory from the device has an -order of magnitude higher latency than when the device accesses its own memory. - -Some platforms are developing new I/O buses or additions/modifications to PCIE -to address some of these limitations (OpenCAPI, CCIX). They mainly allow -two-way cache coherency between CPU and device and allow all atomic operations the -architecture supports. Sadly, not all platforms are following this trend and -some major architectures are left without hardware solutions to these problems. - -So for shared address space to make sense, not only must we allow devices to -access any memory but we must also permit any memory to be migrated to device -memory while the device is using it (blocking CPU access while it happens). - - -Shared address space and migration -================================== - -HMM intends to provide two main features. The first one is to share the address -space by duplicating the CPU page table in the device page table so the same -address points to the same physical memory for any valid main memory address in -the process address space. - -To achieve this, HMM offers a set of helpers to populate the device page table -while keeping track of CPU page table updates. Device page table updates are -not as easy as CPU page table updates. To update the device page table, you must -allocate a buffer (or use a pool of pre-allocated buffers) and write GPU -specific commands in it to perform the update (unmap, cache invalidations, and -flush, ...). This cannot be done through common code for all devices. Hence -why HMM provides helpers to factor out everything that can be while leaving the -hardware specific details to the device driver. - -The second mechanism HMM provides is a new kind of ZONE_DEVICE memory that -allows allocating a struct page for each page of device memory. Those pages -are special because the CPU cannot map them. However, they allow migrating -main memory to device memory using existing migration mechanisms and everything -looks like a page that is swapped out to disk from the CPU point of view. Using a -struct page gives the easiest and cleanest integration with existing mm -mechanisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE -memory for the device memory and second to perform migration. Policy decisions -of what and when to migrate is left to the device driver. - -Note that any CPU access to a device page triggers a page fault and a migration -back to main memory. For example, when a page backing a given CPU address A is -migrated from a main memory page to a device page, then any CPU access to -address A triggers a page fault and initiates a migration back to main memory. - -With these two features, HMM not only allows a device to mirror process address -space and keeps both CPU and device page tables synchronized, but also -leverages device memory by migrating the part of the data set that is actively being -used by the device. - - -Address space mirroring implementation and API -============================================== - -Address space mirroring's main objective is to allow duplication of a range of -CPU page table into a device page table; HMM helps keep both synchronized. A -device driver that wants to mirror a process address space must start with the -registration of a mmu_interval_notifier:: - - int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, - struct mm_struct *mm, unsigned long start, - unsigned long length, - const struct mmu_interval_notifier_ops *ops); - -During the ops->invalidate() callback the device driver must perform the -update action to the range (mark range read only, or fully unmap, etc.). The -device must complete the update before the driver callback returns. - -When the device driver wants to populate a range of virtual addresses, it can -use:: - - int hmm_range_fault(struct hmm_range *range); - -It will trigger a page fault on missing or read-only entries if write access is -requested (see below). Page faults use the generic mm page fault code path just -like a CPU page fault. - -Both functions copy CPU page table entries into their pfns array argument. Each -entry in that array corresponds to an address in the virtual range. HMM -provides a set of flags to help the driver identify special CPU page table -entries. - -Locking within the sync_cpu_device_pagetables() callback is the most important -aspect the driver must respect in order to keep things properly synchronized. -The usage pattern is:: - - int driver_populate_range(...) - { - struct hmm_range range; - ... - - range.notifier = &interval_sub; - range.start = ...; - range.end = ...; - range.hmm_pfns = ...; - - if (!mmget_not_zero(interval_sub->notifier.mm)) - return -EFAULT; - - again: - range.notifier_seq = mmu_interval_read_begin(&interval_sub); - mmap_read_lock(mm); - ret = hmm_range_fault(&range); - if (ret) { - mmap_read_unlock(mm); - if (ret == -EBUSY) - goto again; - return ret; - } - mmap_read_unlock(mm); - - take_lock(driver->update); - if (mmu_interval_read_retry(&ni, range.notifier_seq) { - release_lock(driver->update); - goto again; - } - - /* Use pfns array content to update device page table, - * under the update lock */ - - release_lock(driver->update); - return 0; - } - -The driver->update lock is the same lock that the driver takes inside its -invalidate() callback. That lock must be held before calling -mmu_interval_read_retry() to avoid any race with a concurrent CPU page table -update. - -Leverage default_flags and pfn_flags_mask -========================================= - -The hmm_range struct has 2 fields, default_flags and pfn_flags_mask, that specify -fault or snapshot policy for the whole range instead of having to set them -for each entry in the pfns array. - -For instance if the device driver wants pages for a range with at least read -permission, it sets:: - - range->default_flags = HMM_PFN_REQ_FAULT; - range->pfn_flags_mask = 0; - -and calls hmm_range_fault() as described above. This will fill fault all pages -in the range with at least read permission. - -Now let's say the driver wants to do the same except for one page in the range for -which it wants to have write permission. Now driver set:: - - range->default_flags = HMM_PFN_REQ_FAULT; - range->pfn_flags_mask = HMM_PFN_REQ_WRITE; - range->pfns[index_of_write] = HMM_PFN_REQ_WRITE; - -With this, HMM will fault in all pages with at least read (i.e., valid) and for the -address == range->start + (index_of_write << PAGE_SHIFT) it will fault with -write permission i.e., if the CPU pte does not have write permission set then HMM -will call handle_mm_fault(). - -After hmm_range_fault completes the flag bits are set to the current state of -the page tables, ie HMM_PFN_VALID | HMM_PFN_WRITE will be set if the page is -writable. - - -Represent and manage device memory from core kernel point of view -================================================================= - -Several different designs were tried to support device memory. The first one -used a device specific data structure to keep information about migrated memory -and HMM hooked itself in various places of mm code to handle any access to -addresses that were backed by device memory. It turns out that this ended up -replicating most of the fields of struct page and also needed many kernel code -paths to be updated to understand this new kind of memory. - -Most kernel code paths never try to access the memory behind a page -but only care about struct page contents. Because of this, HMM switched to -directly using struct page for device memory which left most kernel code paths -unaware of the difference. We only need to make sure that no one ever tries to -map those pages from the CPU side. - -Migration to and from device memory -=================================== - -Because the CPU cannot access device memory directly, the device driver must -use hardware DMA or device specific load/store instructions to migrate data. -The migrate_vma_setup(), migrate_vma_pages(), and migrate_vma_finalize() -functions are designed to make drivers easier to write and to centralize common -code across drivers. - -Before migrating pages to device private memory, special device private -``struct page`` need to be created. These will be used as special "swap" -page table entries so that a CPU process will fault if it tries to access -a page that has been migrated to device private memory. - -These can be allocated and freed with:: - - struct resource *res; - struct dev_pagemap pagemap; - - res = request_free_mem_region(&iomem_resource, /* number of bytes */, - "name of driver resource"); - pagemap.type = MEMORY_DEVICE_PRIVATE; - pagemap.range.start = res->start; - pagemap.range.end = res->end; - pagemap.nr_range = 1; - pagemap.ops = &device_devmem_ops; - memremap_pages(&pagemap, numa_node_id()); - - memunmap_pages(&pagemap); - release_mem_region(pagemap.range.start, range_len(&pagemap.range)); - -There are also devm_request_free_mem_region(), devm_memremap_pages(), -devm_memunmap_pages(), and devm_release_mem_region() when the resources can -be tied to a ``struct device``. - -The overall migration steps are similar to migrating NUMA pages within system -memory (see :ref:`Page migration `) but the steps are split -between device driver specific code and shared common code: - -1. ``mmap_read_lock()`` - - The device driver has to pass a ``struct vm_area_struct`` to - migrate_vma_setup() so the mmap_read_lock() or mmap_write_lock() needs to - be held for the duration of the migration. - -2. ``migrate_vma_setup(struct migrate_vma *args)`` - - The device driver initializes the ``struct migrate_vma`` fields and passes - the pointer to migrate_vma_setup(). The ``args->flags`` field is used to - filter which source pages should be migrated. For example, setting - ``MIGRATE_VMA_SELECT_SYSTEM`` will only migrate system memory and - ``MIGRATE_VMA_SELECT_DEVICE_PRIVATE`` will only migrate pages residing in - device private memory. If the latter flag is set, the ``args->pgmap_owner`` - field is used to identify device private pages owned by the driver. This - avoids trying to migrate device private pages residing in other devices. - Currently only anonymous private VMA ranges can be migrated to or from - system memory and device private memory. - - One of the first steps migrate_vma_setup() does is to invalidate other - device's MMUs with the ``mmu_notifier_invalidate_range_start(()`` and - ``mmu_notifier_invalidate_range_end()`` calls around the page table - walks to fill in the ``args->src`` array with PFNs to be migrated. - The ``invalidate_range_start()`` callback is passed a - ``struct mmu_notifier_range`` with the ``event`` field set to - ``MMU_NOTIFY_MIGRATE`` and the ``owner`` field set to - the ``args->pgmap_owner`` field passed to migrate_vma_setup(). This is - allows the device driver to skip the invalidation callback and only - invalidate device private MMU mappings that are actually migrating. - This is explained more in the next section. - - While walking the page tables, a ``pte_none()`` or ``is_zero_pfn()`` - entry results in a valid "zero" PFN stored in the ``args->src`` array. - This lets the driver allocate device private memory and clear it instead - of copying a page of zeros. Valid PTE entries to system memory or - device private struct pages will be locked with ``lock_page()``, isolated - from the LRU (if system memory since device private pages are not on - the LRU), unmapped from the process, and a special migration PTE is - inserted in place of the original PTE. - migrate_vma_setup() also clears the ``args->dst`` array. - -3. The device driver allocates destination pages and copies source pages to - destination pages. - - The driver checks each ``src`` entry to see if the ``MIGRATE_PFN_MIGRATE`` - bit is set and skips entries that are not migrating. The device driver - can also choose to skip migrating a page by not filling in the ``dst`` - array for that page. - - The driver then allocates either a device private struct page or a - system memory page, locks the page with ``lock_page()``, and fills in the - ``dst`` array entry with:: - - dst[i] = migrate_pfn(page_to_pfn(dpage)); - - Now that the driver knows that this page is being migrated, it can - invalidate device private MMU mappings and copy device private memory - to system memory or another device private page. The core Linux kernel - handles CPU page table invalidations so the device driver only has to - invalidate its own MMU mappings. - - The driver can use ``migrate_pfn_to_page(src[i])`` to get the - ``struct page`` of the source and either copy the source page to the - destination or clear the destination device private memory if the pointer - is ``NULL`` meaning the source page was not populated in system memory. - -4. ``migrate_vma_pages()`` - - This step is where the migration is actually "committed". - - If the source page was a ``pte_none()`` or ``is_zero_pfn()`` page, this - is where the newly allocated page is inserted into the CPU's page table. - This can fail if a CPU thread faults on the same page. However, the page - table is locked and only one of the new pages will be inserted. - The device driver will see that the ``MIGRATE_PFN_MIGRATE`` bit is cleared - if it loses the race. - - If the source page was locked, isolated, etc. the source ``struct page`` - information is now copied to destination ``struct page`` finalizing the - migration on the CPU side. - -5. Device driver updates device MMU page tables for pages still migrating, - rolling back pages not migrating. - - If the ``src`` entry still has ``MIGRATE_PFN_MIGRATE`` bit set, the device - driver can update the device MMU and set the write enable bit if the - ``MIGRATE_PFN_WRITE`` bit is set. - -6. ``migrate_vma_finalize()`` - - This step replaces the special migration page table entry with the new - page's page table entry and releases the reference to the source and - destination ``struct page``. - -7. ``mmap_read_unlock()`` - - The lock can now be released. - -Exclusive access memory -======================= - -Some devices have features such as atomic PTE bits that can be used to implement -atomic access to system memory. To support atomic operations to a shared virtual -memory page such a device needs access to that page which is exclusive of any -userspace access from the CPU. The ``make_device_exclusive_range()`` function -can be used to make a memory range inaccessible from userspace. - -This replaces all mappings for pages in the given range with special swap -entries. Any attempt to access the swap entry results in a fault which is -resovled by replacing the entry with the original mapping. A driver gets -notified that the mapping has been changed by MMU notifiers, after which point -it will no longer have exclusive access to the page. Exclusive access is -guranteed to last until the driver drops the page lock and page reference, at -which point any CPU faults on the page may proceed as described. - -Memory cgroup (memcg) and rss accounting -======================================== - -For now, device memory is accounted as any regular page in rss counters (either -anonymous if device page is used for anonymous, file if device page is used for -file backed page, or shmem if device page is used for shared memory). This is a -deliberate choice to keep existing applications, that might start using device -memory without knowing about it, running unimpacted. - -A drawback is that the OOM killer might kill an application using a lot of -device memory and not a lot of regular system memory and thus not freeing much -system memory. We want to gather more real world experience on how applications -and system react under memory pressure in the presence of device memory before -deciding to account device memory differently. - - -Same decision was made for memory cgroup. Device memory pages are accounted -against same memory cgroup a regular page would be accounted to. This does -simplify migration to and from device memory. This also means that migration -back from device memory to regular memory cannot fail because it would -go above memory cgroup limit. We might revisit this choice latter on once we -get more experience in how device memory is used and its impact on memory -resource control. - - -Note that device memory can never be pinned by a device driver nor through GUP -and thus such memory is always free upon process exit. Or when last reference -is dropped in case of shared memory or file backed memory. diff --git a/Documentation/vm/hugetlbfs_reserv.rst b/Documentation/vm/hugetlbfs_reserv.rst deleted file mode 100644 index f143954e0d05..000000000000 --- a/Documentation/vm/hugetlbfs_reserv.rst +++ /dev/null @@ -1,596 +0,0 @@ -.. _hugetlbfs_reserve: - -===================== -Hugetlbfs Reservation -===================== - -Overview -======== - -Huge pages as described at :ref:`hugetlbpage` are typically -preallocated for application use. These huge pages are instantiated in a -task's address space at page fault time if the VMA indicates huge pages are -to be used. If no huge page exists at page fault time, the task is sent -a SIGBUS and often dies an unhappy death. Shortly after huge page support -was added, it was determined that it would be better to detect a shortage -of huge pages at mmap() time. The idea is that if there were not enough -huge pages to cover the mapping, the mmap() would fail. This was first -done with a simple check in the code at mmap() time to determine if there -were enough free huge pages to cover the mapping. Like most things in the -kernel, the code has evolved over time. However, the basic idea was to -'reserve' huge pages at mmap() time to ensure that huge pages would be -available for page faults in that mapping. The description below attempts to -describe how huge page reserve processing is done in the v4.10 kernel. - - -Audience -======== -This description is primarily targeted at kernel developers who are modifying -hugetlbfs code. - - -The Data Structures -=================== - -resv_huge_pages - This is a global (per-hstate) count of reserved huge pages. Reserved - huge pages are only available to the task which reserved them. - Therefore, the number of huge pages generally available is computed - as (``free_huge_pages - resv_huge_pages``). -Reserve Map - A reserve map is described by the structure:: - - struct resv_map { - struct kref refs; - spinlock_t lock; - struct list_head regions; - long adds_in_progress; - struct list_head region_cache; - long region_cache_count; - }; - - There is one reserve map for each huge page mapping in the system. - The regions list within the resv_map describes the regions within - the mapping. A region is described as:: - - struct file_region { - struct list_head link; - long from; - long to; - }; - - The 'from' and 'to' fields of the file region structure are huge page - indices into the mapping. Depending on the type of mapping, a - region in the reserv_map may indicate reservations exist for the - range, or reservations do not exist. -Flags for MAP_PRIVATE Reservations - These are stored in the bottom bits of the reservation map pointer. - - ``#define HPAGE_RESV_OWNER (1UL << 0)`` - Indicates this task is the owner of the reservations - associated with the mapping. - ``#define HPAGE_RESV_UNMAPPED (1UL << 1)`` - Indicates task originally mapping this range (and creating - reserves) has unmapped a page from this task (the child) - due to a failed COW. -Page Flags - The PagePrivate page flag is used to indicate that a huge page - reservation must be restored when the huge page is freed. More - details will be discussed in the "Freeing huge pages" section. - - -Reservation Map Location (Private or Shared) -============================================ - -A huge page mapping or segment is either private or shared. If private, -it is typically only available to a single address space (task). If shared, -it can be mapped into multiple address spaces (tasks). The location and -semantics of the reservation map is significantly different for the two types -of mappings. Location differences are: - -- For private mappings, the reservation map hangs off the VMA structure. - Specifically, vma->vm_private_data. This reserve map is created at the - time the mapping (mmap(MAP_PRIVATE)) is created. -- For shared mappings, the reservation map hangs off the inode. Specifically, - inode->i_mapping->private_data. Since shared mappings are always backed - by files in the hugetlbfs filesystem, the hugetlbfs code ensures each inode - contains a reservation map. As a result, the reservation map is allocated - when the inode is created. - - -Creating Reservations -===================== -Reservations are created when a huge page backed shared memory segment is -created (shmget(SHM_HUGETLB)) or a mapping is created via mmap(MAP_HUGETLB). -These operations result in a call to the routine hugetlb_reserve_pages():: - - int hugetlb_reserve_pages(struct inode *inode, - long from, long to, - struct vm_area_struct *vma, - vm_flags_t vm_flags) - -The first thing hugetlb_reserve_pages() does is check if the NORESERVE -flag was specified in either the shmget() or mmap() call. If NORESERVE -was specified, then this routine returns immediately as no reservations -are desired. - -The arguments 'from' and 'to' are huge page indices into the mapping or -underlying file. For shmget(), 'from' is always 0 and 'to' corresponds to -the length of the segment/mapping. For mmap(), the offset argument could -be used to specify the offset into the underlying file. In such a case, -the 'from' and 'to' arguments have been adjusted by this offset. - -One of the big differences between PRIVATE and SHARED mappings is the way -in which reservations are represented in the reservation map. - -- For shared mappings, an entry in the reservation map indicates a reservation - exists or did exist for the corresponding page. As reservations are - consumed, the reservation map is not modified. -- For private mappings, the lack of an entry in the reservation map indicates - a reservation exists for the corresponding page. As reservations are - consumed, entries are added to the reservation map. Therefore, the - reservation map can also be used to determine which reservations have - been consumed. - -For private mappings, hugetlb_reserve_pages() creates the reservation map and -hangs it off the VMA structure. In addition, the HPAGE_RESV_OWNER flag is set -to indicate this VMA owns the reservations. - -The reservation map is consulted to determine how many huge page reservations -are needed for the current mapping/segment. For private mappings, this is -always the value (to - from). However, for shared mappings it is possible that -some reservations may already exist within the range (to - from). See the -section :ref:`Reservation Map Modifications ` -for details on how this is accomplished. - -The mapping may be associated with a subpool. If so, the subpool is consulted -to ensure there is sufficient space for the mapping. It is possible that the -subpool has set aside reservations that can be used for the mapping. See the -section :ref:`Subpool Reservations ` for more details. - -After consulting the reservation map and subpool, the number of needed new -reservations is known. The routine hugetlb_acct_memory() is called to check -for and take the requested number of reservations. hugetlb_acct_memory() -calls into routines that potentially allocate and adjust surplus page counts. -However, within those routines the code is simply checking to ensure there -are enough free huge pages to accommodate the reservation. If there are, -the global reservation count resv_huge_pages is adjusted something like the -following:: - - if (resv_needed <= (resv_huge_pages - free_huge_pages)) - resv_huge_pages += resv_needed; - -Note that the global lock hugetlb_lock is held when checking and adjusting -these counters. - -If there were enough free huge pages and the global count resv_huge_pages -was adjusted, then the reservation map associated with the mapping is -modified to reflect the reservations. In the case of a shared mapping, a -file_region will exist that includes the range 'from' - 'to'. For private -mappings, no modifications are made to the reservation map as lack of an -entry indicates a reservation exists. - -If hugetlb_reserve_pages() was successful, the global reservation count and -reservation map associated with the mapping will be modified as required to -ensure reservations exist for the range 'from' - 'to'. - -.. _consume_resv: - -Consuming Reservations/Allocating a Huge Page -============================================= - -Reservations are consumed when huge pages associated with the reservations -are allocated and instantiated in the corresponding mapping. The allocation -is performed within the routine alloc_huge_page():: - - struct page *alloc_huge_page(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve) - -alloc_huge_page is passed a VMA pointer and a virtual address, so it can -consult the reservation map to determine if a reservation exists. In addition, -alloc_huge_page takes the argument avoid_reserve which indicates reserves -should not be used even if it appears they have been set aside for the -specified address. The avoid_reserve argument is most often used in the case -of Copy on Write and Page Migration where additional copies of an existing -page are being allocated. - -The helper routine vma_needs_reservation() is called to determine if a -reservation exists for the address within the mapping(vma). See the section -:ref:`Reservation Map Helper Routines ` for detailed -information on what this routine does. -The value returned from vma_needs_reservation() is generally -0 or 1. 0 if a reservation exists for the address, 1 if no reservation exists. -If a reservation does not exist, and there is a subpool associated with the -mapping the subpool is consulted to determine if it contains reservations. -If the subpool contains reservations, one can be used for this allocation. -However, in every case the avoid_reserve argument overrides the use of -a reservation for the allocation. After determining whether a reservation -exists and can be used for the allocation, the routine dequeue_huge_page_vma() -is called. This routine takes two arguments related to reservations: - -- avoid_reserve, this is the same value/argument passed to alloc_huge_page() -- chg, even though this argument is of type long only the values 0 or 1 are - passed to dequeue_huge_page_vma. If the value is 0, it indicates a - reservation exists (see the section "Memory Policy and Reservations" for - possible issues). If the value is 1, it indicates a reservation does not - exist and the page must be taken from the global free pool if possible. - -The free lists associated with the memory policy of the VMA are searched for -a free page. If a page is found, the value free_huge_pages is decremented -when the page is removed from the free list. If there was a reservation -associated with the page, the following adjustments are made:: - - SetPagePrivate(page); /* Indicates allocating this page consumed - * a reservation, and if an error is - * encountered such that the page must be - * freed, the reservation will be restored. */ - resv_huge_pages--; /* Decrement the global reservation count */ - -Note, if no huge page can be found that satisfies the VMA's memory policy -an attempt will be made to allocate one using the buddy allocator. This -brings up the issue of surplus huge pages and overcommit which is beyond -the scope reservations. Even if a surplus page is allocated, the same -reservation based adjustments as above will be made: SetPagePrivate(page) and -resv_huge_pages--. - -After obtaining a new huge page, (page)->private is set to the value of -the subpool associated with the page if it exists. This will be used for -subpool accounting when the page is freed. - -The routine vma_commit_reservation() is then called to adjust the reserve -map based on the consumption of the reservation. In general, this involves -ensuring the page is represented within a file_region structure of the region -map. For shared mappings where the reservation was present, an entry -in the reserve map already existed so no change is made. However, if there -was no reservation in a shared mapping or this was a private mapping a new -entry must be created. - -It is possible that the reserve map could have been changed between the call -to vma_needs_reservation() at the beginning of alloc_huge_page() and the -call to vma_commit_reservation() after the page was allocated. This would -be possible if hugetlb_reserve_pages was called for the same page in a shared -mapping. In such cases, the reservation count and subpool free page count -will be off by one. This rare condition can be identified by comparing the -return value from vma_needs_reservation and vma_commit_reservation. If such -a race is detected, the subpool and global reserve counts are adjusted to -compensate. See the section -:ref:`Reservation Map Helper Routines ` for more -information on these routines. - - -Instantiate Huge Pages -====================== - -After huge page allocation, the page is typically added to the page tables -of the allocating task. Before this, pages in a shared mapping are added -to the page cache and pages in private mappings are added to an anonymous -reverse mapping. In both cases, the PagePrivate flag is cleared. Therefore, -when a huge page that has been instantiated is freed no adjustment is made -to the global reservation count (resv_huge_pages). - - -Freeing Huge Pages -================== - -Huge page freeing is performed by the routine free_huge_page(). This routine -is the destructor for hugetlbfs compound pages. As a result, it is only -passed a pointer to the page struct. When a huge page is freed, reservation -accounting may need to be performed. This would be the case if the page was -associated with a subpool that contained reserves, or the page is being freed -on an error path where a global reserve count must be restored. - -The page->private field points to any subpool associated with the page. -If the PagePrivate flag is set, it indicates the global reserve count should -be adjusted (see the section -:ref:`Consuming Reservations/Allocating a Huge Page ` -for information on how these are set). - -The routine first calls hugepage_subpool_put_pages() for the page. If this -routine returns a value of 0 (which does not equal the value passed 1) it -indicates reserves are associated with the subpool, and this newly free page -must be used to keep the number of subpool reserves above the minimum size. -Therefore, the global resv_huge_pages counter is incremented in this case. - -If the PagePrivate flag was set in the page, the global resv_huge_pages counter -will always be incremented. - -.. _sub_pool_resv: - -Subpool Reservations -==================== - -There is a struct hstate associated with each huge page size. The hstate -tracks all huge pages of the specified size. A subpool represents a subset -of pages within a hstate that is associated with a mounted hugetlbfs -filesystem. - -When a hugetlbfs filesystem is mounted a min_size option can be specified -which indicates the minimum number of huge pages required by the filesystem. -If this option is specified, the number of huge pages corresponding to -min_size are reserved for use by the filesystem. This number is tracked in -the min_hpages field of a struct hugepage_subpool. At mount time, -hugetlb_acct_memory(min_hpages) is called to reserve the specified number of -huge pages. If they can not be reserved, the mount fails. - -The routines hugepage_subpool_get/put_pages() are called when pages are -obtained from or released back to a subpool. They perform all subpool -accounting, and track any reservations associated with the subpool. -hugepage_subpool_get/put_pages are passed the number of huge pages by which -to adjust the subpool 'used page' count (down for get, up for put). Normally, -they return the same value that was passed or an error if not enough pages -exist in the subpool. - -However, if reserves are associated with the subpool a return value less -than the passed value may be returned. This return value indicates the -number of additional global pool adjustments which must be made. For example, -suppose a subpool contains 3 reserved huge pages and someone asks for 5. -The 3 reserved pages associated with the subpool can be used to satisfy part -of the request. But, 2 pages must be obtained from the global pools. To -relay this information to the caller, the value 2 is returned. The caller -is then responsible for attempting to obtain the additional two pages from -the global pools. - - -COW and Reservations -==================== - -Since shared mappings all point to and use the same underlying pages, the -biggest reservation concern for COW is private mappings. In this case, -two tasks can be pointing at the same previously allocated page. One task -attempts to write to the page, so a new page must be allocated so that each -task points to its own page. - -When the page was originally allocated, the reservation for that page was -consumed. When an attempt to allocate a new page is made as a result of -COW, it is possible that no free huge pages are free and the allocation -will fail. - -When the private mapping was originally created, the owner of the mapping -was noted by setting the HPAGE_RESV_OWNER bit in the pointer to the reservation -map of the owner. Since the owner created the mapping, the owner owns all -the reservations associated with the mapping. Therefore, when a write fault -occurs and there is no page available, different action is taken for the owner -and non-owner of the reservation. - -In the case where the faulting task is not the owner, the fault will fail and -the task will typically receive a SIGBUS. - -If the owner is the faulting task, we want it to succeed since it owned the -original reservation. To accomplish this, the page is unmapped from the -non-owning task. In this way, the only reference is from the owning task. -In addition, the HPAGE_RESV_UNMAPPED bit is set in the reservation map pointer -of the non-owning task. The non-owning task may receive a SIGBUS if it later -faults on a non-present page. But, the original owner of the -mapping/reservation will behave as expected. - - -.. _resv_map_modifications: - -Reservation Map Modifications -============================= - -The following low level routines are used to make modifications to a -reservation map. Typically, these routines are not called directly. Rather, -a reservation map helper routine is called which calls one of these low level -routines. These low level routines are fairly well documented in the source -code (mm/hugetlb.c). These routines are:: - - long region_chg(struct resv_map *resv, long f, long t); - long region_add(struct resv_map *resv, long f, long t); - void region_abort(struct resv_map *resv, long f, long t); - long region_count(struct resv_map *resv, long f, long t); - -Operations on the reservation map typically involve two operations: - -1) region_chg() is called to examine the reserve map and determine how - many pages in the specified range [f, t) are NOT currently represented. - - The calling code performs global checks and allocations to determine if - there are enough huge pages for the operation to succeed. - -2) - a) If the operation can succeed, region_add() is called to actually modify - the reservation map for the same range [f, t) previously passed to - region_chg(). - b) If the operation can not succeed, region_abort is called for the same - range [f, t) to abort the operation. - -Note that this is a two step process where region_add() and region_abort() -are guaranteed to succeed after a prior call to region_chg() for the same -range. region_chg() is responsible for pre-allocating any data structures -necessary to ensure the subsequent operations (specifically region_add())) -will succeed. - -As mentioned above, region_chg() determines the number of pages in the range -which are NOT currently represented in the map. This number is returned to -the caller. region_add() returns the number of pages in the range added to -the map. In most cases, the return value of region_add() is the same as the -return value of region_chg(). However, in the case of shared mappings it is -possible for changes to the reservation map to be made between the calls to -region_chg() and region_add(). In this case, the return value of region_add() -will not match the return value of region_chg(). It is likely that in such -cases global counts and subpool accounting will be incorrect and in need of -adjustment. It is the responsibility of the caller to check for this condition -and make the appropriate adjustments. - -The routine region_del() is called to remove regions from a reservation map. -It is typically called in the following situations: - -- When a file in the hugetlbfs filesystem is being removed, the inode will - be released and the reservation map freed. Before freeing the reservation - map, all the individual file_region structures must be freed. In this case - region_del is passed the range [0, LONG_MAX). -- When a hugetlbfs file is being truncated. In this case, all allocated pages - after the new file size must be freed. In addition, any file_region entries - in the reservation map past the new end of file must be deleted. In this - case, region_del is passed the range [new_end_of_file, LONG_MAX). -- When a hole is being punched in a hugetlbfs file. In this case, huge pages - are removed from the middle of the file one at a time. As the pages are - removed, region_del() is called to remove the corresponding entry from the - reservation map. In this case, region_del is passed the range - [page_idx, page_idx + 1). - -In every case, region_del() will return the number of pages removed from the -reservation map. In VERY rare cases, region_del() can fail. This can only -happen in the hole punch case where it has to split an existing file_region -entry and can not allocate a new structure. In this error case, region_del() -will return -ENOMEM. The problem here is that the reservation map will -indicate that there is a reservation for the page. However, the subpool and -global reservation counts will not reflect the reservation. To handle this -situation, the routine hugetlb_fix_reserve_counts() is called to adjust the -counters so that they correspond with the reservation map entry that could -not be deleted. - -region_count() is called when unmapping a private huge page mapping. In -private mappings, the lack of a entry in the reservation map indicates that -a reservation exists. Therefore, by counting the number of entries in the -reservation map we know how many reservations were consumed and how many are -outstanding (outstanding = (end - start) - region_count(resv, start, end)). -Since the mapping is going away, the subpool and global reservation counts -are decremented by the number of outstanding reservations. - -.. _resv_map_helpers: - -Reservation Map Helper Routines -=============================== - -Several helper routines exist to query and modify the reservation maps. -These routines are only interested with reservations for a specific huge -page, so they just pass in an address instead of a range. In addition, -they pass in the associated VMA. From the VMA, the type of mapping (private -or shared) and the location of the reservation map (inode or VMA) can be -determined. These routines simply call the underlying routines described -in the section "Reservation Map Modifications". However, they do take into -account the 'opposite' meaning of reservation map entries for private and -shared mappings and hide this detail from the caller:: - - long vma_needs_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -This routine calls region_chg() for the specified page. If no reservation -exists, 1 is returned. If a reservation exists, 0 is returned:: - - long vma_commit_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -This calls region_add() for the specified page. As in the case of region_chg -and region_add, this routine is to be called after a previous call to -vma_needs_reservation. It will add a reservation entry for the page. It -returns 1 if the reservation was added and 0 if not. The return value should -be compared with the return value of the previous call to -vma_needs_reservation. An unexpected difference indicates the reservation -map was modified between calls:: - - void vma_end_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -This calls region_abort() for the specified page. As in the case of region_chg -and region_abort, this routine is to be called after a previous call to -vma_needs_reservation. It will abort/end the in progress reservation add -operation:: - - long vma_add_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -This is a special wrapper routine to help facilitate reservation cleanup -on error paths. It is only called from the routine restore_reserve_on_error(). -This routine is used in conjunction with vma_needs_reservation in an attempt -to add a reservation to the reservation map. It takes into account the -different reservation map semantics for private and shared mappings. Hence, -region_add is called for shared mappings (as an entry present in the map -indicates a reservation), and region_del is called for private mappings (as -the absence of an entry in the map indicates a reservation). See the section -"Reservation cleanup in error paths" for more information on what needs to -be done on error paths. - - -Reservation Cleanup in Error Paths -================================== - -As mentioned in the section -:ref:`Reservation Map Helper Routines `, reservation -map modifications are performed in two steps. First vma_needs_reservation -is called before a page is allocated. If the allocation is successful, -then vma_commit_reservation is called. If not, vma_end_reservation is called. -Global and subpool reservation counts are adjusted based on success or failure -of the operation and all is well. - -Additionally, after a huge page is instantiated the PagePrivate flag is -cleared so that accounting when the page is ultimately freed is correct. - -However, there are several instances where errors are encountered after a huge -page is allocated but before it is instantiated. In this case, the page -allocation has consumed the reservation and made the appropriate subpool, -reservation map and global count adjustments. If the page is freed at this -time (before instantiation and clearing of PagePrivate), then free_huge_page -will increment the global reservation count. However, the reservation map -indicates the reservation was consumed. This resulting inconsistent state -will cause the 'leak' of a reserved huge page. The global reserve count will -be higher than it should and prevent allocation of a pre-allocated page. - -The routine restore_reserve_on_error() attempts to handle this situation. It -is fairly well documented. The intention of this routine is to restore -the reservation map to the way it was before the page allocation. In this -way, the state of the reservation map will correspond to the global reservation -count after the page is freed. - -The routine restore_reserve_on_error itself may encounter errors while -attempting to restore the reservation map entry. In this case, it will -simply clear the PagePrivate flag of the page. In this way, the global -reserve count will not be incremented when the page is freed. However, the -reservation map will continue to look as though the reservation was consumed. -A page can still be allocated for the address, but it will not use a reserved -page as originally intended. - -There is some code (most notably userfaultfd) which can not call -restore_reserve_on_error. In this case, it simply modifies the PagePrivate -so that a reservation will not be leaked when the huge page is freed. - - -Reservations and Memory Policy -============================== -Per-node huge page lists existed in struct hstate when git was first used -to manage Linux code. The concept of reservations was added some time later. -When reservations were added, no attempt was made to take memory policy -into account. While cpusets are not exactly the same as memory policy, this -comment in hugetlb_acct_memory sums up the interaction between reservations -and cpusets/memory policy:: - - /* - * When cpuset is configured, it breaks the strict hugetlb page - * reservation as the accounting is done on a global variable. Such - * reservation is completely rubbish in the presence of cpuset because - * the reservation is not checked against page availability for the - * current cpuset. Application can still potentially OOM'ed by kernel - * with lack of free htlb page in cpuset that the task is in. - * Attempt to enforce strict accounting with cpuset is almost - * impossible (or too ugly) because cpuset is too fluid that - * task or memory node can be dynamically moved between cpusets. - * - * The change of semantics for shared hugetlb mapping with cpuset is - * undesirable. However, in order to preserve some of the semantics, - * we fall back to check against current free page availability as - * a best attempt and hopefully to minimize the impact of changing - * semantics that cpuset has. - */ - -Huge page reservations were added to prevent unexpected page allocation -failures (OOM) at page fault time. However, if an application makes use -of cpusets or memory policy there is no guarantee that huge pages will be -available on the required nodes. This is true even if there are a sufficient -number of global reservations. - -Hugetlbfs regression testing -============================ - -The most complete set of hugetlb tests are in the libhugetlbfs repository. -If you modify any hugetlb related code, use the libhugetlbfs test suite -to check for regressions. In addition, if you add any new hugetlb -functionality, please add appropriate tests to libhugetlbfs. - --- -Mike Kravetz, 7 April 2017 diff --git a/Documentation/vm/hwpoison.rst b/Documentation/vm/hwpoison.rst deleted file mode 100644 index b9d5253c1305..000000000000 --- a/Documentation/vm/hwpoison.rst +++ /dev/null @@ -1,184 +0,0 @@ -.. hwpoison: - -======== -hwpoison -======== - -What is hwpoison? -================= - -Upcoming Intel CPUs have support for recovering from some memory errors -(``MCA recovery``). This requires the OS to declare a page "poisoned", -kill the processes associated with it and avoid using it in the future. - -This patchkit implements the necessary infrastructure in the VM. - -To quote the overview comment:: - - High level machine check handler. Handles pages reported by the - hardware as being corrupted usually due to a 2bit ECC memory or cache - failure. - - This focusses on pages detected as corrupted in the background. - When the current CPU tries to consume corruption the currently - running process can just be killed directly instead. This implies - that if the error cannot be handled for some reason it's safe to - just ignore it because no corruption has been consumed yet. Instead - when that happens another machine check will happen. - - Handles page cache pages in various states. The tricky part - here is that we can access any page asynchronous to other VM - users, because memory failures could happen anytime and anywhere, - possibly violating some of their assumptions. This is why this code - has to be extremely careful. Generally it tries to use normal locking - rules, as in get the standard locks, even if that means the - error handling takes potentially a long time. - - Some of the operations here are somewhat inefficient and have non - linear algorithmic complexity, because the data structures have not - been optimized for this case. This is in particular the case - for the mapping from a vma to a process. Since this case is expected - to be rare we hope we can get away with this. - -The code consists of a the high level handler in mm/memory-failure.c, -a new page poison bit and various checks in the VM to handle poisoned -pages. - -The main target right now is KVM guests, but it works for all kinds -of applications. KVM support requires a recent qemu-kvm release. - -For the KVM use there was need for a new signal type so that -KVM can inject the machine check into the guest with the proper -address. This in theory allows other applications to handle -memory failures too. The expection is that near all applications -won't do that, but some very specialized ones might. - -Failure recovery modes -====================== - -There are two (actually three) modes memory failure recovery can be in: - -vm.memory_failure_recovery sysctl set to zero: - All memory failures cause a panic. Do not attempt recovery. - -early kill - (can be controlled globally and per process) - Send SIGBUS to the application as soon as the error is detected - This allows applications who can process memory errors in a gentle - way (e.g. drop affected object) - This is the mode used by KVM qemu. - -late kill - Send SIGBUS when the application runs into the corrupted page. - This is best for memory error unaware applications and default - Note some pages are always handled as late kill. - -User control -============ - -vm.memory_failure_recovery - See sysctl.txt - -vm.memory_failure_early_kill - Enable early kill mode globally - -PR_MCE_KILL - Set early/late kill mode/revert to system default - - arg1: PR_MCE_KILL_CLEAR: - Revert to system default - arg1: PR_MCE_KILL_SET: - arg2 defines thread specific mode - - PR_MCE_KILL_EARLY: - Early kill - PR_MCE_KILL_LATE: - Late kill - PR_MCE_KILL_DEFAULT - Use system global default - - Note that if you want to have a dedicated thread which handles - the SIGBUS(BUS_MCEERR_AO) on behalf of the process, you should - call prctl(PR_MCE_KILL_EARLY) on the designated thread. Otherwise, - the SIGBUS is sent to the main thread. - -PR_MCE_KILL_GET - return current mode - -Testing -======= - -* madvise(MADV_HWPOISON, ....) (as root) - Poison a page in the - process for testing - -* hwpoison-inject module through debugfs ``/sys/kernel/debug/hwpoison/`` - - corrupt-pfn - Inject hwpoison fault at PFN echoed into this file. This does - some early filtering to avoid corrupted unintended pages in test suites. - - unpoison-pfn - Software-unpoison page at PFN echoed into this file. This way - a page can be reused again. This only works for Linux - injected failures, not for real memory failures. Once any hardware - memory failure happens, this feature is disabled. - - Note these injection interfaces are not stable and might change between - kernel versions - - corrupt-filter-dev-major, corrupt-filter-dev-minor - Only handle memory failures to pages associated with the file - system defined by block device major/minor. -1U is the - wildcard value. This should be only used for testing with - artificial injection. - - corrupt-filter-memcg - Limit injection to pages owned by memgroup. Specified by inode - number of the memcg. - - Example:: - - mkdir /sys/fs/cgroup/mem/hwpoison - - usemem -m 100 -s 1000 & - echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks - - memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ') - echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg - - page-types -p `pidof init` --hwpoison # shall do nothing - page-types -p `pidof usemem` --hwpoison # poison its pages - - corrupt-filter-flags-mask, corrupt-filter-flags-value - When specified, only poison pages if ((page_flags & mask) == - value). This allows stress testing of many kinds of - pages. The page_flags are the same as in /proc/kpageflags. The - flag bits are defined in include/linux/kernel-page-flags.h and - documented in Documentation/admin-guide/mm/pagemap.rst - -* Architecture specific MCE injector - - x86 has mce-inject, mce-test - - Some portable hwpoison test programs in mce-test, see below. - -References -========== - -http://halobates.de/mce-lc09-2.pdf - Overview presentation from LinuxCon 09 - -git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git - Test suite (hwpoison specific portable tests in tsrc) - -git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git - x86 specific injector - - -Limitations -=========== -- Not all page types are supported and never will. Most kernel internal - objects cannot be recovered, only LRU pages for now. - ---- -Andi Kleen, Oct 2009 diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst deleted file mode 100644 index 575ccd40e30c..000000000000 --- a/Documentation/vm/index.rst +++ /dev/null @@ -1,68 +0,0 @@ -===================================== -Linux Memory Management Documentation -===================================== - -Memory Management Guide -======================= - -This is a guide to understanding the memory management subsystem -of Linux. If you are looking for advice on simply allocating memory, -see the :ref:`memory_allocation`. For controlling and tuning guides, -see the :doc:`admin guide <../admin-guide/mm/index>`. - -.. toctree:: - :maxdepth: 1 - - physical_memory - page_tables - process_addrs - bootmem - page_allocation - vmalloc - slab - highmem - page_reclaim - swap - page_cache - shmfs - oom - -Legacy Documentation -==================== - -This is a collection of older documents about the Linux memory management -(MM) subsystem internals with different level of details ranging from -notes and mailing list responses for elaborating descriptions of data -structures and algorithms. It should all be integrated nicely into the -above structured documentation, or deleted if it has served its purpose. - -.. toctree:: - :maxdepth: 1 - - active_mm - arch_pgtable_helpers - balance - damon/index - free_page_reporting - frontswap - hmm - hwpoison - hugetlbfs_reserv - ksm - memory-model - mmu_notifier - numa - overcommit-accounting - page_migration - page_frags - page_owner - page_table_check - remap_file_pages - slub - split_page_table_lock - transhuge - unevictable-lru - vmalloced-kernel-stacks - vmemmap_dedup - z3fold - zsmalloc diff --git a/Documentation/vm/ksm.rst b/Documentation/vm/ksm.rst deleted file mode 100644 index 9e37add068e6..000000000000 --- a/Documentation/vm/ksm.rst +++ /dev/null @@ -1,87 +0,0 @@ -.. _ksm: - -======================= -Kernel Samepage Merging -======================= - -KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y, -added to the Linux kernel in 2.6.32. See ``mm/ksm.c`` for its implementation, -and http://lwn.net/Articles/306704/ and https://lwn.net/Articles/330589/ - -The userspace interface of KSM is described in :ref:`Documentation/admin-guide/mm/ksm.rst ` - -Design -====== - -Overview --------- - -.. kernel-doc:: mm/ksm.c - :DOC: Overview - -Reverse mapping ---------------- -KSM maintains reverse mapping information for KSM pages in the stable -tree. - -If a KSM page is shared between less than ``max_page_sharing`` VMAs, -the node of the stable tree that represents such KSM page points to a -list of struct rmap_item and the ``page->mapping`` of the -KSM page points to the stable tree node. - -When the sharing passes this threshold, KSM adds a second dimension to -the stable tree. The tree node becomes a "chain" that links one or -more "dups". Each "dup" keeps reverse mapping information for a KSM -page with ``page->mapping`` pointing to that "dup". - -Every "chain" and all "dups" linked into a "chain" enforce the -invariant that they represent the same write protected memory content, -even if each "dup" will be pointed by a different KSM page copy of -that content. - -This way the stable tree lookup computational complexity is unaffected -if compared to an unlimited list of reverse mappings. It is still -enforced that there cannot be KSM page content duplicates in the -stable tree itself. - -The deduplication limit enforced by ``max_page_sharing`` is required -to avoid the virtual memory rmap lists to grow too large. The rmap -walk has O(N) complexity where N is the number of rmap_items -(i.e. virtual mappings) that are sharing the page, which is in turn -capped by ``max_page_sharing``. So this effectively spreads the linear -O(N) computational complexity from rmap walk context over different -KSM pages. The ksmd walk over the stable_node "chains" is also O(N), -but N is the number of stable_node "dups", not the number of -rmap_items, so it has not a significant impact on ksmd performance. In -practice the best stable_node "dup" candidate will be kept and found -at the head of the "dups" list. - -High values of ``max_page_sharing`` result in faster memory merging -(because there will be fewer stable_node dups queued into the -stable_node chain->hlist to check for pruning) and higher -deduplication factor at the expense of slower worst case for rmap -walks for any KSM page which can happen during swapping, compaction, -NUMA balancing and page migration. - -The ``stable_node_dups/stable_node_chains`` ratio is also affected by the -``max_page_sharing`` tunable, and an high ratio may indicate fragmentation -in the stable_node dups, which could be solved by introducing -fragmentation algorithms in ksmd which would refile rmap_items from -one stable_node dup to another stable_node dup, in order to free up -stable_node "dups" with few rmap_items in them, but that may increase -the ksmd CPU usage and possibly slowdown the readonly computations on -the KSM pages of the applications. - -The whole list of stable_node "dups" linked in the stable_node -"chains" is scanned periodically in order to prune stale stable_nodes. -The frequency of such scans is defined by -``stable_node_chains_prune_millisecs`` sysfs tunable. - -Reference ---------- -.. kernel-doc:: mm/ksm.c - :functions: mm_slot ksm_scan stable_node rmap_item - --- -Izik Eidus, -Hugh Dickins, 17 Nov 2009 diff --git a/Documentation/vm/memory-model.rst b/Documentation/vm/memory-model.rst deleted file mode 100644 index 30e8fbed6914..000000000000 --- a/Documentation/vm/memory-model.rst +++ /dev/null @@ -1,177 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -.. _physical_memory_model: - -===================== -Physical Memory Model -===================== - -Physical memory in a system may be addressed in different ways. The -simplest case is when the physical memory starts at address 0 and -spans a contiguous range up to the maximal address. It could be, -however, that this range contains small holes that are not accessible -for the CPU. Then there could be several contiguous ranges at -completely distinct addresses. And, don't forget about NUMA, where -different memory banks are attached to different CPUs. - -Linux abstracts this diversity using one of the two memory models: -FLATMEM and SPARSEMEM. Each architecture defines what -memory models it supports, what the default memory model is and -whether it is possible to manually override that default. - -All the memory models track the status of physical page frames using -struct page arranged in one or more arrays. - -Regardless of the selected memory model, there exists one-to-one -mapping between the physical page frame number (PFN) and the -corresponding `struct page`. - -Each memory model defines :c:func:`pfn_to_page` and :c:func:`page_to_pfn` -helpers that allow the conversion from PFN to `struct page` and vice -versa. - -FLATMEM -======= - -The simplest memory model is FLATMEM. This model is suitable for -non-NUMA systems with contiguous, or mostly contiguous, physical -memory. - -In the FLATMEM memory model, there is a global `mem_map` array that -maps the entire physical memory. For most architectures, the holes -have entries in the `mem_map` array. The `struct page` objects -corresponding to the holes are never fully initialized. - -To allocate the `mem_map` array, architecture specific setup code should -call :c:func:`free_area_init` function. Yet, the mappings array is not -usable until the call to :c:func:`memblock_free_all` that hands all the -memory to the page allocator. - -An architecture may free parts of the `mem_map` array that do not cover the -actual physical pages. In such case, the architecture specific -:c:func:`pfn_valid` implementation should take the holes in the -`mem_map` into account. - -With FLATMEM, the conversion between a PFN and the `struct page` is -straightforward: `PFN - ARCH_PFN_OFFSET` is an index to the -`mem_map` array. - -The `ARCH_PFN_OFFSET` defines the first page frame number for -systems with physical memory starting at address different from 0. - -SPARSEMEM -========= - -SPARSEMEM is the most versatile memory model available in Linux and it -is the only memory model that supports several advanced features such -as hot-plug and hot-remove of the physical memory, alternative memory -maps for non-volatile memory devices and deferred initialization of -the memory map for larger systems. - -The SPARSEMEM model presents the physical memory as a collection of -sections. A section is represented with struct mem_section -that contains `section_mem_map` that is, logically, a pointer to an -array of struct pages. However, it is stored with some other magic -that aids the sections management. The section size and maximal number -of section is specified using `SECTION_SIZE_BITS` and -`MAX_PHYSMEM_BITS` constants defined by each architecture that -supports SPARSEMEM. While `MAX_PHYSMEM_BITS` is an actual width of a -physical address that an architecture supports, the -`SECTION_SIZE_BITS` is an arbitrary value. - -The maximal number of sections is denoted `NR_MEM_SECTIONS` and -defined as - -.. math:: - - NR\_MEM\_SECTIONS = 2 ^ {(MAX\_PHYSMEM\_BITS - SECTION\_SIZE\_BITS)} - -The `mem_section` objects are arranged in a two-dimensional array -called `mem_sections`. The size and placement of this array depend -on `CONFIG_SPARSEMEM_EXTREME` and the maximal possible number of -sections: - -* When `CONFIG_SPARSEMEM_EXTREME` is disabled, the `mem_sections` - array is static and has `NR_MEM_SECTIONS` rows. Each row holds a - single `mem_section` object. -* When `CONFIG_SPARSEMEM_EXTREME` is enabled, the `mem_sections` - array is dynamically allocated. Each row contains PAGE_SIZE worth of - `mem_section` objects and the number of rows is calculated to fit - all the memory sections. - -The architecture setup code should call sparse_init() to -initialize the memory sections and the memory maps. - -With SPARSEMEM there are two possible ways to convert a PFN to the -corresponding `struct page` - a "classic sparse" and "sparse -vmemmap". The selection is made at build time and it is determined by -the value of `CONFIG_SPARSEMEM_VMEMMAP`. - -The classic sparse encodes the section number of a page in page->flags -and uses high bits of a PFN to access the section that maps that page -frame. Inside a section, the PFN is the index to the array of pages. - -The sparse vmemmap uses a virtually mapped memory map to optimize -pfn_to_page and page_to_pfn operations. There is a global `struct -page *vmemmap` pointer that points to a virtually contiguous array of -`struct page` objects. A PFN is an index to that array and the -offset of the `struct page` from `vmemmap` is the PFN of that -page. - -To use vmemmap, an architecture has to reserve a range of virtual -addresses that will map the physical pages containing the memory -map and make sure that `vmemmap` points to that range. In addition, -the architecture should implement :c:func:`vmemmap_populate` method -that will allocate the physical memory and create page tables for the -virtual memory map. If an architecture does not have any special -requirements for the vmemmap mappings, it can use default -:c:func:`vmemmap_populate_basepages` provided by the generic memory -management. - -The virtually mapped memory map allows storing `struct page` objects -for persistent memory devices in pre-allocated storage on those -devices. This storage is represented with struct vmem_altmap -that is eventually passed to vmemmap_populate() through a long chain -of function calls. The vmemmap_populate() implementation may use the -`vmem_altmap` along with :c:func:`vmemmap_alloc_block_buf` helper to -allocate memory map on the persistent memory device. - -ZONE_DEVICE -=========== -The `ZONE_DEVICE` facility builds upon `SPARSEMEM_VMEMMAP` to offer -`struct page` `mem_map` services for device driver identified physical -address ranges. The "device" aspect of `ZONE_DEVICE` relates to the fact -that the page objects for these address ranges are never marked online, -and that a reference must be taken against the device, not just the page -to keep the memory pinned for active use. `ZONE_DEVICE`, via -:c:func:`devm_memremap_pages`, performs just enough memory hotplug to -turn on :c:func:`pfn_to_page`, :c:func:`page_to_pfn`, and -:c:func:`get_user_pages` service for the given range of pfns. Since the -page reference count never drops below 1 the page is never tracked as -free memory and the page's `struct list_head lru` space is repurposed -for back referencing to the host device / driver that mapped the memory. - -While `SPARSEMEM` presents memory as a collection of sections, -optionally collected into memory blocks, `ZONE_DEVICE` users have a need -for smaller granularity of populating the `mem_map`. Given that -`ZONE_DEVICE` memory is never marked online it is subsequently never -subject to its memory ranges being exposed through the sysfs memory -hotplug api on memory block boundaries. The implementation relies on -this lack of user-api constraint to allow sub-section sized memory -ranges to be specified to :c:func:`arch_add_memory`, the top-half of -memory hotplug. Sub-section support allows for 2MB as the cross-arch -common alignment granularity for :c:func:`devm_memremap_pages`. - -The users of `ZONE_DEVICE` are: - -* pmem: Map platform persistent memory to be used as a direct-I/O target - via DAX mappings. - -* hmm: Extend `ZONE_DEVICE` with `->page_fault()` and `->page_free()` - event callbacks to allow a device-driver to coordinate memory management - events related to device-memory, typically GPU memory. See - Documentation/vm/hmm.rst. - -* p2pdma: Create `struct page` objects to allow peer devices in a - PCI/-E topology to coordinate direct-DMA operations between themselves, - i.e. bypass host memory. diff --git a/Documentation/vm/mmu_notifier.rst b/Documentation/vm/mmu_notifier.rst deleted file mode 100644 index df5d7777fc6b..000000000000 --- a/Documentation/vm/mmu_notifier.rst +++ /dev/null @@ -1,99 +0,0 @@ -.. _mmu_notifier: - -When do you need to notify inside page table lock ? -=================================================== - -When clearing a pte/pmd we are given a choice to notify the event through -(notify version of \*_clear_flush call mmu_notifier_invalidate_range) under -the page table lock. But that notification is not necessary in all cases. - -For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use -thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a -process virtual address space). There is only 2 cases when you need to notify -those secondary TLB while holding page table lock when clearing a pte/pmd: - - A) page backing address is free before mmu_notifier_invalidate_range_end() - B) a page table entry is updated to point to a new page (COW, write fault - on zero page, __replace_page(), ...) - -Case A is obvious you do not want to take the risk for the device to write to -a page that might now be used by some completely different task. - -Case B is more subtle. For correctness it requires the following sequence to -happen: - - - take page table lock - - clear page table entry and notify ([pmd/pte]p_huge_clear_flush_notify()) - - set page table entry to point to new page - -If clearing the page table entry is not followed by a notify before setting -the new pte/pmd value then you can break memory model like C11 or C++11 for -the device. - -Consider the following scenario (device use a feature similar to ATS/PASID): - -Two address addrA and addrB such that \|addrA - addrB\| >= PAGE_SIZE we assume -they are write protected for COW (other case of B apply too). - -:: - - [Time N] -------------------------------------------------------------------- - CPU-thread-0 {try to write to addrA} - CPU-thread-1 {try to write to addrB} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {read addrA and populate device TLB} - DEV-thread-2 {read addrB and populate device TLB} - [Time N+1] ------------------------------------------------------------------ - CPU-thread-0 {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}} - CPU-thread-1 {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+2] ------------------------------------------------------------------ - CPU-thread-0 {COW_step1: {update page table to point to new page for addrA}} - CPU-thread-1 {COW_step1: {update page table to point to new page for addrB}} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+3] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {preempted} - CPU-thread-2 {write to addrA which is a write to new page} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+3] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {preempted} - CPU-thread-2 {} - CPU-thread-3 {write to addrB which is a write to new page} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+4] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+5] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {read addrA from old page} - DEV-thread-2 {read addrB from new page} - -So here because at time N+2 the clear page table entry was not pair with a -notification to invalidate the secondary TLB, the device see the new value for -addrB before seeing the new value for addrA. This break total memory ordering -for the device. - -When changing a pte to write protect or to point to a new write protected page -with same content (KSM) it is fine to delay the mmu_notifier_invalidate_range -call to mmu_notifier_invalidate_range_end() outside the page table lock. This -is true even if the thread doing the page table update is preempted right after -releasing page table lock but before call mmu_notifier_invalidate_range_end(). diff --git a/Documentation/vm/numa.rst b/Documentation/vm/numa.rst deleted file mode 100644 index 99fdeca917ca..000000000000 --- a/Documentation/vm/numa.rst +++ /dev/null @@ -1,150 +0,0 @@ -.. _numa: - -Started Nov 1999 by Kanoj Sarcar - -============= -What is NUMA? -============= - -This question can be answered from a couple of perspectives: the -hardware view and the Linux software view. - -From the hardware perspective, a NUMA system is a computer platform that -comprises multiple components or assemblies each of which may contain 0 -or more CPUs, local memory, and/or IO buses. For brevity and to -disambiguate the hardware view of these physical components/assemblies -from the software abstraction thereof, we'll call the components/assemblies -'cells' in this document. - -Each of the 'cells' may be viewed as an SMP [symmetric multi-processor] subset -of the system--although some components necessary for a stand-alone SMP system -may not be populated on any given cell. The cells of the NUMA system are -connected together with some sort of system interconnect--e.g., a crossbar or -point-to-point link are common types of NUMA system interconnects. Both of -these types of interconnects can be aggregated to create NUMA platforms with -cells at multiple distances from other cells. - -For Linux, the NUMA platforms of interest are primarily what is known as Cache -Coherent NUMA or ccNUMA systems. With ccNUMA systems, all memory is visible -to and accessible from any CPU attached to any cell and cache coherency -is handled in hardware by the processor caches and/or the system interconnect. - -Memory access time and effective memory bandwidth varies depending on how far -away the cell containing the CPU or IO bus making the memory access is from the -cell containing the target memory. For example, access to memory by CPUs -attached to the same cell will experience faster access times and higher -bandwidths than accesses to memory on other, remote cells. NUMA platforms -can have cells at multiple remote distances from any given cell. - -Platform vendors don't build NUMA systems just to make software developers' -lives interesting. Rather, this architecture is a means to provide scalable -memory bandwidth. However, to achieve scalable memory bandwidth, system and -application software must arrange for a large majority of the memory references -[cache misses] to be to "local" memory--memory on the same cell, if any--or -to the closest cell with memory. - -This leads to the Linux software view of a NUMA system: - -Linux divides the system's hardware resources into multiple software -abstractions called "nodes". Linux maps the nodes onto the physical cells -of the hardware platform, abstracting away some of the details for some -architectures. As with physical cells, software nodes may contain 0 or more -CPUs, memory and/or IO buses. And, again, memory accesses to memory on -"closer" nodes--nodes that map to closer cells--will generally experience -faster access times and higher effective bandwidth than accesses to more -remote cells. - -For some architectures, such as x86, Linux will "hide" any node representing a -physical cell that has no memory attached, and reassign any CPUs attached to -that cell to a node representing a cell that does have memory. Thus, on -these architectures, one cannot assume that all CPUs that Linux associates with -a given node will see the same local memory access times and bandwidth. - -In addition, for some architectures, again x86 is an example, Linux supports -the emulation of additional nodes. For NUMA emulation, linux will carve up -the existing nodes--or the system memory for non-NUMA platforms--into multiple -nodes. Each emulated node will manage a fraction of the underlying cells' -physical memory. NUMA emluation is useful for testing NUMA kernel and -application features on non-NUMA platforms, and as a sort of memory resource -management mechanism when used together with cpusets. -[see Documentation/admin-guide/cgroup-v1/cpusets.rst] - -For each node with memory, Linux constructs an independent memory management -subsystem, complete with its own free page lists, in-use page lists, usage -statistics and locks to mediate access. In addition, Linux constructs for -each memory zone [one or more of DMA, DMA32, NORMAL, HIGH_MEMORY, MOVABLE], -an ordered "zonelist". A zonelist specifies the zones/nodes to visit when a -selected zone/node cannot satisfy the allocation request. This situation, -when a zone has no available memory to satisfy a request, is called -"overflow" or "fallback". - -Because some nodes contain multiple zones containing different types of -memory, Linux must decide whether to order the zonelists such that allocations -fall back to the same zone type on a different node, or to a different zone -type on the same node. This is an important consideration because some zones, -such as DMA or DMA32, represent relatively scarce resources. Linux chooses -a default Node ordered zonelist. This means it tries to fallback to other zones -from the same node before using remote nodes which are ordered by NUMA distance. - -By default, Linux will attempt to satisfy memory allocation requests from the -node to which the CPU that executes the request is assigned. Specifically, -Linux will attempt to allocate from the first node in the appropriate zonelist -for the node where the request originates. This is called "local allocation." -If the "local" node cannot satisfy the request, the kernel will examine other -nodes' zones in the selected zonelist looking for the first zone in the list -that can satisfy the request. - -Local allocation will tend to keep subsequent access to the allocated memory -"local" to the underlying physical resources and off the system interconnect-- -as long as the task on whose behalf the kernel allocated some memory does not -later migrate away from that memory. The Linux scheduler is aware of the -NUMA topology of the platform--embodied in the "scheduling domains" data -structures [see Documentation/scheduler/sched-domains.rst]--and the scheduler -attempts to minimize task migration to distant scheduling domains. However, -the scheduler does not take a task's NUMA footprint into account directly. -Thus, under sufficient imbalance, tasks can migrate between nodes, remote -from their initial node and kernel data structures. - -System administrators and application designers can restrict a task's migration -to improve NUMA locality using various CPU affinity command line interfaces, -such as taskset(1) and numactl(1), and program interfaces such as -sched_setaffinity(2). Further, one can modify the kernel's default local -allocation behavior using Linux NUMA memory policy. [see -:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst `]. - -System administrators can restrict the CPUs and nodes' memories that a non- -privileged user can specify in the scheduling or NUMA commands and functions -using control groups and CPUsets. [see Documentation/admin-guide/cgroup-v1/cpusets.rst] - -On architectures that do not hide memoryless nodes, Linux will include only -zones [nodes] with memory in the zonelists. This means that for a memoryless -node the "local memory node"--the node of the first zone in CPU's node's -zonelist--will not be the node itself. Rather, it will be the node that the -kernel selected as the nearest node with memory when it built the zonelists. -So, default, local allocations will succeed with the kernel supplying the -closest available memory. This is a consequence of the same mechanism that -allows such allocations to fallback to other nearby nodes when a node that -does contain memory overflows. - -Some kernel allocations do not want or cannot tolerate this allocation fallback -behavior. Rather they want to be sure they get memory from the specified node -or get notified that the node has no free memory. This is usually the case when -a subsystem allocates per CPU memory resources, for example. - -A typical model for making such an allocation is to obtain the node id of the -node to which the "current CPU" is attached using one of the kernel's -numa_node_id() or CPU_to_node() functions and then request memory from only -the node id returned. When such an allocation fails, the requesting subsystem -may revert to its own fallback path. The slab kernel memory allocator is an -example of this. Or, the subsystem may choose to disable or not to enable -itself on allocation failure. The kernel profiling subsystem is an example of -this. - -If the architecture supports--does not hide--memoryless nodes, then CPUs -attached to memoryless nodes would always incur the fallback path overhead -or some subsystems would fail to initialize if they attempted to allocated -memory exclusively from a node without memory. To support such -architectures transparently, kernel subsystems can use the numa_mem_id() -or cpu_to_mem() function to locate the "local memory node" for the calling or -specified CPU. Again, this is the same node from which default, local page -allocations will be attempted. diff --git a/Documentation/vm/oom.rst b/Documentation/vm/oom.rst deleted file mode 100644 index 18e9e40c1ec1..000000000000 --- a/Documentation/vm/oom.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -====================== -Out Of Memory Handling -====================== diff --git a/Documentation/vm/overcommit-accounting.rst b/Documentation/vm/overcommit-accounting.rst deleted file mode 100644 index 1addb0c374a4..000000000000 --- a/Documentation/vm/overcommit-accounting.rst +++ /dev/null @@ -1,88 +0,0 @@ -.. _overcommit_accounting: - -===================== -Overcommit Accounting -===================== - -The Linux kernel supports the following overcommit handling modes - -0 - Heuristic overcommit handling. Obvious overcommits of address - space are refused. Used for a typical system. It ensures a - seriously wild allocation fails while allowing overcommit to - reduce swap usage. root is allowed to allocate slightly more - memory in this mode. This is the default. - -1 - Always overcommit. Appropriate for some scientific - applications. Classic example is code using sparse arrays and - just relying on the virtual memory consisting almost entirely - of zero pages. - -2 - Don't overcommit. The total address space commit for the - system is not permitted to exceed swap + a configurable amount - (default is 50%) of physical RAM. Depending on the amount you - use, in most situations this means a process will not be - killed while accessing pages but will receive errors on memory - allocation as appropriate. - - Useful for applications that want to guarantee their memory - allocations will be available in the future without having to - initialize every page. - -The overcommit policy is set via the sysctl ``vm.overcommit_memory``. - -The overcommit amount can be set via ``vm.overcommit_ratio`` (percentage) -or ``vm.overcommit_kbytes`` (absolute value). These only have an effect -when ``vm.overcommit_memory`` is set to 2. - -The current overcommit limit and amount committed are viewable in -``/proc/meminfo`` as CommitLimit and Committed_AS respectively. - -Gotchas -======= - -The C language stack growth does an implicit mremap. If you want absolute -guarantees and run close to the edge you MUST mmap your stack for the -largest size you think you will need. For typical stack usage this does -not matter much but it's a corner case if you really really care - -In mode 2 the MAP_NORESERVE flag is ignored. - - -How It Works -============ - -The overcommit is based on the following rules - -For a file backed map - | SHARED or READ-only - 0 cost (the file is the map not swap) - | PRIVATE WRITABLE - size of mapping per instance - -For an anonymous or ``/dev/zero`` map - | SHARED - size of mapping - | PRIVATE READ-only - 0 cost (but of little use) - | PRIVATE WRITABLE - size of mapping per instance - -Additional accounting - | Pages made writable copies by mmap - | shmfs memory drawn from the same pool - -Status -====== - -* We account mmap memory mappings -* We account mprotect changes in commit -* We account mremap changes in size -* We account brk -* We account munmap -* We report the commit status in /proc -* Account and check on fork -* Review stack handling/building on exec -* SHMfs accounting -* Implement actual limit enforcement - -To Do -===== -* Account ptrace pages (this is hard) diff --git a/Documentation/vm/page_allocation.rst b/Documentation/vm/page_allocation.rst deleted file mode 100644 index d9b4495561f1..000000000000 --- a/Documentation/vm/page_allocation.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -=============== -Page Allocation -=============== diff --git a/Documentation/vm/page_cache.rst b/Documentation/vm/page_cache.rst deleted file mode 100644 index 75eba7c431b2..000000000000 --- a/Documentation/vm/page_cache.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -========== -Page Cache -========== diff --git a/Documentation/vm/page_frags.rst b/Documentation/vm/page_frags.rst deleted file mode 100644 index 7d6f9385d129..000000000000 --- a/Documentation/vm/page_frags.rst +++ /dev/null @@ -1,45 +0,0 @@ -.. _page_frags: - -============== -Page fragments -============== - -A page fragment is an arbitrary-length arbitrary-offset area of memory -which resides within a 0 or higher order compound page. Multiple -fragments within that page are individually refcounted, in the page's -reference counter. - -The page_frag functions, page_frag_alloc and page_frag_free, provide a -simple allocation framework for page fragments. This is used by the -network stack and network device drivers to provide a backing region of -memory for use as either an sk_buff->head, or to be used in the "frags" -portion of skb_shared_info. - -In order to make use of the page fragment APIs a backing page fragment -cache is needed. This provides a central point for the fragment allocation -and tracks allows multiple calls to make use of a cached page. The -advantage to doing this is that multiple calls to get_page can be avoided -which can be expensive at allocation time. However due to the nature of -this caching it is required that any calls to the cache be protected by -either a per-cpu limitation, or a per-cpu limitation and forcing interrupts -to be disabled when executing the fragment allocation. - -The network stack uses two separate caches per CPU to handle fragment -allocation. The netdev_alloc_cache is used by callers making use of the -netdev_alloc_frag and __netdev_alloc_skb calls. The napi_alloc_cache is -used by callers of the __napi_alloc_frag and __napi_alloc_skb calls. The -main difference between these two calls is the context in which they may be -called. The "netdev" prefixed functions are usable in any context as these -functions will disable interrupts, while the "napi" prefixed functions are -only usable within the softirq context. - -Many network device drivers use a similar methodology for allocating page -fragments, but the page fragments are cached at the ring or descriptor -level. In order to enable these cases it is necessary to provide a generic -way of tearing down a page cache. For this reason __page_frag_cache_drain -was implemented. It allows for freeing multiple references from a single -page via a single call. The advantage to doing this is that it allows for -cleaning up the multiple references that were added to a page in order to -avoid calling get_page per allocation. - -Alexander Duyck, Nov 29, 2016. diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst deleted file mode 100644 index 8c5cb8147e55..000000000000 --- a/Documentation/vm/page_migration.rst +++ /dev/null @@ -1,288 +0,0 @@ -.. _page_migration: - -============== -Page migration -============== - -Page migration allows moving the physical location of pages between -nodes in a NUMA system while the process is running. This means that the -virtual addresses that the process sees do not change. However, the -system rearranges the physical location of those pages. - -Also see :ref:`Heterogeneous Memory Management (HMM) ` -for migrating pages to or from device private memory. - -The main intent of page migration is to reduce the latency of memory accesses -by moving pages near to the processor where the process accessing that memory -is running. - -Page migration allows a process to manually relocate the node on which its -pages are located through the MF_MOVE and MF_MOVE_ALL options while setting -a new memory policy via mbind(). The pages of a process can also be relocated -from another process using the sys_migrate_pages() function call. The -migrate_pages() function call takes two sets of nodes and moves pages of a -process that are located on the from nodes to the destination nodes. -Page migration functions are provided by the numactl package by Andi Kleen -(a version later than 0.9.3 is required. Get it from -https://github.com/numactl/numactl.git). numactl provides libnuma -which provides an interface similar to other NUMA functionality for page -migration. cat ``/proc//numa_maps`` allows an easy review of where the -pages of a process are located. See also the numa_maps documentation in the -proc(5) man page. - -Manual migration is useful if for example the scheduler has relocated -a process to a processor on a distant node. A batch scheduler or an -administrator may detect the situation and move the pages of the process -nearer to the new processor. The kernel itself only provides -manual page migration support. Automatic page migration may be implemented -through user space processes that move pages. A special function call -"move_pages" allows the moving of individual pages within a process. -For example, A NUMA profiler may obtain a log showing frequent off-node -accesses and may use the result to move pages to more advantageous -locations. - -Larger installations usually partition the system using cpusets into -sections of nodes. Paul Jackson has equipped cpusets with the ability to -move pages when a task is moved to another cpuset (See -:ref:`CPUSETS `). -Cpusets allow the automation of process locality. If a task is moved to -a new cpuset then also all its pages are moved with it so that the -performance of the process does not sink dramatically. Also the pages -of processes in a cpuset are moved if the allowed memory nodes of a -cpuset are changed. - -Page migration allows the preservation of the relative location of pages -within a group of nodes for all migration techniques which will preserve a -particular memory allocation pattern generated even after migrating a -process. This is necessary in order to preserve the memory latencies. -Processes will run with similar performance after migration. - -Page migration occurs in several steps. First a high level -description for those trying to use migrate_pages() from the kernel -(for userspace usage see the Andi Kleen's numactl package mentioned above) -and then a low level description of how the low level details work. - -In kernel use of migrate_pages() -================================ - -1. Remove pages from the LRU. - - Lists of pages to be migrated are generated by scanning over - pages and moving them into lists. This is done by - calling isolate_lru_page(). - Calling isolate_lru_page() increases the references to the page - so that it cannot vanish while the page migration occurs. - It also prevents the swapper or other scans from encountering - the page. - -2. We need to have a function of type new_page_t that can be - passed to migrate_pages(). This function should figure out - how to allocate the correct new page given the old page. - -3. The migrate_pages() function is called which attempts - to do the migration. It will call the function to allocate - the new page for each page that is considered for - moving. - -How migrate_pages() works -========================= - -migrate_pages() does several passes over its list of pages. A page is moved -if all references to a page are removable at the time. The page has -already been removed from the LRU via isolate_lru_page() and the refcount -is increased so that the page cannot be freed while page migration occurs. - -Steps: - -1. Lock the page to be migrated. - -2. Ensure that writeback is complete. - -3. Lock the new page that we want to move to. It is locked so that accesses to - this (not yet up-to-date) page immediately block while the move is in progress. - -4. All the page table references to the page are converted to migration - entries. This decreases the mapcount of a page. If the resulting - mapcount is not zero then we do not migrate the page. All user space - processes that attempt to access the page will now wait on the page lock - or wait for the migration page table entry to be removed. - -5. The i_pages lock is taken. This will cause all processes trying - to access the page via the mapping to block on the spinlock. - -6. The refcount of the page is examined and we back out if references remain. - Otherwise, we know that we are the only one referencing this page. - -7. The radix tree is checked and if it does not contain the pointer to this - page then we back out because someone else modified the radix tree. - -8. The new page is prepped with some settings from the old page so that - accesses to the new page will discover a page with the correct settings. - -9. The radix tree is changed to point to the new page. - -10. The reference count of the old page is dropped because the address space - reference is gone. A reference to the new page is established because - the new page is referenced by the address space. - -11. The i_pages lock is dropped. With that lookups in the mapping - become possible again. Processes will move from spinning on the lock - to sleeping on the locked new page. - -12. The page contents are copied to the new page. - -13. The remaining page flags are copied to the new page. - -14. The old page flags are cleared to indicate that the page does - not provide any information anymore. - -15. Queued up writeback on the new page is triggered. - -16. If migration entries were inserted into the page table, then replace them - with real ptes. Doing so will enable access for user space processes not - already waiting for the page lock. - -17. The page locks are dropped from the old and new page. - Processes waiting on the page lock will redo their page faults - and will reach the new page. - -18. The new page is moved to the LRU and can be scanned by the swapper, - etc. again. - -Non-LRU page migration -====================== - -Although migration originally aimed for reducing the latency of memory accesses -for NUMA, compaction also uses migration to create high-order pages. - -Current problem of the implementation is that it is designed to migrate only -*LRU* pages. However, there are potential non-LRU pages which can be migrated -in drivers, for example, zsmalloc, virtio-balloon pages. - -For virtio-balloon pages, some parts of migration code path have been hooked -up and added virtio-balloon specific functions to intercept migration logics. -It's too specific to a driver so other drivers who want to make their pages -movable would have to add their own specific hooks in the migration path. - -To overcome the problem, VM supports non-LRU page migration which provides -generic functions for non-LRU movable pages without driver specific hooks -in the migration path. - -If a driver wants to make its pages movable, it should define three functions -which are function pointers of struct address_space_operations. - -1. ``bool (*isolate_page) (struct page *page, isolate_mode_t mode);`` - - What VM expects from isolate_page() function of driver is to return *true* - if driver isolates the page successfully. On returning true, VM marks the page - as PG_isolated so concurrent isolation in several CPUs skip the page - for isolation. If a driver cannot isolate the page, it should return *false*. - - Once page is successfully isolated, VM uses page.lru fields so driver - shouldn't expect to preserve values in those fields. - -2. ``int (*migratepage) (struct address_space *mapping,`` -| ``struct page *newpage, struct page *oldpage, enum migrate_mode);`` - - After isolation, VM calls migratepage() of driver with the isolated page. - The function of migratepage() is to move the contents of the old page to the - new page - and set up fields of struct page newpage. Keep in mind that you should - indicate to the VM the oldpage is no longer movable via __ClearPageMovable() - under page_lock if you migrated the oldpage successfully and returned - MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver - can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time - because VM interprets -EAGAIN as "temporary migration failure". On returning - any error except -EAGAIN, VM will give up the page migration without - retrying. - - Driver shouldn't touch the page.lru field while in the migratepage() function. - -3. ``void (*putback_page)(struct page *);`` - - If migration fails on the isolated page, VM should return the isolated page - to the driver so VM calls the driver's putback_page() with the isolated page. - In this function, the driver should put the isolated page back into its own data - structure. - -Non-LRU movable page flags - - There are two page flags for supporting non-LRU movable page. - - * PG_movable - - Driver should use the function below to make page movable under page_lock:: - - void __SetPageMovable(struct page *page, struct address_space *mapping) - - It needs argument of address_space for registering migration - family functions which will be called by VM. Exactly speaking, - PG_movable is not a real flag of struct page. Rather, VM - reuses the page->mapping's lower bits to represent it:: - - #define PAGE_MAPPING_MOVABLE 0x2 - page->mapping = page->mapping | PAGE_MAPPING_MOVABLE; - - so driver shouldn't access page->mapping directly. Instead, driver should - use page_mapping() which masks off the low two bits of page->mapping under - page lock so it can get the right struct address_space. - - For testing of non-LRU movable pages, VM supports __PageMovable() function. - However, it doesn't guarantee to identify non-LRU movable pages because - the page->mapping field is unified with other variables in struct page. - If the driver releases the page after isolation by VM, page->mapping - doesn't have a stable value although it has PAGE_MAPPING_MOVABLE set - (look at __ClearPageMovable). But __PageMovable() is cheap to call whether - page is LRU or non-LRU movable once the page has been isolated because LRU - pages can never have PAGE_MAPPING_MOVABLE set in page->mapping. It is also - good for just peeking to test non-LRU movable pages before more expensive - checking with lock_page() in pfn scanning to select a victim. - - For guaranteeing non-LRU movable page, VM provides PageMovable() function. - Unlike __PageMovable(), PageMovable() validates page->mapping and - mapping->a_ops->isolate_page under lock_page(). The lock_page() prevents - sudden destroying of page->mapping. - - Drivers using __SetPageMovable() should clear the flag via - __ClearMovablePage() under page_lock() before the releasing the page. - - * PG_isolated - - To prevent concurrent isolation among several CPUs, VM marks isolated page - as PG_isolated under lock_page(). So if a CPU encounters PG_isolated - non-LRU movable page, it can skip it. Driver doesn't need to manipulate the - flag because VM will set/clear it automatically. Keep in mind that if the - driver sees a PG_isolated page, it means the page has been isolated by the - VM so it shouldn't touch the page.lru field. - The PG_isolated flag is aliased with the PG_reclaim flag so drivers - shouldn't use PG_isolated for its own purposes. - -Monitoring Migration -===================== - -The following events (counters) can be used to monitor page migration. - -1. PGMIGRATE_SUCCESS: Normal page migration success. Each count means that a - page was migrated. If the page was a non-THP and non-hugetlb page, then - this counter is increased by one. If the page was a THP or hugetlb, then - this counter is increased by the number of THP or hugetlb subpages. - For example, migration of a single 2MB THP that has 4KB-size base pages - (subpages) will cause this counter to increase by 512. - -2. PGMIGRATE_FAIL: Normal page migration failure. Same counting rules as for - PGMIGRATE_SUCCESS, above: this will be increased by the number of subpages, - if it was a THP or hugetlb. - -3. THP_MIGRATION_SUCCESS: A THP was migrated without being split. - -4. THP_MIGRATION_FAIL: A THP could not be migrated nor it could be split. - -5. THP_MIGRATION_SPLIT: A THP was migrated, but not as such: first, the THP had - to be split. After splitting, a migration retry was used for it's sub-pages. - -THP_MIGRATION_* events also update the appropriate PGMIGRATE_SUCCESS or -PGMIGRATE_FAIL events. For example, a THP migration failure will cause both -THP_MIGRATION_FAIL and PGMIGRATE_FAIL to increase. - -Christoph Lameter, May 8, 2006. -Minchan Kim, Mar 28, 2016. diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst deleted file mode 100644 index f5c954afe97c..000000000000 --- a/Documentation/vm/page_owner.rst +++ /dev/null @@ -1,196 +0,0 @@ -.. _page_owner: - -================================================== -page owner: Tracking about who allocated each page -================================================== - -Introduction -============ - -page owner is for the tracking about who allocated each page. -It can be used to debug memory leak or to find a memory hogger. -When allocation happens, information about allocation such as call stack -and order of pages is stored into certain storage for each page. -When we need to know about status of all pages, we can get and analyze -this information. - -Although we already have tracepoint for tracing page allocation/free, -using it for analyzing who allocate each page is rather complex. We need -to enlarge the trace buffer for preventing overlapping until userspace -program launched. And, launched program continually dump out the trace -buffer for later analysis and it would change system behaviour with more -possibility rather than just keeping it in memory, so bad for debugging. - -page owner can also be used for various purposes. For example, accurate -fragmentation statistics can be obtained through gfp flag information of -each page. It is already implemented and activated if page owner is -enabled. Other usages are more than welcome. - -page owner is disabled by default. So, if you'd like to use it, you need -to add "page_owner=on" to your boot cmdline. If the kernel is built -with page owner and page owner is disabled in runtime due to not enabling -boot option, runtime overhead is marginal. If disabled in runtime, it -doesn't require memory to store owner information, so there is no runtime -memory overhead. And, page owner inserts just two unlikely branches into -the page allocator hotpath and if not enabled, then allocation is done -like as the kernel without page owner. These two unlikely branches should -not affect to allocation performance, especially if the static keys jump -label patching functionality is available. Following is the kernel's code -size change due to this facility. - -- Without page owner:: - - text data bss dec hex filename - 48392 2333 644 51369 c8a9 mm/page_alloc.o - -- With page owner:: - - text data bss dec hex filename - 48800 2445 644 51889 cab1 mm/page_alloc.o - 6662 108 29 6799 1a8f mm/page_owner.o - 1025 8 8 1041 411 mm/page_ext.o - -Although, roughly, 8 KB code is added in total, page_alloc.o increase by -520 bytes and less than half of it is in hotpath. Building the kernel with -page owner and turning it on if needed would be great option to debug -kernel memory problem. - -There is one notice that is caused by implementation detail. page owner -stores information into the memory from struct page extension. This memory -is initialized some time later than that page allocator starts in sparse -memory system, so, until initialization, many pages can be allocated and -they would have no owner information. To fix it up, these early allocated -pages are investigated and marked as allocated in initialization phase. -Although it doesn't mean that they have the right owner information, -at least, we can tell whether the page is allocated or not, -more accurately. On 2GB memory x86-64 VM box, 13343 early allocated pages -are catched and marked, although they are mostly allocated from struct -page extension feature. Anyway, after that, no page is left in -un-tracking state. - -Usage -===== - -1) Build user-space helper:: - - cd tools/vm - make page_owner_sort - -2) Enable page owner: add "page_owner=on" to boot cmdline. - -3) Do the job that you want to debug. - -4) Analyze information from page owner:: - - cat /sys/kernel/debug/page_owner > page_owner_full.txt - ./page_owner_sort page_owner_full.txt sorted_page_owner.txt - - The general output of ``page_owner_full.txt`` is as follows:: - - Page allocated via order XXX, ... - PFN XXX ... - // Detailed stack - - Page allocated via order XXX, ... - PFN XXX ... - // Detailed stack - - The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows - in buf, uses regexp to extract the page order value, counts the times - and pages of buf, and finally sorts them according to the parameter(s). - - See the result about who allocated each page - in the ``sorted_page_owner.txt``. General output:: - - XXX times, XXX pages: - Page allocated via order XXX, ... - // Detailed stack - - By default, ``page_owner_sort`` is sorted according to the times of buf. - If you want to sort by the page nums of buf, use the ``-m`` parameter. - The detailed parameters are: - - fundamental function:: - - Sort: - -a Sort by memory allocation time. - -m Sort by total memory. - -p Sort by pid. - -P Sort by tgid. - -n Sort by task command name. - -r Sort by memory release time. - -s Sort by stack trace. - -t Sort by times (default). - --sort Specify sorting order. Sorting syntax is [+|-]key[,[+|-]key[,...]]. - Choose a key from the **STANDARD FORMAT SPECIFIERS** section. The "+" is - optional since default direction is increasing numerical or lexicographic - order. Mixed use of abbreviated and complete-form of keys is allowed. - - Examples: - ./page_owner_sort --sort=n,+pid,-tgid - ./page_owner_sort --sort=at - - additional function:: - - Cull: - --cull - Specify culling rules.Culling syntax is key[,key[,...]].Choose a - multi-letter key from the **STANDARD FORMAT SPECIFIERS** section. - - is a single argument in the form of a comma-separated list, - which offers a way to specify individual culling rules. The recognized - keywords are described in the **STANDARD FORMAT SPECIFIERS** section below. - can be specified by the sequence of keys k1,k2, ..., as described in - the STANDARD SORT KEYS section below. Mixed use of abbreviated and - complete-form of keys is allowed. - - Examples: - ./page_owner_sort --cull=stacktrace - ./page_owner_sort --cull=st,pid,name - ./page_owner_sort --cull=n,f - - Filter: - -f Filter out the information of blocks whose memory has been released. - - Select: - --pid Select by pid. This selects the blocks whose process ID - numbers appear in . - --tgid Select by tgid. This selects the blocks whose thread - group ID numbers appear in . - --name Select by task command name. This selects the blocks whose - task command name appear in . - - , , are single arguments in the form of a comma-separated list, - which offers a way to specify individual selecting rules. - - - Examples: - ./page_owner_sort --pid=1 - ./page_owner_sort --tgid=1,2,3 - ./page_owner_sort --name name1,name2 - -STANDARD FORMAT SPECIFIERS -========================== -:: - - For --sort option: - - KEY LONG DESCRIPTION - p pid process ID - tg tgid thread group ID - n name task command name - st stacktrace stack trace of the page allocation - T txt full text of block - ft free_ts timestamp of the page when it was released - at alloc_ts timestamp of the page when it was allocated - ator allocator memory allocator for pages - - For --curl option: - - KEY LONG DESCRIPTION - p pid process ID - tg tgid thread group ID - n name task command name - f free whether the page has been released or not - st stacktrace stack trace of the page allocation - ator allocator memory allocator for pages diff --git a/Documentation/vm/page_reclaim.rst b/Documentation/vm/page_reclaim.rst deleted file mode 100644 index 50a30b7f8ac3..000000000000 --- a/Documentation/vm/page_reclaim.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -============ -Page Reclaim -============ diff --git a/Documentation/vm/page_table_check.rst b/Documentation/vm/page_table_check.rst deleted file mode 100644 index 1a09472f10a3..000000000000 --- a/Documentation/vm/page_table_check.rst +++ /dev/null @@ -1,56 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -.. _page_table_check: - -================ -Page Table Check -================ - -Introduction -============ - -Page table check allows to harden the kernel by ensuring that some types of -the memory corruptions are prevented. - -Page table check performs extra verifications at the time when new pages become -accessible from the userspace by getting their page table entries (PTEs PMDs -etc.) added into the table. - -In case of detected corruption, the kernel is crashed. There is a small -performance and memory overhead associated with the page table check. Therefore, -it is disabled by default, but can be optionally enabled on systems where the -extra hardening outweighs the performance costs. Also, because page table check -is synchronous, it can help with debugging double map memory corruption issues, -by crashing kernel at the time wrong mapping occurs instead of later which is -often the case with memory corruptions bugs. - -Double mapping detection logic -============================== - -+-------------------+-------------------+-------------------+------------------+ -| Current Mapping | New mapping | Permissions | Rule | -+===================+===================+===================+==================+ -| Anonymous | Anonymous | Read | Allow | -+-------------------+-------------------+-------------------+------------------+ -| Anonymous | Anonymous | Read / Write | Prohibit | -+-------------------+-------------------+-------------------+------------------+ -| Anonymous | Named | Any | Prohibit | -+-------------------+-------------------+-------------------+------------------+ -| Named | Anonymous | Any | Prohibit | -+-------------------+-------------------+-------------------+------------------+ -| Named | Named | Any | Allow | -+-------------------+-------------------+-------------------+------------------+ - -Enabling Page Table Check -========================= - -Build kernel with: - -- PAGE_TABLE_CHECK=y - Note, it can only be enabled on platforms where ARCH_SUPPORTS_PAGE_TABLE_CHECK - is available. - -- Boot with 'page_table_check=on' kernel parameter. - -Optionally, build kernel with PAGE_TABLE_CHECK_ENFORCED in order to have page -table support without extra kernel parameter. diff --git a/Documentation/vm/page_tables.rst b/Documentation/vm/page_tables.rst deleted file mode 100644 index 96939571d7bc..000000000000 --- a/Documentation/vm/page_tables.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -=========== -Page Tables -=========== diff --git a/Documentation/vm/physical_memory.rst b/Documentation/vm/physical_memory.rst deleted file mode 100644 index 2ab7b8c1c863..000000000000 --- a/Documentation/vm/physical_memory.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -=============== -Physical Memory -=============== diff --git a/Documentation/vm/process_addrs.rst b/Documentation/vm/process_addrs.rst deleted file mode 100644 index e8618fbc62c9..000000000000 --- a/Documentation/vm/process_addrs.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -================= -Process Addresses -================= diff --git a/Documentation/vm/remap_file_pages.rst b/Documentation/vm/remap_file_pages.rst deleted file mode 100644 index 7bef6718e3a9..000000000000 --- a/Documentation/vm/remap_file_pages.rst +++ /dev/null @@ -1,33 +0,0 @@ -.. _remap_file_pages: - -============================== -remap_file_pages() system call -============================== - -The remap_file_pages() system call is used to create a nonlinear mapping, -that is, a mapping in which the pages of the file are mapped into a -nonsequential order in memory. The advantage of using remap_file_pages() -over using repeated calls to mmap(2) is that the former approach does not -require the kernel to create additional VMA (Virtual Memory Area) data -structures. - -Supporting of nonlinear mapping requires significant amount of non-trivial -code in kernel virtual memory subsystem including hot paths. Also to get -nonlinear mapping work kernel need a way to distinguish normal page table -entries from entries with file offset (pte_file). Kernel reserves flag in -PTE for this purpose. PTE flags are scarce resource especially on some CPU -architectures. It would be nice to free up the flag for other usage. - -Fortunately, there are not many users of remap_file_pages() in the wild. -It's only known that one enterprise RDBMS implementation uses the syscall -on 32-bit systems to map files bigger than can linearly fit into 32-bit -virtual address space. This use-case is not critical anymore since 64-bit -systems are widely available. - -The syscall is deprecated and replaced it with an emulation now. The -emulation creates new VMAs instead of nonlinear mappings. It's going to -work slower for rare users of remap_file_pages() but ABI is preserved. - -One side effect of emulation (apart from performance) is that user can hit -vm.max_map_count limit more easily due to additional VMAs. See comment for -DEFAULT_MAX_MAP_COUNT for more details on the limit. diff --git a/Documentation/vm/shmfs.rst b/Documentation/vm/shmfs.rst deleted file mode 100644 index 8b01ebb4c30e..000000000000 --- a/Documentation/vm/shmfs.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -======================== -Shared Memory Filesystem -======================== diff --git a/Documentation/vm/slab.rst b/Documentation/vm/slab.rst deleted file mode 100644 index 87d5a5bb172f..000000000000 --- a/Documentation/vm/slab.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -=============== -Slab Allocation -=============== diff --git a/Documentation/vm/slub.rst b/Documentation/vm/slub.rst deleted file mode 100644 index 43063ade737a..000000000000 --- a/Documentation/vm/slub.rst +++ /dev/null @@ -1,452 +0,0 @@ -.. _slub: - -========================== -Short users guide for SLUB -========================== - -The basic philosophy of SLUB is very different from SLAB. SLAB -requires rebuilding the kernel to activate debug options for all -slab caches. SLUB always includes full debugging but it is off by default. -SLUB can enable debugging only for selected slabs in order to avoid -an impact on overall system performance which may make a bug more -difficult to find. - -In order to switch debugging on one can add an option ``slub_debug`` -to the kernel command line. That will enable full debugging for -all slabs. - -Typically one would then use the ``slabinfo`` command to get statistical -data and perform operation on the slabs. By default ``slabinfo`` only lists -slabs that have data in them. See "slabinfo -h" for more options when -running the command. ``slabinfo`` can be compiled with -:: - - gcc -o slabinfo tools/vm/slabinfo.c - -Some of the modes of operation of ``slabinfo`` require that slub debugging -be enabled on the command line. F.e. no tracking information will be -available without debugging on and validation can only partially -be performed if debugging was not switched on. - -Some more sophisticated uses of slub_debug: -------------------------------------------- - -Parameters may be given to ``slub_debug``. If none is specified then full -debugging is enabled. Format: - -slub_debug= - Enable options for all slabs - -slub_debug=,,,... - Enable options only for select slabs (no spaces - after a comma) - -Multiple blocks of options for all slabs or selected slabs can be given, with -blocks of options delimited by ';'. The last of "all slabs" blocks is applied -to all slabs except those that match one of the "select slabs" block. Options -of the first "select slabs" blocks that matches the slab's name are applied. - -Possible debug options are:: - - F Sanity checks on (enables SLAB_DEBUG_CONSISTENCY_CHECKS - Sorry SLAB legacy issues) - Z Red zoning - P Poisoning (object and padding) - U User tracking (free and alloc) - T Trace (please only use on single slabs) - A Enable failslab filter mark for the cache - O Switch debugging off for caches that would have - caused higher minimum slab orders - - Switch all debugging off (useful if the kernel is - configured with CONFIG_SLUB_DEBUG_ON) - -F.e. in order to boot just with sanity checks and red zoning one would specify:: - - slub_debug=FZ - -Trying to find an issue in the dentry cache? Try:: - - slub_debug=,dentry - -to only enable debugging on the dentry cache. You may use an asterisk at the -end of the slab name, in order to cover all slabs with the same prefix. For -example, here's how you can poison the dentry cache as well as all kmalloc -slabs:: - - slub_debug=P,kmalloc-*,dentry - -Red zoning and tracking may realign the slab. We can just apply sanity checks -to the dentry cache with:: - - slub_debug=F,dentry - -Debugging options may require the minimum possible slab order to increase as -a result of storing the metadata (for example, caches with PAGE_SIZE object -sizes). This has a higher liklihood of resulting in slab allocation errors -in low memory situations or if there's high fragmentation of memory. To -switch off debugging for such caches by default, use:: - - slub_debug=O - -You can apply different options to different list of slab names, using blocks -of options. This will enable red zoning for dentry and user tracking for -kmalloc. All other slabs will not get any debugging enabled:: - - slub_debug=Z,dentry;U,kmalloc-* - -You can also enable options (e.g. sanity checks and poisoning) for all caches -except some that are deemed too performance critical and don't need to be -debugged by specifying global debug options followed by a list of slab names -with "-" as options:: - - slub_debug=FZ;-,zs_handle,zspage - -The state of each debug option for a slab can be found in the respective files -under:: - - /sys/kernel/slab// - -If the file contains 1, the option is enabled, 0 means disabled. The debug -options from the ``slub_debug`` parameter translate to the following files:: - - F sanity_checks - Z red_zone - P poison - U store_user - T trace - A failslab - -Careful with tracing: It may spew out lots of information and never stop if -used on the wrong slab. - -Slab merging -============ - -If no debug options are specified then SLUB may merge similar slabs together -in order to reduce overhead and increase cache hotness of objects. -``slabinfo -a`` displays which slabs were merged together. - -Slab validation -=============== - -SLUB can validate all object if the kernel was booted with slub_debug. In -order to do so you must have the ``slabinfo`` tool. Then you can do -:: - - slabinfo -v - -which will test all objects. Output will be generated to the syslog. - -This also works in a more limited way if boot was without slab debug. -In that case ``slabinfo -v`` simply tests all reachable objects. Usually -these are in the cpu slabs and the partial slabs. Full slabs are not -tracked by SLUB in a non debug situation. - -Getting more performance -======================== - -To some degree SLUB's performance is limited by the need to take the -list_lock once in a while to deal with partial slabs. That overhead is -governed by the order of the allocation for each slab. The allocations -can be influenced by kernel parameters: - -.. slub_min_objects=x (default 4) -.. slub_min_order=x (default 0) -.. slub_max_order=x (default 3 (PAGE_ALLOC_COSTLY_ORDER)) - -``slub_min_objects`` - allows to specify how many objects must at least fit into one - slab in order for the allocation order to be acceptable. In - general slub will be able to perform this number of - allocations on a slab without consulting centralized resources - (list_lock) where contention may occur. - -``slub_min_order`` - specifies a minimum order of slabs. A similar effect like - ``slub_min_objects``. - -``slub_max_order`` - specified the order at which ``slub_min_objects`` should no - longer be checked. This is useful to avoid SLUB trying to - generate super large order pages to fit ``slub_min_objects`` - of a slab cache with large object sizes into one high order - page. Setting command line parameter - ``debug_guardpage_minorder=N`` (N > 0), forces setting - ``slub_max_order`` to 0, what cause minimum possible order of - slabs allocation. - -SLUB Debug output -================= - -Here is a sample of slub debug output:: - - ==================================================================== - BUG kmalloc-8: Right Redzone overwritten - -------------------------------------------------------------------- - - INFO: 0xc90f6d28-0xc90f6d2b. First byte 0x00 instead of 0xcc - INFO: Slab 0xc528c530 flags=0x400000c3 inuse=61 fp=0xc90f6d58 - INFO: Object 0xc90f6d20 @offset=3360 fp=0xc90f6d58 - INFO: Allocated in get_modalias+0x61/0xf5 age=53 cpu=1 pid=554 - - Bytes b4 (0xc90f6d10): 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ - Object (0xc90f6d20): 31 30 31 39 2e 30 30 35 1019.005 - Redzone (0xc90f6d28): 00 cc cc cc . - Padding (0xc90f6d50): 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ - - [] dump_trace+0x63/0x1eb - [] show_trace_log_lvl+0x1a/0x2f - [] show_trace+0x12/0x14 - [] dump_stack+0x16/0x18 - [] object_err+0x143/0x14b - [] check_object+0x66/0x234 - [] __slab_free+0x239/0x384 - [] kfree+0xa6/0xc6 - [] get_modalias+0xb9/0xf5 - [] dmi_dev_uevent+0x27/0x3c - [] dev_uevent+0x1ad/0x1da - [] kobject_uevent_env+0x20a/0x45b - [] kobject_uevent+0xa/0xf - [] store_uevent+0x4f/0x58 - [] dev_attr_store+0x29/0x2f - [] sysfs_write_file+0x16e/0x19c - [] vfs_write+0xd1/0x15a - [] sys_write+0x3d/0x72 - [] sysenter_past_esp+0x5f/0x99 - [] 0xb7f7b410 - ======================= - - FIX kmalloc-8: Restoring Redzone 0xc90f6d28-0xc90f6d2b=0xcc - -If SLUB encounters a corrupted object (full detection requires the kernel -to be booted with slub_debug) then the following output will be dumped -into the syslog: - -1. Description of the problem encountered - - This will be a message in the system log starting with:: - - =============================================== - BUG : - ----------------------------------------------- - - INFO: - - INFO: Slab
- INFO: Object
- INFO: Allocated in age= cpu= pid= - INFO: Freed in age= cpu= - pid= - - (Object allocation / free information is only available if SLAB_STORE_USER is - set for the slab. slub_debug sets that option) - -2. The object contents if an object was involved. - - Various types of lines can follow the BUG SLUB line: - - Bytes b4
: - Shows a few bytes before the object where the problem was detected. - Can be useful if the corruption does not stop with the start of the - object. - - Object
: - The bytes of the object. If the object is inactive then the bytes - typically contain poison values. Any non-poison value shows a - corruption by a write after free. - - Redzone
: - The Redzone following the object. The Redzone is used to detect - writes after the object. All bytes should always have the same - value. If there is any deviation then it is due to a write after - the object boundary. - - (Redzone information is only available if SLAB_RED_ZONE is set. - slub_debug sets that option) - - Padding
: - Unused data to fill up the space in order to get the next object - properly aligned. In the debug case we make sure that there are - at least 4 bytes of padding. This allows the detection of writes - before the object. - -3. A stackdump - - The stackdump describes the location where the error was detected. The cause - of the corruption is may be more likely found by looking at the function that - allocated or freed the object. - -4. Report on how the problem was dealt with in order to ensure the continued - operation of the system. - - These are messages in the system log beginning with:: - - FIX : - - In the above sample SLUB found that the Redzone of an active object has - been overwritten. Here a string of 8 characters was written into a slab that - has the length of 8 characters. However, a 8 character string needs a - terminating 0. That zero has overwritten the first byte of the Redzone field. - After reporting the details of the issue encountered the FIX SLUB message - tells us that SLUB has restored the Redzone to its proper value and then - system operations continue. - -Emergency operations -==================== - -Minimal debugging (sanity checks alone) can be enabled by booting with:: - - slub_debug=F - -This will be generally be enough to enable the resiliency features of slub -which will keep the system running even if a bad kernel component will -keep corrupting objects. This may be important for production systems. -Performance will be impacted by the sanity checks and there will be a -continual stream of error messages to the syslog but no additional memory -will be used (unlike full debugging). - -No guarantees. The kernel component still needs to be fixed. Performance -may be optimized further by locating the slab that experiences corruption -and enabling debugging only for that cache - -I.e.:: - - slub_debug=F,dentry - -If the corruption occurs by writing after the end of the object then it -may be advisable to enable a Redzone to avoid corrupting the beginning -of other objects:: - - slub_debug=FZ,dentry - -Extended slabinfo mode and plotting -=================================== - -The ``slabinfo`` tool has a special 'extended' ('-X') mode that includes: - - Slabcache Totals - - Slabs sorted by size (up to -N slabs, default 1) - - Slabs sorted by loss (up to -N slabs, default 1) - -Additionally, in this mode ``slabinfo`` does not dynamically scale -sizes (G/M/K) and reports everything in bytes (this functionality is -also available to other slabinfo modes via '-B' option) which makes -reporting more precise and accurate. Moreover, in some sense the `-X' -mode also simplifies the analysis of slabs' behaviour, because its -output can be plotted using the ``slabinfo-gnuplot.sh`` script. So it -pushes the analysis from looking through the numbers (tons of numbers) -to something easier -- visual analysis. - -To generate plots: - -a) collect slabinfo extended records, for example:: - - while [ 1 ]; do slabinfo -X >> FOO_STATS; sleep 1; done - -b) pass stats file(-s) to ``slabinfo-gnuplot.sh`` script:: - - slabinfo-gnuplot.sh FOO_STATS [FOO_STATS2 .. FOO_STATSN] - - The ``slabinfo-gnuplot.sh`` script will pre-processes the collected records - and generates 3 png files (and 3 pre-processing cache files) per STATS - file: - - Slabcache Totals: FOO_STATS-totals.png - - Slabs sorted by size: FOO_STATS-slabs-by-size.png - - Slabs sorted by loss: FOO_STATS-slabs-by-loss.png - -Another use case, when ``slabinfo-gnuplot.sh`` can be useful, is when you -need to compare slabs' behaviour "prior to" and "after" some code -modification. To help you out there, ``slabinfo-gnuplot.sh`` script -can 'merge' the `Slabcache Totals` sections from different -measurements. To visually compare N plots: - -a) Collect as many STATS1, STATS2, .. STATSN files as you need:: - - while [ 1 ]; do slabinfo -X >> STATS; sleep 1; done - -b) Pre-process those STATS files:: - - slabinfo-gnuplot.sh STATS1 STATS2 .. STATSN - -c) Execute ``slabinfo-gnuplot.sh`` in '-t' mode, passing all of the - generated pre-processed \*-totals:: - - slabinfo-gnuplot.sh -t STATS1-totals STATS2-totals .. STATSN-totals - - This will produce a single plot (png file). - - Plots, expectedly, can be large so some fluctuations or small spikes - can go unnoticed. To deal with that, ``slabinfo-gnuplot.sh`` has two - options to 'zoom-in'/'zoom-out': - - a) ``-s %d,%d`` -- overwrites the default image width and height - b) ``-r %d,%d`` -- specifies a range of samples to use (for example, - in ``slabinfo -X >> FOO_STATS; sleep 1;`` case, using a ``-r - 40,60`` range will plot only samples collected between 40th and - 60th seconds). - - -DebugFS files for SLUB -====================== - -For more information about current state of SLUB caches with the user tracking -debug option enabled, debugfs files are available, typically under -/sys/kernel/debug/slab// (created only for caches with enabled user -tracking). There are 2 types of these files with the following debug -information: - -1. alloc_traces:: - - Prints information about unique allocation traces of the currently - allocated objects. The output is sorted by frequency of each trace. - - Information in the output: - Number of objects, allocating function, minimal/average/maximal jiffies since alloc, - pid range of the allocating processes, cpu mask of allocating cpus, and stack trace. - - Example::: - - 1085 populate_error_injection_list+0x97/0x110 age=166678/166680/166682 pid=1 cpus=1:: - __slab_alloc+0x6d/0x90 - kmem_cache_alloc_trace+0x2eb/0x300 - populate_error_injection_list+0x97/0x110 - init_error_injection+0x1b/0x71 - do_one_initcall+0x5f/0x2d0 - kernel_init_freeable+0x26f/0x2d7 - kernel_init+0xe/0x118 - ret_from_fork+0x22/0x30 - - -2. free_traces:: - - Prints information about unique freeing traces of the currently allocated - objects. The freeing traces thus come from the previous life-cycle of the - objects and are reported as not available for objects allocated for the first - time. The output is sorted by frequency of each trace. - - Information in the output: - Number of objects, freeing function, minimal/average/maximal jiffies since free, - pid range of the freeing processes, cpu mask of freeing cpus, and stack trace. - - Example::: - - 1980 age=4294912290 pid=0 cpus=0 - 51 acpi_ut_update_ref_count+0x6a6/0x782 age=236886/237027/237772 pid=1 cpus=1 - kfree+0x2db/0x420 - acpi_ut_update_ref_count+0x6a6/0x782 - acpi_ut_update_object_reference+0x1ad/0x234 - acpi_ut_remove_reference+0x7d/0x84 - acpi_rs_get_prt_method_data+0x97/0xd6 - acpi_get_irq_routing_table+0x82/0xc4 - acpi_pci_irq_find_prt_entry+0x8e/0x2e0 - acpi_pci_irq_lookup+0x3a/0x1e0 - acpi_pci_irq_enable+0x77/0x240 - pcibios_enable_device+0x39/0x40 - do_pci_enable_device.part.0+0x5d/0xe0 - pci_enable_device_flags+0xfc/0x120 - pci_enable_device+0x13/0x20 - virtio_pci_probe+0x9e/0x170 - local_pci_probe+0x48/0x80 - pci_device_probe+0x105/0x1c0 - -Christoph Lameter, May 30, 2007 -Sergey Senozhatsky, October 23, 2015 diff --git a/Documentation/vm/split_page_table_lock.rst b/Documentation/vm/split_page_table_lock.rst deleted file mode 100644 index c08919662704..000000000000 --- a/Documentation/vm/split_page_table_lock.rst +++ /dev/null @@ -1,100 +0,0 @@ -.. _split_page_table_lock: - -===================== -Split page table lock -===================== - -Originally, mm->page_table_lock spinlock protected all page tables of the -mm_struct. But this approach leads to poor page fault scalability of -multi-threaded applications due high contention on the lock. To improve -scalability, split page table lock was introduced. - -With split page table lock we have separate per-table lock to serialize -access to the table. At the moment we use split lock for PTE and PMD -tables. Access to higher level tables protected by mm->page_table_lock. - -There are helpers to lock/unlock a table and other accessor functions: - - - pte_offset_map_lock() - maps pte and takes PTE table lock, returns pointer to the taken - lock; - - pte_unmap_unlock() - unlocks and unmaps PTE table; - - pte_alloc_map_lock() - allocates PTE table if needed and take the lock, returns pointer - to taken lock or NULL if allocation failed; - - pte_lockptr() - returns pointer to PTE table lock; - - pmd_lock() - takes PMD table lock, returns pointer to taken lock; - - pmd_lockptr() - returns pointer to PMD table lock; - -Split page table lock for PTE tables is enabled compile-time if -CONFIG_SPLIT_PTLOCK_CPUS (usually 4) is less or equal to NR_CPUS. -If split lock is disabled, all tables are guarded by mm->page_table_lock. - -Split page table lock for PMD tables is enabled, if it's enabled for PTE -tables and the architecture supports it (see below). - -Hugetlb and split page table lock -================================= - -Hugetlb can support several page sizes. We use split lock only for PMD -level, but not for PUD. - -Hugetlb-specific helpers: - - - huge_pte_lock() - takes pmd split lock for PMD_SIZE page, mm->page_table_lock - otherwise; - - huge_pte_lockptr() - returns pointer to table lock; - -Support of split page table lock by an architecture -=================================================== - -There's no need in special enabling of PTE split page table lock: everything -required is done by pgtable_pte_page_ctor() and pgtable_pte_page_dtor(), which -must be called on PTE table allocation / freeing. - -Make sure the architecture doesn't use slab allocator for page table -allocation: slab uses page->slab_cache for its pages. -This field shares storage with page->ptl. - -PMD split lock only makes sense if you have more than two page table -levels. - -PMD split lock enabling requires pgtable_pmd_page_ctor() call on PMD table -allocation and pgtable_pmd_page_dtor() on freeing. - -Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and -pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing -paths: i.e X86_PAE preallocate few PMDs on pgd_alloc(). - -With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK. - -NOTE: pgtable_pte_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must -be handled properly. - -page->ptl -========= - -page->ptl is used to access split page table lock, where 'page' is struct -page of page containing the table. It shares storage with page->private -(and few other fields in union). - -To avoid increasing size of struct page and have best performance, we use a -trick: - - - if spinlock_t fits into long, we use page->ptr as spinlock, so we - can avoid indirect access and save a cache line. - - if size of spinlock_t is bigger then size of long, we use page->ptl as - pointer to spinlock_t and allocate it dynamically. This allows to use - split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs - one more cache line for indirect access; - -The spinlock_t allocated in pgtable_pte_page_ctor() for PTE table and in -pgtable_pmd_page_ctor() for PMD table. - -Please, never access page->ptl directly -- use appropriate helper. diff --git a/Documentation/vm/swap.rst b/Documentation/vm/swap.rst deleted file mode 100644 index 78819bd4d745..000000000000 --- a/Documentation/vm/swap.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -==== -Swap -==== diff --git a/Documentation/vm/transhuge.rst b/Documentation/vm/transhuge.rst deleted file mode 100644 index 216db1d67d04..000000000000 --- a/Documentation/vm/transhuge.rst +++ /dev/null @@ -1,187 +0,0 @@ -.. _transhuge: - -============================ -Transparent Hugepage Support -============================ - -This document describes design principles for Transparent Hugepage (THP) -support and its interaction with other parts of the memory management -system. - -Design principles -================= - -- "graceful fallback": mm components which don't have transparent hugepage - knowledge fall back to breaking huge pmd mapping into table of ptes and, - if necessary, split a transparent hugepage. Therefore these components - can continue working on the regular pages or regular pte mappings. - -- if a hugepage allocation fails because of memory fragmentation, - regular pages should be gracefully allocated instead and mixed in - the same vma without any failure or significant delay and without - userland noticing - -- if some task quits and more hugepages become available (either - immediately in the buddy or through the VM), guest physical memory - backed by regular pages should be relocated on hugepages - automatically (with khugepaged) - -- it doesn't require memory reservation and in turn it uses hugepages - whenever possible (the only possible reservation here is kernelcore= - to avoid unmovable pages to fragment all the memory but such a tweak - is not specific to transparent hugepage support and it's a generic - feature that applies to all dynamic high order allocations in the - kernel) - -get_user_pages and follow_page -============================== - -get_user_pages and follow_page if run on a hugepage, will return the -head or tail pages as usual (exactly as they would do on -hugetlbfs). Most GUP users will only care about the actual physical -address of the page and its temporary pinning to release after the I/O -is complete, so they won't ever notice the fact the page is huge. But -if any driver is going to mangle over the page structure of the tail -page (like for checking page->mapping or other bits that are relevant -for the head page and not the tail page), it should be updated to jump -to check head page instead. Taking a reference on any head/tail page would -prevent the page from being split by anyone. - -.. note:: - these aren't new constraints to the GUP API, and they match the - same constraints that apply to hugetlbfs too, so any driver capable - of handling GUP on hugetlbfs will also work fine on transparent - hugepage backed mappings. - -Graceful fallback -================= - -Code walking pagetables but unaware about huge pmds can simply call -split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by -pmd_offset. It's trivial to make the code transparent hugepage aware -by just grepping for "pmd_offset" and adding split_huge_pmd where -missing after pmd_offset returns the pmd. Thanks to the graceful -fallback design, with a one liner change, you can avoid to write -hundreds if not thousands of lines of complex code to make your code -hugepage aware. - -If you're not walking pagetables but you run into a physical hugepage -that you can't handle natively in your code, you can split it by -calling split_huge_page(page). This is what the Linux VM does before -it tries to swapout the hugepage for example. split_huge_page() can fail -if the page is pinned and you must handle this correctly. - -Example to make mremap.c transparent hugepage aware with a one liner -change:: - - diff --git a/mm/mremap.c b/mm/mremap.c - --- a/mm/mremap.c - +++ b/mm/mremap.c - @@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru - return NULL; - - pmd = pmd_offset(pud, addr); - + split_huge_pmd(vma, pmd, addr); - if (pmd_none_or_clear_bad(pmd)) - return NULL; - -Locking in hugepage aware code -============================== - -We want as much code as possible hugepage aware, as calling -split_huge_page() or split_huge_pmd() has a cost. - -To make pagetable walks huge pmd aware, all you need to do is to call -pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the -mmap_lock in read (or write) mode to be sure a huge pmd cannot be -created from under you by khugepaged (khugepaged collapse_huge_page -takes the mmap_lock in write mode in addition to the anon_vma lock). If -pmd_trans_huge returns false, you just fallback in the old code -paths. If instead pmd_trans_huge returns true, you have to take the -page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the -page table lock will prevent the huge pmd being converted into a -regular pmd from under you (split_huge_pmd can run in parallel to the -pagetable walk). If the second pmd_trans_huge returns false, you -should just drop the page table lock and fallback to the old code as -before. Otherwise, you can proceed to process the huge pmd and the -hugepage natively. Once finished, you can drop the page table lock. - -Refcounts and transparent huge pages -==================================== - -Refcounting on THP is mostly consistent with refcounting on other compound -pages: - - - get_page()/put_page() and GUP operate on head page's ->_refcount. - - - ->_refcount in tail pages is always zero: get_page_unless_zero() never - succeeds on tail pages. - - - map/unmap of the pages with PTE entry increment/decrement ->_mapcount - on relevant sub-page of the compound page. - - - map/unmap of the whole compound page is accounted for in compound_mapcount - (stored in first tail page). For file huge pages, we also increment - ->_mapcount of all sub-pages in order to have race-free detection of - last unmap of subpages. - -PageDoubleMap() indicates that the page is *possibly* mapped with PTEs. - -For anonymous pages, PageDoubleMap() also indicates ->_mapcount in all -subpages is offset up by one. This additional reference is required to -get race-free detection of unmap of subpages when we have them mapped with -both PMDs and PTEs. - -This optimization is required to lower the overhead of per-subpage mapcount -tracking. The alternative is to alter ->_mapcount in all subpages on each -map/unmap of the whole compound page. - -For anonymous pages, we set PG_double_map when a PMD of the page is split -for the first time, but still have a PMD mapping. The additional references -go away with the last compound_mapcount. - -File pages get PG_double_map set on the first map of the page with PTE and -goes away when the page gets evicted from the page cache. - -split_huge_page internally has to distribute the refcounts in the head -page to the tail pages before clearing all PG_head/tail bits from the page -structures. It can be done easily for refcounts taken by page table -entries, but we don't have enough information on how to distribute any -additional pins (i.e. from get_user_pages). split_huge_page() fails any -requests to split pinned huge pages: it expects page count to be equal to -the sum of mapcount of all sub-pages plus one (split_huge_page caller must -have a reference to the head page). - -split_huge_page uses migration entries to stabilize page->_refcount and -page->_mapcount of anonymous pages. File pages just get unmapped. - -We are safe against physical memory scanners too: the only legitimate way -a scanner can get a reference to a page is get_page_unless_zero(). - -All tail pages have zero ->_refcount until atomic_add(). This prevents the -scanner from getting a reference to the tail page up to that point. After the -atomic_add() we don't care about the ->_refcount value. We already know how -many references should be uncharged from the head page. - -For head page get_page_unless_zero() will succeed and we don't mind. It's -clear where references should go after split: it will stay on the head page. - -Note that split_huge_pmd() doesn't have any limitations on refcounting: -pmd can be split at any point and never fails. - -Partial unmap and deferred_split_huge_page() -============================================ - -Unmapping part of THP (with munmap() or other way) is not going to free -memory immediately. Instead, we detect that a subpage of THP is not in use -in page_remove_rmap() and queue the THP for splitting if memory pressure -comes. Splitting will free up unused subpages. - -Splitting the page right away is not an option due to locking context in -the place where we can detect partial unmap. It also might be -counterproductive since in many cases partial unmap happens during exit(2) if -a THP crosses a VMA boundary. - -The function deferred_split_huge_page() is used to queue a page for splitting. -The splitting itself will happen when we get memory pressure via shrinker -interface. diff --git a/Documentation/vm/unevictable-lru.rst b/Documentation/vm/unevictable-lru.rst deleted file mode 100644 index b280367d6a44..000000000000 --- a/Documentation/vm/unevictable-lru.rst +++ /dev/null @@ -1,554 +0,0 @@ -.. _unevictable_lru: - -============================== -Unevictable LRU Infrastructure -============================== - -.. contents:: :local: - - -Introduction -============ - -This document describes the Linux memory manager's "Unevictable LRU" -infrastructure and the use of this to manage several types of "unevictable" -pages. - -The document attempts to provide the overall rationale behind this mechanism -and the rationale for some of the design decisions that drove the -implementation. The latter design rationale is discussed in the context of an -implementation description. Admittedly, one can obtain the implementation -details - the "what does it do?" - by reading the code. One hopes that the -descriptions below add value by provide the answer to "why does it do that?". - - - -The Unevictable LRU -=================== - -The Unevictable LRU facility adds an additional LRU list to track unevictable -pages and to hide these pages from vmscan. This mechanism is based on a patch -by Larry Woodman of Red Hat to address several scalability problems with page -reclaim in Linux. The problems have been observed at customer sites on large -memory x86_64 systems. - -To illustrate this with an example, a non-NUMA x86_64 platform with 128GB of -main memory will have over 32 million 4k pages in a single node. When a large -fraction of these pages are not evictable for any reason [see below], vmscan -will spend a lot of time scanning the LRU lists looking for the small fraction -of pages that are evictable. This can result in a situation where all CPUs are -spending 100% of their time in vmscan for hours or days on end, with the system -completely unresponsive. - -The unevictable list addresses the following classes of unevictable pages: - - * Those owned by ramfs. - - * Those mapped into SHM_LOCK'd shared memory regions. - - * Those mapped into VM_LOCKED [mlock()ed] VMAs. - -The infrastructure may also be able to handle other conditions that make pages -unevictable, either by definition or by circumstance, in the future. - - -The Unevictable LRU Page List ------------------------------ - -The Unevictable LRU page list is a lie. It was never an LRU-ordered list, but a -companion to the LRU-ordered anonymous and file, active and inactive page lists; -and now it is not even a page list. But following familiar convention, here in -this document and in the source, we often imagine it as a fifth LRU page list. - -The Unevictable LRU infrastructure consists of an additional, per-node, LRU list -called the "unevictable" list and an associated page flag, PG_unevictable, to -indicate that the page is being managed on the unevictable list. - -The PG_unevictable flag is analogous to, and mutually exclusive with, the -PG_active flag in that it indicates on which LRU list a page resides when -PG_lru is set. - -The Unevictable LRU infrastructure maintains unevictable pages as if they were -on an additional LRU list for a few reasons: - - (1) We get to "treat unevictable pages just like we treat other pages in the - system - which means we get to use the same code to manipulate them, the - same code to isolate them (for migrate, etc.), the same code to keep track - of the statistics, etc..." [Rik van Riel] - - (2) We want to be able to migrate unevictable pages between nodes for memory - defragmentation, workload management and memory hotplug. The Linux kernel - can only migrate pages that it can successfully isolate from the LRU - lists (or "Movable" pages: outside of consideration here). If we were to - maintain pages elsewhere than on an LRU-like list, where they can be - detected by isolate_lru_page(), we would prevent their migration. - -The unevictable list does not differentiate between file-backed and anonymous, -swap-backed pages. This differentiation is only important while the pages are, -in fact, evictable. - -The unevictable list benefits from the "arrayification" of the per-node LRU -lists and statistics originally proposed and posted by Christoph Lameter. - - -Memory Control Group Interaction --------------------------------- - -The unevictable LRU facility interacts with the memory control group [aka -memory controller; see Documentation/admin-guide/cgroup-v1/memory.rst] by -extending the lru_list enum. - -The memory controller data structure automatically gets a per-node unevictable -list as a result of the "arrayification" of the per-node LRU lists (one per -lru_list enum element). The memory controller tracks the movement of pages to -and from the unevictable list. - -When a memory control group comes under memory pressure, the controller will -not attempt to reclaim pages on the unevictable list. This has a couple of -effects: - - (1) Because the pages are "hidden" from reclaim on the unevictable list, the - reclaim process can be more efficient, dealing only with pages that have a - chance of being reclaimed. - - (2) On the other hand, if too many of the pages charged to the control group - are unevictable, the evictable portion of the working set of the tasks in - the control group may not fit into the available memory. This can cause - the control group to thrash or to OOM-kill tasks. - - -.. _mark_addr_space_unevict: - -Marking Address Spaces Unevictable ----------------------------------- - -For facilities such as ramfs none of the pages attached to the address space -may be evicted. To prevent eviction of any such pages, the AS_UNEVICTABLE -address space flag is provided, and this can be manipulated by a filesystem -using a number of wrapper functions: - - * ``void mapping_set_unevictable(struct address_space *mapping);`` - - Mark the address space as being completely unevictable. - - * ``void mapping_clear_unevictable(struct address_space *mapping);`` - - Mark the address space as being evictable. - - * ``int mapping_unevictable(struct address_space *mapping);`` - - Query the address space, and return true if it is completely - unevictable. - -These are currently used in three places in the kernel: - - (1) By ramfs to mark the address spaces of its inodes when they are created, - and this mark remains for the life of the inode. - - (2) By SYSV SHM to mark SHM_LOCK'd address spaces until SHM_UNLOCK is called. - Note that SHM_LOCK is not required to page in the locked pages if they're - swapped out; the application must touch the pages manually if it wants to - ensure they're in memory. - - (3) By the i915 driver to mark pinned address space until it's unpinned. The - amount of unevictable memory marked by i915 driver is roughly the bounded - object size in debugfs/dri/0/i915_gem_objects. - - -Detecting Unevictable Pages ---------------------------- - -The function page_evictable() in mm/internal.h determines whether a page is -evictable or not using the query function outlined above [see section -:ref:`Marking address spaces unevictable `] -to check the AS_UNEVICTABLE flag. - -For address spaces that are so marked after being populated (as SHM regions -might be), the lock action (e.g. SHM_LOCK) can be lazy, and need not populate -the page tables for the region as does, for example, mlock(), nor need it make -any special effort to push any pages in the SHM_LOCK'd area to the unevictable -list. Instead, vmscan will do this if and when it encounters the pages during -a reclamation scan. - -On an unlock action (such as SHM_UNLOCK), the unlocker (e.g. shmctl()) must scan -the pages in the region and "rescue" them from the unevictable list if no other -condition is keeping them unevictable. If an unevictable region is destroyed, -the pages are also "rescued" from the unevictable list in the process of -freeing them. - -page_evictable() also checks for mlocked pages by testing an additional page -flag, PG_mlocked (as wrapped by PageMlocked()), which is set when a page is -faulted into a VM_LOCKED VMA, or found in a VMA being VM_LOCKED. - - -Vmscan's Handling of Unevictable Pages --------------------------------------- - -If unevictable pages are culled in the fault path, or moved to the unevictable -list at mlock() or mmap() time, vmscan will not encounter the pages until they -have become evictable again (via munlock() for example) and have been "rescued" -from the unevictable list. However, there may be situations where we decide, -for the sake of expediency, to leave an unevictable page on one of the regular -active/inactive LRU lists for vmscan to deal with. vmscan checks for such -pages in all of the shrink_{active|inactive|page}_list() functions and will -"cull" such pages that it encounters: that is, it diverts those pages to the -unevictable list for the memory cgroup and node being scanned. - -There may be situations where a page is mapped into a VM_LOCKED VMA, but the -page is not marked as PG_mlocked. Such pages will make it all the way to -shrink_active_list() or shrink_page_list() where they will be detected when -vmscan walks the reverse map in page_referenced() or try_to_unmap(). The page -is culled to the unevictable list when it is released by the shrinker. - -To "cull" an unevictable page, vmscan simply puts the page back on the LRU list -using putback_lru_page() - the inverse operation to isolate_lru_page() - after -dropping the page lock. Because the condition which makes the page unevictable -may change once the page is unlocked, __pagevec_lru_add_fn() will recheck the -unevictable state of a page before placing it on the unevictable list. - - -MLOCKED Pages -============= - -The unevictable page list is also useful for mlock(), in addition to ramfs and -SYSV SHM. Note that mlock() is only available in CONFIG_MMU=y situations; in -NOMMU situations, all mappings are effectively mlocked. - - -History -------- - -The "Unevictable mlocked Pages" infrastructure is based on work originally -posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU". -Nick posted his patch as an alternative to a patch posted by Christoph Lameter -to achieve the same objective: hiding mlocked pages from vmscan. - -In Nick's patch, he used one of the struct page LRU list link fields as a count -of VM_LOCKED VMAs that map the page (Rik van Riel had the same idea three years -earlier). But this use of the link field for a count prevented the management -of the pages on an LRU list, and thus mlocked pages were not migratable as -isolate_lru_page() could not detect them, and the LRU list link field was not -available to the migration subsystem. - -Nick resolved this by putting mlocked pages back on the LRU list before -attempting to isolate them, thus abandoning the count of VM_LOCKED VMAs. When -Nick's patch was integrated with the Unevictable LRU work, the count was -replaced by walking the reverse map when munlocking, to determine whether any -other VM_LOCKED VMAs still mapped the page. - -However, walking the reverse map for each page when munlocking was ugly and -inefficient, and could lead to catastrophic contention on a file's rmap lock, -when many processes which had it mlocked were trying to exit. In 5.18, the -idea of keeping mlock_count in Unevictable LRU list link field was revived and -put to work, without preventing the migration of mlocked pages. This is why -the "Unevictable LRU list" cannot be a linked list of pages now; but there was -no use for that linked list anyway - though its size is maintained for meminfo. - - -Basic Management ----------------- - -mlocked pages - pages mapped into a VM_LOCKED VMA - are a class of unevictable -pages. When such a page has been "noticed" by the memory management subsystem, -the page is marked with the PG_mlocked flag. This can be manipulated using the -PageMlocked() functions. - -A PG_mlocked page will be placed on the unevictable list when it is added to -the LRU. Such pages can be "noticed" by memory management in several places: - - (1) in the mlock()/mlock2()/mlockall() system call handlers; - - (2) in the mmap() system call handler when mmapping a region with the - MAP_LOCKED flag; - - (3) mmapping a region in a task that has called mlockall() with the MCL_FUTURE - flag; - - (4) in the fault path and when a VM_LOCKED stack segment is expanded; or - - (5) as mentioned above, in vmscan:shrink_page_list() when attempting to - reclaim a page in a VM_LOCKED VMA by page_referenced() or try_to_unmap(). - -mlocked pages become unlocked and rescued from the unevictable list when: - - (1) mapped in a range unlocked via the munlock()/munlockall() system calls; - - (2) munmap()'d out of the last VM_LOCKED VMA that maps the page, including - unmapping at task exit; - - (3) when the page is truncated from the last VM_LOCKED VMA of an mmapped file; - or - - (4) before a page is COW'd in a VM_LOCKED VMA. - - -mlock()/mlock2()/mlockall() System Call Handling ------------------------------------------------- - -mlock(), mlock2() and mlockall() system call handlers proceed to mlock_fixup() -for each VMA in the range specified by the call. In the case of mlockall(), -this is the entire active address space of the task. Note that mlock_fixup() -is used for both mlocking and munlocking a range of memory. A call to mlock() -an already VM_LOCKED VMA, or to munlock() a VMA that is not VM_LOCKED, is -treated as a no-op and mlock_fixup() simply returns. - -If the VMA passes some filtering as described in "Filtering Special VMAs" -below, mlock_fixup() will attempt to merge the VMA with its neighbors or split -off a subset of the VMA if the range does not cover the entire VMA. Any pages -already present in the VMA are then marked as mlocked by mlock_page() via -mlock_pte_range() via walk_page_range() via mlock_vma_pages_range(). - -Before returning from the system call, do_mlock() or mlockall() will call -__mm_populate() to fault in the remaining pages via get_user_pages() and to -mark those pages as mlocked as they are faulted. - -Note that the VMA being mlocked might be mapped with PROT_NONE. In this case, -get_user_pages() will be unable to fault in the pages. That's okay. If pages -do end up getting faulted into this VM_LOCKED VMA, they will be handled in the -fault path - which is also how mlock2()'s MLOCK_ONFAULT areas are handled. - -For each PTE (or PMD) being faulted into a VMA, the page add rmap function -calls mlock_vma_page(), which calls mlock_page() when the VMA is VM_LOCKED -(unless it is a PTE mapping of a part of a transparent huge page). Or when -it is a newly allocated anonymous page, lru_cache_add_inactive_or_unevictable() -calls mlock_new_page() instead: similar to mlock_page(), but can make better -judgments, since this page is held exclusively and known not to be on LRU yet. - -mlock_page() sets PageMlocked immediately, then places the page on the CPU's -mlock pagevec, to batch up the rest of the work to be done under lru_lock by -__mlock_page(). __mlock_page() sets PageUnevictable, initializes mlock_count -and moves the page to unevictable state ("the unevictable LRU", but with -mlock_count in place of LRU threading). Or if the page was already PageLRU -and PageUnevictable and PageMlocked, it simply increments the mlock_count. - -But in practice that may not work ideally: the page may not yet be on an LRU, or -it may have been temporarily isolated from LRU. In such cases the mlock_count -field cannot be touched, but will be set to 0 later when __pagevec_lru_add_fn() -returns the page to "LRU". Races prohibit mlock_count from being set to 1 then: -rather than risk stranding a page indefinitely as unevictable, always err with -mlock_count on the low side, so that when munlocked the page will be rescued to -an evictable LRU, then perhaps be mlocked again later if vmscan finds it in a -VM_LOCKED VMA. - - -Filtering Special VMAs ----------------------- - -mlock_fixup() filters several classes of "special" VMAs: - -1) VMAs with VM_IO or VM_PFNMAP set are skipped entirely. The pages behind - these mappings are inherently pinned, so we don't need to mark them as - mlocked. In any case, most of the pages have no struct page in which to so - mark the page. Because of this, get_user_pages() will fail for these VMAs, - so there is no sense in attempting to visit them. - -2) VMAs mapping hugetlbfs page are already effectively pinned into memory. We - neither need nor want to mlock() these pages. But __mm_populate() includes - hugetlbfs ranges, allocating the huge pages and populating the PTEs. - -3) VMAs with VM_DONTEXPAND are generally userspace mappings of kernel pages, - such as the VDSO page, relay channel pages, etc. These pages are inherently - unevictable and are not managed on the LRU lists. __mm_populate() includes - these ranges, populating the PTEs if not already populated. - -4) VMAs with VM_MIXEDMAP set are not marked VM_LOCKED, but __mm_populate() - includes these ranges, populating the PTEs if not already populated. - -Note that for all of these special VMAs, mlock_fixup() does not set the -VM_LOCKED flag. Therefore, we won't have to deal with them later during -munlock(), munmap() or task exit. Neither does mlock_fixup() account these -VMAs against the task's "locked_vm". - - -munlock()/munlockall() System Call Handling -------------------------------------------- - -The munlock() and munlockall() system calls are handled by the same -mlock_fixup() function as mlock(), mlock2() and mlockall() system calls are. -If called to munlock an already munlocked VMA, mlock_fixup() simply returns. -Because of the VMA filtering discussed above, VM_LOCKED will not be set in -any "special" VMAs. So, those VMAs will be ignored for munlock. - -If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the -specified range. All pages in the VMA are then munlocked by munlock_page() via -mlock_pte_range() via walk_page_range() via mlock_vma_pages_range() - the same -function used when mlocking a VMA range, with new flags for the VMA indicating -that it is munlock() being performed. - -munlock_page() uses the mlock pagevec to batch up work to be done under -lru_lock by __munlock_page(). __munlock_page() decrements the page's -mlock_count, and when that reaches 0 it clears PageMlocked and clears -PageUnevictable, moving the page from unevictable state to inactive LRU. - -But in practice that may not work ideally: the page may not yet have reached -"the unevictable LRU", or it may have been temporarily isolated from it. In -those cases its mlock_count field is unusable and must be assumed to be 0: so -that the page will be rescued to an evictable LRU, then perhaps be mlocked -again later if vmscan finds it in a VM_LOCKED VMA. - - -Migrating MLOCKED Pages ------------------------ - -A page that is being migrated has been isolated from the LRU lists and is held -locked across unmapping of the page, updating the page's address space entry -and copying the contents and state, until the page table entry has been -replaced with an entry that refers to the new page. Linux supports migration -of mlocked pages and other unevictable pages. PG_mlocked is cleared from the -the old page when it is unmapped from the last VM_LOCKED VMA, and set when the -new page is mapped in place of migration entry in a VM_LOCKED VMA. If the page -was unevictable because mlocked, PG_unevictable follows PG_mlocked; but if the -page was unevictable for other reasons, PG_unevictable is copied explicitly. - -Note that page migration can race with mlocking or munlocking of the same page. -There is mostly no problem since page migration requires unmapping all PTEs of -the old page (including munlock where VM_LOCKED), then mapping in the new page -(including mlock where VM_LOCKED). The page table locks provide sufficient -synchronization. - -However, since mlock_vma_pages_range() starts by setting VM_LOCKED on a VMA, -before mlocking any pages already present, if one of those pages were migrated -before mlock_pte_range() reached it, it would get counted twice in mlock_count. -To prevent that, mlock_vma_pages_range() temporarily marks the VMA as VM_IO, -so that mlock_vma_page() will skip it. - -To complete page migration, we place the old and new pages back onto the LRU -afterwards. The "unneeded" page - old page on success, new page on failure - -is freed when the reference count held by the migration process is released. - - -Compacting MLOCKED Pages ------------------------- - -The memory map can be scanned for compactable regions and the default behavior -is to let unevictable pages be moved. /proc/sys/vm/compact_unevictable_allowed -controls this behavior (see Documentation/admin-guide/sysctl/vm.rst). The work -of compaction is mostly handled by the page migration code and the same work -flow as described in Migrating MLOCKED Pages will apply. - - -MLOCKING Transparent Huge Pages -------------------------------- - -A transparent huge page is represented by a single entry on an LRU list. -Therefore, we can only make unevictable an entire compound page, not -individual subpages. - -If a user tries to mlock() part of a huge page, and no user mlock()s the -whole of the huge page, we want the rest of the page to be reclaimable. - -We cannot just split the page on partial mlock() as split_huge_page() can -fail and a new intermittent failure mode for the syscall is undesirable. - -We handle this by keeping PTE-mlocked huge pages on evictable LRU lists: -the PMD on the border of a VM_LOCKED VMA will be split into a PTE table. - -This way the huge page is accessible for vmscan. Under memory pressure the -page will be split, subpages which belong to VM_LOCKED VMAs will be moved -to the unevictable LRU and the rest can be reclaimed. - -/proc/meminfo's Unevictable and Mlocked amounts do not include those parts -of a transparent huge page which are mapped only by PTEs in VM_LOCKED VMAs. - - -mmap(MAP_LOCKED) System Call Handling -------------------------------------- - -In addition to the mlock(), mlock2() and mlockall() system calls, an application -can request that a region of memory be mlocked by supplying the MAP_LOCKED flag -to the mmap() call. There is one important and subtle difference here, though. -mmap() + mlock() will fail if the range cannot be faulted in (e.g. because -mm_populate fails) and returns with ENOMEM while mmap(MAP_LOCKED) will not fail. -The mmaped area will still have properties of the locked area - pages will not -get swapped out - but major page faults to fault memory in might still happen. - -Furthermore, any mmap() call or brk() call that expands the heap by a task -that has previously called mlockall() with the MCL_FUTURE flag will result -in the newly mapped memory being mlocked. Before the unevictable/mlock -changes, the kernel simply called make_pages_present() to allocate pages -and populate the page table. - -To mlock a range of memory under the unevictable/mlock infrastructure, -the mmap() handler and task address space expansion functions call -populate_vma_page_range() specifying the vma and the address range to mlock. - - -munmap()/exit()/exec() System Call Handling -------------------------------------------- - -When unmapping an mlocked region of memory, whether by an explicit call to -munmap() or via an internal unmap from exit() or exec() processing, we must -munlock the pages if we're removing the last VM_LOCKED VMA that maps the pages. -Before the unevictable/mlock changes, mlocking did not mark the pages in any -way, so unmapping them required no processing. - -For each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls -munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED -(unless it was a PTE mapping of a part of a transparent huge page). - -munlock_page() uses the mlock pagevec to batch up work to be done under -lru_lock by __munlock_page(). __munlock_page() decrements the page's -mlock_count, and when that reaches 0 it clears PageMlocked and clears -PageUnevictable, moving the page from unevictable state to inactive LRU. - -But in practice that may not work ideally: the page may not yet have reached -"the unevictable LRU", or it may have been temporarily isolated from it. In -those cases its mlock_count field is unusable and must be assumed to be 0: so -that the page will be rescued to an evictable LRU, then perhaps be mlocked -again later if vmscan finds it in a VM_LOCKED VMA. - - -Truncating MLOCKED Pages ------------------------- - -File truncation or hole punching forcibly unmaps the deleted pages from -userspace; truncation even unmaps and deletes any private anonymous pages -which had been Copied-On-Write from the file pages now being truncated. - -Mlocked pages can be munlocked and deleted in this way: like with munmap(), -for each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls -munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED -(unless it was a PTE mapping of a part of a transparent huge page). - -However, if there is a racing munlock(), since mlock_vma_pages_range() starts -munlocking by clearing VM_LOCKED from a VMA, before munlocking all the pages -present, if one of those pages were unmapped by truncation or hole punch before -mlock_pte_range() reached it, it would not be recognized as mlocked by this VMA, -and would not be counted out of mlock_count. In this rare case, a page may -still appear as PageMlocked after it has been fully unmapped: and it is left to -release_pages() (or __page_cache_release()) to clear it and update statistics -before freeing (this event is counted in /proc/vmstat unevictable_pgs_cleared, -which is usually 0). - - -Page Reclaim in shrink_*_list() -------------------------------- - -vmscan's shrink_active_list() culls any obviously unevictable pages - -i.e. !page_evictable(page) pages - diverting those to the unevictable list. -However, shrink_active_list() only sees unevictable pages that made it onto the -active/inactive LRU lists. Note that these pages do not have PageUnevictable -set - otherwise they would be on the unevictable list and shrink_active_list() -would never see them. - -Some examples of these unevictable pages on the LRU lists are: - - (1) ramfs pages that have been placed on the LRU lists when first allocated. - - (2) SHM_LOCK'd shared memory pages. shmctl(SHM_LOCK) does not attempt to - allocate or fault in the pages in the shared memory region. This happens - when an application accesses the page the first time after SHM_LOCK'ing - the segment. - - (3) pages still mapped into VM_LOCKED VMAs, which should be marked mlocked, - but events left mlock_count too low, so they were munlocked too early. - -vmscan's shrink_inactive_list() and shrink_page_list() also divert obviously -unevictable pages found on the inactive lists to the appropriate memory cgroup -and node unevictable list. - -rmap's page_referenced_one(), called via vmscan's shrink_active_list() or -shrink_page_list(), and rmap's try_to_unmap_one() called via shrink_page_list(), -check for (3) pages still mapped into VM_LOCKED VMAs, and call mlock_vma_page() -to correct them. Such pages are culled to the unevictable list when released -by the shrinker. diff --git a/Documentation/vm/vmalloc.rst b/Documentation/vm/vmalloc.rst deleted file mode 100644 index 363fe20d6b9f..000000000000 --- a/Documentation/vm/vmalloc.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -====================================== -Virtually Contiguous Memory Allocation -====================================== diff --git a/Documentation/vm/vmalloced-kernel-stacks.rst b/Documentation/vm/vmalloced-kernel-stacks.rst deleted file mode 100644 index fc8c67833af6..000000000000 --- a/Documentation/vm/vmalloced-kernel-stacks.rst +++ /dev/null @@ -1,153 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -===================================== -Virtually Mapped Kernel Stack Support -===================================== - -:Author: Shuah Khan - -.. contents:: :local: - -Overview --------- - -This is a compilation of information from the code and original patch -series that introduced the `Virtually Mapped Kernel Stacks feature -` - -Introduction ------------- - -Kernel stack overflows are often hard to debug and make the kernel -susceptible to exploits. Problems could show up at a later time making -it difficult to isolate and root-cause. - -Virtually-mapped kernel stacks with guard pages causes kernel stack -overflows to be caught immediately rather than causing difficult to -diagnose corruptions. - -HAVE_ARCH_VMAP_STACK and VMAP_STACK configuration options enable -support for virtually mapped stacks with guard pages. This feature -causes reliable faults when the stack overflows. The usability of -the stack trace after overflow and response to the overflow itself -is architecture dependent. - -.. note:: - As of this writing, arm64, powerpc, riscv, s390, um, and x86 have - support for VMAP_STACK. - -HAVE_ARCH_VMAP_STACK --------------------- - -Architectures that can support Virtually Mapped Kernel Stacks should -enable this bool configuration option. The requirements are: - -- vmalloc space must be large enough to hold many kernel stacks. This - may rule out many 32-bit architectures. -- Stacks in vmalloc space need to work reliably. For example, if - vmap page tables are created on demand, either this mechanism - needs to work while the stack points to a virtual address with - unpopulated page tables or arch code (switch_to() and switch_mm(), - most likely) needs to ensure that the stack's page table entries - are populated before running on a possibly unpopulated stack. -- If the stack overflows into a guard page, something reasonable - should happen. The definition of "reasonable" is flexible, but - instantly rebooting without logging anything would be unfriendly. - -VMAP_STACK ----------- - -VMAP_STACK bool configuration option when enabled allocates virtually -mapped task stacks. This option depends on HAVE_ARCH_VMAP_STACK. - -- Enable this if you want the use virtually-mapped kernel stacks - with guard pages. This causes kernel stack overflows to be caught - immediately rather than causing difficult-to-diagnose corruption. - -.. note:: - - Using this feature with KASAN requires architecture support - for backing virtual mappings with real shadow memory, and - KASAN_VMALLOC must be enabled. - -.. note:: - - VMAP_STACK is enabled, it is not possible to run DMA on stack - allocated data. - -Kernel configuration options and dependencies keep changing. Refer to -the latest code base: - -`Kconfig ` - -Allocation ------------ - -When a new kernel thread is created, thread stack is allocated from -virtually contiguous memory pages from the page level allocator. These -pages are mapped into contiguous kernel virtual space with PAGE_KERNEL -protections. - -alloc_thread_stack_node() calls __vmalloc_node_range() to allocate stack -with PAGE_KERNEL protections. - -- Allocated stacks are cached and later reused by new threads, so memcg - accounting is performed manually on assigning/releasing stacks to tasks. - Hence, __vmalloc_node_range is called without __GFP_ACCOUNT. -- vm_struct is cached to be able to find when thread free is initiated - in interrupt context. free_thread_stack() can be called in interrupt - context. -- On arm64, all VMAP's stacks need to have the same alignment to ensure - that VMAP'd stack overflow detection works correctly. Arch specific - vmap stack allocator takes care of this detail. -- This does not address interrupt stacks - according to the original patch - -Thread stack allocation is initiated from clone(), fork(), vfork(), -kernel_thread() via kernel_clone(). Leaving a few hints for searching -the code base to understand when and how thread stack is allocated. - -Bulk of the code is in: -`kernel/fork.c `. - -stack_vm_area pointer in task_struct keeps track of the virtually allocated -stack and a non-null stack_vm_area pointer serves as a indication that the -virtually mapped kernel stacks are enabled. - -:: - - struct vm_struct *stack_vm_area; - -Stack overflow handling ------------------------ - -Leading and trailing guard pages help detect stack overflows. When stack -overflows into the guard pages, handlers have to be careful not overflow -the stack again. When handlers are called, it is likely that very little -stack space is left. - -On x86, this is done by handling the page fault indicating the kernel -stack overflow on the double-fault stack. - -Testing VMAP allocation with guard pages ----------------------------------------- - -How do we ensure that VMAP_STACK is actually allocating with a leading -and trailing guard page? The following lkdtm tests can help detect any -regressions. - -:: - - void lkdtm_STACK_GUARD_PAGE_LEADING() - void lkdtm_STACK_GUARD_PAGE_TRAILING() - -Conclusions ------------ - -- A percpu cache of vmalloced stacks appears to be a bit faster than a - high-order stack allocation, at least when the cache hits. -- THREAD_INFO_IN_TASK gets rid of arch-specific thread_info entirely and - simply embed the thread_info (containing only flags) and 'int cpu' into - task_struct. -- The thread stack can be free'ed as soon as the task is dead (without - waiting for RCU) and then, if vmapped stacks are in use, cache the - entire stack for reuse on the same cpu. diff --git a/Documentation/vm/vmemmap_dedup.rst b/Documentation/vm/vmemmap_dedup.rst deleted file mode 100644 index c9c495f62d12..000000000000 --- a/Documentation/vm/vmemmap_dedup.rst +++ /dev/null @@ -1,223 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -========================================= -A vmemmap diet for HugeTLB and Device DAX -========================================= - -HugeTLB -======= - -The struct page structures (page structs) are used to describe a physical -page frame. By default, there is a one-to-one mapping from a page frame to -it's corresponding page struct. - -HugeTLB pages consist of multiple base page size pages and is supported by many -architectures. See Documentation/admin-guide/mm/hugetlbpage.rst for more -details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB are -currently supported. Since the base page size on x86 is 4KB, a 2MB HugeTLB page -consists of 512 base pages and a 1GB HugeTLB page consists of 4096 base pages. -For each base page, there is a corresponding page struct. - -Within the HugeTLB subsystem, only the first 4 page structs are used to -contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides -this upper limit. The only 'useful' information in the remaining page structs -is the compound_head field, and this field is the same for all tail pages. - -By removing redundant page structs for HugeTLB pages, memory can be returned -to the buddy allocator for other uses. - -Different architectures support different HugeTLB pages. For example, the -following table is the HugeTLB page size supported by x86 and arm64 -architectures. Because arm64 supports 4k, 16k, and 64k base pages and -supports contiguous entries, so it supports many kinds of sizes of HugeTLB -page. - -+--------------+-----------+-----------------------------------------------+ -| Architecture | Page Size | HugeTLB Page Size | -+--------------+-----------+-----------+-----------+-----------+-----------+ -| x86-64 | 4KB | 2MB | 1GB | | | -+--------------+-----------+-----------+-----------+-----------+-----------+ -| | 4KB | 64KB | 2MB | 32MB | 1GB | -| +-----------+-----------+-----------+-----------+-----------+ -| arm64 | 16KB | 2MB | 32MB | 1GB | | -| +-----------+-----------+-----------+-----------+-----------+ -| | 64KB | 2MB | 512MB | 16GB | | -+--------------+-----------+-----------+-----------+-----------+-----------+ - -When the system boot up, every HugeTLB page has more than one struct page -structs which size is (unit: pages):: - - struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE - -Where HugeTLB_Size is the size of the HugeTLB page. We know that the size -of the HugeTLB page is always n times PAGE_SIZE. So we can get the following -relationship:: - - HugeTLB_Size = n * PAGE_SIZE - -Then:: - - struct_size = n * PAGE_SIZE / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE - = n * sizeof(struct page) / PAGE_SIZE - -We can use huge mapping at the pud/pmd level for the HugeTLB page. - -For the HugeTLB page of the pmd level mapping, then:: - - struct_size = n * sizeof(struct page) / PAGE_SIZE - = PAGE_SIZE / sizeof(pte_t) * sizeof(struct page) / PAGE_SIZE - = sizeof(struct page) / sizeof(pte_t) - = 64 / 8 - = 8 (pages) - -Where n is how many pte entries which one page can contains. So the value of -n is (PAGE_SIZE / sizeof(pte_t)). - -This optimization only supports 64-bit system, so the value of sizeof(pte_t) -is 8. And this optimization also applicable only when the size of struct page -is a power of two. In most cases, the size of struct page is 64 bytes (e.g. -x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the -size of struct page structs of it is 8 page frames which size depends on the -size of the base page. - -For the HugeTLB page of the pud level mapping, then:: - - struct_size = PAGE_SIZE / sizeof(pmd_t) * struct_size(pmd) - = PAGE_SIZE / 8 * 8 (pages) - = PAGE_SIZE (pages) - -Where the struct_size(pmd) is the size of the struct page structs of a -HugeTLB page of the pmd level mapping. - -E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB -HugeTLB page consists in 4096. - -Next, we take the pmd level mapping of the HugeTLB page as an example to -show the internal implementation of this optimization. There are 8 pages -struct page structs associated with a HugeTLB page which is pmd mapped. - -Here is how things look before optimization:: - - HugeTLB struct pages(8 pages) page frame(8 pages) - +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ - | | | 0 | -------------> | 0 | - | | +-----------+ +-----------+ - | | | 1 | -------------> | 1 | - | | +-----------+ +-----------+ - | | | 2 | -------------> | 2 | - | | +-----------+ +-----------+ - | | | 3 | -------------> | 3 | - | | +-----------+ +-----------+ - | | | 4 | -------------> | 4 | - | PMD | +-----------+ +-----------+ - | level | | 5 | -------------> | 5 | - | mapping | +-----------+ +-----------+ - | | | 6 | -------------> | 6 | - | | +-----------+ +-----------+ - | | | 7 | -------------> | 7 | - | | +-----------+ +-----------+ - | | - | | - | | - +-----------+ - -The value of page->compound_head is the same for all tail pages. The first -page of page structs (page 0) associated with the HugeTLB page contains the 4 -page structs necessary to describe the HugeTLB. The only use of the remaining -pages of page structs (page 1 to page 7) is to point to page->compound_head. -Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs -will be used for each HugeTLB page. This will allow us to free the remaining -7 pages to the buddy allocator. - -Here is how things look after remapping:: - - HugeTLB struct pages(8 pages) page frame(8 pages) - +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ - | | | 0 | -------------> | 0 | - | | +-----------+ +-----------+ - | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^ - | | +-----------+ | | | | | | - | | | 2 | -----------------+ | | | | | - | | +-----------+ | | | | | - | | | 3 | -------------------+ | | | | - | | +-----------+ | | | | - | | | 4 | ---------------------+ | | | - | PMD | +-----------+ | | | - | level | | 5 | -----------------------+ | | - | mapping | +-----------+ | | - | | | 6 | -------------------------+ | - | | +-----------+ | - | | | 7 | ---------------------------+ - | | +-----------+ - | | - | | - | | - +-----------+ - -When a HugeTLB is freed to the buddy system, we should allocate 7 pages for -vmemmap pages and restore the previous mapping relationship. - -For the HugeTLB page of the pud level mapping. It is similar to the former. -We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages. - -Apart from the HugeTLB page of the pmd/pud level mapping, some architectures -(e.g. aarch64) provides a contiguous bit in the translation table entries -that hints to the MMU to indicate that it is one of a contiguous set of -entries that can be cached in a single TLB entry. - -The contiguous bit is used to increase the mapping size at the pmd and pte -(last) level. So this type of HugeTLB page can be optimized only when its -size of the struct page structs is greater than 1 page. - -Notice: The head vmemmap page is not freed to the buddy allocator and all -tail vmemmap pages are mapped to the head vmemmap page frame. So we can see -more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page) -associated with each HugeTLB page. The compound_head() can handle this -correctly (more details refer to the comment above compound_head()). - -Device DAX -========== - -The device-dax interface uses the same tail deduplication technique explained -in the previous chapter, except when used with the vmemmap in -the device (altmap). - -The following page sizes are supported in DAX: PAGE_SIZE (4K on x86_64), -PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64). - -The differences with HugeTLB are relatively minor. - -It only use 3 page structs for storing all information as opposed -to 4 on HugeTLB pages. - -There's no remapping of vmemmap given that device-dax memory is not part of -System RAM ranges initialized at boot. Thus the tail page deduplication -happens at a later stage when we populate the sections. HugeTLB reuses the -the head vmemmap page representing, whereas device-dax reuses the tail -vmemmap page. This results in only half of the savings compared to HugeTLB. - -Deduplicated tail pages are not mapped read-only. - -Here's how things look like on device-dax after the sections are populated:: - - +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ - | | | 0 | -------------> | 0 | - | | +-----------+ +-----------+ - | | | 1 | -------------> | 1 | - | | +-----------+ +-----------+ - | | | 2 | ----------------^ ^ ^ ^ ^ ^ - | | +-----------+ | | | | | - | | | 3 | ------------------+ | | | | - | | +-----------+ | | | | - | | | 4 | --------------------+ | | | - | PMD | +-----------+ | | | - | level | | 5 | ----------------------+ | | - | mapping | +-----------+ | | - | | | 6 | ------------------------+ | - | | +-----------+ | - | | | 7 | --------------------------+ - | | +-----------+ - | | - | | - | | - +-----------+ diff --git a/Documentation/vm/z3fold.rst b/Documentation/vm/z3fold.rst deleted file mode 100644 index 224e3c61d686..000000000000 --- a/Documentation/vm/z3fold.rst +++ /dev/null @@ -1,30 +0,0 @@ -.. _z3fold: - -====== -z3fold -====== - -z3fold is a special purpose allocator for storing compressed pages. -It is designed to store up to three compressed pages per physical page. -It is a zbud derivative which allows for higher compression -ratio keeping the simplicity and determinism of its predecessor. - -The main differences between z3fold and zbud are: - -* unlike zbud, z3fold allows for up to PAGE_SIZE allocations -* z3fold can hold up to 3 compressed pages in its page -* z3fold doesn't export any API itself and is thus intended to be used - via the zpool API. - -To keep the determinism and simplicity, z3fold, just like zbud, always -stores an integral number of compressed pages per page, but it can store -up to 3 pages unlike zbud which can store at most 2. Therefore the -compression ratio goes to around 2.7x while zbud's one is around 1.7x. - -Unlike zbud (but like zsmalloc for that matter) z3fold_alloc() does not -return a dereferenceable pointer. Instead, it returns an unsigned long -handle which encodes actual location of the allocated object. - -Keeping effective compression ratio close to zsmalloc's, z3fold doesn't -depend on MMU enabled and provides more predictable reclaim behavior -which makes it a better fit for small and response-critical systems. diff --git a/Documentation/vm/zsmalloc.rst b/Documentation/vm/zsmalloc.rst deleted file mode 100644 index 6e79893d6132..000000000000 --- a/Documentation/vm/zsmalloc.rst +++ /dev/null @@ -1,82 +0,0 @@ -.. _zsmalloc: - -======== -zsmalloc -======== - -This allocator is designed for use with zram. Thus, the allocator is -supposed to work well under low memory conditions. In particular, it -never attempts higher order page allocation which is very likely to -fail under memory pressure. On the other hand, if we just use single -(0-order) pages, it would suffer from very high fragmentation -- -any object of size PAGE_SIZE/2 or larger would occupy an entire page. -This was one of the major issues with its predecessor (xvmalloc). - -To overcome these issues, zsmalloc allocates a bunch of 0-order pages -and links them together using various 'struct page' fields. These linked -pages act as a single higher-order page i.e. an object can span 0-order -page boundaries. The code refers to these linked pages as a single entity -called zspage. - -For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE -since this satisfies the requirements of all its current users (in the -worst case, page is incompressible and is thus stored "as-is" i.e. in -uncompressed form). For allocation requests larger than this size, failure -is returned (see zs_malloc). - -Additionally, zs_malloc() does not return a dereferenceable pointer. -Instead, it returns an opaque handle (unsigned long) which encodes actual -location of the allocated object. The reason for this indirection is that -zsmalloc does not keep zspages permanently mapped since that would cause -issues on 32-bit systems where the VA region for kernel space mappings -is very small. So, before using the allocating memory, the object has to -be mapped using zs_map_object() to get a usable pointer and subsequently -unmapped using zs_unmap_object(). - -stat -==== - -With CONFIG_ZSMALLOC_STAT, we could see zsmalloc internal information via -``/sys/kernel/debug/zsmalloc/``. Here is a sample of stat output:: - - # cat /sys/kernel/debug/zsmalloc/zram0/classes - - class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage - ... - ... - 9 176 0 1 186 129 8 4 - 10 192 1 0 2880 2872 135 3 - 11 208 0 1 819 795 42 2 - 12 224 0 1 219 159 12 4 - ... - ... - - -class - index -size - object size zspage stores -almost_empty - the number of ZS_ALMOST_EMPTY zspages(see below) -almost_full - the number of ZS_ALMOST_FULL zspages(see below) -obj_allocated - the number of objects allocated -obj_used - the number of objects allocated to the user -pages_used - the number of pages allocated for the class -pages_per_zspage - the number of 0-order pages to make a zspage - -We assign a zspage to ZS_ALMOST_EMPTY fullness group when n <= N / f, where - -* n = number of allocated objects -* N = total number of objects zspage can store -* f = fullness_threshold_frac(ie, 4 at the moment) - -Similarly, we assign zspage to: - -* ZS_ALMOST_FULL when n > N / f -* ZS_EMPTY when n == 0 -* ZS_FULL when n == N diff --git a/MAINTAINERS b/MAINTAINERS index fe5daf141501..55fb1daa9057 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5526,7 +5526,7 @@ L: linux-mm@kvack.org S: Maintained F: Documentation/ABI/testing/sysfs-kernel-mm-damon F: Documentation/admin-guide/mm/damon/ -F: Documentation/vm/damon/ +F: Documentation/mm/damon/ F: include/linux/damon.h F: include/trace/events/damon.h F: mm/damon/ @@ -9037,7 +9037,7 @@ HMM - Heterogeneous Memory Management M: Jérôme Glisse L: linux-mm@kvack.org S: Maintained -F: Documentation/vm/hmm.rst +F: Documentation/mm/hmm.rst F: include/linux/hmm* F: lib/test_hmm* F: mm/hmm* @@ -9135,8 +9135,8 @@ L: linux-mm@kvack.org S: Maintained F: Documentation/ABI/testing/sysfs-kernel-mm-hugepages F: Documentation/admin-guide/mm/hugetlbpage.rst -F: Documentation/vm/hugetlbfs_reserv.rst -F: Documentation/vm/vmemmap_dedup.rst +F: Documentation/mm/hugetlbfs_reserv.rst +F: Documentation/mm/vmemmap_dedup.rst F: fs/hugetlbfs/ F: include/linux/hugetlb.h F: mm/hugetlb.c @@ -15072,7 +15072,7 @@ M: Pasha Tatashin M: Andrew Morton L: linux-mm@kvack.org S: Maintained -F: Documentation/vm/page_table_check.rst +F: Documentation/mm/page_table_check.rst F: include/linux/page_table_check.h F: mm/page_table_check.c @@ -22158,7 +22158,7 @@ M: Nitin Gupta R: Sergey Senozhatsky L: linux-mm@kvack.org S: Maintained -F: Documentation/vm/zsmalloc.rst +F: Documentation/mm/zsmalloc.rst F: include/linux/zsmalloc.h F: mm/zsmalloc.c diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 1920d52653b4..db2838cf8c02 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -410,7 +410,7 @@ config ARCH_SPARSEMEM_ENABLE Say Y to support efficient handling of sparse physical memory, for architectures which are either NUMA (Non-Uniform Memory Access) or have huge holes in the physical address space for other reasons. - See for more. + See for more. config ARCH_ENABLE_THP_MIGRATION def_bool y diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index cb9d5fd39d7f..392ff48f77df 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1273,7 +1273,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, * should return true. * We should not call this on a hugetlb entry. We should check for HugeTLB * entry using vma->vm_flags - * The page table walk rule is explained in Documentation/vm/transhuge.rst + * The page table walk rule is explained in Documentation/mm/transhuge.rst */ static inline int pmd_trans_huge(pmd_t pmd) { diff --git a/include/linux/hmm.h b/include/linux/hmm.h index d5a6f101f843..126a36571667 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -4,7 +4,7 @@ * * Authors: Jérôme Glisse * - * See Documentation/vm/hmm.rst for reasons and overview of what HMM is. + * See Documentation/mm/hmm.rst for reasons and overview of what HMM is. */ #ifndef LINUX_HMM_H #define LINUX_HMM_H @@ -100,7 +100,7 @@ struct hmm_range { }; /* - * Please see Documentation/vm/hmm.rst for how to use the range API. + * Please see Documentation/mm/hmm.rst for how to use the range API. */ int hmm_range_fault(struct hmm_range *range); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 8af304f6b504..9f5ee49482de 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -39,7 +39,7 @@ struct vmem_altmap { * must be treated as an opaque object, rather than a "normal" struct page. * * A more complete discussion of unaddressable memory may be found in - * include/linux/hmm.h and Documentation/vm/hmm.rst. + * include/linux/hmm.h and Documentation/mm/hmm.rst. * * MEMORY_DEVICE_FS_DAX: * Host memory that has similar access semantics as System RAM i.e. DMA diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 45fc2c81e370..d6c06e140277 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -198,7 +198,7 @@ struct mmu_notifier_ops { * invalidate_range_start()/end() notifiers, as * invalidate_range() already catches the points in time when an * external TLB range needs to be flushed. For more in depth - * discussion on this see Documentation/vm/mmu_notifier.rst + * discussion on this see Documentation/mm/mmu_notifier.rst * * Note that this function might be called with just a sub-range * of what was passed to invalidate_range_start()/end(), if diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 8cd975a8bfeb..2a243616f222 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -29,7 +29,7 @@ extern struct mm_struct *mm_alloc(void); * * Use mmdrop() to release the reference acquired by mmgrab(). * - * See also for an in-depth explanation + * See also for an in-depth explanation * of &mm_struct.mm_count vs &mm_struct.mm_users. */ static inline void mmgrab(struct mm_struct *mm) @@ -92,7 +92,7 @@ static inline void mmdrop_sched(struct mm_struct *mm) * * Use mmput() to release the reference acquired by mmget(). * - * See also for an in-depth explanation + * See also for an in-depth explanation * of &mm_struct.mm_count vs &mm_struct.mm_users. */ static inline void mmget(struct mm_struct *mm) diff --git a/include/linux/swap.h b/include/linux/swap.h index 0c0fed1b348f..95a5b7aa1ae9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -74,7 +74,7 @@ static inline int current_is_kswapd(void) /* * Unaddressable device memory support. See include/linux/hmm.h and - * Documentation/vm/hmm.rst. Short description is we need struct pages for + * Documentation/mm/hmm.rst. Short description is we need struct pages for * device memory that is unaddressable (inaccessible) by CPU, so that we can * migrate part of a process memory to device memory. * diff --git a/mm/Kconfig b/mm/Kconfig index 169e64192e48..c1fa4993a56f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -663,7 +663,7 @@ config KSM the many instances by a single page with that content, so saving memory until one or another app needs to modify the content. Recommended for use with KVM, or with other duplicative applications. - See Documentation/vm/ksm.rst for more information: KSM is inactive + See Documentation/mm/ksm.rst for more information: KSM is inactive until a program has madvised that an area is MADV_MERGEABLE, and root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 1ab091f49fc0..dc7df1254f0a 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -35,7 +35,7 @@ #include /* - * Please refer Documentation/vm/arch_pgtable_helpers.rst for the semantics + * Please refer Documentation/mm/arch_pgtable_helpers.rst for the semantics * expectations that are being validated here. All future changes in here * or the documentation need to be in sync. */ diff --git a/mm/frontswap.c b/mm/frontswap.c index 6f69b044a8cc..1a97610308cb 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -4,7 +4,7 @@ * * This code provides the generic "frontend" layer to call a matching * "backend" driver implementation of frontswap. See - * Documentation/vm/frontswap.rst for more information. + * Documentation/mm/frontswap.rst for more information. * * Copyright (C) 2009-2012 Oracle Corp. All rights reserved. * Author: Dan Magenheimer diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 834f288b3769..f9b90a8d7dfa 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1937,7 +1937,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, * replacing a zero pmd write protected page with a zero pte write * protected page. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ pmdp_huge_clear_flush(vma, haddr, pmd); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a57e1be41401..b36a4ef87a2e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4875,7 +4875,7 @@ again: * table protection not changing it to point * to a new page. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ huge_ptep_set_wrprotect(src, addr, src_pte); entry = huge_pte_wrprotect(entry); @@ -6403,7 +6403,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * No need to call mmu_notifier_invalidate_range() we are downgrading * page table protection not changing it to point to a new page. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ i_mmap_unlock_write(vma->vm_file->f_mapping); mmu_notifier_invalidate_range_end(&range); @@ -7102,7 +7102,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) i_mmap_unlock_write(vma->vm_file->f_mapping); /* * No need to call mmu_notifier_invalidate_range(), see - * Documentation/vm/mmu_notifier.rst. + * Documentation/mm/mmu_notifier.rst. */ mmu_notifier_invalidate_range_end(&range); } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 1089ea8a9c98..ba29c15c53d6 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -6,7 +6,7 @@ * * Author: Muchun Song * - * See Documentation/vm/vmemmap_dedup.rst + * See Documentation/mm/vmemmap_dedup.rst */ #define pr_fmt(fmt) "HugeTLB: " fmt diff --git a/mm/ksm.c b/mm/ksm.c index 54f78c9eecae..8d2dc501c92c 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1083,7 +1083,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, * No need to notify as we are downgrading page table to read * only not changing it to point to a new page. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte); /* @@ -1186,7 +1186,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, * No need to notify as we are replacing a read only page with another * read only page with the same content. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ ptep_clear_flush(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, newpte); diff --git a/mm/mmap.c b/mm/mmap.c index 61e6135c54ef..c14d7286a379 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2944,7 +2944,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, unsigned long ret = -EINVAL; struct file *file; - pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.rst.\n", + pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n", current->comm, current->pid); if (prot) diff --git a/mm/rmap.c b/mm/rmap.c index 5bcb334cd6f2..65e0a767b837 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -999,7 +999,7 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) * downgrading page table protection not changing it to point * to a new page. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ if (ret) cleaned++; @@ -1765,7 +1765,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * to point at a new folio while a device is * still using this folio. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ dec_mm_counter(mm, mm_counter_file(&folio->page)); } @@ -1775,7 +1775,7 @@ discard: * done above for all cases requiring it to happen under page * table lock before mmu_notifier_invalidate_range_end() * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); if (vma->vm_flags & VM_LOCKED) @@ -2093,7 +2093,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * done above for all cases requiring it to happen under page * table lock before mmu_notifier_invalidate_range_end() * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); if (vma->vm_flags & VM_LOCKED) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 652f11a05749..3ff88a2eefb8 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -752,7 +752,7 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, /* * Reuse the previous page for the rest of tail pages - * See layout diagram in Documentation/vm/vmemmap_dedup.rst + * See layout diagram in Documentation/mm/vmemmap_dedup.rst */ next += PAGE_SIZE; rc = vmemmap_populate_range(next, last, node, NULL, diff --git a/mm/util.c b/mm/util.c index 0837570c9225..5df8f2db7ca9 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1005,7 +1005,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); * succeed and -ENOMEM implies there is not. * * We currently support three overcommit policies, which are set via the - * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting.rst + * vm.overcommit_memory sysctl. See Documentation/mm/overcommit-accounting.rst * * Strict overcommit modes added 2002 Feb 26 by Alan Cox. * Additional code 2002 Jul 20 by Robert Love. diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index c149427eb1c9..74c3dcecf64d 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -8,7 +8,7 @@ * Or sort by total memory: * ./page_owner_sort -m page_owner_full.txt sorted_page_owner.txt * - * See Documentation/vm/page_owner.rst + * See Documentation/mm/page_owner.rst */ #include -- cgit v1.2.3 From 3d6c1dfb1f75d856cb6247aab7c2dcca45de5cd4 Mon Sep 17 00:00:00 2001 From: Xiang wangx Date: Fri, 10 Jun 2022 15:12:44 +0800 Subject: userfaultfd/selftests: Fix typo in comment Delete the redundant word 'in'. Signed-off-by: Xiang wangx Signed-off-by: Shuah Khan --- tools/testing/selftests/vm/userfaultfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 0bdfc1955229..4bc24581760d 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -860,7 +860,7 @@ static int stress(struct uffd_stats *uffd_stats) /* * Be strict and immediately zap area_src, the whole area has * been transferred already by the background treads. The - * area_src could then be faulted in in a racy way by still + * area_src could then be faulted in a racy way by still * running uffdio_threads reading zeropages after we zapped * area_src (but they're guaranteed to get -EEXIST from * UFFDIO_COPY without writing zero pages into area_dst -- cgit v1.2.3 From 3297a4df805d4263506b6dfec4d1bbeff8862dd8 Mon Sep 17 00:00:00 2001 From: Gautam Date: Sat, 25 Jun 2022 19:24:55 +0530 Subject: kselftests: Enable the echo command to print newlines in Makefile In the install section of the main Makefile of kselftests, the echo command is used with -n flag, which disables the printing of new line due to which the output contains "\n" chars as follows: Emit Tests for alsa\nSkipping non-existent dir: arm64 Emit Tests for breakpoints\nEmit Tests for capabilities\n This patch fixes the above bug by using the -e flag. Signed-off-by: Gautam Signed-off-by: Shuah Khan --- tools/testing/selftests/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index de11992dc577..52e31437f1a3 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -253,7 +253,7 @@ ifdef INSTALL_PATH for TARGET in $(TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ [ ! -d $(INSTALL_PATH)/$$TARGET ] && echo "Skipping non-existent dir: $$TARGET" && continue; \ - echo -n "Emit Tests for $$TARGET\n"; \ + echo -ne "Emit Tests for $$TARGET\n"; \ $(MAKE) -s --no-print-directory OUTPUT=$$BUILD_TARGET COLLECTION=$$TARGET \ -C $$TARGET emit_tests >> $(TEST_LIST); \ done; -- cgit v1.2.3 From 18afe1bf05a11c1b186bcf696e7a06bfed0e5ba2 Mon Sep 17 00:00:00 2001 From: Gautam Menghani Date: Thu, 23 Jun 2022 09:18:32 +0530 Subject: selftests: Make the usage formatting consistent in kselftest_deps.sh Add a colon in the "Optional" test usage message to ensure consistency with the "Default" test usage message. Signed-off-by: Gautam Menghani Signed-off-by: Shuah Khan --- tools/testing/selftests/kselftest_deps.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest_deps.sh b/tools/testing/selftests/kselftest_deps.sh index 00e60d6eb16b..708cb5429633 100755 --- a/tools/testing/selftests/kselftest_deps.sh +++ b/tools/testing/selftests/kselftest_deps.sh @@ -26,7 +26,7 @@ echo " main Makefile when optional -p is specified." echo "- Prints pass/fail dependency check for each tests/sub-test." echo "- Prints pass/fail targets and libraries." echo "- Default: runs dependency checks on all tests." -echo "- Optional test name can be specified to check dependencies for it." +echo "- Optional: test name can be specified to check dependencies for it." exit 1 } -- cgit v1.2.3 From 43fe0cc46b6206b25f0f13bb249f0078441ae15a Mon Sep 17 00:00:00 2001 From: Gautam Date: Sun, 26 Jun 2022 01:22:45 +0530 Subject: kselftests/damon: add support for cases where debugfs cannot be read The kernel is in lockdown mode when secureboot is enabled and hence debugfs cannot be used. Add support for this and other general cases where debugfs cannot be read and communicate the same to the user before running tests. Signed-off-by: Gautam Reviewed-by: SeongJae Park Signed-off-by: Shuah Khan --- tools/testing/selftests/damon/_chk_dependency.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/damon/_chk_dependency.sh b/tools/testing/selftests/damon/_chk_dependency.sh index 0189db81550b..0328ac0b5a5e 100644 --- a/tools/testing/selftests/damon/_chk_dependency.sh +++ b/tools/testing/selftests/damon/_chk_dependency.sh @@ -26,3 +26,13 @@ do exit 1 fi done + +permission_error="Operation not permitted" +for f in attrs target_ids monitor_on +do + status=$( cat "$DBGFS/$f" 2>&1 ) + if [ "${status#*$permission_error}" != "$status" ]; then + echo "Permission for reading $DBGFS/$f denied; maybe secureboot enabled?" + exit $ksft_skip + fi +done -- cgit v1.2.3 From 4f3394924358fe04ced0411c72fc7eeb0d3be652 Mon Sep 17 00:00:00 2001 From: Michael Jeanson Date: Tue, 14 Jun 2022 11:48:28 -0400 Subject: selftests/rseq: riscv: use rseq_get_abi() helper Make the RISC-V rseq selftests compatible with glibc-2.35 by using the rseq_get_abi() helper. Signed-off-by: Michael Jeanson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mathieu Desnoyers Link: https://lore.kernel.org/r/20220614154830.1367382-2-mjeanson@efficios.com --- tools/testing/selftests/rseq/rseq-riscv.h | 36 +++++++++++++++---------------- 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/rseq/rseq-riscv.h b/tools/testing/selftests/rseq/rseq-riscv.h index b86642f90d7f..6f8a605b75c0 100644 --- a/tools/testing/selftests/rseq/rseq-riscv.h +++ b/tools/testing/selftests/rseq/rseq-riscv.h @@ -194,8 +194,8 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu) RSEQ_ASM_DEFINE_ABORT(4, abort) : /* gcc asm goto does not allow outputs */ : [cpu_id] "r" (cpu), - [current_cpu_id] "m" (__rseq_abi.cpu_id), - [rseq_cs] "m" (__rseq_abi.rseq_cs), + [current_cpu_id] "m" (rseq_get_abi()->cpu_id), + [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr), [v] "m" (*v), [expect] "r" (expect), [newv] "r" (newv) @@ -251,8 +251,8 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot, RSEQ_ASM_DEFINE_ABORT(4, abort) : /* gcc asm goto does not allow outputs */ : [cpu_id] "r" (cpu), - [current_cpu_id] "m" (__rseq_abi.cpu_id), - [rseq_cs] "m" (__rseq_abi.rseq_cs), + [current_cpu_id] "m" (rseq_get_abi()->cpu_id), + [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr), [v] "m" (*v), [expectnot] "r" (expectnot), [load] "m" (*load), @@ -301,8 +301,8 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu) RSEQ_ASM_DEFINE_ABORT(4, abort) : /* gcc asm goto does not allow outputs */ : [cpu_id] "r" (cpu), - [current_cpu_id] "m" (__rseq_abi.cpu_id), - [rseq_cs] "m" (__rseq_abi.rseq_cs), + [current_cpu_id] "m" (rseq_get_abi()->cpu_id), + [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr), [v] "m" (*v), [count] "r" (count) RSEQ_INJECT_INPUT @@ -352,8 +352,8 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect, RSEQ_ASM_DEFINE_ABORT(4, abort) : /* gcc asm goto does not allow outputs */ : [cpu_id] "r" (cpu), - [current_cpu_id] "m" (__rseq_abi.cpu_id), - [rseq_cs] "m" (__rseq_abi.rseq_cs), + [current_cpu_id] "m" (rseq_get_abi()->cpu_id), + [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr), [expect] "r" (expect), [v] "m" (*v), [newv] "r" (newv), @@ -411,8 +411,8 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect, RSEQ_ASM_DEFINE_ABORT(4, abort) : /* gcc asm goto does not allow outputs */ : [cpu_id] "r" (cpu), - [current_cpu_id] "m" (__rseq_abi.cpu_id), - [rseq_cs] "m" (__rseq_abi.rseq_cs), + [current_cpu_id] "m" (rseq_get_abi()->cpu_id), + [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr), [expect] "r" (expect), [v] "m" (*v), [newv] "r" (newv), @@ -472,8 +472,8 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect, RSEQ_ASM_DEFINE_ABORT(4, abort) : /* gcc asm goto does not allow outputs */ : [cpu_id] "r" (cpu), - [current_cpu_id] "m" (__rseq_abi.cpu_id), - [rseq_cs] "m" (__rseq_abi.rseq_cs), + [current_cpu_id] "m" (rseq_get_abi()->cpu_id), + [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr), [v] "m" (*v), [expect] "r" (expect), [v2] "m" (*v2), @@ -532,8 +532,8 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect, RSEQ_ASM_DEFINE_ABORT(4, abort) : /* gcc asm goto does not allow outputs */ : [cpu_id] "r" (cpu), - [current_cpu_id] "m" (__rseq_abi.cpu_id), - [rseq_cs] "m" (__rseq_abi.rseq_cs), + [current_cpu_id] "m" (rseq_get_abi()->cpu_id), + [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr), [expect] "r" (expect), [v] "m" (*v), [newv] "r" (newv), @@ -593,8 +593,8 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect, RSEQ_ASM_DEFINE_ABORT(4, abort) : /* gcc asm goto does not allow outputs */ : [cpu_id] "r" (cpu), - [current_cpu_id] "m" (__rseq_abi.cpu_id), - [rseq_cs] "m" (__rseq_abi.rseq_cs), + [current_cpu_id] "m" (rseq_get_abi()->cpu_id), + [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr), [expect] "r" (expect), [v] "m" (*v), [newv] "r" (newv), @@ -651,8 +651,8 @@ int rseq_offset_deref_addv(intptr_t *ptr, off_t off, intptr_t inc, int cpu) RSEQ_ASM_DEFINE_ABORT(4, abort) : /* gcc asm goto does not allow outputs */ : [cpu_id] "r" (cpu), - [current_cpu_id] "m" (__rseq_abi.cpu_id), - [rseq_cs] "m" (__rseq_abi.rseq_cs), + [current_cpu_id] "m" (rseq_get_abi()->cpu_id), + [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr), [ptr] "r" (ptr), [off] "er" (off), [inc] "er" (inc) -- cgit v1.2.3 From d47c0cc94a86b9098930523a9e68180bef6b26cf Mon Sep 17 00:00:00 2001 From: Michael Jeanson Date: Tue, 14 Jun 2022 11:48:29 -0400 Subject: selftests/rseq: riscv: fix 'literal-suffix' warning This header is also used in librseq where it can be included in C++ code, add a space between literals and string macros. Signed-off-by: Michael Jeanson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mathieu Desnoyers Link: https://lore.kernel.org/r/20220614154830.1367382-3-mjeanson@efficios.com --- tools/testing/selftests/rseq/rseq-riscv.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/rseq/rseq-riscv.h b/tools/testing/selftests/rseq/rseq-riscv.h index 6f8a605b75c0..3a391c9bf468 100644 --- a/tools/testing/selftests/rseq/rseq-riscv.h +++ b/tools/testing/selftests/rseq/rseq-riscv.h @@ -86,7 +86,7 @@ do { \ #define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs) \ RSEQ_INJECT_ASM(1) \ - "la "RSEQ_ASM_TMP_REG_1 ", " __rseq_str(cs_label) "\n" \ + "la " RSEQ_ASM_TMP_REG_1 ", " __rseq_str(cs_label) "\n" \ REG_S RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(rseq_cs) "]\n" \ __rseq_str(label) ":\n" @@ -103,17 +103,17 @@ do { \ #define RSEQ_ASM_OP_CMPEQ(var, expect, label) \ REG_L RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(var) "]\n" \ - "bne "RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "] ," \ + "bne " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "] ," \ __rseq_str(label) "\n" #define RSEQ_ASM_OP_CMPEQ32(var, expect, label) \ - "lw "RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(var) "]\n" \ - "bne "RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "] ," \ + "lw " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(var) "]\n" \ + "bne " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "] ," \ __rseq_str(label) "\n" #define RSEQ_ASM_OP_CMPNE(var, expect, label) \ REG_L RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(var) "]\n" \ - "beq "RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "] ," \ + "beq " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "] ," \ __rseq_str(label) "\n" #define RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, label) \ @@ -127,12 +127,12 @@ do { \ REG_S RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(var) "]\n" #define RSEQ_ASM_OP_R_LOAD_OFF(offset) \ - "add "RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(offset) "], " \ + "add " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(offset) "], " \ RSEQ_ASM_TMP_REG_1 "\n" \ REG_L RSEQ_ASM_TMP_REG_1 ", (" RSEQ_ASM_TMP_REG_1 ")\n" #define RSEQ_ASM_OP_R_ADD(count) \ - "add "RSEQ_ASM_TMP_REG_1 ", " RSEQ_ASM_TMP_REG_1 \ + "add " RSEQ_ASM_TMP_REG_1 ", " RSEQ_ASM_TMP_REG_1 \ ", %[" __rseq_str(count) "]\n" #define RSEQ_ASM_OP_FINAL_STORE(value, var, post_commit_label) \ -- cgit v1.2.3 From d1a997ba4c1bf65497d956aea90de42a6398f73a Mon Sep 17 00:00:00 2001 From: Michael Jeanson Date: Tue, 14 Jun 2022 11:48:30 -0400 Subject: selftests/rseq: check if libc rseq support is registered When checking for libc rseq support in the library constructor, don't only depend on the symbols presence, check that the registration was completed. This targets a scenario where the libc has rseq support but it is not wired for the current architecture in 'bits/rseq.h', we want to fallback to our internal registration mechanism. Signed-off-by: Michael Jeanson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mathieu Desnoyers Link: https://lore.kernel.org/r/20220614154830.1367382-4-mjeanson@efficios.com --- tools/testing/selftests/rseq/rseq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c index 986b9458efb2..4177f9507bbe 100644 --- a/tools/testing/selftests/rseq/rseq.c +++ b/tools/testing/selftests/rseq/rseq.c @@ -111,7 +111,8 @@ void rseq_init(void) libc_rseq_offset_p = dlsym(RTLD_NEXT, "__rseq_offset"); libc_rseq_size_p = dlsym(RTLD_NEXT, "__rseq_size"); libc_rseq_flags_p = dlsym(RTLD_NEXT, "__rseq_flags"); - if (libc_rseq_size_p && libc_rseq_offset_p && libc_rseq_flags_p) { + if (libc_rseq_size_p && libc_rseq_offset_p && libc_rseq_flags_p && + *libc_rseq_size_p != 0) { /* rseq registration owned by glibc */ rseq_offset = *libc_rseq_offset_p; rseq_size = *libc_rseq_size_p; -- cgit v1.2.3 From 42e0576eec75479fa7709c41e5c3b9ec556b8f4d Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Fri, 10 Jun 2022 19:10:39 +0530 Subject: selftests/powerpc/pmu: Add mask/shift bits for extracting threshold compare field In power10, threshold compare field is not part of the raw event code and provided via event attribute config1. Hence add the mask and shift bits based on event attribute config1, to extract the threshold compare value for power10 Also add a new function called get_thresh_cmp_val to compute and return the threshold compare field for a given platform, since incase of power10, threshold compare value provided is decimal. Signed-off-by: Kajol Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-2-atrajeev@linux.vnet.ibm.com --- .../selftests/powerpc/pmu/sampling_tests/misc.c | 44 ++++++++++++++++++++++ .../selftests/powerpc/pmu/sampling_tests/misc.h | 1 + 2 files changed, 45 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.c b/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.c index c01a31d5f4ee..b984d1e162ac 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.c +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.c @@ -60,6 +60,8 @@ static void init_ev_encodes(void) switch (pvr) { case POWER10: + ev_mask_thd_cmp = 0x3ffff; + ev_shift_thd_cmp = 0; ev_mask_rsq = 1; ev_shift_rsq = 9; ev_mask_comb = 3; @@ -410,3 +412,45 @@ u64 get_reg_value(u64 *intr_regs, char *register_name) return *(intr_regs + register_bit_position); } + +int get_thresh_cmp_val(struct event event) +{ + int exp = 0; + u64 result = 0; + u64 value; + + if (!have_hwcap2(PPC_FEATURE2_ARCH_3_1)) + return EV_CODE_EXTRACT(event.attr.config, thd_cmp); + + value = EV_CODE_EXTRACT(event.attr.config1, thd_cmp); + + if (!value) + return value; + + /* + * Incase of P10, thresh_cmp value is not part of raw event code + * and provided via attr.config1 parameter. To program threshold in MMCRA, + * take a 18 bit number N and shift right 2 places and increment + * the exponent E by 1 until the upper 10 bits of N are zero. + * Write E to the threshold exponent and write the lower 8 bits of N + * to the threshold mantissa. + * The max threshold that can be written is 261120. + */ + if (value > 261120) + value = 261120; + while ((64 - __builtin_clzl(value)) > 8) { + exp++; + value >>= 2; + } + + /* + * Note that it is invalid to write a mantissa with the + * upper 2 bits of mantissa being zero, unless the + * exponent is also zero. + */ + if (!(value & 0xC0) && exp) + result = -1; + else + result = (exp << 8) | value; + return result; +} diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.h b/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.h index 7675f3177725..078120883fde 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.h +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.h @@ -52,6 +52,7 @@ void *__event_read_samples(void *sample_buff, size_t *size, u64 *sample_count); int collect_samples(void *sample_buff); u64 *get_intr_regs(struct event *event, void *sample_buff); u64 get_reg_value(u64 *intr_regs, char *register_name); +int get_thresh_cmp_val(struct event event); static inline int get_mmcr0_fc56(u64 mmcr0, int pmc) { -- cgit v1.2.3 From a069b5f980e3b65b64b6322b71d5819f90dbb42b Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Fri, 10 Jun 2022 19:10:40 +0530 Subject: selftests/powerpc: Add support to fetch "platform" and "base platform" from auxv to detect platform. The /proc/self/auxv contains information about "platform" on any system. Also "base platform" which is an indication about platform string corresponding to the real PVR. When systems are booted in compat mode, say, power10 booted in power9 mode, "platform" will point to power9 whereas base platform will point to power10. Incase, if the distro doesn't support platform indicated by real PVR, base platform will have a default value. The mismatch of platform/base platform is an indication of system booted in compat mode. In such cases, distro will have a Generic Compat registered which supports basic features for performance monitoring. Some of the selftest needs to be handled differently ( ex: generic events, alternative events, bhrb filter map) in Generic Compat PMU. Hence selftest framework needs utility functions to identify such cases. One way is make sure of auxv information. Below condition can be used to detect if Generic Compat PMU is registered. ie: if ((AT_PLATFORM != AT_BASE_PLATFORM) && (AT_BASE_PLATFORM != PVR)) this indicates Generic Compat PMU. Add utility function in "include/utils.h" to return: AT_PLATFORM and AT_BASE_PLATFORM from auxv. Also update misc.c in "sampling_tests" folder to add function to use above check to determine presence of generic compat pmu. In other architecture ( like x86 ), pmu_name is exposed via "/sys/bus/event_source/devices/cpu/caps". The same could be used in powerpc in future. Since currently we don't have the "caps" support in powerpc, patch uses auxv information to detect platform type and compat mode. But as placeholder utility function is added considering possiblity of getting "caps" information via sysfs. If that doesn't exist, fallback to using auxv information. Signed-off-by: Athira Rajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-3-atrajeev@linux.vnet.ibm.com --- tools/testing/selftests/powerpc/include/utils.h | 10 +++++ .../selftests/powerpc/pmu/sampling_tests/misc.c | 50 ++++++++++++++++++++++ .../selftests/powerpc/pmu/sampling_tests/misc.h | 3 ++ 3 files changed, 63 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/include/utils.h b/tools/testing/selftests/powerpc/include/utils.h index b9fa9cd709df..e222a5858450 100644 --- a/tools/testing/selftests/powerpc/include/utils.h +++ b/tools/testing/selftests/powerpc/include/utils.h @@ -74,6 +74,16 @@ static inline bool have_hwcap2(unsigned long ftr2) } #endif +static inline char *auxv_base_platform(void) +{ + return ((char *)get_auxv_entry(AT_BASE_PLATFORM)); +} + +static inline char *auxv_platform(void) +{ + return ((char *)get_auxv_entry(AT_PLATFORM)); +} + bool is_ppc64le(void); int using_hash_mmu(bool *using_hash); diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.c b/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.c index b984d1e162ac..6e30b455cbd6 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.c +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.c @@ -454,3 +454,53 @@ int get_thresh_cmp_val(struct event event) result = (exp << 8) | value; return result; } + +/* + * Utility function to check for generic compat PMU + * by comparing base_platform value from auxv and real + * PVR value. + */ +static bool auxv_generic_compat_pmu(void) +{ + int base_pvr = 0; + + if (!strcmp(auxv_base_platform(), "power9")) + base_pvr = POWER9; + else if (!strcmp(auxv_base_platform(), "power10")) + base_pvr = POWER10; + + return (!base_pvr); +} + +/* + * Check for generic compat PMU. + * First check for presence of pmu_name from + * "/sys/bus/event_source/devices/cpu/caps". + * If doesn't exist, fallback to using value + * auxv. + */ +bool check_for_generic_compat_pmu(void) +{ + char pmu_name[256]; + + memset(pmu_name, 0, sizeof(pmu_name)); + if (read_sysfs_file("bus/event_source/devices/cpu/caps/pmu_name", + pmu_name, sizeof(pmu_name)) < 0) + return auxv_generic_compat_pmu(); + + if (!strcmp(pmu_name, "ISAv3")) + return true; + else + return false; +} + +/* + * Check if system is booted in compat mode. + */ +bool check_for_compat_mode(void) +{ + char *platform = auxv_platform(); + char *base_platform = auxv_base_platform(); + + return strcmp(platform, base_platform); +} diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.h b/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.h index 078120883fde..c0e923f38793 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.h +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.h @@ -5,6 +5,7 @@ * Copyright 2022, Kajol Jain, IBM Corp. */ +#include #include "../event.h" #define POWER10 0x80 @@ -53,6 +54,8 @@ int collect_samples(void *sample_buff); u64 *get_intr_regs(struct event *event, void *sample_buff); u64 get_reg_value(u64 *intr_regs, char *register_name); int get_thresh_cmp_val(struct event event); +bool check_for_generic_compat_pmu(void); +bool check_for_compat_mode(void); static inline int get_mmcr0_fc56(u64 mmcr0, int pmc) { -- cgit v1.2.3 From 50d9c30a685c14e41e44d48a08a08703c680d861 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Fri, 10 Jun 2022 19:10:41 +0530 Subject: selftests/powerpc/pmu: Add interface test for mmcra_thresh_cmp fields The testcase uses event code 0x35340401e0 for load only sampling, to verify the settings of thresh compare field in Monitor Mode Control Register A (MMCRA: 9-18 bits for power9 and MMCRA: 8-18 bits for power10). Testcase checks if the thresh compare field is programmed correctly via perf interface to MMCRA register. Signed-off-by: Kajol Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-4-atrajeev@linux.vnet.ibm.com --- .../selftests/powerpc/pmu/sampling_tests/Makefile | 2 +- .../pmu/sampling_tests/mmcra_thresh_cmp_test.c | 74 ++++++++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/sampling_tests/mmcra_thresh_cmp_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile index a785c6a173b9..6508e6074bac 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile @@ -4,7 +4,7 @@ CFLAGS += -m64 TEST_GEN_PROGS := mmcr0_exceptionbits_test mmcr0_cc56run_test mmcr0_pmccext_test \ mmcr0_pmcjce_test mmcr0_fc56_pmc1ce_test mmcr0_fc56_pmc56_test \ mmcr1_comb_test mmcr2_l2l3_test mmcr2_fcs_fch_test \ - mmcr3_src_test mmcra_thresh_marked_sample_test + mmcr3_src_test mmcra_thresh_marked_sample_test mmcra_thresh_cmp_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/mmcra_thresh_cmp_test.c b/tools/testing/selftests/powerpc/pmu/sampling_tests/mmcra_thresh_cmp_test.c new file mode 100644 index 000000000000..904362f172c9 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/mmcra_thresh_cmp_test.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Kajol Jain, IBM Corp. + */ + +#include +#include + +#include "../event.h" +#include "misc.h" +#include "utils.h" + +/* + * Primary PMU event used here is PM_MRK_INST_CMPL (0x401e0) + * Threshold event selection used is issue to complete for cycles + * Sampling criteria is Load only sampling + */ +#define p9_EventCode 0x13E35340401e0 +#define p10_EventCode 0x35340401e0 + +extern void thirty_two_instruction_loop_with_ll_sc(u64 loops, u64 *ll_sc_target); + +/* A perf sampling test to test mmcra fields */ +static int mmcra_thresh_cmp(void) +{ + struct event event; + u64 *intr_regs; + u64 dummy; + + /* Check for platform support for the test */ + SKIP_IF(check_pvr_for_sampling_tests()); + + /* Skip for comapt mode */ + SKIP_IF(check_for_compat_mode()); + + /* Init the event for the sampling test */ + if (!have_hwcap2(PPC_FEATURE2_ARCH_3_1)) { + event_init_sampling(&event, p9_EventCode); + } else { + event_init_sampling(&event, p10_EventCode); + event.attr.config1 = 1000; + } + + event.attr.sample_regs_intr = platform_extended_mask; + FAIL_IF(event_open(&event)); + event.mmap_buffer = event_sample_buf_mmap(event.fd, 1); + + FAIL_IF(event_enable(&event)); + + /* workload to make the event overflow */ + thirty_two_instruction_loop_with_ll_sc(1000000, &dummy); + + FAIL_IF(event_disable(&event)); + + /* Check for sample count */ + FAIL_IF(!collect_samples(event.mmap_buffer)); + + intr_regs = get_intr_regs(&event, event.mmap_buffer); + + /* Check for intr_regs */ + FAIL_IF(!intr_regs); + + /* Verify that thresh cmp match with the corresponding event code fields */ + FAIL_IF(get_thresh_cmp_val(event) != + get_mmcra_thd_cmp(get_reg_value(intr_regs, "MMCRA"), 4)); + + event_close(&event); + return 0; +} + +int main(void) +{ + FAIL_IF(test_harness(mmcra_thresh_cmp, "mmcra_thresh_cmp")); +} -- cgit v1.2.3 From 61d89900315aa25f6da0c1bc800ce295d74d69f1 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Fri, 10 Jun 2022 19:10:42 +0530 Subject: selftests/powerpc/pmu: Add support for branch sampling in get_intr_regs function Add support for sample type as PERF_SAMPLE_BRANCH_STACK in sampling tests. This change is a precursor/helper for sampling testcases, that test branck stack feature in perf interface. Signed-off-by: Kajol Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-5-atrajeev@linux.vnet.ibm.com --- .../selftests/powerpc/pmu/sampling_tests/misc.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.c b/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.c index 6e30b455cbd6..5a26fc3a9706 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.c +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.c @@ -259,13 +259,32 @@ u64 *get_intr_regs(struct event *event, void *sample_buff) u64 *intr_regs; size_t size = 0; - if ((type ^ PERF_SAMPLE_REGS_INTR)) + if ((type ^ (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_BRANCH_STACK)) && + (type ^ PERF_SAMPLE_REGS_INTR)) return NULL; intr_regs = (u64 *)perf_read_first_sample(sample_buff, &size); if (!intr_regs) return NULL; + if (type & PERF_SAMPLE_BRANCH_STACK) { + /* + * PERF_RECORD_SAMPLE and PERF_SAMPLE_BRANCH_STACK: + * struct { + * struct perf_event_header hdr; + * u64 number_of_branches; + * struct perf_branch_entry[number_of_branches]; + * u64 data[]; + * }; + * struct perf_branch_entry { + * u64 from; + * u64 to; + * u64 misc; + * }; + */ + intr_regs += ((*intr_regs) * 3) + 1; + } + /* * First entry in the sample buffer used to specify * PERF_SAMPLE_REGS_ABI_64, skip perf regs abi to access -- cgit v1.2.3 From c55dabc6d577a864cd618107ea6aaa6cad8c987b Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Fri, 10 Jun 2022 19:10:43 +0530 Subject: selftests/powerpc/pmu: Add interface test for mmcra_ifm field of indirect call type The testcase uses "instructions" event to check if the Instruction filtering mode(IFM) bits are programmed correctly for indirect branch type. Testcase checks if IFM bits are programmed correctly to Monitor Mode Control Register A (MMCRA) via perf interface for ISA v3.1 platform. Signed-off-by: Kajol Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-6-atrajeev@linux.vnet.ibm.com --- tools/testing/selftests/powerpc/pmu/branch_loops.S | 28 +++++++++ .../selftests/powerpc/pmu/sampling_tests/Makefile | 5 +- .../pmu/sampling_tests/mmcra_bhrb_ind_call_test.c | 69 ++++++++++++++++++++++ 3 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/powerpc/pmu/branch_loops.S create mode 100644 tools/testing/selftests/powerpc/pmu/sampling_tests/mmcra_bhrb_ind_call_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/branch_loops.S b/tools/testing/selftests/powerpc/pmu/branch_loops.S new file mode 100644 index 000000000000..de758dd3cecf --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/branch_loops.S @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2022, Kajol Jain, IBM Corp. + */ + +#include + + .text + +#define ITER_SHIFT 31 + +FUNC_START(indirect_branch_loop) + li r3, 1 + sldi r3, r3, ITER_SHIFT + +1: cmpdi r3, 0 + beqlr + + addi r3, r3, -1 + + ld r4, 2f@got(%r2) + mtctr r4 + bctr + + .balign 32 +2: b 1b + +FUNC_END(indirect_branch_loop) diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile index 6508e6074bac..89def6e706c8 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile @@ -4,9 +4,10 @@ CFLAGS += -m64 TEST_GEN_PROGS := mmcr0_exceptionbits_test mmcr0_cc56run_test mmcr0_pmccext_test \ mmcr0_pmcjce_test mmcr0_fc56_pmc1ce_test mmcr0_fc56_pmc56_test \ mmcr1_comb_test mmcr2_l2l3_test mmcr2_fcs_fch_test \ - mmcr3_src_test mmcra_thresh_marked_sample_test mmcra_thresh_cmp_test + mmcr3_src_test mmcra_thresh_marked_sample_test mmcra_thresh_cmp_test \ + mmcra_bhrb_ind_call_test top_srcdir = ../../../../../.. include ../../../lib.mk -$(TEST_GEN_PROGS): ../../harness.c ../../utils.c ../event.c ../lib.c misc.c misc.h ../loop.S +$(TEST_GEN_PROGS): ../../harness.c ../../utils.c ../event.c ../lib.c misc.c misc.h ../loop.S ../branch_loops.S diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/mmcra_bhrb_ind_call_test.c b/tools/testing/selftests/powerpc/pmu/sampling_tests/mmcra_bhrb_ind_call_test.c new file mode 100644 index 000000000000..f0706730c099 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/mmcra_bhrb_ind_call_test.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Kajol Jain, IBM Corp. + */ + +#include +#include + +#include "../event.h" +#include "misc.h" +#include "utils.h" + +extern void indirect_branch_loop(void); + +/* Instructions */ +#define EventCode 0x500fa + +/* ifm field for indirect branch mode */ +#define IFM_IND_BRANCH 0x2 + +/* + * A perf sampling test for mmcra + * field: ifm for bhrb ind_call. + */ +static int mmcra_bhrb_ind_call_test(void) +{ + struct event event; + u64 *intr_regs; + + /* + * Check for platform support for the test. + * This test is only aplicable on power10 + */ + SKIP_IF(check_pvr_for_sampling_tests()); + SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_3_1)); + + /* Init the event for the sampling test */ + event_init_sampling(&event, EventCode); + event.attr.sample_regs_intr = platform_extended_mask; + event.attr.sample_type |= PERF_SAMPLE_BRANCH_STACK; + event.attr.branch_sample_type = PERF_SAMPLE_BRANCH_IND_CALL; + event.attr.exclude_kernel = 1; + + FAIL_IF(event_open(&event)); + event.mmap_buffer = event_sample_buf_mmap(event.fd, 1); + + FAIL_IF(event_enable(&event)); + + /* workload to make the event overflow */ + indirect_branch_loop(); + + FAIL_IF(event_disable(&event)); + + intr_regs = get_intr_regs(&event, event.mmap_buffer); + + /* Check for intr_regs */ + FAIL_IF(!intr_regs); + + /* Verify that ifm bit is set properly in MMCRA */ + FAIL_IF(get_mmcra_ifm(get_reg_value(intr_regs, "MMCRA"), 5) != IFM_IND_BRANCH); + + event_close(&event); + return 0; +} + +int main(void) +{ + return test_harness(mmcra_bhrb_ind_call_test, "mmcra_bhrb_ind_call_test"); +} -- cgit v1.2.3 From faa64ddc1e398131e7eaadc8f03cb7bd3904eff2 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Fri, 10 Jun 2022 19:10:44 +0530 Subject: selftests/powerpc/pmu: Add interface test for mmcra_ifm field for any branch type The testcase uses "instructions" event to check if the Instruction filtering mode(IFM) bits are programmed correctly for type any branch. Testcase checks if IFM bits is programmed correctly to Monitor Mode Control Register A (MMCRA) via perf interface. Signed-off-by: Kajol Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-7-atrajeev@linux.vnet.ibm.com --- .../selftests/powerpc/pmu/sampling_tests/Makefile | 2 +- .../pmu/sampling_tests/mmcra_bhrb_any_test.c | 65 ++++++++++++++++++++++ 2 files changed, 66 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/sampling_tests/mmcra_bhrb_any_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile index 89def6e706c8..63b084f66dbf 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile @@ -5,7 +5,7 @@ TEST_GEN_PROGS := mmcr0_exceptionbits_test mmcr0_cc56run_test mmcr0_pmccext_test mmcr0_pmcjce_test mmcr0_fc56_pmc1ce_test mmcr0_fc56_pmc56_test \ mmcr1_comb_test mmcr2_l2l3_test mmcr2_fcs_fch_test \ mmcr3_src_test mmcra_thresh_marked_sample_test mmcra_thresh_cmp_test \ - mmcra_bhrb_ind_call_test + mmcra_bhrb_ind_call_test mmcra_bhrb_any_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/mmcra_bhrb_any_test.c b/tools/testing/selftests/powerpc/pmu/sampling_tests/mmcra_bhrb_any_test.c new file mode 100644 index 000000000000..14854694af62 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/mmcra_bhrb_any_test.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Kajol Jain, IBM Corp. + */ + +#include +#include + +#include "../event.h" +#include "misc.h" +#include "utils.h" + +extern void thirty_two_instruction_loop(int loops); + +/* Instructions */ +#define EventCode 0x500fa + +/* ifm field for any branch mode */ +#define IFM_ANY_BRANCH 0x0 + +/* + * A perf sampling test for mmcra + * field: ifm for bhrb any call. + */ +static int mmcra_bhrb_any_test(void) +{ + struct event event; + u64 *intr_regs; + + /* Check for platform support for the test */ + SKIP_IF(check_pvr_for_sampling_tests()); + + /* Init the event for the sampling test */ + event_init_sampling(&event, EventCode); + event.attr.sample_regs_intr = platform_extended_mask; + event.attr.sample_type |= PERF_SAMPLE_BRANCH_STACK; + event.attr.branch_sample_type = PERF_SAMPLE_BRANCH_ANY; + event.attr.exclude_kernel = 1; + + FAIL_IF(event_open(&event)); + event.mmap_buffer = event_sample_buf_mmap(event.fd, 1); + + FAIL_IF(event_enable(&event)); + + /* workload to make the event overflow */ + thirty_two_instruction_loop(10000); + + FAIL_IF(event_disable(&event)); + + intr_regs = get_intr_regs(&event, event.mmap_buffer); + + /* Check for intr_regs */ + FAIL_IF(!intr_regs); + + /* Verify that ifm bit is set properly in MMCRA */ + FAIL_IF(get_mmcra_ifm(get_reg_value(intr_regs, "MMCRA"), 5) != IFM_ANY_BRANCH); + + event_close(&event); + return 0; +} + +int main(void) +{ + return test_harness(mmcra_bhrb_any_test, "mmcra_bhrb_any_test"); +} -- cgit v1.2.3 From 014fb4a3ae746276f4320f7010d03157485051cb Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Fri, 10 Jun 2022 19:10:45 +0530 Subject: selftests/powerpc/pmu: Add interface test for mmcra_ifm field for conditional branch type The testcase uses "instructions" event to check if the Instruction filtering mode(IFM) bits are programmed correctly for conditional branch type. Testcase checks if IFM bits is programmed correctly to Monitor Mode Control Register A (MMCRA) via perf interface for ISA v3.1 platform. Signed-off-by: Kajol Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-8-atrajeev@linux.vnet.ibm.com --- .../selftests/powerpc/pmu/sampling_tests/Makefile | 2 +- .../pmu/sampling_tests/mmcra_bhrb_cond_test.c | 69 ++++++++++++++++++++++ 2 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/sampling_tests/mmcra_bhrb_cond_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile index 63b084f66dbf..53569fbb1cda 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile @@ -5,7 +5,7 @@ TEST_GEN_PROGS := mmcr0_exceptionbits_test mmcr0_cc56run_test mmcr0_pmccext_test mmcr0_pmcjce_test mmcr0_fc56_pmc1ce_test mmcr0_fc56_pmc56_test \ mmcr1_comb_test mmcr2_l2l3_test mmcr2_fcs_fch_test \ mmcr3_src_test mmcra_thresh_marked_sample_test mmcra_thresh_cmp_test \ - mmcra_bhrb_ind_call_test mmcra_bhrb_any_test + mmcra_bhrb_ind_call_test mmcra_bhrb_any_test mmcra_bhrb_cond_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/mmcra_bhrb_cond_test.c b/tools/testing/selftests/powerpc/pmu/sampling_tests/mmcra_bhrb_cond_test.c new file mode 100644 index 000000000000..3e08176eb7f8 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/mmcra_bhrb_cond_test.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Kajol Jain, IBM Corp. + */ + +#include +#include + +#include "../event.h" +#include "misc.h" +#include "utils.h" + +extern void thirty_two_instruction_loop(int loops); + +/* Instructions */ +#define EventCode 0x500fa + +/* ifm field for conditional branch mode */ +#define IFM_COND_BRANCH 0x3 + +/* + * A perf sampling test for mmcra + * field: ifm for bhrb cond call. + */ +static int mmcra_bhrb_cond_test(void) +{ + struct event event; + u64 *intr_regs; + + /* + * Check for platform support for the test. + * This test is only aplicable on power10 + */ + SKIP_IF(check_pvr_for_sampling_tests()); + SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_3_1)); + + /* Init the event for the sampling test */ + event_init_sampling(&event, EventCode); + event.attr.sample_regs_intr = platform_extended_mask; + event.attr.sample_type |= PERF_SAMPLE_BRANCH_STACK; + event.attr.branch_sample_type = PERF_SAMPLE_BRANCH_COND; + event.attr.exclude_kernel = 1; + + FAIL_IF(event_open(&event)); + event.mmap_buffer = event_sample_buf_mmap(event.fd, 1); + + FAIL_IF(event_enable(&event)); + + /* workload to make the event overflow */ + thirty_two_instruction_loop(10000); + + FAIL_IF(event_disable(&event)); + + intr_regs = get_intr_regs(&event, event.mmap_buffer); + + /* Check for intr_regs */ + FAIL_IF(!intr_regs); + + /* Verify that ifm bit is set properly in MMCRA */ + FAIL_IF(get_mmcra_ifm(get_reg_value(intr_regs, "MMCRA"), 5) != IFM_COND_BRANCH); + + event_close(&event); + return 0; +} + +int main(void) +{ + return test_harness(mmcra_bhrb_cond_test, "mmcra_bhrb_cond_test"); +} -- cgit v1.2.3 From 84cc4e66d90f6624f821df381073813dd502f657 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Fri, 10 Jun 2022 19:10:46 +0530 Subject: selftests/powerpc/pmu: Add interface test for bhrb disable field The testcase uses "instructions" event to generate the samples and fetch Monitor Mode Control Register A (MMCRA) when overflow. Branch History Rolling Buffer(bhrb) disable bit is part of MMCRA which need to be verified by perf interface. Testcase checks if the bhrb disable bit of MMCRA register is programmed correctly via perf interface for ISA v3.1 platform Also make get_mmcra_ifm return type as u64. Signed-off-by: Kajol Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-9-atrajeev@linux.vnet.ibm.com --- .../selftests/powerpc/pmu/sampling_tests/Makefile | 3 +- .../selftests/powerpc/pmu/sampling_tests/misc.h | 2 +- .../pmu/sampling_tests/mmcra_bhrb_disable_test.c | 66 ++++++++++++++++++++++ 3 files changed, 69 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/powerpc/pmu/sampling_tests/mmcra_bhrb_disable_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile index 53569fbb1cda..f4da49d55d57 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile @@ -5,7 +5,8 @@ TEST_GEN_PROGS := mmcr0_exceptionbits_test mmcr0_cc56run_test mmcr0_pmccext_test mmcr0_pmcjce_test mmcr0_fc56_pmc1ce_test mmcr0_fc56_pmc56_test \ mmcr1_comb_test mmcr2_l2l3_test mmcr2_fcs_fch_test \ mmcr3_src_test mmcra_thresh_marked_sample_test mmcra_thresh_cmp_test \ - mmcra_bhrb_ind_call_test mmcra_bhrb_any_test mmcra_bhrb_cond_test + mmcra_bhrb_ind_call_test mmcra_bhrb_any_test mmcra_bhrb_cond_test \ + mmcra_bhrb_disable_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.h b/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.h index c0e923f38793..874a1596add8 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.h +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.h @@ -188,7 +188,7 @@ static inline int get_mmcra_sm(u64 mmcra, int pmc) return ((mmcra >> 42) & 0x3); } -static inline int get_mmcra_bhrb_disable(u64 mmcra, int pmc) +static inline u64 get_mmcra_bhrb_disable(u64 mmcra, int pmc) { if (pvr == POWER10) return mmcra & BHRB_DISABLE; diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/mmcra_bhrb_disable_test.c b/tools/testing/selftests/powerpc/pmu/sampling_tests/mmcra_bhrb_disable_test.c new file mode 100644 index 000000000000..186a853c0f62 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/mmcra_bhrb_disable_test.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Kajol Jain, IBM Corp. + */ + +#include +#include + +#include "../event.h" +#include "misc.h" +#include "utils.h" + +extern void thirty_two_instruction_loop(int loops); + +/* Instructions */ +#define EventCode 0x500fa + +/* + * A perf sampling test for mmcra + * field: bhrb_disable. + */ +static int mmcra_bhrb_disable_test(void) +{ + struct event event; + u64 *intr_regs; + + /* + * Check for platform support for the test. + * This test is only aplicable on power10 + */ + SKIP_IF(check_pvr_for_sampling_tests()); + SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_3_1)); + + /* Init the event for the sampling test */ + event_init_sampling(&event, EventCode); + event.attr.sample_regs_intr = platform_extended_mask; + event.attr.sample_type |= PERF_SAMPLE_BRANCH_STACK; + event.attr.branch_sample_type = PERF_SAMPLE_BRANCH_ANY; + event.attr.exclude_kernel = 1; + + FAIL_IF(event_open(&event)); + event.mmap_buffer = event_sample_buf_mmap(event.fd, 1); + + FAIL_IF(event_enable(&event)); + + /* workload to make the event overflow */ + thirty_two_instruction_loop(10000); + + FAIL_IF(event_disable(&event)); + + intr_regs = get_intr_regs(&event, event.mmap_buffer); + + /* Check for intr_regs */ + FAIL_IF(!intr_regs); + + /* Verify that bhrb_disable bit is set in MMCRA */ + FAIL_IF(get_mmcra_bhrb_disable(get_reg_value(intr_regs, "MMCRA"), 5)); + + event_close(&event); + return 0; +} + +int main(void) +{ + return test_harness(mmcra_bhrb_disable_test, "mmcra_bhrb_disable_test"); +} -- cgit v1.2.3 From 94dfc73e7cf4a31da66b8843f0b9283ddd6b8381 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 6 Apr 2022 19:36:51 -0500 Subject: treewide: uapi: Replace zero-length arrays with flexible-array members MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. This code was transformed with the help of Coccinelle: (linux-5.19-rc2$ spatch --jobs $(getconf _NPROCESSORS_ONLN) --sp-file script.cocci --include-headers --dir . > output.patch) @@ identifier S, member, array; type T1, T2; @@ struct S { ... T1 member; T2 array[ - 0 ]; }; -fstrict-flex-arrays=3 is coming and we need to land these changes to prevent issues like these in the short future: ../fs/minix/dir.c:337:3: warning: 'strcpy' will always overflow; destination buffer has size 0, but the source string has length 2 (including NUL byte) [-Wfortify-source] strcpy(de3->name, "."); ^ Since these are all [0] to [] changes, the risk to UAPI is nearly zero. If this breaks anything, we can use a union with a new member name. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.16/process/deprecated.html#zero-length-and-one-element-arrays Link: https://github.com/KSPP/linux/issues/78 Build-tested-by: kernel test robot Link: https://lore.kernel.org/lkml/62b675ec.wKX6AOZ6cbE71vtF%25lkp@intel.com/ Acked-by: Dan Williams # For ndctl.h Signed-off-by: Gustavo A. R. Silva --- arch/m68k/include/uapi/asm/bootinfo.h | 4 +- arch/mips/include/uapi/asm/ucontext.h | 2 +- arch/s390/include/uapi/asm/hwctrset.h | 6 +-- arch/x86/include/uapi/asm/bootparam.h | 2 +- arch/x86/include/uapi/asm/kvm.h | 12 ++--- include/uapi/drm/i915_drm.h | 6 +-- include/uapi/linux/blkzoned.h | 2 +- include/uapi/linux/bpf.h | 2 +- include/uapi/linux/btrfs.h | 10 ++-- include/uapi/linux/btrfs_tree.h | 2 +- include/uapi/linux/can/bcm.h | 2 +- include/uapi/linux/connector.h | 2 +- include/uapi/linux/cycx_cfm.h | 2 +- include/uapi/linux/dm-ioctl.h | 8 +-- include/uapi/linux/dm-log-userspace.h | 2 +- include/uapi/linux/ethtool.h | 28 +++++----- include/uapi/linux/fanotify.h | 2 +- include/uapi/linux/fiemap.h | 2 +- include/uapi/linux/firewire-cdev.h | 12 ++--- include/uapi/linux/fs.h | 2 +- include/uapi/linux/if_alg.h | 2 +- include/uapi/linux/if_arcnet.h | 6 +-- include/uapi/linux/if_pppox.h | 4 +- include/uapi/linux/if_tun.h | 2 +- include/uapi/linux/igmp.h | 6 +-- include/uapi/linux/inet_diag.h | 2 +- include/uapi/linux/inotify.h | 2 +- include/uapi/linux/ip.h | 4 +- include/uapi/linux/ip_vs.h | 4 +- include/uapi/linux/iso_fs.h | 4 +- include/uapi/linux/jffs2.h | 8 +-- include/uapi/linux/kcov.h | 2 +- include/uapi/linux/kvm.h | 8 +-- include/uapi/linux/minix_fs.h | 4 +- include/uapi/linux/mmc/ioctl.h | 2 +- include/uapi/linux/ndctl.h | 10 ++-- include/uapi/linux/net_dropmon.h | 4 +- include/uapi/linux/netfilter/x_tables.h | 4 +- include/uapi/linux/netfilter_arp/arp_tables.h | 6 +-- include/uapi/linux/netfilter_bridge/ebt_among.h | 2 +- include/uapi/linux/netfilter_ipv4/ip_tables.h | 6 +-- include/uapi/linux/netfilter_ipv6/ip6_tables.h | 4 +- include/uapi/linux/perf_event.h | 2 +- include/uapi/linux/pkt_cls.h | 4 +- include/uapi/linux/raid/md_p.h | 2 +- include/uapi/linux/random.h | 2 +- include/uapi/linux/romfs_fs.h | 4 +- include/uapi/linux/rtnetlink.h | 2 +- include/uapi/linux/sctp.h | 10 ++-- include/uapi/linux/seg6.h | 2 +- include/uapi/linux/seg6_iptunnel.h | 2 +- include/uapi/linux/stm.h | 2 +- include/uapi/linux/target_core_user.h | 2 +- include/uapi/linux/usb/audio.h | 2 +- include/uapi/linux/usb/cdc.h | 6 +-- include/uapi/linux/usb/ch9.h | 2 +- include/uapi/linux/usb/raw_gadget.h | 4 +- include/uapi/linux/usbdevice_fs.h | 4 +- include/uapi/linux/vhost_types.h | 4 +- include/uapi/linux/virtio_9p.h | 2 +- include/uapi/linux/xfrm.h | 10 ++-- include/uapi/rdma/hfi/hfi1_user.h | 2 +- include/uapi/rdma/ib_user_verbs.h | 72 ++++++++++++------------- include/uapi/rdma/rdma_user_cm.h | 2 +- include/uapi/rdma/rdma_user_ioctl_cmds.h | 2 +- include/uapi/scsi/fc/fc_els.h | 18 +++---- include/uapi/scsi/scsi_bsg_fc.h | 2 +- include/uapi/sound/asound.h | 2 +- include/uapi/sound/firewire.h | 6 +-- include/uapi/sound/skl-tplg-interface.h | 2 +- include/uapi/sound/sof/header.h | 2 +- include/uapi/sound/usb_stream.h | 2 +- tools/arch/x86/include/uapi/asm/kvm.h | 12 ++--- tools/include/uapi/drm/i915_drm.h | 6 +-- tools/include/uapi/linux/fs.h | 2 +- tools/include/uapi/linux/if_tun.h | 2 +- tools/include/uapi/linux/kvm.h | 8 +-- tools/include/uapi/linux/perf_event.h | 2 +- tools/include/uapi/linux/pkt_cls.h | 4 +- tools/include/uapi/linux/seg6.h | 4 +- tools/include/uapi/linux/usbdevice_fs.h | 4 +- tools/include/uapi/sound/asound.h | 2 +- 82 files changed, 216 insertions(+), 216 deletions(-) (limited to 'tools') diff --git a/arch/m68k/include/uapi/asm/bootinfo.h b/arch/m68k/include/uapi/asm/bootinfo.h index 203d9cbf9630..95ecf3ae4c49 100644 --- a/arch/m68k/include/uapi/asm/bootinfo.h +++ b/arch/m68k/include/uapi/asm/bootinfo.h @@ -34,7 +34,7 @@ struct bi_record { __be16 tag; /* tag ID */ __be16 size; /* size of record (in bytes) */ - __be32 data[0]; /* data */ + __be32 data[]; /* data */ }; @@ -168,7 +168,7 @@ struct bootversion { struct { __be32 machtype; __be32 version; - } machversions[0]; + } machversions[]; } __packed; #endif /* __ASSEMBLY__ */ diff --git a/arch/mips/include/uapi/asm/ucontext.h b/arch/mips/include/uapi/asm/ucontext.h index 2d3bf8eebf1f..6122ef97c6ff 100644 --- a/arch/mips/include/uapi/asm/ucontext.h +++ b/arch/mips/include/uapi/asm/ucontext.h @@ -60,7 +60,7 @@ struct ucontext { sigset_t uc_sigmask; /* Extended context structures may follow ucontext */ - unsigned long long uc_extcontext[0]; + unsigned long long uc_extcontext[]; }; #endif /* __MIPS_UAPI_ASM_UCONTEXT_H */ diff --git a/arch/s390/include/uapi/asm/hwctrset.h b/arch/s390/include/uapi/asm/hwctrset.h index 3d8284b95f87..e56b9dd23a4b 100644 --- a/arch/s390/include/uapi/asm/hwctrset.h +++ b/arch/s390/include/uapi/asm/hwctrset.h @@ -30,18 +30,18 @@ struct s390_ctrset_start { /* Set CPUs to operate on */ struct s390_ctrset_setdata { /* Counter set data */ __u32 set; /* Counter set number */ __u32 no_cnts; /* # of counters stored in cv[] */ - __u64 cv[0]; /* Counter values (variable length) */ + __u64 cv[]; /* Counter values (variable length) */ }; struct s390_ctrset_cpudata { /* Counter set data per CPU */ __u32 cpu_nr; /* CPU number */ __u32 no_sets; /* # of counters sets in data[] */ - struct s390_ctrset_setdata data[0]; + struct s390_ctrset_setdata data[]; }; struct s390_ctrset_read { /* Structure to get all ctr sets */ __u64 no_cpus; /* Total # of CPUs data taken from */ - struct s390_ctrset_cpudata data[0]; + struct s390_ctrset_cpudata data[]; }; #define S390_HWCTR_MAGIC 'C' /* Random magic # for ioctls */ diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index bea5cdcdf532..cdd6c7f6cfa6 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -52,7 +52,7 @@ struct setup_data { __u64 next; __u32 type; __u32 len; - __u8 data[0]; + __u8 data[]; }; /* extensible setup indirect data node */ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 21614807a2cb..ec53c9fa1da9 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -198,13 +198,13 @@ struct kvm_msrs { __u32 nmsrs; /* number of msrs in entries */ __u32 pad; - struct kvm_msr_entry entries[0]; + struct kvm_msr_entry entries[]; }; /* for KVM_GET_MSR_INDEX_LIST */ struct kvm_msr_list { __u32 nmsrs; /* number of msrs in entries */ - __u32 indices[0]; + __u32 indices[]; }; /* Maximum size of any access bitmap in bytes */ @@ -241,7 +241,7 @@ struct kvm_cpuid_entry { struct kvm_cpuid { __u32 nent; __u32 padding; - struct kvm_cpuid_entry entries[0]; + struct kvm_cpuid_entry entries[]; }; struct kvm_cpuid_entry2 { @@ -263,7 +263,7 @@ struct kvm_cpuid_entry2 { struct kvm_cpuid2 { __u32 nent; __u32 padding; - struct kvm_cpuid_entry2 entries[0]; + struct kvm_cpuid_entry2 entries[]; }; /* for KVM_GET_PIT and KVM_SET_PIT */ @@ -389,7 +389,7 @@ struct kvm_xsave { * the contents of CPUID leaf 0xD on the host. */ __u32 region[1024]; - __u32 extra[0]; + __u32 extra[]; }; #define KVM_MAX_XCRS 16 @@ -516,7 +516,7 @@ struct kvm_pmu_event_filter { __u32 fixed_counter_bitmap; __u32 flags; __u32 pad[4]; - __u64 events[0]; + __u64 events[]; }; #define KVM_PMU_EVENT_ALLOW 0 diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index a2def7b27009..b28ff5d88145 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -2123,7 +2123,7 @@ struct i915_context_engines_load_balance { __u64 mbz64; /* reserved for future use; must be zero */ - struct i915_engine_class_instance engines[0]; + struct i915_engine_class_instance engines[]; } __attribute__((packed)); #define I915_DEFINE_CONTEXT_ENGINES_LOAD_BALANCE(name__, N__) struct { \ @@ -2161,7 +2161,7 @@ struct i915_context_engines_bond { __u64 flags; /* all undefined flags must be zero */ __u64 mbz64[4]; /* reserved for future use; must be zero */ - struct i915_engine_class_instance engines[0]; + struct i915_engine_class_instance engines[]; } __attribute__((packed)); #define I915_DEFINE_CONTEXT_ENGINES_BOND(name__, N__) struct { \ @@ -2288,7 +2288,7 @@ struct i915_context_engines_parallel_submit { * length = width (i) * num_siblings (j) * index = j + i * num_siblings */ - struct i915_engine_class_instance engines[0]; + struct i915_engine_class_instance engines[]; } __packed; diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index 656a326821a2..b80fcc9ea525 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -130,7 +130,7 @@ struct blk_zone_report { __u64 sector; __u32 nr_zones; __u32 flags; - struct blk_zone zones[0]; + struct blk_zone zones[]; }; /** diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f4009dbdf62d..e4b33ba06f00 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -79,7 +79,7 @@ struct bpf_insn { /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ struct bpf_lpm_trie_key { __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ - __u8 data[0]; /* Arbitrary size */ + __u8 data[]; /* Arbitrary size */ }; struct bpf_cgroup_storage_key { diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index d956b2993970..3d0edbe3b991 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -93,7 +93,7 @@ struct btrfs_qgroup_inherit { __u64 num_ref_copies; __u64 num_excl_copies; struct btrfs_qgroup_limit lim; - __u64 qgroups[0]; + __u64 qgroups[]; }; struct btrfs_ioctl_qgroup_limit_args { @@ -561,7 +561,7 @@ struct btrfs_ioctl_search_args_v2 { __u64 buf_size; /* in - size of buffer * out - on EOVERFLOW: needed size * to store item */ - __u64 buf[0]; /* out - found items */ + __u64 buf[]; /* out - found items */ }; struct btrfs_ioctl_clone_range_args { @@ -632,7 +632,7 @@ struct btrfs_ioctl_same_args { __u16 dest_count; /* in - total elements in info array */ __u16 reserved1; __u32 reserved2; - struct btrfs_ioctl_same_extent_info info[0]; + struct btrfs_ioctl_same_extent_info info[]; }; struct btrfs_ioctl_space_info { @@ -644,7 +644,7 @@ struct btrfs_ioctl_space_info { struct btrfs_ioctl_space_args { __u64 space_slots; __u64 total_spaces; - struct btrfs_ioctl_space_info spaces[0]; + struct btrfs_ioctl_space_info spaces[]; }; struct btrfs_data_container { @@ -652,7 +652,7 @@ struct btrfs_data_container { __u32 bytes_missing; /* out -- additional bytes needed for result */ __u32 elem_cnt; /* out */ __u32 elem_missed; /* out */ - __u64 val[0]; /* out */ + __u64 val[]; /* out */ }; struct btrfs_ioctl_ino_path_args { diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index d4117152d907..5f32a2a495dc 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -575,7 +575,7 @@ struct btrfs_inode_extref { __le64 parent_objectid; __le64 index; __le16 name_len; - __u8 name[0]; + __u8 name[]; /* name goes here */ } __attribute__ ((__packed__)); diff --git a/include/uapi/linux/can/bcm.h b/include/uapi/linux/can/bcm.h index dd2b925b09ac..f1e45f533a72 100644 --- a/include/uapi/linux/can/bcm.h +++ b/include/uapi/linux/can/bcm.h @@ -71,7 +71,7 @@ struct bcm_msg_head { struct bcm_timeval ival1, ival2; canid_t can_id; __u32 nframes; - struct can_frame frames[0]; + struct can_frame frames[]; }; enum { diff --git a/include/uapi/linux/connector.h b/include/uapi/linux/connector.h index 3738936149a2..5ae131c3f145 100644 --- a/include/uapi/linux/connector.h +++ b/include/uapi/linux/connector.h @@ -75,7 +75,7 @@ struct cn_msg { __u16 len; /* Length of the following data */ __u16 flags; - __u8 data[0]; + __u8 data[]; }; #endif /* _UAPI__CONNECTOR_H */ diff --git a/include/uapi/linux/cycx_cfm.h b/include/uapi/linux/cycx_cfm.h index 51f541942ff9..91778c8024b1 100644 --- a/include/uapi/linux/cycx_cfm.h +++ b/include/uapi/linux/cycx_cfm.h @@ -91,7 +91,7 @@ struct cycx_firmware { unsigned short reserved[6]; char descr[CFM_DESCR_LEN]; struct cycx_fw_info info; - unsigned char image[0]; + unsigned char image[]; }; struct cycx_fw_header { diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index 2e9550fef90f..8c97d75f3104 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -182,7 +182,7 @@ struct dm_target_spec { struct dm_target_deps { __u32 count; /* Array size */ __u32 padding; /* unused */ - __u64 dev[0]; /* out */ + __u64 dev[]; /* out */ }; /* @@ -192,7 +192,7 @@ struct dm_name_list { __u64 dev; __u32 next; /* offset to the next record from the _start_ of this */ - char name[0]; + char name[]; /* * The following members can be accessed by taking a pointer that @@ -216,7 +216,7 @@ struct dm_target_versions { __u32 next; __u32 version[3]; - char name[0]; + char name[]; }; /* @@ -225,7 +225,7 @@ struct dm_target_versions { struct dm_target_msg { __u64 sector; /* Device sector */ - char message[0]; + char message[]; }; /* diff --git a/include/uapi/linux/dm-log-userspace.h b/include/uapi/linux/dm-log-userspace.h index 5c47a8603376..23dad9565e46 100644 --- a/include/uapi/linux/dm-log-userspace.h +++ b/include/uapi/linux/dm-log-userspace.h @@ -426,7 +426,7 @@ struct dm_ulog_request { __u32 request_type; /* DM_ULOG_* defined above */ __u32 data_size; /* How much data (not including this struct) */ - char data[0]; + char data[]; }; #endif /* __DM_LOG_USERSPACE_H__ */ diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index e0f0ee9bc89e..2d5741fd44bb 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -257,7 +257,7 @@ struct ethtool_tunable { __u32 id; __u32 type_id; __u32 len; - void *data[0]; + void *data[]; }; #define DOWNSHIFT_DEV_DEFAULT_COUNT 0xff @@ -322,7 +322,7 @@ struct ethtool_regs { __u32 cmd; __u32 version; __u32 len; - __u8 data[0]; + __u8 data[]; }; /** @@ -348,7 +348,7 @@ struct ethtool_eeprom { __u32 magic; __u32 offset; __u32 len; - __u8 data[0]; + __u8 data[]; }; /** @@ -752,7 +752,7 @@ struct ethtool_gstrings { __u32 cmd; __u32 string_set; __u32 len; - __u8 data[0]; + __u8 data[]; }; /** @@ -777,7 +777,7 @@ struct ethtool_sset_info { __u32 cmd; __u32 reserved; __u64 sset_mask; - __u32 data[0]; + __u32 data[]; }; /** @@ -817,7 +817,7 @@ struct ethtool_test { __u32 flags; __u32 reserved; __u32 len; - __u64 data[0]; + __u64 data[]; }; /** @@ -834,7 +834,7 @@ struct ethtool_test { struct ethtool_stats { __u32 cmd; __u32 n_stats; - __u64 data[0]; + __u64 data[]; }; /** @@ -851,7 +851,7 @@ struct ethtool_stats { struct ethtool_perm_addr { __u32 cmd; __u32 size; - __u8 data[0]; + __u8 data[]; }; /* boolean flags controlling per-interface behavior characteristics. @@ -1160,7 +1160,7 @@ struct ethtool_rxnfc { struct ethtool_rxfh_indir { __u32 cmd; __u32 size; - __u32 ring_index[0]; + __u32 ring_index[]; }; /** @@ -1201,7 +1201,7 @@ struct ethtool_rxfh { __u8 hfunc; __u8 rsvd8[3]; __u32 rsvd32; - __u32 rss_config[0]; + __u32 rss_config[]; }; #define ETH_RXFH_CONTEXT_ALLOC 0xffffffff #define ETH_RXFH_INDIR_NO_CHANGE 0xffffffff @@ -1286,7 +1286,7 @@ struct ethtool_dump { __u32 version; __u32 flag; __u32 len; - __u8 data[0]; + __u8 data[]; }; #define ETH_FW_DUMP_DISABLE 0 @@ -1318,7 +1318,7 @@ struct ethtool_get_features_block { struct ethtool_gfeatures { __u32 cmd; __u32 size; - struct ethtool_get_features_block features[0]; + struct ethtool_get_features_block features[]; }; /** @@ -1340,7 +1340,7 @@ struct ethtool_set_features_block { struct ethtool_sfeatures { __u32 cmd; __u32 size; - struct ethtool_set_features_block features[0]; + struct ethtool_set_features_block features[]; }; /** @@ -2087,7 +2087,7 @@ struct ethtool_link_settings { __u8 master_slave_state; __u8 reserved1[1]; __u32 reserved[7]; - __u32 link_mode_masks[0]; + __u32 link_mode_masks[]; /* layout of link_mode_masks fields: * __u32 map_supported[link_mode_masks_nwords]; * __u32 map_advertising[link_mode_masks_nwords]; diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index f1f89132d60e..197df344307d 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -162,7 +162,7 @@ struct fanotify_event_info_fid { * Following is an opaque struct file_handle that can be passed as * an argument to open_by_handle_at(2). */ - unsigned char handle[0]; + unsigned char handle[]; }; /* diff --git a/include/uapi/linux/fiemap.h b/include/uapi/linux/fiemap.h index 07c1cdcb715e..24ca0c00cae3 100644 --- a/include/uapi/linux/fiemap.h +++ b/include/uapi/linux/fiemap.h @@ -34,7 +34,7 @@ struct fiemap { __u32 fm_mapped_extents;/* number of extents that were mapped (out) */ __u32 fm_extent_count; /* size of fm_extents array (in) */ __u32 fm_reserved; - struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */ + struct fiemap_extent fm_extents[]; /* array of mapped extents (out) */ }; #define FIEMAP_MAX_OFFSET (~0ULL) diff --git a/include/uapi/linux/firewire-cdev.h b/include/uapi/linux/firewire-cdev.h index 5effa9832802..92be3ea3c6e0 100644 --- a/include/uapi/linux/firewire-cdev.h +++ b/include/uapi/linux/firewire-cdev.h @@ -118,7 +118,7 @@ struct fw_cdev_event_response { __u32 type; __u32 rcode; __u32 length; - __u32 data[0]; + __u32 data[]; }; /** @@ -142,7 +142,7 @@ struct fw_cdev_event_request { __u64 offset; __u32 handle; __u32 length; - __u32 data[0]; + __u32 data[]; }; /** @@ -205,7 +205,7 @@ struct fw_cdev_event_request2 { __u32 generation; __u32 handle; __u32 length; - __u32 data[0]; + __u32 data[]; }; /** @@ -265,7 +265,7 @@ struct fw_cdev_event_iso_interrupt { __u32 type; __u32 cycle; __u32 header_length; - __u32 header[0]; + __u32 header[]; }; /** @@ -355,7 +355,7 @@ struct fw_cdev_event_phy_packet { __u32 type; __u32 rcode; __u32 length; - __u32 data[0]; + __u32 data[]; }; /** @@ -803,7 +803,7 @@ struct fw_cdev_set_iso_channels { */ struct fw_cdev_iso_packet { __u32 control; - __u32 header[0]; + __u32 header[]; }; /** diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index bdf7b404b3e7..b7b56871029c 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -90,7 +90,7 @@ struct file_dedupe_range { __u16 dest_count; /* in - total elements in info array */ __u16 reserved1; /* must be zero */ __u32 reserved2; /* must be zero */ - struct file_dedupe_range_info info[0]; + struct file_dedupe_range_info info[]; }; /* And dynamically-tunable limits and defaults: */ diff --git a/include/uapi/linux/if_alg.h b/include/uapi/linux/if_alg.h index dc52a11ba6d1..578b18aab821 100644 --- a/include/uapi/linux/if_alg.h +++ b/include/uapi/linux/if_alg.h @@ -42,7 +42,7 @@ struct sockaddr_alg_new { struct af_alg_iv { __u32 ivlen; - __u8 iv[0]; + __u8 iv[]; }; /* Socket options */ diff --git a/include/uapi/linux/if_arcnet.h b/include/uapi/linux/if_arcnet.h index 683878036d76..b122cfac7128 100644 --- a/include/uapi/linux/if_arcnet.h +++ b/include/uapi/linux/if_arcnet.h @@ -60,7 +60,7 @@ struct arc_rfc1201 { __u8 proto; /* protocol ID field - varies */ __u8 split_flag; /* for use with split packets */ __be16 sequence; /* sequence number */ - __u8 payload[0]; /* space remaining in packet (504 bytes)*/ + __u8 payload[]; /* space remaining in packet (504 bytes)*/ }; #define RFC1201_HDR_SIZE 4 @@ -69,7 +69,7 @@ struct arc_rfc1201 { */ struct arc_rfc1051 { __u8 proto; /* ARC_P_RFC1051_ARP/RFC1051_IP */ - __u8 payload[0]; /* 507 bytes */ + __u8 payload[]; /* 507 bytes */ }; #define RFC1051_HDR_SIZE 1 @@ -80,7 +80,7 @@ struct arc_rfc1051 { struct arc_eth_encap { __u8 proto; /* Always ARC_P_ETHER */ struct ethhdr eth; /* standard ethernet header (yuck!) */ - __u8 payload[0]; /* 493 bytes */ + __u8 payload[]; /* 493 bytes */ }; #define ETH_ENCAP_HDR_SIZE 14 diff --git a/include/uapi/linux/if_pppox.h b/include/uapi/linux/if_pppox.h index e7a693c28f16..9abd80dcc46f 100644 --- a/include/uapi/linux/if_pppox.h +++ b/include/uapi/linux/if_pppox.h @@ -122,7 +122,7 @@ struct sockaddr_pppol2tpv3in6 { struct pppoe_tag { __be16 tag_type; __be16 tag_len; - char tag_data[0]; + char tag_data[]; } __attribute__ ((packed)); /* Tag identifiers */ @@ -150,7 +150,7 @@ struct pppoe_hdr { __u8 code; __be16 sid; __be16 length; - struct pppoe_tag tag[0]; + struct pppoe_tag tag[]; } __packed; /* Length of entire PPPoE + PPP header */ diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 454ae31b93c7..2ec07de1d73b 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -108,7 +108,7 @@ struct tun_pi { struct tun_filter { __u16 flags; /* TUN_FLT_ flags see above */ __u16 count; /* Number of addresses */ - __u8 addr[0][ETH_ALEN]; + __u8 addr[][ETH_ALEN]; }; #endif /* _UAPI__IF_TUN_H */ diff --git a/include/uapi/linux/igmp.h b/include/uapi/linux/igmp.h index 90c28bc466c6..5930f2437cd1 100644 --- a/include/uapi/linux/igmp.h +++ b/include/uapi/linux/igmp.h @@ -48,7 +48,7 @@ struct igmpv3_grec { __u8 grec_auxwords; __be16 grec_nsrcs; __be32 grec_mca; - __be32 grec_src[0]; + __be32 grec_src[]; }; struct igmpv3_report { @@ -57,7 +57,7 @@ struct igmpv3_report { __sum16 csum; __be16 resv2; __be16 ngrec; - struct igmpv3_grec grec[0]; + struct igmpv3_grec grec[]; }; struct igmpv3_query { @@ -78,7 +78,7 @@ struct igmpv3_query { #endif __u8 qqic; __be16 nsrcs; - __be32 srcs[0]; + __be32 srcs[]; }; #define IGMP_HOST_MEMBERSHIP_QUERY 0x11 /* From RFC1112 */ diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 20ee93f0f876..50655de04c9b 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -104,7 +104,7 @@ struct inet_diag_hostcond { __u8 family; __u8 prefix_len; int port; - __be32 addr[0]; + __be32 addr[]; }; struct inet_diag_markcond { diff --git a/include/uapi/linux/inotify.h b/include/uapi/linux/inotify.h index 884b4846b630..b3e165853d5b 100644 --- a/include/uapi/linux/inotify.h +++ b/include/uapi/linux/inotify.h @@ -23,7 +23,7 @@ struct inotify_event { __u32 mask; /* watch mask */ __u32 cookie; /* cookie to synchronize two events */ __u32 len; /* length (including nulls) of name */ - char name[0]; /* stub for possible name */ + char name[]; /* stub for possible name */ }; /* the following are legal, implemented events that user-space can watch for */ diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h index e00bbb9c47bb..961ec16a26b8 100644 --- a/include/uapi/linux/ip.h +++ b/include/uapi/linux/ip.h @@ -112,13 +112,13 @@ struct ip_auth_hdr { __be16 reserved; __be32 spi; __be32 seq_no; /* Sequence number */ - __u8 auth_data[0]; /* Variable len but >=4. Mind the 64 bit alignment! */ + __u8 auth_data[]; /* Variable len but >=4. Mind the 64 bit alignment! */ }; struct ip_esp_hdr { __be32 spi; __be32 seq_no; /* Sequence number */ - __u8 enc_data[0]; /* Variable len but >=8. Mind the 64 bit alignment! */ + __u8 enc_data[]; /* Variable len but >=8. Mind the 64 bit alignment! */ }; struct ip_comp_hdr { diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h index 4102ddcb4e14..1ed234e7f251 100644 --- a/include/uapi/linux/ip_vs.h +++ b/include/uapi/linux/ip_vs.h @@ -254,7 +254,7 @@ struct ip_vs_get_dests { unsigned int num_dests; /* the real servers */ - struct ip_vs_dest_entry entrytable[0]; + struct ip_vs_dest_entry entrytable[]; }; @@ -264,7 +264,7 @@ struct ip_vs_get_services { unsigned int num_services; /* service table */ - struct ip_vs_service_entry entrytable[0]; + struct ip_vs_service_entry entrytable[]; }; diff --git a/include/uapi/linux/iso_fs.h b/include/uapi/linux/iso_fs.h index a2555176f6d1..758178f5b52d 100644 --- a/include/uapi/linux/iso_fs.h +++ b/include/uapi/linux/iso_fs.h @@ -137,7 +137,7 @@ struct iso_path_table{ __u8 name_len[2]; /* 721 */ __u8 extent[4]; /* 731 */ __u8 parent[2]; /* 721 */ - char name[0]; + char name[]; } __attribute__((packed)); /* high sierra is identical to iso, except that the date is only 6 bytes, and @@ -154,7 +154,7 @@ struct iso_directory_record { __u8 interleave [ISODCL (28, 28)]; /* 711 */ __u8 volume_sequence_number [ISODCL (29, 32)]; /* 723 */ __u8 name_len [ISODCL (33, 33)]; /* 711 */ - char name [0]; + char name []; } __attribute__((packed)); #define ISOFS_BLOCK_BITS 11 diff --git a/include/uapi/linux/jffs2.h b/include/uapi/linux/jffs2.h index 784ba0b9690a..637ee4a793cf 100644 --- a/include/uapi/linux/jffs2.h +++ b/include/uapi/linux/jffs2.h @@ -123,7 +123,7 @@ struct jffs2_raw_dirent __u8 unused[2]; jint32_t node_crc; jint32_t name_crc; - __u8 name[0]; + __u8 name[]; }; /* The JFFS2 raw inode structure: Used for storage on physical media. */ @@ -155,7 +155,7 @@ struct jffs2_raw_inode jint16_t flags; /* See JFFS2_INO_FLAG_* */ jint32_t data_crc; /* CRC for the (compressed) data. */ jint32_t node_crc; /* CRC for the raw inode (excluding data) */ - __u8 data[0]; + __u8 data[]; }; struct jffs2_raw_xattr { @@ -170,7 +170,7 @@ struct jffs2_raw_xattr { jint16_t value_len; jint32_t data_crc; jint32_t node_crc; - __u8 data[0]; + __u8 data[]; } __attribute__((packed)); struct jffs2_raw_xref @@ -196,7 +196,7 @@ struct jffs2_raw_summary jint32_t padded; /* sum of the size of padding nodes */ jint32_t sum_crc; /* summary information crc */ jint32_t node_crc; /* node crc */ - jint32_t sum[0]; /* inode summary info */ + jint32_t sum[]; /* inode summary info */ }; union jffs2_node_union diff --git a/include/uapi/linux/kcov.h b/include/uapi/linux/kcov.h index 1d0350e44ae3..ed95dba9fa37 100644 --- a/include/uapi/linux/kcov.h +++ b/include/uapi/linux/kcov.h @@ -13,7 +13,7 @@ struct kcov_remote_arg { __u32 area_size; /* Length of coverage buffer in words */ __u32 num_handles; /* Size of handles array */ __aligned_u64 common_handle; - __aligned_u64 handles[0]; + __aligned_u64 handles[]; }; #define KCOV_REMOTE_MAX_HANDLES 0x100 diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 5088bd9f1922..74dc8bafcb9e 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -542,7 +542,7 @@ struct kvm_coalesced_mmio { struct kvm_coalesced_mmio_ring { __u32 first, last; - struct kvm_coalesced_mmio coalesced_mmio[0]; + struct kvm_coalesced_mmio coalesced_mmio[]; }; #define KVM_COALESCED_MMIO_MAX \ @@ -621,7 +621,7 @@ struct kvm_clear_dirty_log { /* for KVM_SET_SIGNAL_MASK */ struct kvm_signal_mask { __u32 len; - __u8 sigset[0]; + __u8 sigset[]; }; /* for KVM_TPR_ACCESS_REPORTING */ @@ -1221,7 +1221,7 @@ struct kvm_irq_routing_entry { struct kvm_irq_routing { __u32 nr; __u32 flags; - struct kvm_irq_routing_entry entries[0]; + struct kvm_irq_routing_entry entries[]; }; #endif @@ -1341,7 +1341,7 @@ struct kvm_dirty_tlb { struct kvm_reg_list { __u64 n; /* number of regs */ - __u64 reg[0]; + __u64 reg[]; }; struct kvm_one_reg { diff --git a/include/uapi/linux/minix_fs.h b/include/uapi/linux/minix_fs.h index 95dbcb17eacd..8d9ca8b2c357 100644 --- a/include/uapi/linux/minix_fs.h +++ b/include/uapi/linux/minix_fs.h @@ -97,11 +97,11 @@ struct minix3_super_block { struct minix_dir_entry { __u16 inode; - char name[0]; + char name[]; }; struct minix3_dir_entry { __u32 inode; - char name[0]; + char name[]; }; #endif diff --git a/include/uapi/linux/mmc/ioctl.h b/include/uapi/linux/mmc/ioctl.h index 27a39847d55c..e7401ade6822 100644 --- a/include/uapi/linux/mmc/ioctl.h +++ b/include/uapi/linux/mmc/ioctl.h @@ -58,7 +58,7 @@ struct mmc_ioc_cmd { */ struct mmc_ioc_multi_cmd { __u64 num_of_cmds; - struct mmc_ioc_cmd cmds[0]; + struct mmc_ioc_cmd cmds[]; }; #define MMC_IOC_CMD _IOWR(MMC_BLOCK_MAJOR, 0, struct mmc_ioc_cmd) diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index 17e02b64ea2e..73516e263627 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -30,25 +30,25 @@ struct nd_cmd_get_config_data_hdr { __u32 in_offset; __u32 in_length; __u32 status; - __u8 out_buf[0]; + __u8 out_buf[]; } __packed; struct nd_cmd_set_config_hdr { __u32 in_offset; __u32 in_length; - __u8 in_buf[0]; + __u8 in_buf[]; } __packed; struct nd_cmd_vendor_hdr { __u32 opcode; __u32 in_length; - __u8 in_buf[0]; + __u8 in_buf[]; } __packed; struct nd_cmd_vendor_tail { __u32 status; __u32 out_length; - __u8 out_buf[0]; + __u8 out_buf[]; } __packed; struct nd_cmd_ars_cap { @@ -86,7 +86,7 @@ struct nd_cmd_ars_status { __u32 reserved; __u64 err_address; __u64 length; - } __packed records[0]; + } __packed records[]; } __packed; struct nd_cmd_clear_error { diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h index 1bbea8f0681e..84f622a66a7a 100644 --- a/include/uapi/linux/net_dropmon.h +++ b/include/uapi/linux/net_dropmon.h @@ -29,12 +29,12 @@ struct net_dm_config_entry { struct net_dm_config_msg { __u32 entries; - struct net_dm_config_entry options[0]; + struct net_dm_config_entry options[]; }; struct net_dm_alert_msg { __u32 entries; - struct net_dm_drop_point points[0]; + struct net_dm_drop_point points[]; }; struct net_dm_user_msg { diff --git a/include/uapi/linux/netfilter/x_tables.h b/include/uapi/linux/netfilter/x_tables.h index b8c6bb233ac1..796af83a963a 100644 --- a/include/uapi/linux/netfilter/x_tables.h +++ b/include/uapi/linux/netfilter/x_tables.h @@ -28,7 +28,7 @@ struct xt_entry_match { __u16 match_size; } u; - unsigned char data[0]; + unsigned char data[]; }; struct xt_entry_target { @@ -119,7 +119,7 @@ struct xt_counters_info { unsigned int num_counters; /* The counters (actually `number' of these). */ - struct xt_counters counters[0]; + struct xt_counters counters[]; }; #define XT_INV_PROTO 0x40 /* Invert the sense of PROTO. */ diff --git a/include/uapi/linux/netfilter_arp/arp_tables.h b/include/uapi/linux/netfilter_arp/arp_tables.h index bbf5af2b67a8..a6ac2463f787 100644 --- a/include/uapi/linux/netfilter_arp/arp_tables.h +++ b/include/uapi/linux/netfilter_arp/arp_tables.h @@ -109,7 +109,7 @@ struct arpt_entry struct xt_counters counters; /* The matches (if any), then the target. */ - unsigned char elems[0]; + unsigned char elems[]; }; /* @@ -181,7 +181,7 @@ struct arpt_replace { struct xt_counters __user *counters; /* The entries (hang off end: not really an array). */ - struct arpt_entry entries[0]; + struct arpt_entry entries[]; }; /* The argument to ARPT_SO_GET_ENTRIES. */ @@ -193,7 +193,7 @@ struct arpt_get_entries { unsigned int size; /* The entries. */ - struct arpt_entry entrytable[0]; + struct arpt_entry entrytable[]; }; /* Helper functions */ diff --git a/include/uapi/linux/netfilter_bridge/ebt_among.h b/include/uapi/linux/netfilter_bridge/ebt_among.h index 9acf757bc1f7..73b26a280c4f 100644 --- a/include/uapi/linux/netfilter_bridge/ebt_among.h +++ b/include/uapi/linux/netfilter_bridge/ebt_among.h @@ -40,7 +40,7 @@ struct ebt_mac_wormhash_tuple { struct ebt_mac_wormhash { int table[257]; int poolsize; - struct ebt_mac_wormhash_tuple pool[0]; + struct ebt_mac_wormhash_tuple pool[]; }; #define ebt_mac_wormhash_size(x) ((x) ? sizeof(struct ebt_mac_wormhash) \ diff --git a/include/uapi/linux/netfilter_ipv4/ip_tables.h b/include/uapi/linux/netfilter_ipv4/ip_tables.h index 50c7fee625ae..1485df28b239 100644 --- a/include/uapi/linux/netfilter_ipv4/ip_tables.h +++ b/include/uapi/linux/netfilter_ipv4/ip_tables.h @@ -121,7 +121,7 @@ struct ipt_entry { struct xt_counters counters; /* The matches (if any), then the target. */ - unsigned char elems[0]; + unsigned char elems[]; }; /* @@ -203,7 +203,7 @@ struct ipt_replace { struct xt_counters __user *counters; /* The entries (hang off end: not really an array). */ - struct ipt_entry entries[0]; + struct ipt_entry entries[]; }; /* The argument to IPT_SO_GET_ENTRIES. */ @@ -215,7 +215,7 @@ struct ipt_get_entries { unsigned int size; /* The entries. */ - struct ipt_entry entrytable[0]; + struct ipt_entry entrytable[]; }; /* Helper functions */ diff --git a/include/uapi/linux/netfilter_ipv6/ip6_tables.h b/include/uapi/linux/netfilter_ipv6/ip6_tables.h index d9e364f96a5c..766e8e0bcc68 100644 --- a/include/uapi/linux/netfilter_ipv6/ip6_tables.h +++ b/include/uapi/linux/netfilter_ipv6/ip6_tables.h @@ -243,7 +243,7 @@ struct ip6t_replace { struct xt_counters __user *counters; /* The entries (hang off end: not really an array). */ - struct ip6t_entry entries[0]; + struct ip6t_entry entries[]; }; /* The argument to IP6T_SO_GET_ENTRIES. */ @@ -255,7 +255,7 @@ struct ip6t_get_entries { unsigned int size; /* The entries. */ - struct ip6t_entry entrytable[0]; + struct ip6t_entry entrytable[]; }; /* Helper functions */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index d37629dbad72..4653834f078f 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -491,7 +491,7 @@ struct perf_event_query_bpf { /* * User provided buffer to store program ids */ - __u32 ids[0]; + __u32 ids[]; }; /* diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 9a2ee1e39fad..ffbe230ef90b 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -256,7 +256,7 @@ struct tc_u32_sel { short hoff; __be32 hmask; - struct tc_u32_key keys[0]; + struct tc_u32_key keys[]; }; struct tc_u32_mark { @@ -268,7 +268,7 @@ struct tc_u32_mark { struct tc_u32_pcnt { __u64 rcnt; __u64 rhit; - __u64 kcnts[0]; + __u64 kcnts[]; }; /* Flags */ diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index e5a98a16f9b0..6c0aa577730f 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -303,7 +303,7 @@ struct mdp_superblock_1 { * into the 'roles' value. If a device is spare or faulty, then it doesn't * have a meaningful role. */ - __le16 dev_roles[0]; /* role in array, or 0xffff for a spare, or 0xfffe for faulty */ + __le16 dev_roles[]; /* role in array, or 0xffff for a spare, or 0xfffe for faulty */ }; /* feature_map bits */ diff --git a/include/uapi/linux/random.h b/include/uapi/linux/random.h index dcc1b3e6106f..e744c23582eb 100644 --- a/include/uapi/linux/random.h +++ b/include/uapi/linux/random.h @@ -41,7 +41,7 @@ struct rand_pool_info { int entropy_count; int buf_size; - __u32 buf[0]; + __u32 buf[]; }; /* diff --git a/include/uapi/linux/romfs_fs.h b/include/uapi/linux/romfs_fs.h index a7f1585accef..6aa05e792454 100644 --- a/include/uapi/linux/romfs_fs.h +++ b/include/uapi/linux/romfs_fs.h @@ -27,7 +27,7 @@ struct romfs_super_block { __be32 word1; __be32 size; __be32 checksum; - char name[0]; /* volume name */ + char name[]; /* volume name */ }; /* On disk inode */ @@ -37,7 +37,7 @@ struct romfs_inode { __be32 spec; __be32 size; __be32 checksum; - char name[0]; + char name[]; }; #define ROMFH_TYPE 7 diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 83849a37db5b..eb2747d58a81 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -440,7 +440,7 @@ struct rtnexthop { /* RTA_VIA */ struct rtvia { __kernel_sa_family_t rtvia_family; - __u8 rtvia_addr[0]; + __u8 rtvia_addr[]; }; /* RTM_CACHEINFO */ diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index c4ff1ebd8bcc..ed7d4ecbf53d 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -365,7 +365,7 @@ struct sctp_assoc_change { __u16 sac_outbound_streams; __u16 sac_inbound_streams; sctp_assoc_t sac_assoc_id; - __u8 sac_info[0]; + __u8 sac_info[]; }; /* @@ -436,7 +436,7 @@ struct sctp_remote_error { __u32 sre_length; __be16 sre_error; sctp_assoc_t sre_assoc_id; - __u8 sre_data[0]; + __u8 sre_data[]; }; @@ -453,7 +453,7 @@ struct sctp_send_failed { __u32 ssf_error; struct sctp_sndrcvinfo ssf_info; sctp_assoc_t ssf_assoc_id; - __u8 ssf_data[0]; + __u8 ssf_data[]; }; struct sctp_send_failed_event { @@ -463,7 +463,7 @@ struct sctp_send_failed_event { __u32 ssf_error; struct sctp_sndinfo ssfe_info; sctp_assoc_t ssf_assoc_id; - __u8 ssf_data[0]; + __u8 ssf_data[]; }; /* @@ -1029,7 +1029,7 @@ struct sctp_getaddrs_old { struct sctp_getaddrs { sctp_assoc_t assoc_id; /*input*/ __u32 addr_num; /*output*/ - __u8 addrs[0]; /*output, variable size*/ + __u8 addrs[]; /*output, variable size*/ }; /* A socket user request obtained via SCTP_GET_ASSOC_STATS that retrieves diff --git a/include/uapi/linux/seg6.h b/include/uapi/linux/seg6.h index 286e8d6a8e98..13bcbc8bba32 100644 --- a/include/uapi/linux/seg6.h +++ b/include/uapi/linux/seg6.h @@ -30,7 +30,7 @@ struct ipv6_sr_hdr { __u8 flags; __u16 tag; - struct in6_addr segments[0]; + struct in6_addr segments[]; }; #define SR6_FLAG1_PROTECTED (1 << 6) diff --git a/include/uapi/linux/seg6_iptunnel.h b/include/uapi/linux/seg6_iptunnel.h index eb815e0d0ac3..a74294211290 100644 --- a/include/uapi/linux/seg6_iptunnel.h +++ b/include/uapi/linux/seg6_iptunnel.h @@ -26,7 +26,7 @@ enum { struct seg6_iptunnel_encap { int mode; - struct ipv6_sr_hdr srh[0]; + struct ipv6_sr_hdr srh[]; }; #define SEG6_IPTUN_ENCAP_SIZE(x) ((sizeof(*x)) + (((x)->srh->hdrlen + 1) << 3)) diff --git a/include/uapi/linux/stm.h b/include/uapi/linux/stm.h index 7bac318b4440..de3579c2cff0 100644 --- a/include/uapi/linux/stm.h +++ b/include/uapi/linux/stm.h @@ -36,7 +36,7 @@ struct stp_policy_id { /* padding */ __u16 __reserved_0; __u32 __reserved_1; - char id[0]; + char id[]; }; #define STP_POLICY_ID_SET _IOWR('%', 0, struct stp_policy_id) diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h index 27ace512babd..fbd8ca67e107 100644 --- a/include/uapi/linux/target_core_user.h +++ b/include/uapi/linux/target_core_user.h @@ -152,7 +152,7 @@ struct tcmu_tmr_entry { __u32 cmd_cnt; __u64 __pad3; __u64 __pad4; - __u16 cmd_ids[0]; + __u16 cmd_ids[]; } __packed; #define TCMU_OP_ALIGN_SIZE sizeof(__u64) diff --git a/include/uapi/linux/usb/audio.h b/include/uapi/linux/usb/audio.h index 76b7c3f6cd0d..c917c53070d5 100644 --- a/include/uapi/linux/usb/audio.h +++ b/include/uapi/linux/usb/audio.h @@ -341,7 +341,7 @@ struct uac_feature_unit_descriptor { __u8 bUnitID; __u8 bSourceID; __u8 bControlSize; - __u8 bmaControls[0]; /* variable length */ + __u8 bmaControls[]; /* variable length */ } __attribute__((packed)); static inline __u8 uac_feature_unit_iFeature(struct uac_feature_unit_descriptor *desc) diff --git a/include/uapi/linux/usb/cdc.h b/include/uapi/linux/usb/cdc.h index 6d61550959ef..acf3852bb676 100644 --- a/include/uapi/linux/usb/cdc.h +++ b/include/uapi/linux/usb/cdc.h @@ -171,7 +171,7 @@ struct usb_cdc_mdlm_detail_desc { /* type is associated with mdlm_desc.bGUID */ __u8 bGuidDescriptorType; - __u8 bDetailData[0]; + __u8 bDetailData[]; } __attribute__ ((packed)); /* "OBEX Control Model Functional Descriptor" */ @@ -379,7 +379,7 @@ struct usb_cdc_ncm_ndp16 { __le32 dwSignature; __le16 wLength; __le16 wNextNdpIndex; - struct usb_cdc_ncm_dpe16 dpe16[0]; + struct usb_cdc_ncm_dpe16 dpe16[]; } __attribute__ ((packed)); /* 32-bit NCM Datagram Pointer Entry */ @@ -395,7 +395,7 @@ struct usb_cdc_ncm_ndp32 { __le16 wReserved6; __le32 dwNextNdpIndex; __le32 dwReserved12; - struct usb_cdc_ncm_dpe32 dpe32[0]; + struct usb_cdc_ncm_dpe32 dpe32[]; } __attribute__ ((packed)); /* CDC NCM subclass 3.2.1 and 3.2.2 */ diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h index 17ce56198c9a..31fcfa084e63 100644 --- a/include/uapi/linux/usb/ch9.h +++ b/include/uapi/linux/usb/ch9.h @@ -818,7 +818,7 @@ struct usb_key_descriptor { __u8 tTKID[3]; __u8 bReserved; - __u8 bKeyData[0]; + __u8 bKeyData[]; } __attribute__((packed)); /*-------------------------------------------------------------------------*/ diff --git a/include/uapi/linux/usb/raw_gadget.h b/include/uapi/linux/usb/raw_gadget.h index 0be685272eb1..c7d2199134d7 100644 --- a/include/uapi/linux/usb/raw_gadget.h +++ b/include/uapi/linux/usb/raw_gadget.h @@ -60,7 +60,7 @@ enum usb_raw_event_type { struct usb_raw_event { __u32 type; __u32 length; - __u8 data[0]; + __u8 data[]; }; #define USB_RAW_IO_FLAGS_ZERO 0x0001 @@ -90,7 +90,7 @@ struct usb_raw_ep_io { __u16 ep; __u16 flags; __u32 length; - __u8 data[0]; + __u8 data[]; }; /* Maximum number of non-control endpoints in struct usb_raw_eps_info. */ diff --git a/include/uapi/linux/usbdevice_fs.h b/include/uapi/linux/usbdevice_fs.h index cf525cddeb94..74a84e02422a 100644 --- a/include/uapi/linux/usbdevice_fs.h +++ b/include/uapi/linux/usbdevice_fs.h @@ -131,7 +131,7 @@ struct usbdevfs_urb { unsigned int signr; /* signal to be sent on completion, or 0 if none should be sent. */ void __user *usercontext; - struct usbdevfs_iso_packet_desc iso_frame_desc[0]; + struct usbdevfs_iso_packet_desc iso_frame_desc[]; }; /* ioctls for talking directly to drivers */ @@ -176,7 +176,7 @@ struct usbdevfs_disconnect_claim { struct usbdevfs_streams { unsigned int num_streams; /* Not used by USBDEVFS_FREE_STREAMS */ unsigned int num_eps; - unsigned char eps[0]; + unsigned char eps[]; }; /* diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h index 634cee485abb..391331a10879 100644 --- a/include/uapi/linux/vhost_types.h +++ b/include/uapi/linux/vhost_types.h @@ -107,7 +107,7 @@ struct vhost_memory_region { struct vhost_memory { __u32 nregions; __u32 padding; - struct vhost_memory_region regions[0]; + struct vhost_memory_region regions[]; }; /* VHOST_SCSI specific definitions */ @@ -135,7 +135,7 @@ struct vhost_scsi_target { struct vhost_vdpa_config { __u32 off; __u32 len; - __u8 buf[0]; + __u8 buf[]; }; /* vhost vdpa IOVA range diff --git a/include/uapi/linux/virtio_9p.h b/include/uapi/linux/virtio_9p.h index 441047432258..374b68f8ac6e 100644 --- a/include/uapi/linux/virtio_9p.h +++ b/include/uapi/linux/virtio_9p.h @@ -38,7 +38,7 @@ struct virtio_9p_config { /* length of the tag name */ __virtio16 tag_len; /* non-NULL terminated tag name */ - __u8 tag[0]; + __u8 tag[]; } __attribute__((packed)); #endif /* _LINUX_VIRTIO_9P_H */ diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 65e13a099b1a..e8191e0c3b56 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -33,7 +33,7 @@ struct xfrm_sec_ctx { __u8 ctx_alg; __u16 ctx_len; __u32 ctx_sid; - char ctx_str[0]; + char ctx_str[]; }; /* Security Context Domains of Interpretation */ @@ -96,27 +96,27 @@ struct xfrm_replay_state_esn { __u32 oseq_hi; __u32 seq_hi; __u32 replay_window; - __u32 bmp[0]; + __u32 bmp[]; }; struct xfrm_algo { char alg_name[64]; unsigned int alg_key_len; /* in bits */ - char alg_key[0]; + char alg_key[]; }; struct xfrm_algo_auth { char alg_name[64]; unsigned int alg_key_len; /* in bits */ unsigned int alg_trunc_len; /* in bits */ - char alg_key[0]; + char alg_key[]; }; struct xfrm_algo_aead { char alg_name[64]; unsigned int alg_key_len; /* in bits */ unsigned int alg_icv_len; /* in bits */ - char alg_key[0]; + char alg_key[]; }; struct xfrm_stats { diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h index d95ef9a2b032..1106a7c90b29 100644 --- a/include/uapi/rdma/hfi/hfi1_user.h +++ b/include/uapi/rdma/hfi/hfi1_user.h @@ -180,7 +180,7 @@ struct hfi1_sdma_comp_entry { struct hfi1_status { __aligned_u64 dev; /* device/hw status bits */ __aligned_u64 port; /* port state and status bits */ - char freezemsg[0]; + char freezemsg[]; }; enum sdma_req_opcode { diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 7dd903d932e5..43672cb1fd57 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -158,18 +158,18 @@ struct ib_uverbs_ex_cmd_hdr { struct ib_uverbs_get_context { __aligned_u64 response; - __aligned_u64 driver_data[0]; + __aligned_u64 driver_data[]; }; struct ib_uverbs_get_context_resp { __u32 async_fd; __u32 num_comp_vectors; - __aligned_u64 driver_data[0]; + __aligned_u64 driver_data[]; }; struct ib_uverbs_query_device { __aligned_u64 response; - __aligned_u64 driver_data[0]; + __aligned_u64 driver_data[]; }; struct ib_uverbs_query_device_resp { @@ -278,7 +278,7 @@ struct ib_uverbs_query_port { __aligned_u64 response; __u8 port_num; __u8 reserved[7]; - __aligned_u64 driver_data[0]; + __aligned_u64 driver_data[]; }; struct ib_uverbs_query_port_resp { @@ -308,12 +308,12 @@ struct ib_uverbs_query_port_resp { struct ib_uverbs_alloc_pd { __aligned_u64 response; - __aligned_u64 driver_data[0]; + __aligned_u64 driver_data[]; }; struct ib_uverbs_alloc_pd_resp { __u32 pd_handle; - __u32 driver_data[0]; + __u32 driver_data[]; }; struct ib_uverbs_dealloc_pd { @@ -324,12 +324,12 @@ struct ib_uverbs_open_xrcd { __aligned_u64 response; __u32 fd; __u32 oflags; - __aligned_u64 driver_data[0]; + __aligned_u64 driver_data[]; }; struct ib_uverbs_open_xrcd_resp { __u32 xrcd_handle; - __u32 driver_data[0]; + __u32 driver_data[]; }; struct ib_uverbs_close_xrcd { @@ -343,14 +343,14 @@ struct ib_uverbs_reg_mr { __aligned_u64 hca_va; __u32 pd_handle; __u32 access_flags; - __aligned_u64 driver_data[0]; + __aligned_u64 driver_data[]; }; struct ib_uverbs_reg_mr_resp { __u32 mr_handle; __u32 lkey; __u32 rkey; - __u32 driver_data[0]; + __u32 driver_data[]; }; struct ib_uverbs_rereg_mr { @@ -362,13 +362,13 @@ struct ib_uverbs_rereg_mr { __aligned_u64 hca_va; __u32 pd_handle; __u32 access_flags; - __aligned_u64 driver_data[0]; + __aligned_u64 driver_data[]; }; struct ib_uverbs_rereg_mr_resp { __u32 lkey; __u32 rkey; - __aligned_u64 driver_data[0]; + __aligned_u64 driver_data[]; }; struct ib_uverbs_dereg_mr { @@ -380,13 +380,13 @@ struct ib_uverbs_alloc_mw { __u32 pd_handle; __u8 mw_type; __u8 reserved[3]; - __aligned_u64 driver_data[0]; + __aligned_u64 driver_data[]; }; struct ib_uverbs_alloc_mw_resp { __u32 mw_handle; __u32 rkey; - __aligned_u64 driver_data[0]; + __aligned_u64 driver_data[]; }; struct ib_uverbs_dealloc_mw { @@ -408,7 +408,7 @@ struct ib_uverbs_create_cq { __u32 comp_vector; __s32 comp_channel; __u32 reserved; - __aligned_u64 driver_data[0]; + __aligned_u64 driver_data[]; }; enum ib_uverbs_ex_create_cq_flags { @@ -442,13 +442,13 @@ struct ib_uverbs_resize_cq { __aligned_u64 response; __u32 cq_handle; __u32 cqe; - __aligned_u64 driver_data[0]; + __aligned_u64 driver_data[]; }; struct ib_uverbs_resize_cq_resp { __u32 cqe; __u32 reserved; - __aligned_u64 driver_data[0]; + __aligned_u64 driver_data[]; }; struct ib_uverbs_poll_cq { @@ -492,7 +492,7 @@ struct ib_uverbs_wc { struct ib_uverbs_poll_cq_resp { __u32 count; __u32 reserved; - struct ib_uverbs_wc wc[0]; + struct ib_uverbs_wc wc[]; }; struct ib_uverbs_req_notify_cq { @@ -585,7 +585,7 @@ struct ib_uverbs_create_qp { __u8 qp_type; __u8 is_srq; __u8 reserved; - __aligned_u64 driver_data[0]; + __aligned_u64 driver_data[]; }; enum ib_uverbs_create_qp_mask { @@ -624,7 +624,7 @@ struct ib_uverbs_open_qp { __u32 qpn; __u8 qp_type; __u8 reserved[7]; - __aligned_u64 driver_data[0]; + __aligned_u64 driver_data[]; }; /* also used for open response */ @@ -669,7 +669,7 @@ struct ib_uverbs_query_qp { __aligned_u64 response; __u32 qp_handle; __u32 attr_mask; - __aligned_u64 driver_data[0]; + __aligned_u64 driver_data[]; }; struct ib_uverbs_query_qp_resp { @@ -703,7 +703,7 @@ struct ib_uverbs_query_qp_resp { __u8 alt_timeout; __u8 sq_sig_all; __u8 reserved[5]; - __aligned_u64 driver_data[0]; + __aligned_u64 driver_data[]; }; struct ib_uverbs_modify_qp { @@ -824,7 +824,7 @@ struct ib_uverbs_post_send { __u32 wr_count; __u32 sge_count; __u32 wqe_size; - struct ib_uverbs_send_wr send_wr[0]; + struct ib_uverbs_send_wr send_wr[]; }; struct ib_uverbs_post_send_resp { @@ -843,7 +843,7 @@ struct ib_uverbs_post_recv { __u32 wr_count; __u32 sge_count; __u32 wqe_size; - struct ib_uverbs_recv_wr recv_wr[0]; + struct ib_uverbs_recv_wr recv_wr[]; }; struct ib_uverbs_post_recv_resp { @@ -856,7 +856,7 @@ struct ib_uverbs_post_srq_recv { __u32 wr_count; __u32 sge_count; __u32 wqe_size; - struct ib_uverbs_recv_wr recv[0]; + struct ib_uverbs_recv_wr recv[]; }; struct ib_uverbs_post_srq_recv_resp { @@ -869,12 +869,12 @@ struct ib_uverbs_create_ah { __u32 pd_handle; __u32 reserved; struct ib_uverbs_ah_attr attr; - __aligned_u64 driver_data[0]; + __aligned_u64 driver_data[]; }; struct ib_uverbs_create_ah_resp { __u32 ah_handle; - __u32 driver_data[0]; + __u32 driver_data[]; }; struct ib_uverbs_destroy_ah { @@ -886,7 +886,7 @@ struct ib_uverbs_attach_mcast { __u32 qp_handle; __u16 mlid; __u16 reserved; - __aligned_u64 driver_data[0]; + __aligned_u64 driver_data[]; }; struct ib_uverbs_detach_mcast { @@ -894,7 +894,7 @@ struct ib_uverbs_detach_mcast { __u32 qp_handle; __u16 mlid; __u16 reserved; - __aligned_u64 driver_data[0]; + __aligned_u64 driver_data[]; }; struct ib_uverbs_flow_spec_hdr { @@ -1135,7 +1135,7 @@ struct ib_uverbs_flow_attr { * struct ib_flow_spec_xxx * struct ib_flow_spec_yyy */ - struct ib_uverbs_flow_spec_hdr flow_specs[0]; + struct ib_uverbs_flow_spec_hdr flow_specs[]; }; struct ib_uverbs_create_flow { @@ -1161,7 +1161,7 @@ struct ib_uverbs_create_srq { __u32 max_wr; __u32 max_sge; __u32 srq_limit; - __aligned_u64 driver_data[0]; + __aligned_u64 driver_data[]; }; struct ib_uverbs_create_xsrq { @@ -1175,7 +1175,7 @@ struct ib_uverbs_create_xsrq { __u32 max_num_tags; __u32 xrcd_handle; __u32 cq_handle; - __aligned_u64 driver_data[0]; + __aligned_u64 driver_data[]; }; struct ib_uverbs_create_srq_resp { @@ -1183,7 +1183,7 @@ struct ib_uverbs_create_srq_resp { __u32 max_wr; __u32 max_sge; __u32 srqn; - __u32 driver_data[0]; + __u32 driver_data[]; }; struct ib_uverbs_modify_srq { @@ -1191,14 +1191,14 @@ struct ib_uverbs_modify_srq { __u32 attr_mask; __u32 max_wr; __u32 srq_limit; - __aligned_u64 driver_data[0]; + __aligned_u64 driver_data[]; }; struct ib_uverbs_query_srq { __aligned_u64 response; __u32 srq_handle; __u32 reserved; - __aligned_u64 driver_data[0]; + __aligned_u64 driver_data[]; }; struct ib_uverbs_query_srq_resp { @@ -1269,7 +1269,7 @@ struct ib_uverbs_ex_create_rwq_ind_table { * wq_handle1 * wq_handle2 */ - __u32 wq_handles[0]; + __u32 wq_handles[]; }; struct ib_uverbs_ex_create_rwq_ind_table_resp { diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index ed5a514305c1..7cea03581f79 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -184,7 +184,7 @@ struct rdma_ucm_query_addr_resp { struct rdma_ucm_query_path_resp { __u32 num_paths; __u32 reserved; - struct ib_path_rec_data path_data[0]; + struct ib_path_rec_data path_data[]; }; struct rdma_ucm_conn_param { diff --git a/include/uapi/rdma/rdma_user_ioctl_cmds.h b/include/uapi/rdma/rdma_user_ioctl_cmds.h index 38ab7accb7be..ab1aef17feb1 100644 --- a/include/uapi/rdma/rdma_user_ioctl_cmds.h +++ b/include/uapi/rdma/rdma_user_ioctl_cmds.h @@ -81,7 +81,7 @@ struct ib_uverbs_ioctl_hdr { __aligned_u64 reserved1; __u32 driver_id; __u32 reserved2; - struct ib_uverbs_attr attrs[0]; + struct ib_uverbs_attr attrs[]; }; #endif diff --git a/include/uapi/scsi/fc/fc_els.h b/include/uapi/scsi/fc/fc_els.h index c9812c5c2fc4..16782c360de3 100644 --- a/include/uapi/scsi/fc/fc_els.h +++ b/include/uapi/scsi/fc/fc_els.h @@ -264,7 +264,7 @@ struct fc_tlv_desc { * Size of descriptor excluding * desc_tag and desc_len fields. */ - __u8 desc_value[0]; /* Descriptor Value */ + __u8 desc_value[]; /* Descriptor Value */ }; /* Descriptor tag and len fields are considered the mandatory header @@ -1027,7 +1027,7 @@ struct fc_fn_li_desc { * threshold to caause the LI event */ __be32 pname_count; /* number of portname_list elements */ - __be64 pname_list[0]; /* list of N_Port_Names accessible + __be64 pname_list[]; /* list of N_Port_Names accessible * through the attached port */ }; @@ -1069,7 +1069,7 @@ struct fc_fn_peer_congn_desc { * congestion event */ __be32 pname_count; /* number of portname_list elements */ - __be64 pname_list[0]; /* list of N_Port_Names accessible + __be64 pname_list[]; /* list of N_Port_Names accessible * through the attached port */ }; @@ -1104,7 +1104,7 @@ struct fc_els_fpin { * Size of ELS excluding fpin_cmd, * fpin_zero and desc_len fields. */ - struct fc_tlv_desc fpin_desc[0]; /* Descriptor list */ + struct fc_tlv_desc fpin_desc[]; /* Descriptor list */ }; /* Diagnostic Function Descriptor - FPIN Registration */ @@ -1115,7 +1115,7 @@ struct fc_df_desc_fpin_reg { * desc_tag and desc_len fields. */ __be32 count; /* Number of desc_tags elements */ - __be32 desc_tags[0]; /* Array of Descriptor Tags. + __be32 desc_tags[]; /* Array of Descriptor Tags. * Each tag indicates a function * supported by the N_Port (request) * or by the N_Port and Fabric @@ -1135,7 +1135,7 @@ struct fc_els_rdf { * Size of ELS excluding fpin_cmd, * fpin_zero and desc_len fields. */ - struct fc_tlv_desc desc[0]; /* Descriptor list */ + struct fc_tlv_desc desc[]; /* Descriptor list */ }; /* @@ -1148,7 +1148,7 @@ struct fc_els_rdf_resp { * and desc_list_len fields. */ struct fc_els_lsri_desc lsri; - struct fc_tlv_desc desc[0]; /* Supported Descriptor list */ + struct fc_tlv_desc desc[]; /* Supported Descriptor list */ }; @@ -1231,7 +1231,7 @@ struct fc_els_edc { * Size of ELS excluding edc_cmd, * edc_zero and desc_len fields. */ - struct fc_tlv_desc desc[0]; + struct fc_tlv_desc desc[]; /* Diagnostic Descriptor list */ }; @@ -1245,7 +1245,7 @@ struct fc_els_edc_resp { * and desc_list_len fields. */ struct fc_els_lsri_desc lsri; - struct fc_tlv_desc desc[0]; + struct fc_tlv_desc desc[]; /* Supported Diagnostic Descriptor list */ }; diff --git a/include/uapi/scsi/scsi_bsg_fc.h b/include/uapi/scsi/scsi_bsg_fc.h index 3ae65e93235c..7f5930801f72 100644 --- a/include/uapi/scsi/scsi_bsg_fc.h +++ b/include/uapi/scsi/scsi_bsg_fc.h @@ -209,7 +209,7 @@ struct fc_bsg_host_vendor { __u64 vendor_id; /* start of vendor command area */ - __u32 vendor_cmd[0]; + __u32 vendor_cmd[]; }; /* Response: diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h index 2d3e5df39a59..3974a2a911cc 100644 --- a/include/uapi/sound/asound.h +++ b/include/uapi/sound/asound.h @@ -1106,7 +1106,7 @@ struct snd_ctl_elem_value { struct snd_ctl_tlv { unsigned int numid; /* control element numeric identification */ unsigned int length; /* in bytes aligned to 4 */ - unsigned int tlv[0]; /* first TLV */ + unsigned int tlv[]; /* first TLV */ }; #define SNDRV_CTL_IOCTL_PVERSION _IOR('U', 0x00, int) diff --git a/include/uapi/sound/firewire.h b/include/uapi/sound/firewire.h index 39cf6eb75940..3532ac7046d7 100644 --- a/include/uapi/sound/firewire.h +++ b/include/uapi/sound/firewire.h @@ -38,11 +38,11 @@ struct snd_efw_transaction { __be32 category; __be32 command; __be32 status; - __be32 params[0]; + __be32 params[]; }; struct snd_firewire_event_efw_response { unsigned int type; - __be32 response[0]; /* some responses */ + __be32 response[]; /* some responses */ }; struct snd_firewire_event_digi00x_message { @@ -63,7 +63,7 @@ struct snd_firewire_tascam_change { struct snd_firewire_event_tascam_control { unsigned int type; - struct snd_firewire_tascam_change changes[0]; + struct snd_firewire_tascam_change changes[]; }; struct snd_firewire_event_motu_register_dsp_change { diff --git a/include/uapi/sound/skl-tplg-interface.h b/include/uapi/sound/skl-tplg-interface.h index a93c0decfdd5..f29899b179a6 100644 --- a/include/uapi/sound/skl-tplg-interface.h +++ b/include/uapi/sound/skl-tplg-interface.h @@ -151,7 +151,7 @@ struct skl_dfw_algo_data { __u32 rsvd:30; __u32 param_id; __u32 max; - char params[0]; + char params[]; } __packed; enum skl_tkn_dir { diff --git a/include/uapi/sound/sof/header.h b/include/uapi/sound/sof/header.h index 5f4518e7a972..dbf137516522 100644 --- a/include/uapi/sound/sof/header.h +++ b/include/uapi/sound/sof/header.h @@ -23,7 +23,7 @@ struct sof_abi_hdr { __u32 size; /**< size in bytes of data excl. this struct */ __u32 abi; /**< SOF ABI version */ __u32 reserved[4]; /**< reserved for future use */ - __u32 data[0]; /**< Component data - opaque to core */ + __u32 data[]; /**< Component data - opaque to core */ } __packed; #endif diff --git a/include/uapi/sound/usb_stream.h b/include/uapi/sound/usb_stream.h index 95419d8bbc16..ffdd3ea1e31d 100644 --- a/include/uapi/sound/usb_stream.h +++ b/include/uapi/sound/usb_stream.h @@ -61,7 +61,7 @@ struct usb_stream { unsigned inpacket_split_at; unsigned next_inpacket_split; unsigned next_inpacket_split_at; - struct usb_stream_packet inpacket[0]; + struct usb_stream_packet inpacket[]; }; enum usb_stream_state { diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index bf6e96011dfe..e135f4dcb19d 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -198,13 +198,13 @@ struct kvm_msrs { __u32 nmsrs; /* number of msrs in entries */ __u32 pad; - struct kvm_msr_entry entries[0]; + struct kvm_msr_entry entries[]; }; /* for KVM_GET_MSR_INDEX_LIST */ struct kvm_msr_list { __u32 nmsrs; /* number of msrs in entries */ - __u32 indices[0]; + __u32 indices[]; }; /* Maximum size of any access bitmap in bytes */ @@ -241,7 +241,7 @@ struct kvm_cpuid_entry { struct kvm_cpuid { __u32 nent; __u32 padding; - struct kvm_cpuid_entry entries[0]; + struct kvm_cpuid_entry entries[]; }; struct kvm_cpuid_entry2 { @@ -263,7 +263,7 @@ struct kvm_cpuid_entry2 { struct kvm_cpuid2 { __u32 nent; __u32 padding; - struct kvm_cpuid_entry2 entries[0]; + struct kvm_cpuid_entry2 entries[]; }; /* for KVM_GET_PIT and KVM_SET_PIT */ @@ -389,7 +389,7 @@ struct kvm_xsave { * the contents of CPUID leaf 0xD on the host. */ __u32 region[1024]; - __u32 extra[0]; + __u32 extra[]; }; #define KVM_MAX_XCRS 16 @@ -515,7 +515,7 @@ struct kvm_pmu_event_filter { __u32 fixed_counter_bitmap; __u32 flags; __u32 pad[4]; - __u64 events[0]; + __u64 events[]; }; #define KVM_PMU_EVENT_ALLOW 0 diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h index 05c3642aaece..239b91b13c60 100644 --- a/tools/include/uapi/drm/i915_drm.h +++ b/tools/include/uapi/drm/i915_drm.h @@ -2060,7 +2060,7 @@ struct i915_context_engines_load_balance { __u64 mbz64; /* reserved for future use; must be zero */ - struct i915_engine_class_instance engines[0]; + struct i915_engine_class_instance engines[]; } __attribute__((packed)); #define I915_DEFINE_CONTEXT_ENGINES_LOAD_BALANCE(name__, N__) struct { \ @@ -2098,7 +2098,7 @@ struct i915_context_engines_bond { __u64 flags; /* all undefined flags must be zero */ __u64 mbz64[4]; /* reserved for future use; must be zero */ - struct i915_engine_class_instance engines[0]; + struct i915_engine_class_instance engines[]; } __attribute__((packed)); #define I915_DEFINE_CONTEXT_ENGINES_BOND(name__, N__) struct { \ @@ -2225,7 +2225,7 @@ struct i915_context_engines_parallel_submit { * length = width (i) * num_siblings (j) * index = j + i * num_siblings */ - struct i915_engine_class_instance engines[0]; + struct i915_engine_class_instance engines[]; } __packed; diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h index bdf7b404b3e7..b7b56871029c 100644 --- a/tools/include/uapi/linux/fs.h +++ b/tools/include/uapi/linux/fs.h @@ -90,7 +90,7 @@ struct file_dedupe_range { __u16 dest_count; /* in - total elements in info array */ __u16 reserved1; /* must be zero */ __u32 reserved2; /* must be zero */ - struct file_dedupe_range_info info[0]; + struct file_dedupe_range_info info[]; }; /* And dynamically-tunable limits and defaults: */ diff --git a/tools/include/uapi/linux/if_tun.h b/tools/include/uapi/linux/if_tun.h index 454ae31b93c7..2ec07de1d73b 100644 --- a/tools/include/uapi/linux/if_tun.h +++ b/tools/include/uapi/linux/if_tun.h @@ -108,7 +108,7 @@ struct tun_pi { struct tun_filter { __u16 flags; /* TUN_FLT_ flags see above */ __u16 count; /* Number of addresses */ - __u8 addr[0][ETH_ALEN]; + __u8 addr[][ETH_ALEN]; }; #endif /* _UAPI__IF_TUN_H */ diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 6a184d260c7f..37ce8cbac322 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -539,7 +539,7 @@ struct kvm_coalesced_mmio { struct kvm_coalesced_mmio_ring { __u32 first, last; - struct kvm_coalesced_mmio coalesced_mmio[0]; + struct kvm_coalesced_mmio coalesced_mmio[]; }; #define KVM_COALESCED_MMIO_MAX \ @@ -618,7 +618,7 @@ struct kvm_clear_dirty_log { /* for KVM_SET_SIGNAL_MASK */ struct kvm_signal_mask { __u32 len; - __u8 sigset[0]; + __u8 sigset[]; }; /* for KVM_TPR_ACCESS_REPORTING */ @@ -1216,7 +1216,7 @@ struct kvm_irq_routing_entry { struct kvm_irq_routing { __u32 nr; __u32 flags; - struct kvm_irq_routing_entry entries[0]; + struct kvm_irq_routing_entry entries[]; }; #endif @@ -1335,7 +1335,7 @@ struct kvm_dirty_tlb { struct kvm_reg_list { __u64 n; /* number of regs */ - __u64 reg[0]; + __u64 reg[]; }; struct kvm_one_reg { diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index d37629dbad72..4653834f078f 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -491,7 +491,7 @@ struct perf_event_query_bpf { /* * User provided buffer to store program ids */ - __u32 ids[0]; + __u32 ids[]; }; /* diff --git a/tools/include/uapi/linux/pkt_cls.h b/tools/include/uapi/linux/pkt_cls.h index 12153771396a..3faee0199a9b 100644 --- a/tools/include/uapi/linux/pkt_cls.h +++ b/tools/include/uapi/linux/pkt_cls.h @@ -180,7 +180,7 @@ struct tc_u32_sel { short hoff; __be32 hmask; - struct tc_u32_key keys[0]; + struct tc_u32_key keys[]; }; struct tc_u32_mark { @@ -192,7 +192,7 @@ struct tc_u32_mark { struct tc_u32_pcnt { __u64 rcnt; __u64 rhit; - __u64 kcnts[0]; + __u64 kcnts[]; }; /* Flags */ diff --git a/tools/include/uapi/linux/seg6.h b/tools/include/uapi/linux/seg6.h index 286e8d6a8e98..f94baf154c47 100644 --- a/tools/include/uapi/linux/seg6.h +++ b/tools/include/uapi/linux/seg6.h @@ -30,7 +30,7 @@ struct ipv6_sr_hdr { __u8 flags; __u16 tag; - struct in6_addr segments[0]; + struct in6_addr segments[]; }; #define SR6_FLAG1_PROTECTED (1 << 6) @@ -49,7 +49,7 @@ struct ipv6_sr_hdr { struct sr6_tlv { __u8 type; __u8 len; - __u8 data[0]; + __u8 data[]; }; #endif diff --git a/tools/include/uapi/linux/usbdevice_fs.h b/tools/include/uapi/linux/usbdevice_fs.h index cf525cddeb94..74a84e02422a 100644 --- a/tools/include/uapi/linux/usbdevice_fs.h +++ b/tools/include/uapi/linux/usbdevice_fs.h @@ -131,7 +131,7 @@ struct usbdevfs_urb { unsigned int signr; /* signal to be sent on completion, or 0 if none should be sent. */ void __user *usercontext; - struct usbdevfs_iso_packet_desc iso_frame_desc[0]; + struct usbdevfs_iso_packet_desc iso_frame_desc[]; }; /* ioctls for talking directly to drivers */ @@ -176,7 +176,7 @@ struct usbdevfs_disconnect_claim { struct usbdevfs_streams { unsigned int num_streams; /* Not used by USBDEVFS_FREE_STREAMS */ unsigned int num_eps; - unsigned char eps[0]; + unsigned char eps[]; }; /* diff --git a/tools/include/uapi/sound/asound.h b/tools/include/uapi/sound/asound.h index 2d3e5df39a59..3974a2a911cc 100644 --- a/tools/include/uapi/sound/asound.h +++ b/tools/include/uapi/sound/asound.h @@ -1106,7 +1106,7 @@ struct snd_ctl_elem_value { struct snd_ctl_tlv { unsigned int numid; /* control element numeric identification */ unsigned int length; /* in bytes aligned to 4 */ - unsigned int tlv[0]; /* first TLV */ + unsigned int tlv[]; /* first TLV */ }; #define SNDRV_CTL_IOCTL_PVERSION _IOR('U', 0x00, int) -- cgit v1.2.3 From 6b56f5f1ef93ec1b34dee1ef72da33ec046f82be Mon Sep 17 00:00:00 2001 From: Johannes Holland Date: Tue, 28 Jun 2022 16:39:27 +0200 Subject: selftests/tpm2: increase timeout for kselftests Due to CreatePrimary commands which need to create RSA keys of increasing size, the timeout value need to be raised, as well. Default is 45s. Fixed git am white space warns: Shuah Khan Signed-off-by: Johannes Holland Signed-off-by: Stefan Mahnke-Hartmann Signed-off-by: Shuah Khan --- tools/testing/selftests/tpm2/settings | 1 + 1 file changed, 1 insertion(+) create mode 100644 tools/testing/selftests/tpm2/settings (limited to 'tools') diff --git a/tools/testing/selftests/tpm2/settings b/tools/testing/selftests/tpm2/settings new file mode 100644 index 000000000000..a62d2fa1275c --- /dev/null +++ b/tools/testing/selftests/tpm2/settings @@ -0,0 +1 @@ +timeout=600 -- cgit v1.2.3 From 8587f3732b37a39de205d8c42f5448dbe6aa6b55 Mon Sep 17 00:00:00 2001 From: Gautam Menghani Date: Tue, 28 Jun 2022 11:16:03 +0530 Subject: selftests/drivers/gpu: Add error messages to drm_mm.sh Add error messages when the module test-drm_mm is not found or could not be removed to make tests output more readable. Signed-off-by: Gautam Menghani Signed-off-by: Shuah Khan --- tools/testing/selftests/drivers/gpu/drm_mm.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/gpu/drm_mm.sh b/tools/testing/selftests/drivers/gpu/drm_mm.sh index b789dc8257e6..09c76cd7661d 100755 --- a/tools/testing/selftests/drivers/gpu/drm_mm.sh +++ b/tools/testing/selftests/drivers/gpu/drm_mm.sh @@ -3,7 +3,7 @@ # Runs API tests for struct drm_mm (DRM range manager) if ! /sbin/modprobe -n -q test-drm_mm; then - echo "drivers/gpu/drm_mm: [skip]" + echo "drivers/gpu/drm_mm: module test-drm_mm is not found in /lib/modules/`uname -r` [skip]" exit 77 fi @@ -11,6 +11,6 @@ if /sbin/modprobe -q test-drm_mm; then /sbin/modprobe -q -r test-drm_mm echo "drivers/gpu/drm_mm: ok" else - echo "drivers/gpu/drm_mm: [FAIL]" + echo "drivers/gpu/drm_mm: module test-drm_mm could not be removed [FAIL]" exit 1 fi -- cgit v1.2.3 From f36600634282a519e1b0abea609acdc8731515d7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:13 -0700 Subject: libbpf: move xsk.{c,h} into selftests/bpf Remove deprecated xsk APIs from libbpf. But given we have selftests relying on this, move those files (with minimal adjustments to make them compilable) under selftests/bpf. We also remove all the removed APIs from libbpf.map, while overall keeping version inheritance chain, as most APIs are backwards compatible so there is no need to reassign them as LIBBPF_1.0.0 versions. Cc: Magnus Karlsson Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/Build | 2 +- tools/lib/bpf/Makefile | 2 +- tools/lib/bpf/libbpf.map | 12 - tools/lib/bpf/xsk.c | 1260 ----------------------------- tools/lib/bpf/xsk.h | 336 -------- tools/testing/selftests/bpf/Makefile | 2 + tools/testing/selftests/bpf/xdpxceiver.c | 2 +- tools/testing/selftests/bpf/xsk.c | 1264 ++++++++++++++++++++++++++++++ tools/testing/selftests/bpf/xsk.h | 315 ++++++++ 9 files changed, 1584 insertions(+), 1611 deletions(-) delete mode 100644 tools/lib/bpf/xsk.c delete mode 100644 tools/lib/bpf/xsk.h create mode 100644 tools/testing/selftests/bpf/xsk.c create mode 100644 tools/testing/selftests/bpf/xsk.h (limited to 'tools') diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build index 31a1a9015902..5a3dfb56d78f 100644 --- a/tools/lib/bpf/Build +++ b/tools/lib/bpf/Build @@ -1,4 +1,4 @@ libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \ - netlink.o bpf_prog_linfo.o libbpf_probes.o xsk.o hashmap.o \ + netlink.o bpf_prog_linfo.o libbpf_probes.o hashmap.o \ btf_dump.o ringbuf.o strset.o linker.o gen_loader.o relo_core.o \ usdt.o diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index a1265b152027..4c904ef0b47e 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -237,7 +237,7 @@ install_lib: all_cmd $(call do_install_mkdir,$(libdir_SQ)); \ cp -fpR $(LIB_FILE) $(DESTDIR)$(libdir_SQ) -SRC_HDRS := bpf.h libbpf.h btf.h libbpf_common.h libbpf_legacy.h xsk.h \ +SRC_HDRS := bpf.h libbpf.h btf.h libbpf_common.h libbpf_legacy.h \ bpf_helpers.h bpf_tracing.h bpf_endian.h bpf_core_read.h \ skel_internal.h libbpf_version.h usdt.bpf.h GEN_HDRS := $(BPF_GENERATED) diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 116a2a8ee7c2..da7a4f928452 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -147,12 +147,6 @@ LIBBPF_0.0.2 { btf_ext__new; btf_ext__reloc_func_info; btf_ext__reloc_line_info; - xsk_umem__create; - xsk_socket__create; - xsk_umem__delete; - xsk_socket__delete; - xsk_umem__fd; - xsk_socket__fd; bpf_program__get_prog_info_linear; bpf_program__bpil_addr_to_offs; bpf_program__bpil_offs_to_addr; @@ -183,7 +177,6 @@ LIBBPF_0.0.4 { perf_buffer__new; perf_buffer__new_raw; perf_buffer__poll; - xsk_umem__create; } LIBBPF_0.0.3; LIBBPF_0.0.5 { @@ -336,7 +329,6 @@ LIBBPF_0.2.0 { perf_buffer__buffer_fd; perf_buffer__epoll_fd; perf_buffer__consume_buffer; - xsk_socket__create_shared; } LIBBPF_0.1.0; LIBBPF_0.3.0 { @@ -348,8 +340,6 @@ LIBBPF_0.3.0 { btf__new_empty_split; btf__new_split; ring_buffer__epoll_fd; - xsk_setup_xdp_prog; - xsk_socket__update_xskmap; } LIBBPF_0.2.0; LIBBPF_0.4.0 { @@ -468,6 +458,4 @@ LIBBPF_1.0.0 { libbpf_bpf_link_type_str; libbpf_bpf_map_type_str; libbpf_bpf_prog_type_str; - - local: *; }; diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c deleted file mode 100644 index af136f73b09d..000000000000 --- a/tools/lib/bpf/xsk.c +++ /dev/null @@ -1,1260 +0,0 @@ -// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) - -/* - * AF_XDP user-space access library. - * - * Copyright(c) 2018 - 2019 Intel Corporation. - * - * Author(s): Magnus Karlsson - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "bpf.h" -#include "libbpf.h" -#include "libbpf_internal.h" -#include "xsk.h" - -/* entire xsk.h and xsk.c is going away in libbpf 1.0, so ignore all internal - * uses of deprecated APIs - */ -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - -#ifndef SOL_XDP - #define SOL_XDP 283 -#endif - -#ifndef AF_XDP - #define AF_XDP 44 -#endif - -#ifndef PF_XDP - #define PF_XDP AF_XDP -#endif - -enum xsk_prog { - XSK_PROG_FALLBACK, - XSK_PROG_REDIRECT_FLAGS, -}; - -struct xsk_umem { - struct xsk_ring_prod *fill_save; - struct xsk_ring_cons *comp_save; - char *umem_area; - struct xsk_umem_config config; - int fd; - int refcount; - struct list_head ctx_list; - bool rx_ring_setup_done; - bool tx_ring_setup_done; -}; - -struct xsk_ctx { - struct xsk_ring_prod *fill; - struct xsk_ring_cons *comp; - __u32 queue_id; - struct xsk_umem *umem; - int refcount; - int ifindex; - struct list_head list; - int prog_fd; - int link_fd; - int xsks_map_fd; - char ifname[IFNAMSIZ]; - bool has_bpf_link; -}; - -struct xsk_socket { - struct xsk_ring_cons *rx; - struct xsk_ring_prod *tx; - __u64 outstanding_tx; - struct xsk_ctx *ctx; - struct xsk_socket_config config; - int fd; -}; - -struct xsk_nl_info { - bool xdp_prog_attached; - int ifindex; - int fd; -}; - -/* Up until and including Linux 5.3 */ -struct xdp_ring_offset_v1 { - __u64 producer; - __u64 consumer; - __u64 desc; -}; - -/* Up until and including Linux 5.3 */ -struct xdp_mmap_offsets_v1 { - struct xdp_ring_offset_v1 rx; - struct xdp_ring_offset_v1 tx; - struct xdp_ring_offset_v1 fr; - struct xdp_ring_offset_v1 cr; -}; - -int xsk_umem__fd(const struct xsk_umem *umem) -{ - return umem ? umem->fd : -EINVAL; -} - -int xsk_socket__fd(const struct xsk_socket *xsk) -{ - return xsk ? xsk->fd : -EINVAL; -} - -static bool xsk_page_aligned(void *buffer) -{ - unsigned long addr = (unsigned long)buffer; - - return !(addr & (getpagesize() - 1)); -} - -static void xsk_set_umem_config(struct xsk_umem_config *cfg, - const struct xsk_umem_config *usr_cfg) -{ - if (!usr_cfg) { - cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; - cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; - cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; - cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM; - cfg->flags = XSK_UMEM__DEFAULT_FLAGS; - return; - } - - cfg->fill_size = usr_cfg->fill_size; - cfg->comp_size = usr_cfg->comp_size; - cfg->frame_size = usr_cfg->frame_size; - cfg->frame_headroom = usr_cfg->frame_headroom; - cfg->flags = usr_cfg->flags; -} - -static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg, - const struct xsk_socket_config *usr_cfg) -{ - if (!usr_cfg) { - cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; - cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; - cfg->libbpf_flags = 0; - cfg->xdp_flags = 0; - cfg->bind_flags = 0; - return 0; - } - - if (usr_cfg->libbpf_flags & ~XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD) - return -EINVAL; - - cfg->rx_size = usr_cfg->rx_size; - cfg->tx_size = usr_cfg->tx_size; - cfg->libbpf_flags = usr_cfg->libbpf_flags; - cfg->xdp_flags = usr_cfg->xdp_flags; - cfg->bind_flags = usr_cfg->bind_flags; - - return 0; -} - -static void xsk_mmap_offsets_v1(struct xdp_mmap_offsets *off) -{ - struct xdp_mmap_offsets_v1 off_v1; - - /* getsockopt on a kernel <= 5.3 has no flags fields. - * Copy over the offsets to the correct places in the >=5.4 format - * and put the flags where they would have been on that kernel. - */ - memcpy(&off_v1, off, sizeof(off_v1)); - - off->rx.producer = off_v1.rx.producer; - off->rx.consumer = off_v1.rx.consumer; - off->rx.desc = off_v1.rx.desc; - off->rx.flags = off_v1.rx.consumer + sizeof(__u32); - - off->tx.producer = off_v1.tx.producer; - off->tx.consumer = off_v1.tx.consumer; - off->tx.desc = off_v1.tx.desc; - off->tx.flags = off_v1.tx.consumer + sizeof(__u32); - - off->fr.producer = off_v1.fr.producer; - off->fr.consumer = off_v1.fr.consumer; - off->fr.desc = off_v1.fr.desc; - off->fr.flags = off_v1.fr.consumer + sizeof(__u32); - - off->cr.producer = off_v1.cr.producer; - off->cr.consumer = off_v1.cr.consumer; - off->cr.desc = off_v1.cr.desc; - off->cr.flags = off_v1.cr.consumer + sizeof(__u32); -} - -static int xsk_get_mmap_offsets(int fd, struct xdp_mmap_offsets *off) -{ - socklen_t optlen; - int err; - - optlen = sizeof(*off); - err = getsockopt(fd, SOL_XDP, XDP_MMAP_OFFSETS, off, &optlen); - if (err) - return err; - - if (optlen == sizeof(*off)) - return 0; - - if (optlen == sizeof(struct xdp_mmap_offsets_v1)) { - xsk_mmap_offsets_v1(off); - return 0; - } - - return -EINVAL; -} - -static int xsk_create_umem_rings(struct xsk_umem *umem, int fd, - struct xsk_ring_prod *fill, - struct xsk_ring_cons *comp) -{ - struct xdp_mmap_offsets off; - void *map; - int err; - - err = setsockopt(fd, SOL_XDP, XDP_UMEM_FILL_RING, - &umem->config.fill_size, - sizeof(umem->config.fill_size)); - if (err) - return -errno; - - err = setsockopt(fd, SOL_XDP, XDP_UMEM_COMPLETION_RING, - &umem->config.comp_size, - sizeof(umem->config.comp_size)); - if (err) - return -errno; - - err = xsk_get_mmap_offsets(fd, &off); - if (err) - return -errno; - - map = mmap(NULL, off.fr.desc + umem->config.fill_size * sizeof(__u64), - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, - XDP_UMEM_PGOFF_FILL_RING); - if (map == MAP_FAILED) - return -errno; - - fill->mask = umem->config.fill_size - 1; - fill->size = umem->config.fill_size; - fill->producer = map + off.fr.producer; - fill->consumer = map + off.fr.consumer; - fill->flags = map + off.fr.flags; - fill->ring = map + off.fr.desc; - fill->cached_cons = umem->config.fill_size; - - map = mmap(NULL, off.cr.desc + umem->config.comp_size * sizeof(__u64), - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, - XDP_UMEM_PGOFF_COMPLETION_RING); - if (map == MAP_FAILED) { - err = -errno; - goto out_mmap; - } - - comp->mask = umem->config.comp_size - 1; - comp->size = umem->config.comp_size; - comp->producer = map + off.cr.producer; - comp->consumer = map + off.cr.consumer; - comp->flags = map + off.cr.flags; - comp->ring = map + off.cr.desc; - - return 0; - -out_mmap: - munmap(map, off.fr.desc + umem->config.fill_size * sizeof(__u64)); - return err; -} - -DEFAULT_VERSION(xsk_umem__create_v0_0_4, xsk_umem__create, LIBBPF_0.0.4) -int xsk_umem__create_v0_0_4(struct xsk_umem **umem_ptr, void *umem_area, - __u64 size, struct xsk_ring_prod *fill, - struct xsk_ring_cons *comp, - const struct xsk_umem_config *usr_config) -{ - struct xdp_umem_reg mr; - struct xsk_umem *umem; - int err; - - if (!umem_area || !umem_ptr || !fill || !comp) - return -EFAULT; - if (!size && !xsk_page_aligned(umem_area)) - return -EINVAL; - - umem = calloc(1, sizeof(*umem)); - if (!umem) - return -ENOMEM; - - umem->fd = socket(AF_XDP, SOCK_RAW | SOCK_CLOEXEC, 0); - if (umem->fd < 0) { - err = -errno; - goto out_umem_alloc; - } - - umem->umem_area = umem_area; - INIT_LIST_HEAD(&umem->ctx_list); - xsk_set_umem_config(&umem->config, usr_config); - - memset(&mr, 0, sizeof(mr)); - mr.addr = (uintptr_t)umem_area; - mr.len = size; - mr.chunk_size = umem->config.frame_size; - mr.headroom = umem->config.frame_headroom; - mr.flags = umem->config.flags; - - err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)); - if (err) { - err = -errno; - goto out_socket; - } - - err = xsk_create_umem_rings(umem, umem->fd, fill, comp); - if (err) - goto out_socket; - - umem->fill_save = fill; - umem->comp_save = comp; - *umem_ptr = umem; - return 0; - -out_socket: - close(umem->fd); -out_umem_alloc: - free(umem); - return err; -} - -struct xsk_umem_config_v1 { - __u32 fill_size; - __u32 comp_size; - __u32 frame_size; - __u32 frame_headroom; -}; - -COMPAT_VERSION(xsk_umem__create_v0_0_2, xsk_umem__create, LIBBPF_0.0.2) -int xsk_umem__create_v0_0_2(struct xsk_umem **umem_ptr, void *umem_area, - __u64 size, struct xsk_ring_prod *fill, - struct xsk_ring_cons *comp, - const struct xsk_umem_config *usr_config) -{ - struct xsk_umem_config config; - - memcpy(&config, usr_config, sizeof(struct xsk_umem_config_v1)); - config.flags = 0; - - return xsk_umem__create_v0_0_4(umem_ptr, umem_area, size, fill, comp, - &config); -} - -static enum xsk_prog get_xsk_prog(void) -{ - enum xsk_prog detected = XSK_PROG_FALLBACK; - __u32 size_out, retval, duration; - char data_in = 0, data_out; - struct bpf_insn insns[] = { - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_MOV64_IMM(BPF_REG_2, 0), - BPF_MOV64_IMM(BPF_REG_3, XDP_PASS), - BPF_EMIT_CALL(BPF_FUNC_redirect_map), - BPF_EXIT_INSN(), - }; - int prog_fd, map_fd, ret, insn_cnt = ARRAY_SIZE(insns); - - map_fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, NULL, sizeof(int), sizeof(int), 1, NULL); - if (map_fd < 0) - return detected; - - insns[0].imm = map_fd; - - prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, NULL); - if (prog_fd < 0) { - close(map_fd); - return detected; - } - - ret = bpf_prog_test_run(prog_fd, 0, &data_in, 1, &data_out, &size_out, &retval, &duration); - if (!ret && retval == XDP_PASS) - detected = XSK_PROG_REDIRECT_FLAGS; - close(prog_fd); - close(map_fd); - return detected; -} - -static int xsk_load_xdp_prog(struct xsk_socket *xsk) -{ - static const int log_buf_size = 16 * 1024; - struct xsk_ctx *ctx = xsk->ctx; - char log_buf[log_buf_size]; - int prog_fd; - - /* This is the fallback C-program: - * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx) - * { - * int ret, index = ctx->rx_queue_index; - * - * // A set entry here means that the correspnding queue_id - * // has an active AF_XDP socket bound to it. - * ret = bpf_redirect_map(&xsks_map, index, XDP_PASS); - * if (ret > 0) - * return ret; - * - * // Fallback for pre-5.3 kernels, not supporting default - * // action in the flags parameter. - * if (bpf_map_lookup_elem(&xsks_map, &index)) - * return bpf_redirect_map(&xsks_map, index, 0); - * return XDP_PASS; - * } - */ - struct bpf_insn prog[] = { - /* r2 = *(u32 *)(r1 + 16) */ - BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16), - /* *(u32 *)(r10 - 4) = r2 */ - BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -4), - /* r1 = xskmap[] */ - BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd), - /* r3 = XDP_PASS */ - BPF_MOV64_IMM(BPF_REG_3, 2), - /* call bpf_redirect_map */ - BPF_EMIT_CALL(BPF_FUNC_redirect_map), - /* if w0 != 0 goto pc+13 */ - BPF_JMP32_IMM(BPF_JSGT, BPF_REG_0, 0, 13), - /* r2 = r10 */ - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - /* r2 += -4 */ - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), - /* r1 = xskmap[] */ - BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd), - /* call bpf_map_lookup_elem */ - BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), - /* r1 = r0 */ - BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), - /* r0 = XDP_PASS */ - BPF_MOV64_IMM(BPF_REG_0, 2), - /* if r1 == 0 goto pc+5 */ - BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5), - /* r2 = *(u32 *)(r10 - 4) */ - BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_10, -4), - /* r1 = xskmap[] */ - BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd), - /* r3 = 0 */ - BPF_MOV64_IMM(BPF_REG_3, 0), - /* call bpf_redirect_map */ - BPF_EMIT_CALL(BPF_FUNC_redirect_map), - /* The jumps are to this instruction */ - BPF_EXIT_INSN(), - }; - - /* This is the post-5.3 kernel C-program: - * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx) - * { - * return bpf_redirect_map(&xsks_map, ctx->rx_queue_index, XDP_PASS); - * } - */ - struct bpf_insn prog_redirect_flags[] = { - /* r2 = *(u32 *)(r1 + 16) */ - BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16), - /* r1 = xskmap[] */ - BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd), - /* r3 = XDP_PASS */ - BPF_MOV64_IMM(BPF_REG_3, 2), - /* call bpf_redirect_map */ - BPF_EMIT_CALL(BPF_FUNC_redirect_map), - BPF_EXIT_INSN(), - }; - size_t insns_cnt[] = {ARRAY_SIZE(prog), - ARRAY_SIZE(prog_redirect_flags), - }; - struct bpf_insn *progs[] = {prog, prog_redirect_flags}; - enum xsk_prog option = get_xsk_prog(); - LIBBPF_OPTS(bpf_prog_load_opts, opts, - .log_buf = log_buf, - .log_size = log_buf_size, - ); - - prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "LGPL-2.1 or BSD-2-Clause", - progs[option], insns_cnt[option], &opts); - if (prog_fd < 0) { - pr_warn("BPF log buffer:\n%s", log_buf); - return prog_fd; - } - - ctx->prog_fd = prog_fd; - return 0; -} - -static int xsk_create_bpf_link(struct xsk_socket *xsk) -{ - DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); - struct xsk_ctx *ctx = xsk->ctx; - __u32 prog_id = 0; - int link_fd; - int err; - - err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id, xsk->config.xdp_flags); - if (err) { - pr_warn("getting XDP prog id failed\n"); - return err; - } - - /* if there's a netlink-based XDP prog loaded on interface, bail out - * and ask user to do the removal by himself - */ - if (prog_id) { - pr_warn("Netlink-based XDP prog detected, please unload it in order to launch AF_XDP prog\n"); - return -EINVAL; - } - - opts.flags = xsk->config.xdp_flags & ~(XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_REPLACE); - - link_fd = bpf_link_create(ctx->prog_fd, ctx->ifindex, BPF_XDP, &opts); - if (link_fd < 0) { - pr_warn("bpf_link_create failed: %s\n", strerror(errno)); - return link_fd; - } - - ctx->link_fd = link_fd; - return 0; -} - -static int xsk_get_max_queues(struct xsk_socket *xsk) -{ - struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; - struct xsk_ctx *ctx = xsk->ctx; - struct ifreq ifr = {}; - int fd, err, ret; - - fd = socket(AF_LOCAL, SOCK_DGRAM | SOCK_CLOEXEC, 0); - if (fd < 0) - return -errno; - - ifr.ifr_data = (void *)&channels; - libbpf_strlcpy(ifr.ifr_name, ctx->ifname, IFNAMSIZ); - err = ioctl(fd, SIOCETHTOOL, &ifr); - if (err && errno != EOPNOTSUPP) { - ret = -errno; - goto out; - } - - if (err) { - /* If the device says it has no channels, then all traffic - * is sent to a single stream, so max queues = 1. - */ - ret = 1; - } else { - /* Take the max of rx, tx, combined. Drivers return - * the number of channels in different ways. - */ - ret = max(channels.max_rx, channels.max_tx); - ret = max(ret, (int)channels.max_combined); - } - -out: - close(fd); - return ret; -} - -static int xsk_create_bpf_maps(struct xsk_socket *xsk) -{ - struct xsk_ctx *ctx = xsk->ctx; - int max_queues; - int fd; - - max_queues = xsk_get_max_queues(xsk); - if (max_queues < 0) - return max_queues; - - fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, "xsks_map", - sizeof(int), sizeof(int), max_queues, NULL); - if (fd < 0) - return fd; - - ctx->xsks_map_fd = fd; - - return 0; -} - -static void xsk_delete_bpf_maps(struct xsk_socket *xsk) -{ - struct xsk_ctx *ctx = xsk->ctx; - - bpf_map_delete_elem(ctx->xsks_map_fd, &ctx->queue_id); - close(ctx->xsks_map_fd); -} - -static int xsk_lookup_bpf_maps(struct xsk_socket *xsk) -{ - __u32 i, *map_ids, num_maps, prog_len = sizeof(struct bpf_prog_info); - __u32 map_len = sizeof(struct bpf_map_info); - struct bpf_prog_info prog_info = {}; - struct xsk_ctx *ctx = xsk->ctx; - struct bpf_map_info map_info; - int fd, err; - - err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len); - if (err) - return err; - - num_maps = prog_info.nr_map_ids; - - map_ids = calloc(prog_info.nr_map_ids, sizeof(*map_ids)); - if (!map_ids) - return -ENOMEM; - - memset(&prog_info, 0, prog_len); - prog_info.nr_map_ids = num_maps; - prog_info.map_ids = (__u64)(unsigned long)map_ids; - - err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len); - if (err) - goto out_map_ids; - - ctx->xsks_map_fd = -1; - - for (i = 0; i < prog_info.nr_map_ids; i++) { - fd = bpf_map_get_fd_by_id(map_ids[i]); - if (fd < 0) - continue; - - memset(&map_info, 0, map_len); - err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len); - if (err) { - close(fd); - continue; - } - - if (!strncmp(map_info.name, "xsks_map", sizeof(map_info.name))) { - ctx->xsks_map_fd = fd; - break; - } - - close(fd); - } - - if (ctx->xsks_map_fd == -1) - err = -ENOENT; - -out_map_ids: - free(map_ids); - return err; -} - -static int xsk_set_bpf_maps(struct xsk_socket *xsk) -{ - struct xsk_ctx *ctx = xsk->ctx; - - return bpf_map_update_elem(ctx->xsks_map_fd, &ctx->queue_id, - &xsk->fd, 0); -} - -static int xsk_link_lookup(int ifindex, __u32 *prog_id, int *link_fd) -{ - struct bpf_link_info link_info; - __u32 link_len; - __u32 id = 0; - int err; - int fd; - - while (true) { - err = bpf_link_get_next_id(id, &id); - if (err) { - if (errno == ENOENT) { - err = 0; - break; - } - pr_warn("can't get next link: %s\n", strerror(errno)); - break; - } - - fd = bpf_link_get_fd_by_id(id); - if (fd < 0) { - if (errno == ENOENT) - continue; - pr_warn("can't get link by id (%u): %s\n", id, strerror(errno)); - err = -errno; - break; - } - - link_len = sizeof(struct bpf_link_info); - memset(&link_info, 0, link_len); - err = bpf_obj_get_info_by_fd(fd, &link_info, &link_len); - if (err) { - pr_warn("can't get link info: %s\n", strerror(errno)); - close(fd); - break; - } - if (link_info.type == BPF_LINK_TYPE_XDP) { - if (link_info.xdp.ifindex == ifindex) { - *link_fd = fd; - if (prog_id) - *prog_id = link_info.prog_id; - break; - } - } - close(fd); - } - - return err; -} - -static bool xsk_probe_bpf_link(void) -{ - LIBBPF_OPTS(bpf_link_create_opts, opts, .flags = XDP_FLAGS_SKB_MODE); - struct bpf_insn insns[2] = { - BPF_MOV64_IMM(BPF_REG_0, XDP_PASS), - BPF_EXIT_INSN() - }; - int prog_fd, link_fd = -1, insn_cnt = ARRAY_SIZE(insns); - int ifindex_lo = 1; - bool ret = false; - int err; - - err = xsk_link_lookup(ifindex_lo, NULL, &link_fd); - if (err) - return ret; - - if (link_fd >= 0) - return true; - - prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, NULL); - if (prog_fd < 0) - return ret; - - link_fd = bpf_link_create(prog_fd, ifindex_lo, BPF_XDP, &opts); - close(prog_fd); - - if (link_fd >= 0) { - ret = true; - close(link_fd); - } - - return ret; -} - -static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk) -{ - char ifname[IFNAMSIZ]; - struct xsk_ctx *ctx; - char *interface; - - ctx = calloc(1, sizeof(*ctx)); - if (!ctx) - return -ENOMEM; - - interface = if_indextoname(ifindex, &ifname[0]); - if (!interface) { - free(ctx); - return -errno; - } - - ctx->ifindex = ifindex; - libbpf_strlcpy(ctx->ifname, ifname, IFNAMSIZ); - - xsk->ctx = ctx; - xsk->ctx->has_bpf_link = xsk_probe_bpf_link(); - - return 0; -} - -static int xsk_init_xdp_res(struct xsk_socket *xsk, - int *xsks_map_fd) -{ - struct xsk_ctx *ctx = xsk->ctx; - int err; - - err = xsk_create_bpf_maps(xsk); - if (err) - return err; - - err = xsk_load_xdp_prog(xsk); - if (err) - goto err_load_xdp_prog; - - if (ctx->has_bpf_link) - err = xsk_create_bpf_link(xsk); - else - err = bpf_set_link_xdp_fd(xsk->ctx->ifindex, ctx->prog_fd, - xsk->config.xdp_flags); - - if (err) - goto err_attach_xdp_prog; - - if (!xsk->rx) - return err; - - err = xsk_set_bpf_maps(xsk); - if (err) - goto err_set_bpf_maps; - - return err; - -err_set_bpf_maps: - if (ctx->has_bpf_link) - close(ctx->link_fd); - else - bpf_set_link_xdp_fd(ctx->ifindex, -1, 0); -err_attach_xdp_prog: - close(ctx->prog_fd); -err_load_xdp_prog: - xsk_delete_bpf_maps(xsk); - return err; -} - -static int xsk_lookup_xdp_res(struct xsk_socket *xsk, int *xsks_map_fd, int prog_id) -{ - struct xsk_ctx *ctx = xsk->ctx; - int err; - - ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id); - if (ctx->prog_fd < 0) { - err = -errno; - goto err_prog_fd; - } - err = xsk_lookup_bpf_maps(xsk); - if (err) - goto err_lookup_maps; - - if (!xsk->rx) - return err; - - err = xsk_set_bpf_maps(xsk); - if (err) - goto err_set_maps; - - return err; - -err_set_maps: - close(ctx->xsks_map_fd); -err_lookup_maps: - close(ctx->prog_fd); -err_prog_fd: - if (ctx->has_bpf_link) - close(ctx->link_fd); - return err; -} - -static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, int *xsks_map_fd) -{ - struct xsk_socket *xsk = _xdp; - struct xsk_ctx *ctx = xsk->ctx; - __u32 prog_id = 0; - int err; - - if (ctx->has_bpf_link) - err = xsk_link_lookup(ctx->ifindex, &prog_id, &ctx->link_fd); - else - err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id, xsk->config.xdp_flags); - - if (err) - return err; - - err = !prog_id ? xsk_init_xdp_res(xsk, xsks_map_fd) : - xsk_lookup_xdp_res(xsk, xsks_map_fd, prog_id); - - if (!err && xsks_map_fd) - *xsks_map_fd = ctx->xsks_map_fd; - - return err; -} - -static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, int ifindex, - __u32 queue_id) -{ - struct xsk_ctx *ctx; - - if (list_empty(&umem->ctx_list)) - return NULL; - - list_for_each_entry(ctx, &umem->ctx_list, list) { - if (ctx->ifindex == ifindex && ctx->queue_id == queue_id) { - ctx->refcount++; - return ctx; - } - } - - return NULL; -} - -static void xsk_put_ctx(struct xsk_ctx *ctx, bool unmap) -{ - struct xsk_umem *umem = ctx->umem; - struct xdp_mmap_offsets off; - int err; - - if (--ctx->refcount) - return; - - if (!unmap) - goto out_free; - - err = xsk_get_mmap_offsets(umem->fd, &off); - if (err) - goto out_free; - - munmap(ctx->fill->ring - off.fr.desc, off.fr.desc + umem->config.fill_size * - sizeof(__u64)); - munmap(ctx->comp->ring - off.cr.desc, off.cr.desc + umem->config.comp_size * - sizeof(__u64)); - -out_free: - list_del(&ctx->list); - free(ctx); -} - -static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk, - struct xsk_umem *umem, int ifindex, - const char *ifname, __u32 queue_id, - struct xsk_ring_prod *fill, - struct xsk_ring_cons *comp) -{ - struct xsk_ctx *ctx; - int err; - - ctx = calloc(1, sizeof(*ctx)); - if (!ctx) - return NULL; - - if (!umem->fill_save) { - err = xsk_create_umem_rings(umem, xsk->fd, fill, comp); - if (err) { - free(ctx); - return NULL; - } - } else if (umem->fill_save != fill || umem->comp_save != comp) { - /* Copy over rings to new structs. */ - memcpy(fill, umem->fill_save, sizeof(*fill)); - memcpy(comp, umem->comp_save, sizeof(*comp)); - } - - ctx->ifindex = ifindex; - ctx->refcount = 1; - ctx->umem = umem; - ctx->queue_id = queue_id; - libbpf_strlcpy(ctx->ifname, ifname, IFNAMSIZ); - - ctx->fill = fill; - ctx->comp = comp; - list_add(&ctx->list, &umem->ctx_list); - return ctx; -} - -static void xsk_destroy_xsk_struct(struct xsk_socket *xsk) -{ - free(xsk->ctx); - free(xsk); -} - -int xsk_socket__update_xskmap(struct xsk_socket *xsk, int fd) -{ - xsk->ctx->xsks_map_fd = fd; - return xsk_set_bpf_maps(xsk); -} - -int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd) -{ - struct xsk_socket *xsk; - int res; - - xsk = calloc(1, sizeof(*xsk)); - if (!xsk) - return -ENOMEM; - - res = xsk_create_xsk_struct(ifindex, xsk); - if (res) { - free(xsk); - return -EINVAL; - } - - res = __xsk_setup_xdp_prog(xsk, xsks_map_fd); - - xsk_destroy_xsk_struct(xsk); - - return res; -} - -int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, - const char *ifname, - __u32 queue_id, struct xsk_umem *umem, - struct xsk_ring_cons *rx, - struct xsk_ring_prod *tx, - struct xsk_ring_prod *fill, - struct xsk_ring_cons *comp, - const struct xsk_socket_config *usr_config) -{ - bool unmap, rx_setup_done = false, tx_setup_done = false; - void *rx_map = NULL, *tx_map = NULL; - struct sockaddr_xdp sxdp = {}; - struct xdp_mmap_offsets off; - struct xsk_socket *xsk; - struct xsk_ctx *ctx; - int err, ifindex; - - if (!umem || !xsk_ptr || !(rx || tx)) - return -EFAULT; - - unmap = umem->fill_save != fill; - - xsk = calloc(1, sizeof(*xsk)); - if (!xsk) - return -ENOMEM; - - err = xsk_set_xdp_socket_config(&xsk->config, usr_config); - if (err) - goto out_xsk_alloc; - - xsk->outstanding_tx = 0; - ifindex = if_nametoindex(ifname); - if (!ifindex) { - err = -errno; - goto out_xsk_alloc; - } - - if (umem->refcount++ > 0) { - xsk->fd = socket(AF_XDP, SOCK_RAW | SOCK_CLOEXEC, 0); - if (xsk->fd < 0) { - err = -errno; - goto out_xsk_alloc; - } - } else { - xsk->fd = umem->fd; - rx_setup_done = umem->rx_ring_setup_done; - tx_setup_done = umem->tx_ring_setup_done; - } - - ctx = xsk_get_ctx(umem, ifindex, queue_id); - if (!ctx) { - if (!fill || !comp) { - err = -EFAULT; - goto out_socket; - } - - ctx = xsk_create_ctx(xsk, umem, ifindex, ifname, queue_id, - fill, comp); - if (!ctx) { - err = -ENOMEM; - goto out_socket; - } - } - xsk->ctx = ctx; - xsk->ctx->has_bpf_link = xsk_probe_bpf_link(); - - if (rx && !rx_setup_done) { - err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING, - &xsk->config.rx_size, - sizeof(xsk->config.rx_size)); - if (err) { - err = -errno; - goto out_put_ctx; - } - if (xsk->fd == umem->fd) - umem->rx_ring_setup_done = true; - } - if (tx && !tx_setup_done) { - err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING, - &xsk->config.tx_size, - sizeof(xsk->config.tx_size)); - if (err) { - err = -errno; - goto out_put_ctx; - } - if (xsk->fd == umem->fd) - umem->tx_ring_setup_done = true; - } - - err = xsk_get_mmap_offsets(xsk->fd, &off); - if (err) { - err = -errno; - goto out_put_ctx; - } - - if (rx) { - rx_map = mmap(NULL, off.rx.desc + - xsk->config.rx_size * sizeof(struct xdp_desc), - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, - xsk->fd, XDP_PGOFF_RX_RING); - if (rx_map == MAP_FAILED) { - err = -errno; - goto out_put_ctx; - } - - rx->mask = xsk->config.rx_size - 1; - rx->size = xsk->config.rx_size; - rx->producer = rx_map + off.rx.producer; - rx->consumer = rx_map + off.rx.consumer; - rx->flags = rx_map + off.rx.flags; - rx->ring = rx_map + off.rx.desc; - rx->cached_prod = *rx->producer; - rx->cached_cons = *rx->consumer; - } - xsk->rx = rx; - - if (tx) { - tx_map = mmap(NULL, off.tx.desc + - xsk->config.tx_size * sizeof(struct xdp_desc), - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, - xsk->fd, XDP_PGOFF_TX_RING); - if (tx_map == MAP_FAILED) { - err = -errno; - goto out_mmap_rx; - } - - tx->mask = xsk->config.tx_size - 1; - tx->size = xsk->config.tx_size; - tx->producer = tx_map + off.tx.producer; - tx->consumer = tx_map + off.tx.consumer; - tx->flags = tx_map + off.tx.flags; - tx->ring = tx_map + off.tx.desc; - tx->cached_prod = *tx->producer; - /* cached_cons is r->size bigger than the real consumer pointer - * See xsk_prod_nb_free - */ - tx->cached_cons = *tx->consumer + xsk->config.tx_size; - } - xsk->tx = tx; - - sxdp.sxdp_family = PF_XDP; - sxdp.sxdp_ifindex = ctx->ifindex; - sxdp.sxdp_queue_id = ctx->queue_id; - if (umem->refcount > 1) { - sxdp.sxdp_flags |= XDP_SHARED_UMEM; - sxdp.sxdp_shared_umem_fd = umem->fd; - } else { - sxdp.sxdp_flags = xsk->config.bind_flags; - } - - err = bind(xsk->fd, (struct sockaddr *)&sxdp, sizeof(sxdp)); - if (err) { - err = -errno; - goto out_mmap_tx; - } - - ctx->prog_fd = -1; - - if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) { - err = __xsk_setup_xdp_prog(xsk, NULL); - if (err) - goto out_mmap_tx; - } - - *xsk_ptr = xsk; - umem->fill_save = NULL; - umem->comp_save = NULL; - return 0; - -out_mmap_tx: - if (tx) - munmap(tx_map, off.tx.desc + - xsk->config.tx_size * sizeof(struct xdp_desc)); -out_mmap_rx: - if (rx) - munmap(rx_map, off.rx.desc + - xsk->config.rx_size * sizeof(struct xdp_desc)); -out_put_ctx: - xsk_put_ctx(ctx, unmap); -out_socket: - if (--umem->refcount) - close(xsk->fd); -out_xsk_alloc: - free(xsk); - return err; -} - -int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname, - __u32 queue_id, struct xsk_umem *umem, - struct xsk_ring_cons *rx, struct xsk_ring_prod *tx, - const struct xsk_socket_config *usr_config) -{ - if (!umem) - return -EFAULT; - - return xsk_socket__create_shared(xsk_ptr, ifname, queue_id, umem, - rx, tx, umem->fill_save, - umem->comp_save, usr_config); -} - -int xsk_umem__delete(struct xsk_umem *umem) -{ - struct xdp_mmap_offsets off; - int err; - - if (!umem) - return 0; - - if (umem->refcount) - return -EBUSY; - - err = xsk_get_mmap_offsets(umem->fd, &off); - if (!err && umem->fill_save && umem->comp_save) { - munmap(umem->fill_save->ring - off.fr.desc, - off.fr.desc + umem->config.fill_size * sizeof(__u64)); - munmap(umem->comp_save->ring - off.cr.desc, - off.cr.desc + umem->config.comp_size * sizeof(__u64)); - } - - close(umem->fd); - free(umem); - - return 0; -} - -void xsk_socket__delete(struct xsk_socket *xsk) -{ - size_t desc_sz = sizeof(struct xdp_desc); - struct xdp_mmap_offsets off; - struct xsk_umem *umem; - struct xsk_ctx *ctx; - int err; - - if (!xsk) - return; - - ctx = xsk->ctx; - umem = ctx->umem; - if (ctx->prog_fd != -1) { - xsk_delete_bpf_maps(xsk); - close(ctx->prog_fd); - if (ctx->has_bpf_link) - close(ctx->link_fd); - } - - err = xsk_get_mmap_offsets(xsk->fd, &off); - if (!err) { - if (xsk->rx) { - munmap(xsk->rx->ring - off.rx.desc, - off.rx.desc + xsk->config.rx_size * desc_sz); - } - if (xsk->tx) { - munmap(xsk->tx->ring - off.tx.desc, - off.tx.desc + xsk->config.tx_size * desc_sz); - } - } - - xsk_put_ctx(ctx, true); - - umem->refcount--; - /* Do not close an fd that also has an associated umem connected - * to it. - */ - if (xsk->fd != umem->fd) - close(xsk->fd); - free(xsk); -} diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h deleted file mode 100644 index 64e9c57fd792..000000000000 --- a/tools/lib/bpf/xsk.h +++ /dev/null @@ -1,336 +0,0 @@ -/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ - -/* - * AF_XDP user-space access library. - * - * Copyright (c) 2018 - 2019 Intel Corporation. - * Copyright (c) 2019 Facebook - * - * Author(s): Magnus Karlsson - */ - -#ifndef __LIBBPF_XSK_H -#define __LIBBPF_XSK_H - -#include -#include -#include -#include - -#include "libbpf.h" - -#ifdef __cplusplus -extern "C" { -#endif - -/* This whole API has been deprecated and moved to libxdp that can be found at - * https://github.com/xdp-project/xdp-tools. The APIs are exactly the same so - * it should just be linking with libxdp instead of libbpf for this set of - * functionality. If not, please submit a bug report on the aforementioned page. - */ - -/* Load-Acquire Store-Release barriers used by the XDP socket - * library. The following macros should *NOT* be considered part of - * the xsk.h API, and is subject to change anytime. - * - * LIBRARY INTERNAL - */ - -#define __XSK_READ_ONCE(x) (*(volatile typeof(x) *)&x) -#define __XSK_WRITE_ONCE(x, v) (*(volatile typeof(x) *)&x) = (v) - -#if defined(__i386__) || defined(__x86_64__) -# define libbpf_smp_store_release(p, v) \ - do { \ - asm volatile("" : : : "memory"); \ - __XSK_WRITE_ONCE(*p, v); \ - } while (0) -# define libbpf_smp_load_acquire(p) \ - ({ \ - typeof(*p) ___p1 = __XSK_READ_ONCE(*p); \ - asm volatile("" : : : "memory"); \ - ___p1; \ - }) -#elif defined(__aarch64__) -# define libbpf_smp_store_release(p, v) \ - asm volatile ("stlr %w1, %0" : "=Q" (*p) : "r" (v) : "memory") -# define libbpf_smp_load_acquire(p) \ - ({ \ - typeof(*p) ___p1; \ - asm volatile ("ldar %w0, %1" \ - : "=r" (___p1) : "Q" (*p) : "memory"); \ - ___p1; \ - }) -#elif defined(__riscv) -# define libbpf_smp_store_release(p, v) \ - do { \ - asm volatile ("fence rw,w" : : : "memory"); \ - __XSK_WRITE_ONCE(*p, v); \ - } while (0) -# define libbpf_smp_load_acquire(p) \ - ({ \ - typeof(*p) ___p1 = __XSK_READ_ONCE(*p); \ - asm volatile ("fence r,rw" : : : "memory"); \ - ___p1; \ - }) -#endif - -#ifndef libbpf_smp_store_release -#define libbpf_smp_store_release(p, v) \ - do { \ - __sync_synchronize(); \ - __XSK_WRITE_ONCE(*p, v); \ - } while (0) -#endif - -#ifndef libbpf_smp_load_acquire -#define libbpf_smp_load_acquire(p) \ - ({ \ - typeof(*p) ___p1 = __XSK_READ_ONCE(*p); \ - __sync_synchronize(); \ - ___p1; \ - }) -#endif - -/* LIBRARY INTERNAL -- END */ - -/* Do not access these members directly. Use the functions below. */ -#define DEFINE_XSK_RING(name) \ -struct name { \ - __u32 cached_prod; \ - __u32 cached_cons; \ - __u32 mask; \ - __u32 size; \ - __u32 *producer; \ - __u32 *consumer; \ - void *ring; \ - __u32 *flags; \ -} - -DEFINE_XSK_RING(xsk_ring_prod); -DEFINE_XSK_RING(xsk_ring_cons); - -/* For a detailed explanation on the memory barriers associated with the - * ring, please take a look at net/xdp/xsk_queue.h. - */ - -struct xsk_umem; -struct xsk_socket; - -static inline __u64 *xsk_ring_prod__fill_addr(struct xsk_ring_prod *fill, - __u32 idx) -{ - __u64 *addrs = (__u64 *)fill->ring; - - return &addrs[idx & fill->mask]; -} - -static inline const __u64 * -xsk_ring_cons__comp_addr(const struct xsk_ring_cons *comp, __u32 idx) -{ - const __u64 *addrs = (const __u64 *)comp->ring; - - return &addrs[idx & comp->mask]; -} - -static inline struct xdp_desc *xsk_ring_prod__tx_desc(struct xsk_ring_prod *tx, - __u32 idx) -{ - struct xdp_desc *descs = (struct xdp_desc *)tx->ring; - - return &descs[idx & tx->mask]; -} - -static inline const struct xdp_desc * -xsk_ring_cons__rx_desc(const struct xsk_ring_cons *rx, __u32 idx) -{ - const struct xdp_desc *descs = (const struct xdp_desc *)rx->ring; - - return &descs[idx & rx->mask]; -} - -static inline int xsk_ring_prod__needs_wakeup(const struct xsk_ring_prod *r) -{ - return *r->flags & XDP_RING_NEED_WAKEUP; -} - -static inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb) -{ - __u32 free_entries = r->cached_cons - r->cached_prod; - - if (free_entries >= nb) - return free_entries; - - /* Refresh the local tail pointer. - * cached_cons is r->size bigger than the real consumer pointer so - * that this addition can be avoided in the more frequently - * executed code that computs free_entries in the beginning of - * this function. Without this optimization it whould have been - * free_entries = r->cached_prod - r->cached_cons + r->size. - */ - r->cached_cons = libbpf_smp_load_acquire(r->consumer); - r->cached_cons += r->size; - - return r->cached_cons - r->cached_prod; -} - -static inline __u32 xsk_cons_nb_avail(struct xsk_ring_cons *r, __u32 nb) -{ - __u32 entries = r->cached_prod - r->cached_cons; - - if (entries == 0) { - r->cached_prod = libbpf_smp_load_acquire(r->producer); - entries = r->cached_prod - r->cached_cons; - } - - return (entries > nb) ? nb : entries; -} - -static inline __u32 xsk_ring_prod__reserve(struct xsk_ring_prod *prod, __u32 nb, __u32 *idx) -{ - if (xsk_prod_nb_free(prod, nb) < nb) - return 0; - - *idx = prod->cached_prod; - prod->cached_prod += nb; - - return nb; -} - -static inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, __u32 nb) -{ - /* Make sure everything has been written to the ring before indicating - * this to the kernel by writing the producer pointer. - */ - libbpf_smp_store_release(prod->producer, *prod->producer + nb); -} - -static inline __u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, __u32 *idx) -{ - __u32 entries = xsk_cons_nb_avail(cons, nb); - - if (entries > 0) { - *idx = cons->cached_cons; - cons->cached_cons += entries; - } - - return entries; -} - -static inline void xsk_ring_cons__cancel(struct xsk_ring_cons *cons, __u32 nb) -{ - cons->cached_cons -= nb; -} - -static inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, __u32 nb) -{ - /* Make sure data has been read before indicating we are done - * with the entries by updating the consumer pointer. - */ - libbpf_smp_store_release(cons->consumer, *cons->consumer + nb); - -} - -static inline void *xsk_umem__get_data(void *umem_area, __u64 addr) -{ - return &((char *)umem_area)[addr]; -} - -static inline __u64 xsk_umem__extract_addr(__u64 addr) -{ - return addr & XSK_UNALIGNED_BUF_ADDR_MASK; -} - -static inline __u64 xsk_umem__extract_offset(__u64 addr) -{ - return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT; -} - -static inline __u64 xsk_umem__add_offset_to_addr(__u64 addr) -{ - return xsk_umem__extract_addr(addr) + xsk_umem__extract_offset(addr); -} - -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "AF_XDP support deprecated and moved to libxdp") -int xsk_umem__fd(const struct xsk_umem *umem); -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "AF_XDP support deprecated and moved to libxdp") -int xsk_socket__fd(const struct xsk_socket *xsk); - -#define XSK_RING_CONS__DEFAULT_NUM_DESCS 2048 -#define XSK_RING_PROD__DEFAULT_NUM_DESCS 2048 -#define XSK_UMEM__DEFAULT_FRAME_SHIFT 12 /* 4096 bytes */ -#define XSK_UMEM__DEFAULT_FRAME_SIZE (1 << XSK_UMEM__DEFAULT_FRAME_SHIFT) -#define XSK_UMEM__DEFAULT_FRAME_HEADROOM 0 -#define XSK_UMEM__DEFAULT_FLAGS 0 - -struct xsk_umem_config { - __u32 fill_size; - __u32 comp_size; - __u32 frame_size; - __u32 frame_headroom; - __u32 flags; -}; - -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "AF_XDP support deprecated and moved to libxdp") -int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd); -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "AF_XDP support deprecated and moved to libxdp") -int xsk_socket__update_xskmap(struct xsk_socket *xsk, int xsks_map_fd); - -/* Flags for the libbpf_flags field. */ -#define XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD (1 << 0) - -struct xsk_socket_config { - __u32 rx_size; - __u32 tx_size; - __u32 libbpf_flags; - __u32 xdp_flags; - __u16 bind_flags; -}; - -/* Set config to NULL to get the default configuration. */ -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "AF_XDP support deprecated and moved to libxdp") -int xsk_umem__create(struct xsk_umem **umem, - void *umem_area, __u64 size, - struct xsk_ring_prod *fill, - struct xsk_ring_cons *comp, - const struct xsk_umem_config *config); -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "AF_XDP support deprecated and moved to libxdp") -int xsk_umem__create_v0_0_2(struct xsk_umem **umem, - void *umem_area, __u64 size, - struct xsk_ring_prod *fill, - struct xsk_ring_cons *comp, - const struct xsk_umem_config *config); -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "AF_XDP support deprecated and moved to libxdp") -int xsk_umem__create_v0_0_4(struct xsk_umem **umem, - void *umem_area, __u64 size, - struct xsk_ring_prod *fill, - struct xsk_ring_cons *comp, - const struct xsk_umem_config *config); -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "AF_XDP support deprecated and moved to libxdp") -int xsk_socket__create(struct xsk_socket **xsk, - const char *ifname, __u32 queue_id, - struct xsk_umem *umem, - struct xsk_ring_cons *rx, - struct xsk_ring_prod *tx, - const struct xsk_socket_config *config); -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "AF_XDP support deprecated and moved to libxdp") -int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, - const char *ifname, - __u32 queue_id, struct xsk_umem *umem, - struct xsk_ring_cons *rx, - struct xsk_ring_prod *tx, - struct xsk_ring_prod *fill, - struct xsk_ring_cons *comp, - const struct xsk_socket_config *config); - -/* Returns 0 for success and -EBUSY if the umem is still in use. */ -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "AF_XDP support deprecated and moved to libxdp") -int xsk_umem__delete(struct xsk_umem *umem); -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "AF_XDP support deprecated and moved to libxdp") -void xsk_socket__delete(struct xsk_socket *xsk); - -#ifdef __cplusplus -} /* extern "C" */ -#endif - -#endif /* __LIBBPF_XSK_H */ diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 4fbd88a8ed9e..e32a28fe8bc1 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -230,6 +230,8 @@ $(OUTPUT)/xdping: $(TESTING_HELPERS) $(OUTPUT)/flow_dissector_load: $(TESTING_HELPERS) $(OUTPUT)/test_maps: $(TESTING_HELPERS) $(OUTPUT)/test_verifier: $(TESTING_HELPERS) $(CAP_HELPERS) +$(OUTPUT)/xsk.o: $(BPFOBJ) +$(OUTPUT)/xdpxceiver: $(OUTPUT)/xsk.o BPFTOOL ?= $(DEFAULT_BPFTOOL) $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index e5992a6b5e09..019c567b6b4e 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -97,7 +97,7 @@ #include #include #include -#include +#include "xsk.h" #include "xdpxceiver.h" #include "../kselftest.h" diff --git a/tools/testing/selftests/bpf/xsk.c b/tools/testing/selftests/bpf/xsk.c new file mode 100644 index 000000000000..eb50c3f336f8 --- /dev/null +++ b/tools/testing/selftests/bpf/xsk.c @@ -0,0 +1,1264 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * AF_XDP user-space access library. + * + * Copyright(c) 2018 - 2019 Intel Corporation. + * + * Author(s): Magnus Karlsson + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "xsk.h" + +#ifndef SOL_XDP + #define SOL_XDP 283 +#endif + +#ifndef AF_XDP + #define AF_XDP 44 +#endif + +#ifndef PF_XDP + #define PF_XDP AF_XDP +#endif + +#define pr_warn(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__) + +enum xsk_prog { + XSK_PROG_FALLBACK, + XSK_PROG_REDIRECT_FLAGS, +}; + +struct xsk_umem { + struct xsk_ring_prod *fill_save; + struct xsk_ring_cons *comp_save; + char *umem_area; + struct xsk_umem_config config; + int fd; + int refcount; + struct list_head ctx_list; + bool rx_ring_setup_done; + bool tx_ring_setup_done; +}; + +struct xsk_ctx { + struct xsk_ring_prod *fill; + struct xsk_ring_cons *comp; + __u32 queue_id; + struct xsk_umem *umem; + int refcount; + int ifindex; + struct list_head list; + int prog_fd; + int link_fd; + int xsks_map_fd; + char ifname[IFNAMSIZ]; + bool has_bpf_link; +}; + +struct xsk_socket { + struct xsk_ring_cons *rx; + struct xsk_ring_prod *tx; + __u64 outstanding_tx; + struct xsk_ctx *ctx; + struct xsk_socket_config config; + int fd; +}; + +struct xsk_nl_info { + bool xdp_prog_attached; + int ifindex; + int fd; +}; + +/* Up until and including Linux 5.3 */ +struct xdp_ring_offset_v1 { + __u64 producer; + __u64 consumer; + __u64 desc; +}; + +/* Up until and including Linux 5.3 */ +struct xdp_mmap_offsets_v1 { + struct xdp_ring_offset_v1 rx; + struct xdp_ring_offset_v1 tx; + struct xdp_ring_offset_v1 fr; + struct xdp_ring_offset_v1 cr; +}; + +int xsk_umem__fd(const struct xsk_umem *umem) +{ + return umem ? umem->fd : -EINVAL; +} + +int xsk_socket__fd(const struct xsk_socket *xsk) +{ + return xsk ? xsk->fd : -EINVAL; +} + +static bool xsk_page_aligned(void *buffer) +{ + unsigned long addr = (unsigned long)buffer; + + return !(addr & (getpagesize() - 1)); +} + +static void xsk_set_umem_config(struct xsk_umem_config *cfg, + const struct xsk_umem_config *usr_cfg) +{ + if (!usr_cfg) { + cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; + cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; + cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM; + cfg->flags = XSK_UMEM__DEFAULT_FLAGS; + return; + } + + cfg->fill_size = usr_cfg->fill_size; + cfg->comp_size = usr_cfg->comp_size; + cfg->frame_size = usr_cfg->frame_size; + cfg->frame_headroom = usr_cfg->frame_headroom; + cfg->flags = usr_cfg->flags; +} + +static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg, + const struct xsk_socket_config *usr_cfg) +{ + if (!usr_cfg) { + cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; + cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + cfg->libbpf_flags = 0; + cfg->xdp_flags = 0; + cfg->bind_flags = 0; + return 0; + } + + if (usr_cfg->libbpf_flags & ~XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD) + return -EINVAL; + + cfg->rx_size = usr_cfg->rx_size; + cfg->tx_size = usr_cfg->tx_size; + cfg->libbpf_flags = usr_cfg->libbpf_flags; + cfg->xdp_flags = usr_cfg->xdp_flags; + cfg->bind_flags = usr_cfg->bind_flags; + + return 0; +} + +static void xsk_mmap_offsets_v1(struct xdp_mmap_offsets *off) +{ + struct xdp_mmap_offsets_v1 off_v1; + + /* getsockopt on a kernel <= 5.3 has no flags fields. + * Copy over the offsets to the correct places in the >=5.4 format + * and put the flags where they would have been on that kernel. + */ + memcpy(&off_v1, off, sizeof(off_v1)); + + off->rx.producer = off_v1.rx.producer; + off->rx.consumer = off_v1.rx.consumer; + off->rx.desc = off_v1.rx.desc; + off->rx.flags = off_v1.rx.consumer + sizeof(__u32); + + off->tx.producer = off_v1.tx.producer; + off->tx.consumer = off_v1.tx.consumer; + off->tx.desc = off_v1.tx.desc; + off->tx.flags = off_v1.tx.consumer + sizeof(__u32); + + off->fr.producer = off_v1.fr.producer; + off->fr.consumer = off_v1.fr.consumer; + off->fr.desc = off_v1.fr.desc; + off->fr.flags = off_v1.fr.consumer + sizeof(__u32); + + off->cr.producer = off_v1.cr.producer; + off->cr.consumer = off_v1.cr.consumer; + off->cr.desc = off_v1.cr.desc; + off->cr.flags = off_v1.cr.consumer + sizeof(__u32); +} + +static int xsk_get_mmap_offsets(int fd, struct xdp_mmap_offsets *off) +{ + socklen_t optlen; + int err; + + optlen = sizeof(*off); + err = getsockopt(fd, SOL_XDP, XDP_MMAP_OFFSETS, off, &optlen); + if (err) + return err; + + if (optlen == sizeof(*off)) + return 0; + + if (optlen == sizeof(struct xdp_mmap_offsets_v1)) { + xsk_mmap_offsets_v1(off); + return 0; + } + + return -EINVAL; +} + +static int xsk_create_umem_rings(struct xsk_umem *umem, int fd, + struct xsk_ring_prod *fill, + struct xsk_ring_cons *comp) +{ + struct xdp_mmap_offsets off; + void *map; + int err; + + err = setsockopt(fd, SOL_XDP, XDP_UMEM_FILL_RING, + &umem->config.fill_size, + sizeof(umem->config.fill_size)); + if (err) + return -errno; + + err = setsockopt(fd, SOL_XDP, XDP_UMEM_COMPLETION_RING, + &umem->config.comp_size, + sizeof(umem->config.comp_size)); + if (err) + return -errno; + + err = xsk_get_mmap_offsets(fd, &off); + if (err) + return -errno; + + map = mmap(NULL, off.fr.desc + umem->config.fill_size * sizeof(__u64), + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, + XDP_UMEM_PGOFF_FILL_RING); + if (map == MAP_FAILED) + return -errno; + + fill->mask = umem->config.fill_size - 1; + fill->size = umem->config.fill_size; + fill->producer = map + off.fr.producer; + fill->consumer = map + off.fr.consumer; + fill->flags = map + off.fr.flags; + fill->ring = map + off.fr.desc; + fill->cached_cons = umem->config.fill_size; + + map = mmap(NULL, off.cr.desc + umem->config.comp_size * sizeof(__u64), + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, + XDP_UMEM_PGOFF_COMPLETION_RING); + if (map == MAP_FAILED) { + err = -errno; + goto out_mmap; + } + + comp->mask = umem->config.comp_size - 1; + comp->size = umem->config.comp_size; + comp->producer = map + off.cr.producer; + comp->consumer = map + off.cr.consumer; + comp->flags = map + off.cr.flags; + comp->ring = map + off.cr.desc; + + return 0; + +out_mmap: + munmap(map, off.fr.desc + umem->config.fill_size * sizeof(__u64)); + return err; +} + +int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, + __u64 size, struct xsk_ring_prod *fill, + struct xsk_ring_cons *comp, + const struct xsk_umem_config *usr_config) +{ + struct xdp_umem_reg mr; + struct xsk_umem *umem; + int err; + + if (!umem_area || !umem_ptr || !fill || !comp) + return -EFAULT; + if (!size && !xsk_page_aligned(umem_area)) + return -EINVAL; + + umem = calloc(1, sizeof(*umem)); + if (!umem) + return -ENOMEM; + + umem->fd = socket(AF_XDP, SOCK_RAW | SOCK_CLOEXEC, 0); + if (umem->fd < 0) { + err = -errno; + goto out_umem_alloc; + } + + umem->umem_area = umem_area; + INIT_LIST_HEAD(&umem->ctx_list); + xsk_set_umem_config(&umem->config, usr_config); + + memset(&mr, 0, sizeof(mr)); + mr.addr = (uintptr_t)umem_area; + mr.len = size; + mr.chunk_size = umem->config.frame_size; + mr.headroom = umem->config.frame_headroom; + mr.flags = umem->config.flags; + + err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)); + if (err) { + err = -errno; + goto out_socket; + } + + err = xsk_create_umem_rings(umem, umem->fd, fill, comp); + if (err) + goto out_socket; + + umem->fill_save = fill; + umem->comp_save = comp; + *umem_ptr = umem; + return 0; + +out_socket: + close(umem->fd); +out_umem_alloc: + free(umem); + return err; +} + +struct xsk_umem_config_v1 { + __u32 fill_size; + __u32 comp_size; + __u32 frame_size; + __u32 frame_headroom; +}; + +static enum xsk_prog get_xsk_prog(void) +{ + enum xsk_prog detected = XSK_PROG_FALLBACK; + char data_in = 0, data_out; + struct bpf_insn insns[] = { + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_3, XDP_PASS), + BPF_EMIT_CALL(BPF_FUNC_redirect_map), + BPF_EXIT_INSN(), + }; + LIBBPF_OPTS(bpf_test_run_opts, opts, + .data_in = &data_in, + .data_size_in = 1, + .data_out = &data_out, + ); + + int prog_fd, map_fd, ret, insn_cnt = ARRAY_SIZE(insns); + + map_fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, NULL, sizeof(int), sizeof(int), 1, NULL); + if (map_fd < 0) + return detected; + + insns[0].imm = map_fd; + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, NULL); + if (prog_fd < 0) { + close(map_fd); + return detected; + } + + ret = bpf_prog_test_run_opts(prog_fd, &opts); + if (!ret && opts.retval == XDP_PASS) + detected = XSK_PROG_REDIRECT_FLAGS; + close(prog_fd); + close(map_fd); + return detected; +} + +static int xsk_load_xdp_prog(struct xsk_socket *xsk) +{ + static const int log_buf_size = 16 * 1024; + struct xsk_ctx *ctx = xsk->ctx; + char log_buf[log_buf_size]; + int prog_fd; + + /* This is the fallback C-program: + * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx) + * { + * int ret, index = ctx->rx_queue_index; + * + * // A set entry here means that the correspnding queue_id + * // has an active AF_XDP socket bound to it. + * ret = bpf_redirect_map(&xsks_map, index, XDP_PASS); + * if (ret > 0) + * return ret; + * + * // Fallback for pre-5.3 kernels, not supporting default + * // action in the flags parameter. + * if (bpf_map_lookup_elem(&xsks_map, &index)) + * return bpf_redirect_map(&xsks_map, index, 0); + * return XDP_PASS; + * } + */ + struct bpf_insn prog[] = { + /* r2 = *(u32 *)(r1 + 16) */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16), + /* *(u32 *)(r10 - 4) = r2 */ + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -4), + /* r1 = xskmap[] */ + BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd), + /* r3 = XDP_PASS */ + BPF_MOV64_IMM(BPF_REG_3, 2), + /* call bpf_redirect_map */ + BPF_EMIT_CALL(BPF_FUNC_redirect_map), + /* if w0 != 0 goto pc+13 */ + BPF_JMP32_IMM(BPF_JSGT, BPF_REG_0, 0, 13), + /* r2 = r10 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + /* r2 += -4 */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + /* r1 = xskmap[] */ + BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd), + /* call bpf_map_lookup_elem */ + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + /* r1 = r0 */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + /* r0 = XDP_PASS */ + BPF_MOV64_IMM(BPF_REG_0, 2), + /* if r1 == 0 goto pc+5 */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5), + /* r2 = *(u32 *)(r10 - 4) */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_10, -4), + /* r1 = xskmap[] */ + BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd), + /* r3 = 0 */ + BPF_MOV64_IMM(BPF_REG_3, 0), + /* call bpf_redirect_map */ + BPF_EMIT_CALL(BPF_FUNC_redirect_map), + /* The jumps are to this instruction */ + BPF_EXIT_INSN(), + }; + + /* This is the post-5.3 kernel C-program: + * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx) + * { + * return bpf_redirect_map(&xsks_map, ctx->rx_queue_index, XDP_PASS); + * } + */ + struct bpf_insn prog_redirect_flags[] = { + /* r2 = *(u32 *)(r1 + 16) */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16), + /* r1 = xskmap[] */ + BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd), + /* r3 = XDP_PASS */ + BPF_MOV64_IMM(BPF_REG_3, 2), + /* call bpf_redirect_map */ + BPF_EMIT_CALL(BPF_FUNC_redirect_map), + BPF_EXIT_INSN(), + }; + size_t insns_cnt[] = {ARRAY_SIZE(prog), + ARRAY_SIZE(prog_redirect_flags), + }; + struct bpf_insn *progs[] = {prog, prog_redirect_flags}; + enum xsk_prog option = get_xsk_prog(); + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .log_buf = log_buf, + .log_size = log_buf_size, + ); + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "LGPL-2.1 or BSD-2-Clause", + progs[option], insns_cnt[option], &opts); + if (prog_fd < 0) { + pr_warn("BPF log buffer:\n%s", log_buf); + return prog_fd; + } + + ctx->prog_fd = prog_fd; + return 0; +} + +static int xsk_create_bpf_link(struct xsk_socket *xsk) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + struct xsk_ctx *ctx = xsk->ctx; + __u32 prog_id = 0; + int link_fd; + int err; + + err = bpf_xdp_query_id(ctx->ifindex, xsk->config.xdp_flags, &prog_id); + if (err) { + pr_warn("getting XDP prog id failed\n"); + return err; + } + + /* if there's a netlink-based XDP prog loaded on interface, bail out + * and ask user to do the removal by himself + */ + if (prog_id) { + pr_warn("Netlink-based XDP prog detected, please unload it in order to launch AF_XDP prog\n"); + return -EINVAL; + } + + opts.flags = xsk->config.xdp_flags & ~(XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_REPLACE); + + link_fd = bpf_link_create(ctx->prog_fd, ctx->ifindex, BPF_XDP, &opts); + if (link_fd < 0) { + pr_warn("bpf_link_create failed: %s\n", strerror(errno)); + return link_fd; + } + + ctx->link_fd = link_fd; + return 0; +} + +/* Copy up to sz - 1 bytes from zero-terminated src string and ensure that dst + * is zero-terminated string no matter what (unless sz == 0, in which case + * it's a no-op). It's conceptually close to FreeBSD's strlcpy(), but differs + * in what is returned. Given this is internal helper, it's trivial to extend + * this, when necessary. Use this instead of strncpy inside libbpf source code. + */ +static inline void libbpf_strlcpy(char *dst, const char *src, size_t sz) +{ + size_t i; + + if (sz == 0) + return; + + sz--; + for (i = 0; i < sz && src[i]; i++) + dst[i] = src[i]; + dst[i] = '\0'; +} + +static int xsk_get_max_queues(struct xsk_socket *xsk) +{ + struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; + struct xsk_ctx *ctx = xsk->ctx; + struct ifreq ifr = {}; + int fd, err, ret; + + fd = socket(AF_LOCAL, SOCK_DGRAM | SOCK_CLOEXEC, 0); + if (fd < 0) + return -errno; + + ifr.ifr_data = (void *)&channels; + libbpf_strlcpy(ifr.ifr_name, ctx->ifname, IFNAMSIZ); + err = ioctl(fd, SIOCETHTOOL, &ifr); + if (err && errno != EOPNOTSUPP) { + ret = -errno; + goto out; + } + + if (err) { + /* If the device says it has no channels, then all traffic + * is sent to a single stream, so max queues = 1. + */ + ret = 1; + } else { + /* Take the max of rx, tx, combined. Drivers return + * the number of channels in different ways. + */ + ret = max(channels.max_rx, channels.max_tx); + ret = max(ret, (int)channels.max_combined); + } + +out: + close(fd); + return ret; +} + +static int xsk_create_bpf_maps(struct xsk_socket *xsk) +{ + struct xsk_ctx *ctx = xsk->ctx; + int max_queues; + int fd; + + max_queues = xsk_get_max_queues(xsk); + if (max_queues < 0) + return max_queues; + + fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, "xsks_map", + sizeof(int), sizeof(int), max_queues, NULL); + if (fd < 0) + return fd; + + ctx->xsks_map_fd = fd; + + return 0; +} + +static void xsk_delete_bpf_maps(struct xsk_socket *xsk) +{ + struct xsk_ctx *ctx = xsk->ctx; + + bpf_map_delete_elem(ctx->xsks_map_fd, &ctx->queue_id); + close(ctx->xsks_map_fd); +} + +static int xsk_lookup_bpf_maps(struct xsk_socket *xsk) +{ + __u32 i, *map_ids, num_maps, prog_len = sizeof(struct bpf_prog_info); + __u32 map_len = sizeof(struct bpf_map_info); + struct bpf_prog_info prog_info = {}; + struct xsk_ctx *ctx = xsk->ctx; + struct bpf_map_info map_info; + int fd, err; + + err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len); + if (err) + return err; + + num_maps = prog_info.nr_map_ids; + + map_ids = calloc(prog_info.nr_map_ids, sizeof(*map_ids)); + if (!map_ids) + return -ENOMEM; + + memset(&prog_info, 0, prog_len); + prog_info.nr_map_ids = num_maps; + prog_info.map_ids = (__u64)(unsigned long)map_ids; + + err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len); + if (err) + goto out_map_ids; + + ctx->xsks_map_fd = -1; + + for (i = 0; i < prog_info.nr_map_ids; i++) { + fd = bpf_map_get_fd_by_id(map_ids[i]); + if (fd < 0) + continue; + + memset(&map_info, 0, map_len); + err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len); + if (err) { + close(fd); + continue; + } + + if (!strncmp(map_info.name, "xsks_map", sizeof(map_info.name))) { + ctx->xsks_map_fd = fd; + break; + } + + close(fd); + } + + if (ctx->xsks_map_fd == -1) + err = -ENOENT; + +out_map_ids: + free(map_ids); + return err; +} + +static int xsk_set_bpf_maps(struct xsk_socket *xsk) +{ + struct xsk_ctx *ctx = xsk->ctx; + + return bpf_map_update_elem(ctx->xsks_map_fd, &ctx->queue_id, + &xsk->fd, 0); +} + +static int xsk_link_lookup(int ifindex, __u32 *prog_id, int *link_fd) +{ + struct bpf_link_info link_info; + __u32 link_len; + __u32 id = 0; + int err; + int fd; + + while (true) { + err = bpf_link_get_next_id(id, &id); + if (err) { + if (errno == ENOENT) { + err = 0; + break; + } + pr_warn("can't get next link: %s\n", strerror(errno)); + break; + } + + fd = bpf_link_get_fd_by_id(id); + if (fd < 0) { + if (errno == ENOENT) + continue; + pr_warn("can't get link by id (%u): %s\n", id, strerror(errno)); + err = -errno; + break; + } + + link_len = sizeof(struct bpf_link_info); + memset(&link_info, 0, link_len); + err = bpf_obj_get_info_by_fd(fd, &link_info, &link_len); + if (err) { + pr_warn("can't get link info: %s\n", strerror(errno)); + close(fd); + break; + } + if (link_info.type == BPF_LINK_TYPE_XDP) { + if (link_info.xdp.ifindex == ifindex) { + *link_fd = fd; + if (prog_id) + *prog_id = link_info.prog_id; + break; + } + } + close(fd); + } + + return err; +} + +static bool xsk_probe_bpf_link(void) +{ + LIBBPF_OPTS(bpf_link_create_opts, opts, .flags = XDP_FLAGS_SKB_MODE); + struct bpf_insn insns[2] = { + BPF_MOV64_IMM(BPF_REG_0, XDP_PASS), + BPF_EXIT_INSN() + }; + int prog_fd, link_fd = -1, insn_cnt = ARRAY_SIZE(insns); + int ifindex_lo = 1; + bool ret = false; + int err; + + err = xsk_link_lookup(ifindex_lo, NULL, &link_fd); + if (err) + return ret; + + if (link_fd >= 0) + return true; + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, NULL); + if (prog_fd < 0) + return ret; + + link_fd = bpf_link_create(prog_fd, ifindex_lo, BPF_XDP, &opts); + close(prog_fd); + + if (link_fd >= 0) { + ret = true; + close(link_fd); + } + + return ret; +} + +static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk) +{ + char ifname[IFNAMSIZ]; + struct xsk_ctx *ctx; + char *interface; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return -ENOMEM; + + interface = if_indextoname(ifindex, &ifname[0]); + if (!interface) { + free(ctx); + return -errno; + } + + ctx->ifindex = ifindex; + libbpf_strlcpy(ctx->ifname, ifname, IFNAMSIZ); + + xsk->ctx = ctx; + xsk->ctx->has_bpf_link = xsk_probe_bpf_link(); + + return 0; +} + +static int xsk_init_xdp_res(struct xsk_socket *xsk, + int *xsks_map_fd) +{ + struct xsk_ctx *ctx = xsk->ctx; + int err; + + err = xsk_create_bpf_maps(xsk); + if (err) + return err; + + err = xsk_load_xdp_prog(xsk); + if (err) + goto err_load_xdp_prog; + + if (ctx->has_bpf_link) + err = xsk_create_bpf_link(xsk); + else + err = bpf_xdp_attach(xsk->ctx->ifindex, ctx->prog_fd, + xsk->config.xdp_flags, NULL); + + if (err) + goto err_attach_xdp_prog; + + if (!xsk->rx) + return err; + + err = xsk_set_bpf_maps(xsk); + if (err) + goto err_set_bpf_maps; + + return err; + +err_set_bpf_maps: + if (ctx->has_bpf_link) + close(ctx->link_fd); + else + bpf_xdp_detach(ctx->ifindex, 0, NULL); +err_attach_xdp_prog: + close(ctx->prog_fd); +err_load_xdp_prog: + xsk_delete_bpf_maps(xsk); + return err; +} + +static int xsk_lookup_xdp_res(struct xsk_socket *xsk, int *xsks_map_fd, int prog_id) +{ + struct xsk_ctx *ctx = xsk->ctx; + int err; + + ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id); + if (ctx->prog_fd < 0) { + err = -errno; + goto err_prog_fd; + } + err = xsk_lookup_bpf_maps(xsk); + if (err) + goto err_lookup_maps; + + if (!xsk->rx) + return err; + + err = xsk_set_bpf_maps(xsk); + if (err) + goto err_set_maps; + + return err; + +err_set_maps: + close(ctx->xsks_map_fd); +err_lookup_maps: + close(ctx->prog_fd); +err_prog_fd: + if (ctx->has_bpf_link) + close(ctx->link_fd); + return err; +} + +static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, int *xsks_map_fd) +{ + struct xsk_socket *xsk = _xdp; + struct xsk_ctx *ctx = xsk->ctx; + __u32 prog_id = 0; + int err; + + if (ctx->has_bpf_link) + err = xsk_link_lookup(ctx->ifindex, &prog_id, &ctx->link_fd); + else + err = bpf_xdp_query_id(ctx->ifindex, xsk->config.xdp_flags, &prog_id); + + if (err) + return err; + + err = !prog_id ? xsk_init_xdp_res(xsk, xsks_map_fd) : + xsk_lookup_xdp_res(xsk, xsks_map_fd, prog_id); + + if (!err && xsks_map_fd) + *xsks_map_fd = ctx->xsks_map_fd; + + return err; +} + +static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, int ifindex, + __u32 queue_id) +{ + struct xsk_ctx *ctx; + + if (list_empty(&umem->ctx_list)) + return NULL; + + list_for_each_entry(ctx, &umem->ctx_list, list) { + if (ctx->ifindex == ifindex && ctx->queue_id == queue_id) { + ctx->refcount++; + return ctx; + } + } + + return NULL; +} + +static void xsk_put_ctx(struct xsk_ctx *ctx, bool unmap) +{ + struct xsk_umem *umem = ctx->umem; + struct xdp_mmap_offsets off; + int err; + + if (--ctx->refcount) + return; + + if (!unmap) + goto out_free; + + err = xsk_get_mmap_offsets(umem->fd, &off); + if (err) + goto out_free; + + munmap(ctx->fill->ring - off.fr.desc, off.fr.desc + umem->config.fill_size * + sizeof(__u64)); + munmap(ctx->comp->ring - off.cr.desc, off.cr.desc + umem->config.comp_size * + sizeof(__u64)); + +out_free: + list_del(&ctx->list); + free(ctx); +} + +static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk, + struct xsk_umem *umem, int ifindex, + const char *ifname, __u32 queue_id, + struct xsk_ring_prod *fill, + struct xsk_ring_cons *comp) +{ + struct xsk_ctx *ctx; + int err; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return NULL; + + if (!umem->fill_save) { + err = xsk_create_umem_rings(umem, xsk->fd, fill, comp); + if (err) { + free(ctx); + return NULL; + } + } else if (umem->fill_save != fill || umem->comp_save != comp) { + /* Copy over rings to new structs. */ + memcpy(fill, umem->fill_save, sizeof(*fill)); + memcpy(comp, umem->comp_save, sizeof(*comp)); + } + + ctx->ifindex = ifindex; + ctx->refcount = 1; + ctx->umem = umem; + ctx->queue_id = queue_id; + libbpf_strlcpy(ctx->ifname, ifname, IFNAMSIZ); + + ctx->fill = fill; + ctx->comp = comp; + list_add(&ctx->list, &umem->ctx_list); + return ctx; +} + +static void xsk_destroy_xsk_struct(struct xsk_socket *xsk) +{ + free(xsk->ctx); + free(xsk); +} + +int xsk_socket__update_xskmap(struct xsk_socket *xsk, int fd) +{ + xsk->ctx->xsks_map_fd = fd; + return xsk_set_bpf_maps(xsk); +} + +int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd) +{ + struct xsk_socket *xsk; + int res; + + xsk = calloc(1, sizeof(*xsk)); + if (!xsk) + return -ENOMEM; + + res = xsk_create_xsk_struct(ifindex, xsk); + if (res) { + free(xsk); + return -EINVAL; + } + + res = __xsk_setup_xdp_prog(xsk, xsks_map_fd); + + xsk_destroy_xsk_struct(xsk); + + return res; +} + +int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, + const char *ifname, + __u32 queue_id, struct xsk_umem *umem, + struct xsk_ring_cons *rx, + struct xsk_ring_prod *tx, + struct xsk_ring_prod *fill, + struct xsk_ring_cons *comp, + const struct xsk_socket_config *usr_config) +{ + bool unmap, rx_setup_done = false, tx_setup_done = false; + void *rx_map = NULL, *tx_map = NULL; + struct sockaddr_xdp sxdp = {}; + struct xdp_mmap_offsets off; + struct xsk_socket *xsk; + struct xsk_ctx *ctx; + int err, ifindex; + + if (!umem || !xsk_ptr || !(rx || tx)) + return -EFAULT; + + unmap = umem->fill_save != fill; + + xsk = calloc(1, sizeof(*xsk)); + if (!xsk) + return -ENOMEM; + + err = xsk_set_xdp_socket_config(&xsk->config, usr_config); + if (err) + goto out_xsk_alloc; + + xsk->outstanding_tx = 0; + ifindex = if_nametoindex(ifname); + if (!ifindex) { + err = -errno; + goto out_xsk_alloc; + } + + if (umem->refcount++ > 0) { + xsk->fd = socket(AF_XDP, SOCK_RAW | SOCK_CLOEXEC, 0); + if (xsk->fd < 0) { + err = -errno; + goto out_xsk_alloc; + } + } else { + xsk->fd = umem->fd; + rx_setup_done = umem->rx_ring_setup_done; + tx_setup_done = umem->tx_ring_setup_done; + } + + ctx = xsk_get_ctx(umem, ifindex, queue_id); + if (!ctx) { + if (!fill || !comp) { + err = -EFAULT; + goto out_socket; + } + + ctx = xsk_create_ctx(xsk, umem, ifindex, ifname, queue_id, + fill, comp); + if (!ctx) { + err = -ENOMEM; + goto out_socket; + } + } + xsk->ctx = ctx; + xsk->ctx->has_bpf_link = xsk_probe_bpf_link(); + + if (rx && !rx_setup_done) { + err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING, + &xsk->config.rx_size, + sizeof(xsk->config.rx_size)); + if (err) { + err = -errno; + goto out_put_ctx; + } + if (xsk->fd == umem->fd) + umem->rx_ring_setup_done = true; + } + if (tx && !tx_setup_done) { + err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING, + &xsk->config.tx_size, + sizeof(xsk->config.tx_size)); + if (err) { + err = -errno; + goto out_put_ctx; + } + if (xsk->fd == umem->fd) + umem->tx_ring_setup_done = true; + } + + err = xsk_get_mmap_offsets(xsk->fd, &off); + if (err) { + err = -errno; + goto out_put_ctx; + } + + if (rx) { + rx_map = mmap(NULL, off.rx.desc + + xsk->config.rx_size * sizeof(struct xdp_desc), + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, + xsk->fd, XDP_PGOFF_RX_RING); + if (rx_map == MAP_FAILED) { + err = -errno; + goto out_put_ctx; + } + + rx->mask = xsk->config.rx_size - 1; + rx->size = xsk->config.rx_size; + rx->producer = rx_map + off.rx.producer; + rx->consumer = rx_map + off.rx.consumer; + rx->flags = rx_map + off.rx.flags; + rx->ring = rx_map + off.rx.desc; + rx->cached_prod = *rx->producer; + rx->cached_cons = *rx->consumer; + } + xsk->rx = rx; + + if (tx) { + tx_map = mmap(NULL, off.tx.desc + + xsk->config.tx_size * sizeof(struct xdp_desc), + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, + xsk->fd, XDP_PGOFF_TX_RING); + if (tx_map == MAP_FAILED) { + err = -errno; + goto out_mmap_rx; + } + + tx->mask = xsk->config.tx_size - 1; + tx->size = xsk->config.tx_size; + tx->producer = tx_map + off.tx.producer; + tx->consumer = tx_map + off.tx.consumer; + tx->flags = tx_map + off.tx.flags; + tx->ring = tx_map + off.tx.desc; + tx->cached_prod = *tx->producer; + /* cached_cons is r->size bigger than the real consumer pointer + * See xsk_prod_nb_free + */ + tx->cached_cons = *tx->consumer + xsk->config.tx_size; + } + xsk->tx = tx; + + sxdp.sxdp_family = PF_XDP; + sxdp.sxdp_ifindex = ctx->ifindex; + sxdp.sxdp_queue_id = ctx->queue_id; + if (umem->refcount > 1) { + sxdp.sxdp_flags |= XDP_SHARED_UMEM; + sxdp.sxdp_shared_umem_fd = umem->fd; + } else { + sxdp.sxdp_flags = xsk->config.bind_flags; + } + + err = bind(xsk->fd, (struct sockaddr *)&sxdp, sizeof(sxdp)); + if (err) { + err = -errno; + goto out_mmap_tx; + } + + ctx->prog_fd = -1; + + if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) { + err = __xsk_setup_xdp_prog(xsk, NULL); + if (err) + goto out_mmap_tx; + } + + *xsk_ptr = xsk; + umem->fill_save = NULL; + umem->comp_save = NULL; + return 0; + +out_mmap_tx: + if (tx) + munmap(tx_map, off.tx.desc + + xsk->config.tx_size * sizeof(struct xdp_desc)); +out_mmap_rx: + if (rx) + munmap(rx_map, off.rx.desc + + xsk->config.rx_size * sizeof(struct xdp_desc)); +out_put_ctx: + xsk_put_ctx(ctx, unmap); +out_socket: + if (--umem->refcount) + close(xsk->fd); +out_xsk_alloc: + free(xsk); + return err; +} + +int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname, + __u32 queue_id, struct xsk_umem *umem, + struct xsk_ring_cons *rx, struct xsk_ring_prod *tx, + const struct xsk_socket_config *usr_config) +{ + if (!umem) + return -EFAULT; + + return xsk_socket__create_shared(xsk_ptr, ifname, queue_id, umem, + rx, tx, umem->fill_save, + umem->comp_save, usr_config); +} + +int xsk_umem__delete(struct xsk_umem *umem) +{ + struct xdp_mmap_offsets off; + int err; + + if (!umem) + return 0; + + if (umem->refcount) + return -EBUSY; + + err = xsk_get_mmap_offsets(umem->fd, &off); + if (!err && umem->fill_save && umem->comp_save) { + munmap(umem->fill_save->ring - off.fr.desc, + off.fr.desc + umem->config.fill_size * sizeof(__u64)); + munmap(umem->comp_save->ring - off.cr.desc, + off.cr.desc + umem->config.comp_size * sizeof(__u64)); + } + + close(umem->fd); + free(umem); + + return 0; +} + +void xsk_socket__delete(struct xsk_socket *xsk) +{ + size_t desc_sz = sizeof(struct xdp_desc); + struct xdp_mmap_offsets off; + struct xsk_umem *umem; + struct xsk_ctx *ctx; + int err; + + if (!xsk) + return; + + ctx = xsk->ctx; + umem = ctx->umem; + if (ctx->prog_fd != -1) { + xsk_delete_bpf_maps(xsk); + close(ctx->prog_fd); + if (ctx->has_bpf_link) + close(ctx->link_fd); + } + + err = xsk_get_mmap_offsets(xsk->fd, &off); + if (!err) { + if (xsk->rx) { + munmap(xsk->rx->ring - off.rx.desc, + off.rx.desc + xsk->config.rx_size * desc_sz); + } + if (xsk->tx) { + munmap(xsk->tx->ring - off.tx.desc, + off.tx.desc + xsk->config.tx_size * desc_sz); + } + } + + xsk_put_ctx(ctx, true); + + umem->refcount--; + /* Do not close an fd that also has an associated umem connected + * to it. + */ + if (xsk->fd != umem->fd) + close(xsk->fd); + free(xsk); +} diff --git a/tools/testing/selftests/bpf/xsk.h b/tools/testing/selftests/bpf/xsk.h new file mode 100644 index 000000000000..915e7135337c --- /dev/null +++ b/tools/testing/selftests/bpf/xsk.h @@ -0,0 +1,315 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * AF_XDP user-space access library. + * + * Copyright (c) 2018 - 2019 Intel Corporation. + * Copyright (c) 2019 Facebook + * + * Author(s): Magnus Karlsson + */ + +#ifndef __XSK_H +#define __XSK_H + +#include +#include +#include +#include + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* This whole API has been deprecated and moved to libxdp that can be found at + * https://github.com/xdp-project/xdp-tools. The APIs are exactly the same so + * it should just be linking with libxdp instead of libbpf for this set of + * functionality. If not, please submit a bug report on the aforementioned page. + */ + +/* Load-Acquire Store-Release barriers used by the XDP socket + * library. The following macros should *NOT* be considered part of + * the xsk.h API, and is subject to change anytime. + * + * LIBRARY INTERNAL + */ + +#define __XSK_READ_ONCE(x) (*(volatile typeof(x) *)&x) +#define __XSK_WRITE_ONCE(x, v) (*(volatile typeof(x) *)&x) = (v) + +#if defined(__i386__) || defined(__x86_64__) +# define libbpf_smp_store_release(p, v) \ + do { \ + asm volatile("" : : : "memory"); \ + __XSK_WRITE_ONCE(*p, v); \ + } while (0) +# define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = __XSK_READ_ONCE(*p); \ + asm volatile("" : : : "memory"); \ + ___p1; \ + }) +#elif defined(__aarch64__) +# define libbpf_smp_store_release(p, v) \ + asm volatile ("stlr %w1, %0" : "=Q" (*p) : "r" (v) : "memory") +# define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1; \ + asm volatile ("ldar %w0, %1" \ + : "=r" (___p1) : "Q" (*p) : "memory"); \ + ___p1; \ + }) +#elif defined(__riscv) +# define libbpf_smp_store_release(p, v) \ + do { \ + asm volatile ("fence rw,w" : : : "memory"); \ + __XSK_WRITE_ONCE(*p, v); \ + } while (0) +# define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = __XSK_READ_ONCE(*p); \ + asm volatile ("fence r,rw" : : : "memory"); \ + ___p1; \ + }) +#endif + +#ifndef libbpf_smp_store_release +#define libbpf_smp_store_release(p, v) \ + do { \ + __sync_synchronize(); \ + __XSK_WRITE_ONCE(*p, v); \ + } while (0) +#endif + +#ifndef libbpf_smp_load_acquire +#define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = __XSK_READ_ONCE(*p); \ + __sync_synchronize(); \ + ___p1; \ + }) +#endif + +/* LIBRARY INTERNAL -- END */ + +/* Do not access these members directly. Use the functions below. */ +#define DEFINE_XSK_RING(name) \ +struct name { \ + __u32 cached_prod; \ + __u32 cached_cons; \ + __u32 mask; \ + __u32 size; \ + __u32 *producer; \ + __u32 *consumer; \ + void *ring; \ + __u32 *flags; \ +} + +DEFINE_XSK_RING(xsk_ring_prod); +DEFINE_XSK_RING(xsk_ring_cons); + +/* For a detailed explanation on the memory barriers associated with the + * ring, please take a look at net/xdp/xsk_queue.h. + */ + +struct xsk_umem; +struct xsk_socket; + +static inline __u64 *xsk_ring_prod__fill_addr(struct xsk_ring_prod *fill, + __u32 idx) +{ + __u64 *addrs = (__u64 *)fill->ring; + + return &addrs[idx & fill->mask]; +} + +static inline const __u64 * +xsk_ring_cons__comp_addr(const struct xsk_ring_cons *comp, __u32 idx) +{ + const __u64 *addrs = (const __u64 *)comp->ring; + + return &addrs[idx & comp->mask]; +} + +static inline struct xdp_desc *xsk_ring_prod__tx_desc(struct xsk_ring_prod *tx, + __u32 idx) +{ + struct xdp_desc *descs = (struct xdp_desc *)tx->ring; + + return &descs[idx & tx->mask]; +} + +static inline const struct xdp_desc * +xsk_ring_cons__rx_desc(const struct xsk_ring_cons *rx, __u32 idx) +{ + const struct xdp_desc *descs = (const struct xdp_desc *)rx->ring; + + return &descs[idx & rx->mask]; +} + +static inline int xsk_ring_prod__needs_wakeup(const struct xsk_ring_prod *r) +{ + return *r->flags & XDP_RING_NEED_WAKEUP; +} + +static inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb) +{ + __u32 free_entries = r->cached_cons - r->cached_prod; + + if (free_entries >= nb) + return free_entries; + + /* Refresh the local tail pointer. + * cached_cons is r->size bigger than the real consumer pointer so + * that this addition can be avoided in the more frequently + * executed code that computs free_entries in the beginning of + * this function. Without this optimization it whould have been + * free_entries = r->cached_prod - r->cached_cons + r->size. + */ + r->cached_cons = libbpf_smp_load_acquire(r->consumer); + r->cached_cons += r->size; + + return r->cached_cons - r->cached_prod; +} + +static inline __u32 xsk_cons_nb_avail(struct xsk_ring_cons *r, __u32 nb) +{ + __u32 entries = r->cached_prod - r->cached_cons; + + if (entries == 0) { + r->cached_prod = libbpf_smp_load_acquire(r->producer); + entries = r->cached_prod - r->cached_cons; + } + + return (entries > nb) ? nb : entries; +} + +static inline __u32 xsk_ring_prod__reserve(struct xsk_ring_prod *prod, __u32 nb, __u32 *idx) +{ + if (xsk_prod_nb_free(prod, nb) < nb) + return 0; + + *idx = prod->cached_prod; + prod->cached_prod += nb; + + return nb; +} + +static inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, __u32 nb) +{ + /* Make sure everything has been written to the ring before indicating + * this to the kernel by writing the producer pointer. + */ + libbpf_smp_store_release(prod->producer, *prod->producer + nb); +} + +static inline __u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, __u32 *idx) +{ + __u32 entries = xsk_cons_nb_avail(cons, nb); + + if (entries > 0) { + *idx = cons->cached_cons; + cons->cached_cons += entries; + } + + return entries; +} + +static inline void xsk_ring_cons__cancel(struct xsk_ring_cons *cons, __u32 nb) +{ + cons->cached_cons -= nb; +} + +static inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, __u32 nb) +{ + /* Make sure data has been read before indicating we are done + * with the entries by updating the consumer pointer. + */ + libbpf_smp_store_release(cons->consumer, *cons->consumer + nb); + +} + +static inline void *xsk_umem__get_data(void *umem_area, __u64 addr) +{ + return &((char *)umem_area)[addr]; +} + +static inline __u64 xsk_umem__extract_addr(__u64 addr) +{ + return addr & XSK_UNALIGNED_BUF_ADDR_MASK; +} + +static inline __u64 xsk_umem__extract_offset(__u64 addr) +{ + return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT; +} + +static inline __u64 xsk_umem__add_offset_to_addr(__u64 addr) +{ + return xsk_umem__extract_addr(addr) + xsk_umem__extract_offset(addr); +} + +int xsk_umem__fd(const struct xsk_umem *umem); +int xsk_socket__fd(const struct xsk_socket *xsk); + +#define XSK_RING_CONS__DEFAULT_NUM_DESCS 2048 +#define XSK_RING_PROD__DEFAULT_NUM_DESCS 2048 +#define XSK_UMEM__DEFAULT_FRAME_SHIFT 12 /* 4096 bytes */ +#define XSK_UMEM__DEFAULT_FRAME_SIZE (1 << XSK_UMEM__DEFAULT_FRAME_SHIFT) +#define XSK_UMEM__DEFAULT_FRAME_HEADROOM 0 +#define XSK_UMEM__DEFAULT_FLAGS 0 + +struct xsk_umem_config { + __u32 fill_size; + __u32 comp_size; + __u32 frame_size; + __u32 frame_headroom; + __u32 flags; +}; + +int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd); +int xsk_socket__update_xskmap(struct xsk_socket *xsk, int xsks_map_fd); + +/* Flags for the libbpf_flags field. */ +#define XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD (1 << 0) + +struct xsk_socket_config { + __u32 rx_size; + __u32 tx_size; + __u32 libbpf_flags; + __u32 xdp_flags; + __u16 bind_flags; +}; + +/* Set config to NULL to get the default configuration. */ +int xsk_umem__create(struct xsk_umem **umem, + void *umem_area, __u64 size, + struct xsk_ring_prod *fill, + struct xsk_ring_cons *comp, + const struct xsk_umem_config *config); +int xsk_socket__create(struct xsk_socket **xsk, + const char *ifname, __u32 queue_id, + struct xsk_umem *umem, + struct xsk_ring_cons *rx, + struct xsk_ring_prod *tx, + const struct xsk_socket_config *config); +int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, + const char *ifname, + __u32 queue_id, struct xsk_umem *umem, + struct xsk_ring_cons *rx, + struct xsk_ring_prod *tx, + struct xsk_ring_prod *fill, + struct xsk_ring_cons *comp, + const struct xsk_socket_config *config); + +/* Returns 0 for success and -EBUSY if the umem is still in use. */ +int xsk_umem__delete(struct xsk_umem *umem); +void xsk_socket__delete(struct xsk_socket *xsk); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* __XSK_H */ -- cgit v1.2.3 From 765a34130ea506fafefe179bdfa47da296016d92 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:14 -0700 Subject: libbpf: remove deprecated low-level APIs Drop low-level APIs as well as high-level (and very confusingly named) BPF object loading bpf_prog_load_xattr() and bpf_prog_load_deprecated() APIs. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf.c | 174 ++--------------------------------------------- tools/lib/bpf/bpf.h | 83 ---------------------- tools/lib/bpf/libbpf.c | 89 ------------------------ tools/lib/bpf/libbpf.h | 16 ----- tools/lib/bpf/libbpf.map | 16 ----- 5 files changed, 4 insertions(+), 374 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 240186aac8e6..2f3495d80d69 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -233,11 +233,10 @@ alloc_zero_tailing_info(const void *orecord, __u32 cnt, return info; } -DEFAULT_VERSION(bpf_prog_load_v0_6_0, bpf_prog_load, LIBBPF_0.6.0) -int bpf_prog_load_v0_6_0(enum bpf_prog_type prog_type, - const char *prog_name, const char *license, - const struct bpf_insn *insns, size_t insn_cnt, - const struct bpf_prog_load_opts *opts) +int bpf_prog_load(enum bpf_prog_type prog_type, + const char *prog_name, const char *license, + const struct bpf_insn *insns, size_t insn_cnt, + const struct bpf_prog_load_opts *opts) { void *finfo = NULL, *linfo = NULL; const char *func_info, *line_info; @@ -384,94 +383,6 @@ done: return libbpf_err_errno(fd); } -__attribute__((alias("bpf_load_program_xattr2"))) -int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, - char *log_buf, size_t log_buf_sz); - -static int bpf_load_program_xattr2(const struct bpf_load_program_attr *load_attr, - char *log_buf, size_t log_buf_sz) -{ - LIBBPF_OPTS(bpf_prog_load_opts, p); - - if (!load_attr || !log_buf != !log_buf_sz) - return libbpf_err(-EINVAL); - - p.expected_attach_type = load_attr->expected_attach_type; - switch (load_attr->prog_type) { - case BPF_PROG_TYPE_STRUCT_OPS: - case BPF_PROG_TYPE_LSM: - p.attach_btf_id = load_attr->attach_btf_id; - break; - case BPF_PROG_TYPE_TRACING: - case BPF_PROG_TYPE_EXT: - p.attach_btf_id = load_attr->attach_btf_id; - p.attach_prog_fd = load_attr->attach_prog_fd; - break; - default: - p.prog_ifindex = load_attr->prog_ifindex; - p.kern_version = load_attr->kern_version; - } - p.log_level = load_attr->log_level; - p.log_buf = log_buf; - p.log_size = log_buf_sz; - p.prog_btf_fd = load_attr->prog_btf_fd; - p.func_info_rec_size = load_attr->func_info_rec_size; - p.func_info_cnt = load_attr->func_info_cnt; - p.func_info = load_attr->func_info; - p.line_info_rec_size = load_attr->line_info_rec_size; - p.line_info_cnt = load_attr->line_info_cnt; - p.line_info = load_attr->line_info; - p.prog_flags = load_attr->prog_flags; - - return bpf_prog_load(load_attr->prog_type, load_attr->name, load_attr->license, - load_attr->insns, load_attr->insns_cnt, &p); -} - -int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns, - size_t insns_cnt, const char *license, - __u32 kern_version, char *log_buf, - size_t log_buf_sz) -{ - struct bpf_load_program_attr load_attr; - - memset(&load_attr, 0, sizeof(struct bpf_load_program_attr)); - load_attr.prog_type = type; - load_attr.expected_attach_type = 0; - load_attr.name = NULL; - load_attr.insns = insns; - load_attr.insns_cnt = insns_cnt; - load_attr.license = license; - load_attr.kern_version = kern_version; - - return bpf_load_program_xattr2(&load_attr, log_buf, log_buf_sz); -} - -int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns, - size_t insns_cnt, __u32 prog_flags, const char *license, - __u32 kern_version, char *log_buf, size_t log_buf_sz, - int log_level) -{ - union bpf_attr attr; - int fd; - - bump_rlimit_memlock(); - - memset(&attr, 0, sizeof(attr)); - attr.prog_type = type; - attr.insn_cnt = (__u32)insns_cnt; - attr.insns = ptr_to_u64(insns); - attr.license = ptr_to_u64(license); - attr.log_buf = ptr_to_u64(log_buf); - attr.log_size = log_buf_sz; - attr.log_level = log_level; - log_buf[0] = 0; - attr.kern_version = kern_version; - attr.prog_flags = prog_flags; - - fd = sys_bpf_prog_load(&attr, sizeof(attr), PROG_LOAD_ATTEMPTS); - return libbpf_err_errno(fd); -} - int bpf_map_update_elem(int fd, const void *key, const void *value, __u64 flags) { @@ -910,62 +821,6 @@ int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, return libbpf_err_errno(ret); } -int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size, - void *data_out, __u32 *size_out, __u32 *retval, - __u32 *duration) -{ - union bpf_attr attr; - int ret; - - memset(&attr, 0, sizeof(attr)); - attr.test.prog_fd = prog_fd; - attr.test.data_in = ptr_to_u64(data); - attr.test.data_out = ptr_to_u64(data_out); - attr.test.data_size_in = size; - attr.test.repeat = repeat; - - ret = sys_bpf(BPF_PROG_TEST_RUN, &attr, sizeof(attr)); - - if (size_out) - *size_out = attr.test.data_size_out; - if (retval) - *retval = attr.test.retval; - if (duration) - *duration = attr.test.duration; - - return libbpf_err_errno(ret); -} - -int bpf_prog_test_run_xattr(struct bpf_prog_test_run_attr *test_attr) -{ - union bpf_attr attr; - int ret; - - if (!test_attr->data_out && test_attr->data_size_out > 0) - return libbpf_err(-EINVAL); - - memset(&attr, 0, sizeof(attr)); - attr.test.prog_fd = test_attr->prog_fd; - attr.test.data_in = ptr_to_u64(test_attr->data_in); - attr.test.data_out = ptr_to_u64(test_attr->data_out); - attr.test.data_size_in = test_attr->data_size_in; - attr.test.data_size_out = test_attr->data_size_out; - attr.test.ctx_in = ptr_to_u64(test_attr->ctx_in); - attr.test.ctx_out = ptr_to_u64(test_attr->ctx_out); - attr.test.ctx_size_in = test_attr->ctx_size_in; - attr.test.ctx_size_out = test_attr->ctx_size_out; - attr.test.repeat = test_attr->repeat; - - ret = sys_bpf(BPF_PROG_TEST_RUN, &attr, sizeof(attr)); - - test_attr->data_size_out = attr.test.data_size_out; - test_attr->ctx_size_out = attr.test.ctx_size_out; - test_attr->retval = attr.test.retval; - test_attr->duration = attr.test.duration; - - return libbpf_err_errno(ret); -} - int bpf_prog_test_run_opts(int prog_fd, struct bpf_test_run_opts *opts) { union bpf_attr attr; @@ -1162,27 +1017,6 @@ int bpf_btf_load(const void *btf_data, size_t btf_size, const struct bpf_btf_loa return libbpf_err_errno(fd); } -int bpf_load_btf(const void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size, bool do_log) -{ - LIBBPF_OPTS(bpf_btf_load_opts, opts); - int fd; - -retry: - if (do_log && log_buf && log_buf_size) { - opts.log_buf = log_buf; - opts.log_size = log_buf_size; - opts.log_level = 1; - } - - fd = bpf_btf_load(btf, btf_size, &opts); - if (fd < 0 && !do_log && log_buf && log_buf_size) { - do_log = true; - goto retry; - } - - return libbpf_err_errno(fd); -} - int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset, __u64 *probe_addr) diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index cabc03703e29..af7baf5da74d 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -103,54 +103,6 @@ LIBBPF_API int bpf_prog_load(enum bpf_prog_type prog_type, const char *prog_name, const char *license, const struct bpf_insn *insns, size_t insn_cnt, const struct bpf_prog_load_opts *opts); -/* this "specialization" should go away in libbpf 1.0 */ -LIBBPF_API int bpf_prog_load_v0_6_0(enum bpf_prog_type prog_type, - const char *prog_name, const char *license, - const struct bpf_insn *insns, size_t insn_cnt, - const struct bpf_prog_load_opts *opts); - -/* This is an elaborate way to not conflict with deprecated bpf_prog_load() - * API, defined in libbpf.h. Once we hit libbpf 1.0, all this will be gone. - * With this approach, if someone is calling bpf_prog_load() with - * 4 arguments, they will use the deprecated API, which keeps backwards - * compatibility (both source code and binary). If bpf_prog_load() is called - * with 6 arguments, though, it gets redirected to __bpf_prog_load. - * So looking forward to libbpf 1.0 when this hack will be gone and - * __bpf_prog_load() will be called just bpf_prog_load(). - */ -#ifndef bpf_prog_load -#define bpf_prog_load(...) ___libbpf_overload(___bpf_prog_load, __VA_ARGS__) -#define ___bpf_prog_load4(file, type, pobj, prog_fd) \ - bpf_prog_load_deprecated(file, type, pobj, prog_fd) -#define ___bpf_prog_load6(prog_type, prog_name, license, insns, insn_cnt, opts) \ - bpf_prog_load(prog_type, prog_name, license, insns, insn_cnt, opts) -#endif /* bpf_prog_load */ - -struct bpf_load_program_attr { - enum bpf_prog_type prog_type; - enum bpf_attach_type expected_attach_type; - const char *name; - const struct bpf_insn *insns; - size_t insns_cnt; - const char *license; - union { - __u32 kern_version; - __u32 attach_prog_fd; - }; - union { - __u32 prog_ifindex; - __u32 attach_btf_id; - }; - __u32 prog_btf_fd; - __u32 func_info_rec_size; - const void *func_info; - __u32 func_info_cnt; - __u32 line_info_rec_size; - const void *line_info; - __u32 line_info_cnt; - __u32 log_level; - __u32 prog_flags; -}; /* Flags to direct loading requirements */ #define MAPS_RELAX_COMPAT 0x01 @@ -158,22 +110,6 @@ struct bpf_load_program_attr { /* Recommended log buffer size */ #define BPF_LOG_BUF_SIZE (UINT32_MAX >> 8) /* verifier maximum in kernels <= 5.1 */ -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_prog_load() instead") -LIBBPF_API int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, - char *log_buf, size_t log_buf_sz); -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_prog_load() instead") -LIBBPF_API int bpf_load_program(enum bpf_prog_type type, - const struct bpf_insn *insns, size_t insns_cnt, - const char *license, __u32 kern_version, - char *log_buf, size_t log_buf_sz); -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_prog_load() instead") -LIBBPF_API int bpf_verify_program(enum bpf_prog_type type, - const struct bpf_insn *insns, - size_t insns_cnt, __u32 prog_flags, - const char *license, __u32 kern_version, - char *log_buf, size_t log_buf_sz, - int log_level); - struct bpf_btf_load_opts { size_t sz; /* size of this struct for forward/backward compatibility */ @@ -187,10 +123,6 @@ struct bpf_btf_load_opts { LIBBPF_API int bpf_btf_load(const void *btf_data, size_t btf_size, const struct bpf_btf_load_opts *opts); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_btf_load() instead") -LIBBPF_API int bpf_load_btf(const void *btf, __u32 btf_size, char *log_buf, - __u32 log_buf_size, bool do_log); - LIBBPF_API int bpf_map_update_elem(int fd, const void *key, const void *value, __u64 flags); @@ -353,10 +285,6 @@ LIBBPF_API int bpf_prog_attach(int prog_fd, int attachable_fd, LIBBPF_API int bpf_prog_attach_opts(int prog_fd, int attachable_fd, enum bpf_attach_type type, const struct bpf_prog_attach_opts *opts); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_prog_attach_opts() instead") -LIBBPF_API int bpf_prog_attach_xattr(int prog_fd, int attachable_fd, - enum bpf_attach_type type, - const struct bpf_prog_attach_opts *opts); LIBBPF_API int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type); LIBBPF_API int bpf_prog_detach2(int prog_fd, int attachable_fd, enum bpf_attach_type type); @@ -422,17 +350,6 @@ struct bpf_prog_test_run_attr { * out: length of cxt_out */ }; -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_prog_test_run_opts() instead") -LIBBPF_API int bpf_prog_test_run_xattr(struct bpf_prog_test_run_attr *test_attr); - -/* - * bpf_prog_test_run does not check that data_out is large enough. Consider - * using bpf_prog_test_run_opts instead. - */ -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_prog_test_run_opts() instead") -LIBBPF_API int bpf_prog_test_run(int prog_fd, int repeat, void *data, - __u32 size, void *data_out, __u32 *size_out, - __u32 *retval, __u32 *duration); LIBBPF_API int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id); LIBBPF_API int bpf_map_get_next_id(__u32 start_id, __u32 *next_id); LIBBPF_API int bpf_btf_get_next_id(__u32 start_id, __u32 *next_id); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 335467ece75f..b11d3e689126 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -10205,95 +10205,6 @@ long libbpf_get_error(const void *ptr) return -errno; } -__attribute__((alias("bpf_prog_load_xattr2"))) -int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, - struct bpf_object **pobj, int *prog_fd); - -static int bpf_prog_load_xattr2(const struct bpf_prog_load_attr *attr, - struct bpf_object **pobj, int *prog_fd) -{ - struct bpf_object_open_attr open_attr = {}; - struct bpf_program *prog, *first_prog = NULL; - struct bpf_object *obj; - struct bpf_map *map; - int err; - - if (!attr) - return libbpf_err(-EINVAL); - if (!attr->file) - return libbpf_err(-EINVAL); - - open_attr.file = attr->file; - open_attr.prog_type = attr->prog_type; - - obj = __bpf_object__open_xattr(&open_attr, 0); - err = libbpf_get_error(obj); - if (err) - return libbpf_err(-ENOENT); - - bpf_object__for_each_program(prog, obj) { - enum bpf_attach_type attach_type = attr->expected_attach_type; - /* - * to preserve backwards compatibility, bpf_prog_load treats - * attr->prog_type, if specified, as an override to whatever - * bpf_object__open guessed - */ - if (attr->prog_type != BPF_PROG_TYPE_UNSPEC) { - prog->type = attr->prog_type; - prog->expected_attach_type = attach_type; - } - if (bpf_program__type(prog) == BPF_PROG_TYPE_UNSPEC) { - /* - * we haven't guessed from section name and user - * didn't provide a fallback type, too bad... - */ - bpf_object__close(obj); - return libbpf_err(-EINVAL); - } - - prog->prog_ifindex = attr->ifindex; - prog->log_level = attr->log_level; - prog->prog_flags |= attr->prog_flags; - if (!first_prog) - first_prog = prog; - } - - bpf_object__for_each_map(map, obj) { - if (map->def.type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) - map->map_ifindex = attr->ifindex; - } - - if (!first_prog) { - pr_warn("object file doesn't contain bpf program\n"); - bpf_object__close(obj); - return libbpf_err(-ENOENT); - } - - err = bpf_object__load(obj); - if (err) { - bpf_object__close(obj); - return libbpf_err(err); - } - - *pobj = obj; - *prog_fd = bpf_program__fd(first_prog); - return 0; -} - -COMPAT_VERSION(bpf_prog_load_deprecated, bpf_prog_load, LIBBPF_0.0.1) -int bpf_prog_load_deprecated(const char *file, enum bpf_prog_type type, - struct bpf_object **pobj, int *prog_fd) -{ - struct bpf_prog_load_attr attr; - - memset(&attr, 0, sizeof(struct bpf_prog_load_attr)); - attr.file = file; - attr.prog_type = type; - attr.expected_attach_type = 0; - - return bpf_prog_load_xattr2(&attr, pobj, prog_fd); -} - /* Replace link's underlying BPF program with the new one */ int bpf_link__update_program(struct bpf_link *link, struct bpf_program *prog) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index fa27969da0da..0b2de10b5878 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1164,22 +1164,6 @@ LIBBPF_API int bpf_map__get_next_key(const struct bpf_map *map, */ LIBBPF_API long libbpf_get_error(const void *ptr); -struct bpf_prog_load_attr { - const char *file; - enum bpf_prog_type prog_type; - enum bpf_attach_type expected_attach_type; - int ifindex; - int log_level; - int prog_flags; -}; - -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_object__open() and bpf_object__load() instead") -LIBBPF_API int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, - struct bpf_object **pobj, int *prog_fd); -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__open() and bpf_object__load() instead") -LIBBPF_API int bpf_prog_load_deprecated(const char *file, enum bpf_prog_type type, - struct bpf_object **pobj, int *prog_fd); - /* XDP related API */ struct xdp_link_info { __u32 prog_id; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index da7a4f928452..a480490914a4 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -1,15 +1,6 @@ LIBBPF_0.0.1 { global: bpf_btf_get_fd_by_id; - bpf_create_map; - bpf_create_map_in_map; - bpf_create_map_in_map_node; - bpf_create_map_name; - bpf_create_map_node; - bpf_create_map_xattr; - bpf_load_btf; - bpf_load_program; - bpf_load_program_xattr; bpf_map__btf_key_type_id; bpf_map__btf_value_type_id; bpf_map__def; @@ -61,11 +52,7 @@ LIBBPF_0.0.1 { bpf_prog_detach2; bpf_prog_get_fd_by_id; bpf_prog_get_next_id; - bpf_prog_load; - bpf_prog_load_xattr; bpf_prog_query; - bpf_prog_test_run; - bpf_prog_test_run_xattr; bpf_program__fd; bpf_program__is_kprobe; bpf_program__is_perf_event; @@ -106,7 +93,6 @@ LIBBPF_0.0.1 { bpf_raw_tracepoint_open; bpf_set_link_xdp_fd; bpf_task_fd_query; - bpf_verify_program; btf__fd; btf__find_by_name; btf__free; @@ -218,7 +204,6 @@ LIBBPF_0.0.7 { bpf_object__load_skeleton; bpf_object__open_skeleton; bpf_probe_large_insn_limit; - bpf_prog_attach_xattr; bpf_program__attach; bpf_program__name; bpf_program__is_extension; @@ -387,7 +372,6 @@ LIBBPF_0.6.0 { bpf_object__next_program; bpf_object__prev_map; bpf_object__prev_program; - bpf_prog_load_deprecated; bpf_prog_load; bpf_program__flags; bpf_program__insn_cnt; -- cgit v1.2.3 From 53e6af3a761c6913164303f4f31da55ee2d3b134 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:15 -0700 Subject: libbpf: remove deprecated XDP APIs Get rid of deprecated bpf_set_link*() and bpf_get_link*() APIs. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.h | 20 ---------------- tools/lib/bpf/libbpf.map | 4 ---- tools/lib/bpf/netlink.c | 62 +++++++----------------------------------------- 3 files changed, 8 insertions(+), 78 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 0b2de10b5878..d1c93a1e7d66 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1164,15 +1164,6 @@ LIBBPF_API int bpf_map__get_next_key(const struct bpf_map *map, */ LIBBPF_API long libbpf_get_error(const void *ptr); -/* XDP related API */ -struct xdp_link_info { - __u32 prog_id; - __u32 drv_prog_id; - __u32 hw_prog_id; - __u32 skb_prog_id; - __u8 attach_mode; -}; - struct bpf_xdp_set_link_opts { size_t sz; int old_fd; @@ -1180,17 +1171,6 @@ struct bpf_xdp_set_link_opts { }; #define bpf_xdp_set_link_opts__last_field old_fd -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_xdp_attach() instead") -LIBBPF_API int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_xdp_attach() instead") -LIBBPF_API int bpf_set_link_xdp_fd_opts(int ifindex, int fd, __u32 flags, - const struct bpf_xdp_set_link_opts *opts); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_xdp_query_id() instead") -LIBBPF_API int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_xdp_query() instead") -LIBBPF_API int bpf_get_link_xdp_info(int ifindex, struct xdp_link_info *info, - size_t info_size, __u32 flags); - struct bpf_xdp_attach_opts { size_t sz; int old_prog_fd; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index a480490914a4..713c769f125a 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -91,7 +91,6 @@ LIBBPF_0.0.1 { bpf_prog_linfo__lfind_addr_func; bpf_prog_linfo__lfind; bpf_raw_tracepoint_open; - bpf_set_link_xdp_fd; bpf_task_fd_query; btf__fd; btf__find_by_name; @@ -120,7 +119,6 @@ LIBBPF_0.0.2 { bpf_map_lookup_elem_flags; bpf_object__btf; bpf_object__find_map_fd_by_name; - bpf_get_link_xdp_id; btf__dedup; btf__get_map_kv_tids; btf__get_nr_types; @@ -172,7 +170,6 @@ LIBBPF_0.0.5 { LIBBPF_0.0.6 { global: - bpf_get_link_xdp_info; bpf_map__get_pin_path; bpf_map__is_pinned; bpf_map__set_pin_path; @@ -231,7 +228,6 @@ LIBBPF_0.0.8 { bpf_program__is_lsm; bpf_program__set_attach_target; bpf_program__set_lsm; - bpf_set_link_xdp_fd_opts; } LIBBPF_0.0.7; LIBBPF_0.0.9 { diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index cbc8967d5402..6c013168032d 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -27,6 +27,14 @@ typedef int (*libbpf_dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb); typedef int (*__dump_nlmsg_t)(struct nlmsghdr *nlmsg, libbpf_dump_nlmsg_t, void *cookie); +struct xdp_link_info { + __u32 prog_id; + __u32 drv_prog_id; + __u32 hw_prog_id; + __u32 skb_prog_id; + __u8 attach_mode; +}; + struct xdp_id_md { int ifindex; __u32 flags; @@ -288,31 +296,6 @@ int bpf_xdp_detach(int ifindex, __u32 flags, const struct bpf_xdp_attach_opts *o return bpf_xdp_attach(ifindex, -1, flags, opts); } -int bpf_set_link_xdp_fd_opts(int ifindex, int fd, __u32 flags, - const struct bpf_xdp_set_link_opts *opts) -{ - int old_fd = -1, ret; - - if (!OPTS_VALID(opts, bpf_xdp_set_link_opts)) - return libbpf_err(-EINVAL); - - if (OPTS_HAS(opts, old_fd)) { - old_fd = OPTS_GET(opts, old_fd, -1); - flags |= XDP_FLAGS_REPLACE; - } - - ret = __bpf_set_link_xdp_fd_replace(ifindex, fd, old_fd, flags); - return libbpf_err(ret); -} - -int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags) -{ - int ret; - - ret = __bpf_set_link_xdp_fd_replace(ifindex, fd, 0, flags); - return libbpf_err(ret); -} - static int __dump_link_nlmsg(struct nlmsghdr *nlh, libbpf_dump_nlmsg_t dump_link_nlmsg, void *cookie) { @@ -413,30 +396,6 @@ int bpf_xdp_query(int ifindex, int xdp_flags, struct bpf_xdp_query_opts *opts) return 0; } -int bpf_get_link_xdp_info(int ifindex, struct xdp_link_info *info, - size_t info_size, __u32 flags) -{ - LIBBPF_OPTS(bpf_xdp_query_opts, opts); - size_t sz; - int err; - - if (!info_size) - return libbpf_err(-EINVAL); - - err = bpf_xdp_query(ifindex, flags, &opts); - if (err) - return libbpf_err(err); - - /* struct xdp_link_info field layout matches struct bpf_xdp_query_opts - * layout after sz field - */ - sz = min(info_size, offsetofend(struct xdp_link_info, attach_mode)); - memcpy(info, &opts.prog_id, sz); - memset((void *)info + sz, 0, info_size - sz); - - return 0; -} - int bpf_xdp_query_id(int ifindex, int flags, __u32 *prog_id) { LIBBPF_OPTS(bpf_xdp_query_opts, opts); @@ -463,11 +422,6 @@ int bpf_xdp_query_id(int ifindex, int flags, __u32 *prog_id) } -int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags) -{ - return bpf_xdp_query_id(ifindex, flags, prog_id); -} - typedef int (*qdisc_config_t)(struct libbpf_nla_req *req); static int clsact_config(struct libbpf_nla_req *req) -- cgit v1.2.3 From d320fad217b79849d66b53d7fdb361020ab4f64c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:16 -0700 Subject: libbpf: remove deprecated probing APIs Get rid of deprecated feature-probing APIs. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.h | 8 --- tools/lib/bpf/libbpf.map | 4 -- tools/lib/bpf/libbpf_probes.c | 125 ++---------------------------------------- 3 files changed, 5 insertions(+), 132 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index d1c93a1e7d66..71b19ae1659c 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1412,14 +1412,6 @@ bpf_prog_linfo__lfind(const struct bpf_prog_linfo *prog_linfo, * user, causing subsequent probes to fail. In this case, the caller may want * to adjust that limit with setrlimit(). */ -LIBBPF_DEPRECATED_SINCE(0, 8, "use libbpf_probe_bpf_prog_type() instead") -LIBBPF_API bool bpf_probe_prog_type(enum bpf_prog_type prog_type, __u32 ifindex); -LIBBPF_DEPRECATED_SINCE(0, 8, "use libbpf_probe_bpf_map_type() instead") -LIBBPF_API bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex); -LIBBPF_DEPRECATED_SINCE(0, 8, "use libbpf_probe_bpf_helper() instead") -LIBBPF_API bool bpf_probe_helper(enum bpf_func_id id, enum bpf_prog_type prog_type, __u32 ifindex); -LIBBPF_DEPRECATED_SINCE(0, 8, "implement your own or use bpftool for feature detection") -LIBBPF_API bool bpf_probe_large_insn_limit(__u32 ifindex); /** * @brief **libbpf_probe_bpf_prog_type()** detects if host kernel supports diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 713c769f125a..3cea0bab95ea 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -112,9 +112,6 @@ LIBBPF_0.0.1 { LIBBPF_0.0.2 { global: - bpf_probe_helper; - bpf_probe_map_type; - bpf_probe_prog_type; bpf_map__resize; bpf_map_lookup_elem_flags; bpf_object__btf; @@ -200,7 +197,6 @@ LIBBPF_0.0.7 { bpf_object__detach_skeleton; bpf_object__load_skeleton; bpf_object__open_skeleton; - bpf_probe_large_insn_limit; bpf_program__attach; bpf_program__name; bpf_program__is_extension; diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 97b06cede56f..0b5398786bf3 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -17,47 +17,14 @@ #include "libbpf.h" #include "libbpf_internal.h" -static bool grep(const char *buffer, const char *pattern) -{ - return !!strstr(buffer, pattern); -} - -static int get_vendor_id(int ifindex) -{ - char ifname[IF_NAMESIZE], path[64], buf[8]; - ssize_t len; - int fd; - - if (!if_indextoname(ifindex, ifname)) - return -1; - - snprintf(path, sizeof(path), "/sys/class/net/%s/device/vendor", ifname); - - fd = open(path, O_RDONLY | O_CLOEXEC); - if (fd < 0) - return -1; - - len = read(fd, buf, sizeof(buf)); - close(fd); - if (len < 0) - return -1; - if (len >= (ssize_t)sizeof(buf)) - return -1; - buf[len] = '\0'; - - return strtol(buf, NULL, 0); -} - static int probe_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, size_t insns_cnt, - char *log_buf, size_t log_buf_sz, - __u32 ifindex) + char *log_buf, size_t log_buf_sz) { LIBBPF_OPTS(bpf_prog_load_opts, opts, .log_buf = log_buf, .log_size = log_buf_sz, .log_level = log_buf ? 1 : 0, - .prog_ifindex = ifindex, ); int fd, err, exp_err = 0; const char *exp_msg = NULL; @@ -161,31 +128,10 @@ int libbpf_probe_bpf_prog_type(enum bpf_prog_type prog_type, const void *opts) if (opts) return libbpf_err(-EINVAL); - ret = probe_prog_load(prog_type, insns, insn_cnt, NULL, 0, 0); + ret = probe_prog_load(prog_type, insns, insn_cnt, NULL, 0); return libbpf_err(ret); } -bool bpf_probe_prog_type(enum bpf_prog_type prog_type, __u32 ifindex) -{ - struct bpf_insn insns[2] = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN() - }; - - /* prefer libbpf_probe_bpf_prog_type() unless offload is requested */ - if (ifindex == 0) - return libbpf_probe_bpf_prog_type(prog_type, NULL) == 1; - - if (ifindex && prog_type == BPF_PROG_TYPE_SCHED_CLS) - /* nfp returns -EINVAL on exit(0) with TC offload */ - insns[0].imm = 2; - - errno = 0; - probe_prog_load(prog_type, insns, ARRAY_SIZE(insns), NULL, 0, ifindex); - - return errno != EINVAL && errno != EOPNOTSUPP; -} - int libbpf__load_raw_btf(const char *raw_types, size_t types_len, const char *str_sec, size_t str_len) { @@ -242,15 +188,13 @@ static int load_local_storage_btf(void) strs, sizeof(strs)); } -static int probe_map_create(enum bpf_map_type map_type, __u32 ifindex) +static int probe_map_create(enum bpf_map_type map_type) { LIBBPF_OPTS(bpf_map_create_opts, opts); int key_size, value_size, max_entries; __u32 btf_key_type_id = 0, btf_value_type_id = 0; int fd = -1, btf_fd = -1, fd_inner = -1, exp_err = 0, err; - opts.map_ifindex = ifindex; - key_size = sizeof(__u32); value_size = sizeof(__u32); max_entries = 1; @@ -326,12 +270,6 @@ static int probe_map_create(enum bpf_map_type map_type, __u32 ifindex) if (map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS || map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { - /* TODO: probe for device, once libbpf has a function to create - * map-in-map for offload - */ - if (ifindex) - goto cleanup; - fd_inner = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(__u32), sizeof(__u32), 1, NULL); if (fd_inner < 0) @@ -370,15 +308,10 @@ int libbpf_probe_bpf_map_type(enum bpf_map_type map_type, const void *opts) if (opts) return libbpf_err(-EINVAL); - ret = probe_map_create(map_type, 0); + ret = probe_map_create(map_type); return libbpf_err(ret); } -bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex) -{ - return probe_map_create(map_type, ifindex) == 1; -} - int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helper_id, const void *opts) { @@ -407,7 +340,7 @@ int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helpe } buf[0] = '\0'; - ret = probe_prog_load(prog_type, insns, insn_cnt, buf, sizeof(buf), 0); + ret = probe_prog_load(prog_type, insns, insn_cnt, buf, sizeof(buf)); if (ret < 0) return libbpf_err(ret); @@ -427,51 +360,3 @@ int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helpe return 0; return 1; /* assume supported */ } - -bool bpf_probe_helper(enum bpf_func_id id, enum bpf_prog_type prog_type, - __u32 ifindex) -{ - struct bpf_insn insns[2] = { - BPF_EMIT_CALL(id), - BPF_EXIT_INSN() - }; - char buf[4096] = {}; - bool res; - - probe_prog_load(prog_type, insns, ARRAY_SIZE(insns), buf, sizeof(buf), ifindex); - res = !grep(buf, "invalid func ") && !grep(buf, "unknown func "); - - if (ifindex) { - switch (get_vendor_id(ifindex)) { - case 0x19ee: /* Netronome specific */ - res = res && !grep(buf, "not supported by FW") && - !grep(buf, "unsupported function id"); - break; - default: - break; - } - } - - return res; -} - -/* - * Probe for availability of kernel commit (5.3): - * - * c04c0d2b968a ("bpf: increase complexity limit and maximum program size") - */ -bool bpf_probe_large_insn_limit(__u32 ifindex) -{ - struct bpf_insn insns[BPF_MAXINSNS + 1]; - int i; - - for (i = 0; i < BPF_MAXINSNS; i++) - insns[i] = BPF_MOV64_IMM(BPF_REG_0, 1); - insns[BPF_MAXINSNS] = BPF_EXIT_INSN(); - - errno = 0; - probe_prog_load(BPF_PROG_TYPE_SCHED_CLS, insns, ARRAY_SIZE(insns), NULL, 0, - ifindex); - - return errno != E2BIG && errno != EINVAL; -} -- cgit v1.2.3 From aaf6886d9b53d34f0a3d2f577ddc1224026d12ab Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:17 -0700 Subject: libbpf: remove deprecated BTF APIs Get rid of deprecated BTF-related APIs. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/btf.c | 183 +---------------------------------------------- tools/lib/bpf/btf.h | 86 +--------------------- tools/lib/bpf/btf_dump.c | 23 ++---- tools/lib/bpf/libbpf.c | 44 ++++-------- tools/lib/bpf/libbpf.map | 13 ---- 5 files changed, 24 insertions(+), 325 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index ae1520f7e1b0..2d14f1a52d7a 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -448,11 +448,6 @@ static int btf_parse_type_sec(struct btf *btf) return 0; } -__u32 btf__get_nr_types(const struct btf *btf) -{ - return btf->start_id + btf->nr_types - 1; -} - __u32 btf__type_cnt(const struct btf *btf) { return btf->start_id + btf->nr_types; @@ -1408,92 +1403,6 @@ struct btf *btf__load_from_kernel_by_id(__u32 id) return btf__load_from_kernel_by_id_split(id, NULL); } -int btf__get_from_id(__u32 id, struct btf **btf) -{ - struct btf *res; - int err; - - *btf = NULL; - res = btf__load_from_kernel_by_id(id); - err = libbpf_get_error(res); - - if (err) - return libbpf_err(err); - - *btf = res; - return 0; -} - -int btf__get_map_kv_tids(const struct btf *btf, const char *map_name, - __u32 expected_key_size, __u32 expected_value_size, - __u32 *key_type_id, __u32 *value_type_id) -{ - const struct btf_type *container_type; - const struct btf_member *key, *value; - const size_t max_name = 256; - char container_name[max_name]; - __s64 key_size, value_size; - __s32 container_id; - - if (snprintf(container_name, max_name, "____btf_map_%s", map_name) == max_name) { - pr_warn("map:%s length of '____btf_map_%s' is too long\n", - map_name, map_name); - return libbpf_err(-EINVAL); - } - - container_id = btf__find_by_name(btf, container_name); - if (container_id < 0) { - pr_debug("map:%s container_name:%s cannot be found in BTF. Missing BPF_ANNOTATE_KV_PAIR?\n", - map_name, container_name); - return libbpf_err(container_id); - } - - container_type = btf__type_by_id(btf, container_id); - if (!container_type) { - pr_warn("map:%s cannot find BTF type for container_id:%u\n", - map_name, container_id); - return libbpf_err(-EINVAL); - } - - if (!btf_is_struct(container_type) || btf_vlen(container_type) < 2) { - pr_warn("map:%s container_name:%s is an invalid container struct\n", - map_name, container_name); - return libbpf_err(-EINVAL); - } - - key = btf_members(container_type); - value = key + 1; - - key_size = btf__resolve_size(btf, key->type); - if (key_size < 0) { - pr_warn("map:%s invalid BTF key_type_size\n", map_name); - return libbpf_err(key_size); - } - - if (expected_key_size != key_size) { - pr_warn("map:%s btf_key_type_size:%u != map_def_key_size:%u\n", - map_name, (__u32)key_size, expected_key_size); - return libbpf_err(-EINVAL); - } - - value_size = btf__resolve_size(btf, value->type); - if (value_size < 0) { - pr_warn("map:%s invalid BTF value_type_size\n", map_name); - return libbpf_err(value_size); - } - - if (expected_value_size != value_size) { - pr_warn("map:%s btf_value_type_size:%u != map_def_value_size:%u\n", - map_name, (__u32)value_size, expected_value_size); - return libbpf_err(-EINVAL); - } - - *key_type_id = key->type; - *value_type_id = value->type; - - return 0; -} - static void btf_invalidate_raw_data(struct btf *btf) { if (btf->raw_data) { @@ -2965,81 +2874,6 @@ const void *btf_ext__get_raw_data(const struct btf_ext *btf_ext, __u32 *size) return btf_ext->data; } -static int btf_ext_reloc_info(const struct btf *btf, - const struct btf_ext_info *ext_info, - const char *sec_name, __u32 insns_cnt, - void **info, __u32 *cnt) -{ - __u32 sec_hdrlen = sizeof(struct btf_ext_info_sec); - __u32 i, record_size, existing_len, records_len; - struct btf_ext_info_sec *sinfo; - const char *info_sec_name; - __u64 remain_len; - void *data; - - record_size = ext_info->rec_size; - sinfo = ext_info->info; - remain_len = ext_info->len; - while (remain_len > 0) { - records_len = sinfo->num_info * record_size; - info_sec_name = btf__name_by_offset(btf, sinfo->sec_name_off); - if (strcmp(info_sec_name, sec_name)) { - remain_len -= sec_hdrlen + records_len; - sinfo = (void *)sinfo + sec_hdrlen + records_len; - continue; - } - - existing_len = (*cnt) * record_size; - data = realloc(*info, existing_len + records_len); - if (!data) - return libbpf_err(-ENOMEM); - - memcpy(data + existing_len, sinfo->data, records_len); - /* adjust insn_off only, the rest data will be passed - * to the kernel. - */ - for (i = 0; i < sinfo->num_info; i++) { - __u32 *insn_off; - - insn_off = data + existing_len + (i * record_size); - *insn_off = *insn_off / sizeof(struct bpf_insn) + insns_cnt; - } - *info = data; - *cnt += sinfo->num_info; - return 0; - } - - return libbpf_err(-ENOENT); -} - -int btf_ext__reloc_func_info(const struct btf *btf, - const struct btf_ext *btf_ext, - const char *sec_name, __u32 insns_cnt, - void **func_info, __u32 *cnt) -{ - return btf_ext_reloc_info(btf, &btf_ext->func_info, sec_name, - insns_cnt, func_info, cnt); -} - -int btf_ext__reloc_line_info(const struct btf *btf, - const struct btf_ext *btf_ext, - const char *sec_name, __u32 insns_cnt, - void **line_info, __u32 *cnt) -{ - return btf_ext_reloc_info(btf, &btf_ext->line_info, sec_name, - insns_cnt, line_info, cnt); -} - -__u32 btf_ext__func_info_rec_size(const struct btf_ext *btf_ext) -{ - return btf_ext->func_info.rec_size; -} - -__u32 btf_ext__line_info_rec_size(const struct btf_ext *btf_ext) -{ - return btf_ext->line_info.rec_size; -} - struct btf_dedup; static struct btf_dedup *btf_dedup_new(struct btf *btf, const struct btf_dedup_opts *opts); @@ -3189,9 +3023,7 @@ static int btf_dedup_remap_types(struct btf_dedup *d); * deduplicating structs/unions is described in greater details in comments for * `btf_dedup_is_equiv` function. */ - -DEFAULT_VERSION(btf__dedup_v0_6_0, btf__dedup, LIBBPF_0.6.0) -int btf__dedup_v0_6_0(struct btf *btf, const struct btf_dedup_opts *opts) +int btf__dedup(struct btf *btf, const struct btf_dedup_opts *opts) { struct btf_dedup *d; int err; @@ -3251,19 +3083,6 @@ done: return libbpf_err(err); } -COMPAT_VERSION(btf__dedup_deprecated, btf__dedup, LIBBPF_0.0.2) -int btf__dedup_deprecated(struct btf *btf, struct btf_ext *btf_ext, const void *unused_opts) -{ - LIBBPF_OPTS(btf_dedup_opts, opts, .btf_ext = btf_ext); - - if (unused_opts) { - pr_warn("please use new version of btf__dedup() that supports options\n"); - return libbpf_err(-ENOTSUP); - } - - return btf__dedup(btf, &opts); -} - #define BTF_UNPROCESSED_ID ((__u32)-1) #define BTF_IN_PROGRESS_ID ((__u32)-2) diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 9fb416eb5644..583760df83b4 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -120,20 +120,12 @@ LIBBPF_API struct btf *libbpf_find_kernel_btf(void); LIBBPF_API struct btf *btf__load_from_kernel_by_id(__u32 id); LIBBPF_API struct btf *btf__load_from_kernel_by_id_split(__u32 id, struct btf *base_btf); -LIBBPF_DEPRECATED_SINCE(0, 6, "use btf__load_from_kernel_by_id instead") -LIBBPF_API int btf__get_from_id(__u32 id, struct btf **btf); -LIBBPF_DEPRECATED_SINCE(0, 6, "intended for internal libbpf use only") -LIBBPF_API int btf__finalize_data(struct bpf_object *obj, struct btf *btf); -LIBBPF_DEPRECATED_SINCE(0, 6, "use btf__load_into_kernel instead") -LIBBPF_API int btf__load(struct btf *btf); LIBBPF_API int btf__load_into_kernel(struct btf *btf); LIBBPF_API __s32 btf__find_by_name(const struct btf *btf, const char *type_name); LIBBPF_API __s32 btf__find_by_name_kind(const struct btf *btf, const char *type_name, __u32 kind); -LIBBPF_DEPRECATED_SINCE(0, 7, "use btf__type_cnt() instead; note that btf__get_nr_types() == btf__type_cnt() - 1") -LIBBPF_API __u32 btf__get_nr_types(const struct btf *btf); LIBBPF_API __u32 btf__type_cnt(const struct btf *btf); LIBBPF_API const struct btf *btf__base_btf(const struct btf *btf); LIBBPF_API const struct btf_type *btf__type_by_id(const struct btf *btf, @@ -150,29 +142,10 @@ LIBBPF_API void btf__set_fd(struct btf *btf, int fd); LIBBPF_API const void *btf__raw_data(const struct btf *btf, __u32 *size); LIBBPF_API const char *btf__name_by_offset(const struct btf *btf, __u32 offset); LIBBPF_API const char *btf__str_by_offset(const struct btf *btf, __u32 offset); -LIBBPF_DEPRECATED_SINCE(0, 7, "this API is not necessary when BTF-defined maps are used") -LIBBPF_API int btf__get_map_kv_tids(const struct btf *btf, const char *map_name, - __u32 expected_key_size, - __u32 expected_value_size, - __u32 *key_type_id, __u32 *value_type_id); LIBBPF_API struct btf_ext *btf_ext__new(const __u8 *data, __u32 size); LIBBPF_API void btf_ext__free(struct btf_ext *btf_ext); LIBBPF_API const void *btf_ext__raw_data(const struct btf_ext *btf_ext, __u32 *size); -LIBBPF_API LIBBPF_DEPRECATED("btf_ext__reloc_func_info was never meant as a public API and has wrong assumptions embedded in it; it will be removed in the future libbpf versions") -int btf_ext__reloc_func_info(const struct btf *btf, - const struct btf_ext *btf_ext, - const char *sec_name, __u32 insns_cnt, - void **func_info, __u32 *cnt); -LIBBPF_API LIBBPF_DEPRECATED("btf_ext__reloc_line_info was never meant as a public API and has wrong assumptions embedded in it; it will be removed in the future libbpf versions") -int btf_ext__reloc_line_info(const struct btf *btf, - const struct btf_ext *btf_ext, - const char *sec_name, __u32 insns_cnt, - void **line_info, __u32 *cnt); -LIBBPF_API LIBBPF_DEPRECATED("btf_ext__reloc_func_info is deprecated; write custom func_info parsing to fetch rec_size") -__u32 btf_ext__func_info_rec_size(const struct btf_ext *btf_ext); -LIBBPF_API LIBBPF_DEPRECATED("btf_ext__reloc_line_info is deprecated; write custom line_info parsing to fetch rec_size") -__u32 btf_ext__line_info_rec_size(const struct btf_ext *btf_ext); LIBBPF_API int btf__find_str(struct btf *btf, const char *s); LIBBPF_API int btf__add_str(struct btf *btf, const char *s); @@ -259,22 +232,12 @@ struct btf_dedup_opts { LIBBPF_API int btf__dedup(struct btf *btf, const struct btf_dedup_opts *opts); -LIBBPF_API int btf__dedup_v0_6_0(struct btf *btf, const struct btf_dedup_opts *opts); - -LIBBPF_DEPRECATED_SINCE(0, 7, "use btf__dedup() instead") -LIBBPF_API int btf__dedup_deprecated(struct btf *btf, struct btf_ext *btf_ext, const void *opts); -#define btf__dedup(...) ___libbpf_overload(___btf_dedup, __VA_ARGS__) -#define ___btf_dedup3(btf, btf_ext, opts) btf__dedup_deprecated(btf, btf_ext, opts) -#define ___btf_dedup2(btf, opts) btf__dedup(btf, opts) - struct btf_dump; struct btf_dump_opts { - union { - size_t sz; - void *ctx; /* DEPRECATED: will be gone in v1.0 */ - }; + size_t sz; }; +#define btf_dump_opts__last_field sz typedef void (*btf_dump_printf_fn_t)(void *ctx, const char *fmt, va_list args); @@ -283,51 +246,6 @@ LIBBPF_API struct btf_dump *btf_dump__new(const struct btf *btf, void *ctx, const struct btf_dump_opts *opts); -LIBBPF_API struct btf_dump *btf_dump__new_v0_6_0(const struct btf *btf, - btf_dump_printf_fn_t printf_fn, - void *ctx, - const struct btf_dump_opts *opts); - -LIBBPF_API struct btf_dump *btf_dump__new_deprecated(const struct btf *btf, - const struct btf_ext *btf_ext, - const struct btf_dump_opts *opts, - btf_dump_printf_fn_t printf_fn); - -/* Choose either btf_dump__new() or btf_dump__new_deprecated() based on the - * type of 4th argument. If it's btf_dump's print callback, use deprecated - * API; otherwise, choose the new btf_dump__new(). ___libbpf_override() - * doesn't work here because both variants have 4 input arguments. - * - * (void *) casts are necessary to avoid compilation warnings about type - * mismatches, because even though __builtin_choose_expr() only ever evaluates - * one side the other side still has to satisfy type constraints (this is - * compiler implementation limitation which might be lifted eventually, - * according to the documentation). So passing struct btf_ext in place of - * btf_dump_printf_fn_t would be generating compilation warning. Casting to - * void * avoids this issue. - * - * Also, two type compatibility checks for a function and function pointer are - * required because passing function reference into btf_dump__new() as - * btf_dump__new(..., my_callback, ...) and as btf_dump__new(..., - * &my_callback, ...) (not explicit ampersand in the latter case) actually - * differs as far as __builtin_types_compatible_p() is concerned. Thus two - * checks are combined to detect callback argument. - * - * The rest works just like in case of ___libbpf_override() usage with symbol - * versioning. - * - * C++ compilers don't support __builtin_types_compatible_p(), so at least - * don't screw up compilation for them and let C++ users pick btf_dump__new - * vs btf_dump__new_deprecated explicitly. - */ -#ifndef __cplusplus -#define btf_dump__new(a1, a2, a3, a4) __builtin_choose_expr( \ - __builtin_types_compatible_p(typeof(a4), btf_dump_printf_fn_t) || \ - __builtin_types_compatible_p(typeof(a4), void(void *, const char *, va_list)), \ - btf_dump__new_deprecated((void *)a1, (void *)a2, (void *)a3, (void *)a4), \ - btf_dump__new((void *)a1, (void *)a2, (void *)a3, (void *)a4)) -#endif - LIBBPF_API void btf_dump__free(struct btf_dump *d); LIBBPF_API int btf_dump__dump_type(struct btf_dump *d, __u32 id); diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index f5275f819027..400e84fd0578 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -144,15 +144,17 @@ static void btf_dump_printf(const struct btf_dump *d, const char *fmt, ...) static int btf_dump_mark_referenced(struct btf_dump *d); static int btf_dump_resize(struct btf_dump *d); -DEFAULT_VERSION(btf_dump__new_v0_6_0, btf_dump__new, LIBBPF_0.6.0) -struct btf_dump *btf_dump__new_v0_6_0(const struct btf *btf, - btf_dump_printf_fn_t printf_fn, - void *ctx, - const struct btf_dump_opts *opts) +struct btf_dump *btf_dump__new(const struct btf *btf, + btf_dump_printf_fn_t printf_fn, + void *ctx, + const struct btf_dump_opts *opts) { struct btf_dump *d; int err; + if (!OPTS_VALID(opts, btf_dump_opts)) + return libbpf_err_ptr(-EINVAL); + if (!printf_fn) return libbpf_err_ptr(-EINVAL); @@ -188,17 +190,6 @@ err: return libbpf_err_ptr(err); } -COMPAT_VERSION(btf_dump__new_deprecated, btf_dump__new, LIBBPF_0.0.4) -struct btf_dump *btf_dump__new_deprecated(const struct btf *btf, - const struct btf_ext *btf_ext, - const struct btf_dump_opts *opts, - btf_dump_printf_fn_t printf_fn) -{ - if (!printf_fn) - return libbpf_err_ptr(-EINVAL); - return btf_dump__new_v0_6_0(btf, printf_fn, opts ? opts->ctx : NULL, opts); -} - static int btf_dump_resize(struct btf_dump *d) { int err, last_id = btf__type_cnt(d->btf) - 1; diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index b11d3e689126..c0150a7dbd81 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -3061,11 +3061,6 @@ static int btf_finalize_data(struct bpf_object *obj, struct btf *btf) return libbpf_err(err); } -int btf__finalize_data(struct bpf_object *obj, struct btf *btf) -{ - return btf_finalize_data(obj, btf); -} - static int bpf_object__finalize_btf(struct bpf_object *obj) { int err; @@ -4397,9 +4392,7 @@ bpf_object__collect_prog_relos(struct bpf_object *obj, Elf64_Shdr *shdr, Elf_Dat static int bpf_map_find_btf_info(struct bpf_object *obj, struct bpf_map *map) { - struct bpf_map_def *def = &map->def; - __u32 key_type_id = 0, value_type_id = 0; - int ret; + int id; if (!obj->btf) return -ENOENT; @@ -4408,31 +4401,22 @@ static int bpf_map_find_btf_info(struct bpf_object *obj, struct bpf_map *map) * For struct_ops map, it does not need btf_key_type_id and * btf_value_type_id. */ - if (map->sec_idx == obj->efile.btf_maps_shndx || - bpf_map__is_struct_ops(map)) + if (map->sec_idx == obj->efile.btf_maps_shndx || bpf_map__is_struct_ops(map)) return 0; - if (!bpf_map__is_internal(map)) { - pr_warn("Use of BPF_ANNOTATE_KV_PAIR is deprecated, use BTF-defined maps in .maps section instead\n"); -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - ret = btf__get_map_kv_tids(obj->btf, map->name, def->key_size, - def->value_size, &key_type_id, - &value_type_id); -#pragma GCC diagnostic pop - } else { - /* - * LLVM annotates global data differently in BTF, that is, - * only as '.data', '.bss' or '.rodata'. - */ - ret = btf__find_by_name(obj->btf, map->real_name); - } - if (ret < 0) - return ret; + /* + * LLVM annotates global data differently in BTF, that is, + * only as '.data', '.bss' or '.rodata'. + */ + if (!bpf_map__is_internal(map)) + return -ENOENT; + + id = btf__find_by_name(obj->btf, map->real_name); + if (id < 0) + return id; - map->btf_key_type_id = key_type_id; - map->btf_value_type_id = bpf_map__is_internal(map) ? - ret : value_type_id; + map->btf_key_type_id = 0; + map->btf_value_type_id = id; return 0; } diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 3cea0bab95ea..ff5fba17764b 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -95,7 +95,6 @@ LIBBPF_0.0.1 { btf__fd; btf__find_by_name; btf__free; - btf__get_from_id; btf__name_by_offset; btf__new; btf__resolve_size; @@ -116,18 +115,10 @@ LIBBPF_0.0.2 { bpf_map_lookup_elem_flags; bpf_object__btf; bpf_object__find_map_fd_by_name; - btf__dedup; - btf__get_map_kv_tids; - btf__get_nr_types; btf__get_raw_data; - btf__load; btf_ext__free; - btf_ext__func_info_rec_size; btf_ext__get_raw_data; - btf_ext__line_info_rec_size; btf_ext__new; - btf_ext__reloc_func_info; - btf_ext__reloc_line_info; bpf_program__get_prog_info_linear; bpf_program__bpil_addr_to_offs; bpf_program__bpil_offs_to_addr; @@ -137,7 +128,6 @@ LIBBPF_0.0.3 { global: bpf_map__is_internal; bpf_map_freeze; - btf__finalize_data; } LIBBPF_0.0.2; LIBBPF_0.0.4 { @@ -151,7 +141,6 @@ LIBBPF_0.0.4 { bpf_program__attach_uprobe; btf_dump__dump_type; btf_dump__free; - btf_dump__new; btf__parse_elf; libbpf_num_possible_cpus; perf_buffer__free; @@ -373,11 +362,9 @@ LIBBPF_0.6.0 { btf__add_decl_tag; btf__add_type_tag; btf__dedup; - btf__dedup_deprecated; btf__raw_data; btf__type_cnt; btf_dump__new; - btf_dump__new_deprecated; libbpf_major_version; libbpf_minor_version; libbpf_version_string; -- cgit v1.2.3 From 22dd7a58b2e9965c1a612b6a09c4a6146cbb5873 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:18 -0700 Subject: libbpf: clean up perfbuf APIs Remove deprecated perfbuf APIs and clean up opts structs. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-7-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 54 ++++++++---------------------------- tools/lib/bpf/libbpf.h | 71 ++++-------------------------------------------- tools/lib/bpf/libbpf.map | 5 ---- 3 files changed, 18 insertions(+), 112 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index c0150a7dbd81..38dc00e440c3 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -11975,6 +11975,9 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) return link; } +typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(struct perf_event_header *hdr, + void *private_data); + static enum bpf_perf_event_ret perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, void **copy_mem, size_t *copy_size, @@ -12023,12 +12026,6 @@ perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, return libbpf_err(ret); } -__attribute__((alias("perf_event_read_simple"))) -enum bpf_perf_event_ret -bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, - void **copy_mem, size_t *copy_size, - bpf_perf_event_print_t fn, void *private_data); - struct perf_buffer; struct perf_buffer_params { @@ -12162,12 +12159,11 @@ error: static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt, struct perf_buffer_params *p); -DEFAULT_VERSION(perf_buffer__new_v0_6_0, perf_buffer__new, LIBBPF_0.6.0) -struct perf_buffer *perf_buffer__new_v0_6_0(int map_fd, size_t page_cnt, - perf_buffer_sample_fn sample_cb, - perf_buffer_lost_fn lost_cb, - void *ctx, - const struct perf_buffer_opts *opts) +struct perf_buffer *perf_buffer__new(int map_fd, size_t page_cnt, + perf_buffer_sample_fn sample_cb, + perf_buffer_lost_fn lost_cb, + void *ctx, + const struct perf_buffer_opts *opts) { struct perf_buffer_params p = {}; struct perf_event_attr attr = {}; @@ -12189,22 +12185,10 @@ struct perf_buffer *perf_buffer__new_v0_6_0(int map_fd, size_t page_cnt, return libbpf_ptr(__perf_buffer__new(map_fd, page_cnt, &p)); } -COMPAT_VERSION(perf_buffer__new_deprecated, perf_buffer__new, LIBBPF_0.0.4) -struct perf_buffer *perf_buffer__new_deprecated(int map_fd, size_t page_cnt, - const struct perf_buffer_opts *opts) -{ - return perf_buffer__new_v0_6_0(map_fd, page_cnt, - opts ? opts->sample_cb : NULL, - opts ? opts->lost_cb : NULL, - opts ? opts->ctx : NULL, - NULL); -} - -DEFAULT_VERSION(perf_buffer__new_raw_v0_6_0, perf_buffer__new_raw, LIBBPF_0.6.0) -struct perf_buffer *perf_buffer__new_raw_v0_6_0(int map_fd, size_t page_cnt, - struct perf_event_attr *attr, - perf_buffer_event_fn event_cb, void *ctx, - const struct perf_buffer_raw_opts *opts) +struct perf_buffer *perf_buffer__new_raw(int map_fd, size_t page_cnt, + struct perf_event_attr *attr, + perf_buffer_event_fn event_cb, void *ctx, + const struct perf_buffer_raw_opts *opts) { struct perf_buffer_params p = {}; @@ -12224,20 +12208,6 @@ struct perf_buffer *perf_buffer__new_raw_v0_6_0(int map_fd, size_t page_cnt, return libbpf_ptr(__perf_buffer__new(map_fd, page_cnt, &p)); } -COMPAT_VERSION(perf_buffer__new_raw_deprecated, perf_buffer__new_raw, LIBBPF_0.0.4) -struct perf_buffer *perf_buffer__new_raw_deprecated(int map_fd, size_t page_cnt, - const struct perf_buffer_raw_opts *opts) -{ - LIBBPF_OPTS(perf_buffer_raw_opts, inner_opts, - .cpu_cnt = opts->cpu_cnt, - .cpus = opts->cpus, - .map_keys = opts->map_keys, - ); - - return perf_buffer__new_raw_v0_6_0(map_fd, page_cnt, opts->attr, - opts->event_cb, opts->ctx, &inner_opts); -} - static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt, struct perf_buffer_params *p) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 71b19ae1659c..5dc4271fcdb7 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1269,17 +1269,7 @@ typedef void (*perf_buffer_lost_fn)(void *ctx, int cpu, __u64 cnt); /* common use perf buffer options */ struct perf_buffer_opts { - union { - size_t sz; - struct { /* DEPRECATED: will be removed in v1.0 */ - /* if specified, sample_cb is called for each sample */ - perf_buffer_sample_fn sample_cb; - /* if specified, lost_cb is called for each batch of lost samples */ - perf_buffer_lost_fn lost_cb; - /* ctx is provided to sample_cb and lost_cb */ - void *ctx; - }; - }; + size_t sz; }; #define perf_buffer_opts__last_field sz @@ -1300,21 +1290,6 @@ perf_buffer__new(int map_fd, size_t page_cnt, perf_buffer_sample_fn sample_cb, perf_buffer_lost_fn lost_cb, void *ctx, const struct perf_buffer_opts *opts); -LIBBPF_API struct perf_buffer * -perf_buffer__new_v0_6_0(int map_fd, size_t page_cnt, - perf_buffer_sample_fn sample_cb, perf_buffer_lost_fn lost_cb, void *ctx, - const struct perf_buffer_opts *opts); - -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "use new variant of perf_buffer__new() instead") -struct perf_buffer *perf_buffer__new_deprecated(int map_fd, size_t page_cnt, - const struct perf_buffer_opts *opts); - -#define perf_buffer__new(...) ___libbpf_overload(___perf_buffer_new, __VA_ARGS__) -#define ___perf_buffer_new6(map_fd, page_cnt, sample_cb, lost_cb, ctx, opts) \ - perf_buffer__new(map_fd, page_cnt, sample_cb, lost_cb, ctx, opts) -#define ___perf_buffer_new3(map_fd, page_cnt, opts) \ - perf_buffer__new_deprecated(map_fd, page_cnt, opts) - enum bpf_perf_event_ret { LIBBPF_PERF_EVENT_DONE = 0, LIBBPF_PERF_EVENT_ERROR = -1, @@ -1328,21 +1303,9 @@ typedef enum bpf_perf_event_ret /* raw perf buffer options, giving most power and control */ struct perf_buffer_raw_opts { - union { - struct { - size_t sz; - long :0; - long :0; - }; - struct { /* DEPRECATED: will be removed in v1.0 */ - /* perf event attrs passed directly into perf_event_open() */ - struct perf_event_attr *attr; - /* raw event callback */ - perf_buffer_event_fn event_cb; - /* ctx is provided to event_cb */ - void *ctx; - }; - }; + size_t sz; + long :0; + long :0; /* if cpu_cnt == 0, open all on all possible CPUs (up to the number of * max_entries of given PERF_EVENT_ARRAY map) */ @@ -1354,26 +1317,13 @@ struct perf_buffer_raw_opts { }; #define perf_buffer_raw_opts__last_field map_keys +struct perf_event_attr; + LIBBPF_API struct perf_buffer * perf_buffer__new_raw(int map_fd, size_t page_cnt, struct perf_event_attr *attr, perf_buffer_event_fn event_cb, void *ctx, const struct perf_buffer_raw_opts *opts); -LIBBPF_API struct perf_buffer * -perf_buffer__new_raw_v0_6_0(int map_fd, size_t page_cnt, struct perf_event_attr *attr, - perf_buffer_event_fn event_cb, void *ctx, - const struct perf_buffer_raw_opts *opts); - -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "use new variant of perf_buffer__new_raw() instead") -struct perf_buffer *perf_buffer__new_raw_deprecated(int map_fd, size_t page_cnt, - const struct perf_buffer_raw_opts *opts); - -#define perf_buffer__new_raw(...) ___libbpf_overload(___perf_buffer_new_raw, __VA_ARGS__) -#define ___perf_buffer_new_raw6(map_fd, page_cnt, attr, event_cb, ctx, opts) \ - perf_buffer__new_raw(map_fd, page_cnt, attr, event_cb, ctx, opts) -#define ___perf_buffer_new_raw3(map_fd, page_cnt, opts) \ - perf_buffer__new_raw_deprecated(map_fd, page_cnt, opts) - LIBBPF_API void perf_buffer__free(struct perf_buffer *pb); LIBBPF_API int perf_buffer__epoll_fd(const struct perf_buffer *pb); LIBBPF_API int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms); @@ -1382,15 +1332,6 @@ LIBBPF_API int perf_buffer__consume_buffer(struct perf_buffer *pb, size_t buf_id LIBBPF_API size_t perf_buffer__buffer_cnt(const struct perf_buffer *pb); LIBBPF_API int perf_buffer__buffer_fd(const struct perf_buffer *pb, size_t buf_idx); -typedef enum bpf_perf_event_ret - (*bpf_perf_event_print_t)(struct perf_event_header *hdr, - void *private_data); -LIBBPF_DEPRECATED_SINCE(0, 8, "use perf_buffer__poll() or perf_buffer__consume() instead") -LIBBPF_API enum bpf_perf_event_ret -bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, - void **copy_mem, size_t *copy_size, - bpf_perf_event_print_t fn, void *private_data); - struct bpf_prog_linfo; struct bpf_prog_info; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index ff5fba17764b..f327adc66933 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -46,7 +46,6 @@ LIBBPF_0.0.1 { bpf_object__unload; bpf_object__unpin_maps; bpf_object__unpin_programs; - bpf_perf_event_read_simple; bpf_prog_attach; bpf_prog_detach; bpf_prog_detach2; @@ -144,8 +143,6 @@ LIBBPF_0.0.4 { btf__parse_elf; libbpf_num_possible_cpus; perf_buffer__free; - perf_buffer__new; - perf_buffer__new_raw; perf_buffer__poll; } LIBBPF_0.0.3; @@ -369,9 +366,7 @@ LIBBPF_0.6.0 { libbpf_minor_version; libbpf_version_string; perf_buffer__new; - perf_buffer__new_deprecated; perf_buffer__new_raw; - perf_buffer__new_raw_deprecated; } LIBBPF_0.5.0; LIBBPF_0.7.0 { -- cgit v1.2.3 From 9a590538ba4fbe6d0a4586cdab039df782153fcb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:19 -0700 Subject: libbpf: remove prog_info_linear APIs Remove prog_info_linear-related APIs previously used by perf. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-8-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 248 ----------------------------------------------- tools/lib/bpf/libbpf.h | 66 ------------- tools/lib/bpf/libbpf.map | 3 - 3 files changed, 317 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 38dc00e440c3..cd06989812c9 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -12509,254 +12509,6 @@ int perf_buffer__consume(struct perf_buffer *pb) return 0; } -struct bpf_prog_info_array_desc { - int array_offset; /* e.g. offset of jited_prog_insns */ - int count_offset; /* e.g. offset of jited_prog_len */ - int size_offset; /* > 0: offset of rec size, - * < 0: fix size of -size_offset - */ -}; - -static struct bpf_prog_info_array_desc bpf_prog_info_array_desc[] = { - [BPF_PROG_INFO_JITED_INSNS] = { - offsetof(struct bpf_prog_info, jited_prog_insns), - offsetof(struct bpf_prog_info, jited_prog_len), - -1, - }, - [BPF_PROG_INFO_XLATED_INSNS] = { - offsetof(struct bpf_prog_info, xlated_prog_insns), - offsetof(struct bpf_prog_info, xlated_prog_len), - -1, - }, - [BPF_PROG_INFO_MAP_IDS] = { - offsetof(struct bpf_prog_info, map_ids), - offsetof(struct bpf_prog_info, nr_map_ids), - -(int)sizeof(__u32), - }, - [BPF_PROG_INFO_JITED_KSYMS] = { - offsetof(struct bpf_prog_info, jited_ksyms), - offsetof(struct bpf_prog_info, nr_jited_ksyms), - -(int)sizeof(__u64), - }, - [BPF_PROG_INFO_JITED_FUNC_LENS] = { - offsetof(struct bpf_prog_info, jited_func_lens), - offsetof(struct bpf_prog_info, nr_jited_func_lens), - -(int)sizeof(__u32), - }, - [BPF_PROG_INFO_FUNC_INFO] = { - offsetof(struct bpf_prog_info, func_info), - offsetof(struct bpf_prog_info, nr_func_info), - offsetof(struct bpf_prog_info, func_info_rec_size), - }, - [BPF_PROG_INFO_LINE_INFO] = { - offsetof(struct bpf_prog_info, line_info), - offsetof(struct bpf_prog_info, nr_line_info), - offsetof(struct bpf_prog_info, line_info_rec_size), - }, - [BPF_PROG_INFO_JITED_LINE_INFO] = { - offsetof(struct bpf_prog_info, jited_line_info), - offsetof(struct bpf_prog_info, nr_jited_line_info), - offsetof(struct bpf_prog_info, jited_line_info_rec_size), - }, - [BPF_PROG_INFO_PROG_TAGS] = { - offsetof(struct bpf_prog_info, prog_tags), - offsetof(struct bpf_prog_info, nr_prog_tags), - -(int)sizeof(__u8) * BPF_TAG_SIZE, - }, - -}; - -static __u32 bpf_prog_info_read_offset_u32(struct bpf_prog_info *info, - int offset) -{ - __u32 *array = (__u32 *)info; - - if (offset >= 0) - return array[offset / sizeof(__u32)]; - return -(int)offset; -} - -static __u64 bpf_prog_info_read_offset_u64(struct bpf_prog_info *info, - int offset) -{ - __u64 *array = (__u64 *)info; - - if (offset >= 0) - return array[offset / sizeof(__u64)]; - return -(int)offset; -} - -static void bpf_prog_info_set_offset_u32(struct bpf_prog_info *info, int offset, - __u32 val) -{ - __u32 *array = (__u32 *)info; - - if (offset >= 0) - array[offset / sizeof(__u32)] = val; -} - -static void bpf_prog_info_set_offset_u64(struct bpf_prog_info *info, int offset, - __u64 val) -{ - __u64 *array = (__u64 *)info; - - if (offset >= 0) - array[offset / sizeof(__u64)] = val; -} - -struct bpf_prog_info_linear * -bpf_program__get_prog_info_linear(int fd, __u64 arrays) -{ - struct bpf_prog_info_linear *info_linear; - struct bpf_prog_info info = {}; - __u32 info_len = sizeof(info); - __u32 data_len = 0; - int i, err; - void *ptr; - - if (arrays >> BPF_PROG_INFO_LAST_ARRAY) - return libbpf_err_ptr(-EINVAL); - - /* step 1: get array dimensions */ - err = bpf_obj_get_info_by_fd(fd, &info, &info_len); - if (err) { - pr_debug("can't get prog info: %s", strerror(errno)); - return libbpf_err_ptr(-EFAULT); - } - - /* step 2: calculate total size of all arrays */ - for (i = BPF_PROG_INFO_FIRST_ARRAY; i < BPF_PROG_INFO_LAST_ARRAY; ++i) { - bool include_array = (arrays & (1UL << i)) > 0; - struct bpf_prog_info_array_desc *desc; - __u32 count, size; - - desc = bpf_prog_info_array_desc + i; - - /* kernel is too old to support this field */ - if (info_len < desc->array_offset + sizeof(__u32) || - info_len < desc->count_offset + sizeof(__u32) || - (desc->size_offset > 0 && info_len < desc->size_offset)) - include_array = false; - - if (!include_array) { - arrays &= ~(1UL << i); /* clear the bit */ - continue; - } - - count = bpf_prog_info_read_offset_u32(&info, desc->count_offset); - size = bpf_prog_info_read_offset_u32(&info, desc->size_offset); - - data_len += count * size; - } - - /* step 3: allocate continuous memory */ - data_len = roundup(data_len, sizeof(__u64)); - info_linear = malloc(sizeof(struct bpf_prog_info_linear) + data_len); - if (!info_linear) - return libbpf_err_ptr(-ENOMEM); - - /* step 4: fill data to info_linear->info */ - info_linear->arrays = arrays; - memset(&info_linear->info, 0, sizeof(info)); - ptr = info_linear->data; - - for (i = BPF_PROG_INFO_FIRST_ARRAY; i < BPF_PROG_INFO_LAST_ARRAY; ++i) { - struct bpf_prog_info_array_desc *desc; - __u32 count, size; - - if ((arrays & (1UL << i)) == 0) - continue; - - desc = bpf_prog_info_array_desc + i; - count = bpf_prog_info_read_offset_u32(&info, desc->count_offset); - size = bpf_prog_info_read_offset_u32(&info, desc->size_offset); - bpf_prog_info_set_offset_u32(&info_linear->info, - desc->count_offset, count); - bpf_prog_info_set_offset_u32(&info_linear->info, - desc->size_offset, size); - bpf_prog_info_set_offset_u64(&info_linear->info, - desc->array_offset, - ptr_to_u64(ptr)); - ptr += count * size; - } - - /* step 5: call syscall again to get required arrays */ - err = bpf_obj_get_info_by_fd(fd, &info_linear->info, &info_len); - if (err) { - pr_debug("can't get prog info: %s", strerror(errno)); - free(info_linear); - return libbpf_err_ptr(-EFAULT); - } - - /* step 6: verify the data */ - for (i = BPF_PROG_INFO_FIRST_ARRAY; i < BPF_PROG_INFO_LAST_ARRAY; ++i) { - struct bpf_prog_info_array_desc *desc; - __u32 v1, v2; - - if ((arrays & (1UL << i)) == 0) - continue; - - desc = bpf_prog_info_array_desc + i; - v1 = bpf_prog_info_read_offset_u32(&info, desc->count_offset); - v2 = bpf_prog_info_read_offset_u32(&info_linear->info, - desc->count_offset); - if (v1 != v2) - pr_warn("%s: mismatch in element count\n", __func__); - - v1 = bpf_prog_info_read_offset_u32(&info, desc->size_offset); - v2 = bpf_prog_info_read_offset_u32(&info_linear->info, - desc->size_offset); - if (v1 != v2) - pr_warn("%s: mismatch in rec size\n", __func__); - } - - /* step 7: update info_len and data_len */ - info_linear->info_len = sizeof(struct bpf_prog_info); - info_linear->data_len = data_len; - - return info_linear; -} - -void bpf_program__bpil_addr_to_offs(struct bpf_prog_info_linear *info_linear) -{ - int i; - - for (i = BPF_PROG_INFO_FIRST_ARRAY; i < BPF_PROG_INFO_LAST_ARRAY; ++i) { - struct bpf_prog_info_array_desc *desc; - __u64 addr, offs; - - if ((info_linear->arrays & (1UL << i)) == 0) - continue; - - desc = bpf_prog_info_array_desc + i; - addr = bpf_prog_info_read_offset_u64(&info_linear->info, - desc->array_offset); - offs = addr - ptr_to_u64(info_linear->data); - bpf_prog_info_set_offset_u64(&info_linear->info, - desc->array_offset, offs); - } -} - -void bpf_program__bpil_offs_to_addr(struct bpf_prog_info_linear *info_linear) -{ - int i; - - for (i = BPF_PROG_INFO_FIRST_ARRAY; i < BPF_PROG_INFO_LAST_ARRAY; ++i) { - struct bpf_prog_info_array_desc *desc; - __u64 addr, offs; - - if ((info_linear->arrays & (1UL << i)) == 0) - continue; - - desc = bpf_prog_info_array_desc + i; - offs = bpf_prog_info_read_offset_u64(&info_linear->info, - desc->array_offset); - addr = offs + ptr_to_u64(info_linear->data); - bpf_prog_info_set_offset_u64(&info_linear->info, - desc->array_offset, addr); - } -} - int bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd, const char *attach_func_name) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 5dc4271fcdb7..e6357ea7bf41 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1396,72 +1396,6 @@ LIBBPF_API int libbpf_probe_bpf_map_type(enum bpf_map_type map_type, const void LIBBPF_API int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helper_id, const void *opts); -/* - * Get bpf_prog_info in continuous memory - * - * struct bpf_prog_info has multiple arrays. The user has option to choose - * arrays to fetch from kernel. The following APIs provide an uniform way to - * fetch these data. All arrays in bpf_prog_info are stored in a single - * continuous memory region. This makes it easy to store the info in a - * file. - * - * Before writing bpf_prog_info_linear to files, it is necessary to - * translate pointers in bpf_prog_info to offsets. Helper functions - * bpf_program__bpil_addr_to_offs() and bpf_program__bpil_offs_to_addr() - * are introduced to switch between pointers and offsets. - * - * Examples: - * # To fetch map_ids and prog_tags: - * __u64 arrays = (1UL << BPF_PROG_INFO_MAP_IDS) | - * (1UL << BPF_PROG_INFO_PROG_TAGS); - * struct bpf_prog_info_linear *info_linear = - * bpf_program__get_prog_info_linear(fd, arrays); - * - * # To save data in file - * bpf_program__bpil_addr_to_offs(info_linear); - * write(f, info_linear, sizeof(*info_linear) + info_linear->data_len); - * - * # To read data from file - * read(f, info_linear, ); - * bpf_program__bpil_offs_to_addr(info_linear); - */ -enum bpf_prog_info_array { - BPF_PROG_INFO_FIRST_ARRAY = 0, - BPF_PROG_INFO_JITED_INSNS = 0, - BPF_PROG_INFO_XLATED_INSNS, - BPF_PROG_INFO_MAP_IDS, - BPF_PROG_INFO_JITED_KSYMS, - BPF_PROG_INFO_JITED_FUNC_LENS, - BPF_PROG_INFO_FUNC_INFO, - BPF_PROG_INFO_LINE_INFO, - BPF_PROG_INFO_JITED_LINE_INFO, - BPF_PROG_INFO_PROG_TAGS, - BPF_PROG_INFO_LAST_ARRAY, -}; - -struct bpf_prog_info_linear { - /* size of struct bpf_prog_info, when the tool is compiled */ - __u32 info_len; - /* total bytes allocated for data, round up to 8 bytes */ - __u32 data_len; - /* which arrays are included in data */ - __u64 arrays; - struct bpf_prog_info info; - __u8 data[]; -}; - -LIBBPF_DEPRECATED_SINCE(0, 6, "use a custom linear prog_info wrapper") -LIBBPF_API struct bpf_prog_info_linear * -bpf_program__get_prog_info_linear(int fd, __u64 arrays); - -LIBBPF_DEPRECATED_SINCE(0, 6, "use a custom linear prog_info wrapper") -LIBBPF_API void -bpf_program__bpil_addr_to_offs(struct bpf_prog_info_linear *info_linear); - -LIBBPF_DEPRECATED_SINCE(0, 6, "use a custom linear prog_info wrapper") -LIBBPF_API void -bpf_program__bpil_offs_to_addr(struct bpf_prog_info_linear *info_linear); - /** * @brief **libbpf_num_possible_cpus()** is a helper function to get the * number of possible CPUs that the host kernel supports and expects. diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index f327adc66933..93c7bcc058c6 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -118,9 +118,6 @@ LIBBPF_0.0.2 { btf_ext__free; btf_ext__get_raw_data; btf_ext__new; - bpf_program__get_prog_info_linear; - bpf_program__bpil_addr_to_offs; - bpf_program__bpil_offs_to_addr; } LIBBPF_0.0.1; LIBBPF_0.0.3 { -- cgit v1.2.3 From 146bf811f5ac133e619fcf4e77bc7a97a9e401e7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:20 -0700 Subject: libbpf: remove most other deprecated high-level APIs Remove a bunch of high-level bpf_object/bpf_map/bpf_program related APIs. All the APIs related to private per-object/map/prog state, program preprocessing callback, and generally everything multi-instance related is removed in a separate patch. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-9-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 241 +++++------------------------------------------ tools/lib/bpf/libbpf.h | 163 +------------------------------- tools/lib/bpf/libbpf.map | 43 --------- 3 files changed, 26 insertions(+), 421 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index cd06989812c9..149392604aa4 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include @@ -484,6 +483,14 @@ enum libbpf_map_type { LIBBPF_MAP_KCONFIG, }; +struct bpf_map_def { + unsigned int type; + unsigned int key_size; + unsigned int value_size; + unsigned int max_entries; + unsigned int map_flags; +}; + struct bpf_map { struct bpf_object *obj; char *name; @@ -568,8 +575,6 @@ struct extern_desc { }; }; -static LIST_HEAD(bpf_objects_list); - struct module_btf { struct btf *btf; char *name; @@ -638,12 +643,6 @@ struct bpf_object { /* Information when doing ELF related work. Only valid if efile.elf is not NULL */ struct elf_state efile; - /* - * All loaded bpf_object are linked in a list, which is - * hidden to caller. bpf_objects__ handlers deal with - * all objects. - */ - struct list_head list; struct btf *btf; struct btf_ext *btf_ext; @@ -1313,7 +1312,6 @@ static struct bpf_object *bpf_object__new(const char *path, size_t obj_buf_sz, const char *obj_name) { - bool strict = (libbpf_mode & LIBBPF_STRICT_NO_OBJECT_LIST); struct bpf_object *obj; char *end; @@ -1351,9 +1349,6 @@ static struct bpf_object *bpf_object__new(const char *path, obj->kern_version = get_kernel_version(); obj->loaded = false; - INIT_LIST_HEAD(&obj->list); - if (!strict) - list_add(&obj->list, &bpf_objects_list); return obj; } @@ -1386,10 +1381,7 @@ static int bpf_object__elf_init(struct bpf_object *obj) } if (obj->efile.obj_buf_sz > 0) { - /* - * obj_buf should have been validated by - * bpf_object__open_buffer(). - */ + /* obj_buf should have been validated by bpf_object__open_mem(). */ elf = elf_memory((char *)obj->efile.obj_buf, obj->efile.obj_buf_sz); } else { obj->efile.fd = open(obj->path, O_RDONLY | O_CLOEXEC); @@ -2306,6 +2298,13 @@ static int build_map_pin_path(struct bpf_map *map, const char *path) return bpf_map__set_pin_path(map, buf); } +/* should match definition in bpf_helpers.h */ +enum libbpf_pin_type { + LIBBPF_PIN_NONE, + /* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */ + LIBBPF_PIN_BY_NAME, +}; + int parse_btf_map_def(const char *map_name, struct btf *btf, const struct btf_type *def_t, bool strict, struct btf_map_def *map_def, struct btf_map_def *inner_def) @@ -4017,19 +4016,6 @@ static int bpf_object__collect_externs(struct bpf_object *obj) return 0; } -struct bpf_program * -bpf_object__find_program_by_title(const struct bpf_object *obj, - const char *title) -{ - struct bpf_program *pos; - - bpf_object__for_each_program(pos, obj) { - if (pos->sec_name && !strcmp(pos->sec_name, title)) - return pos; - } - return errno = ENOENT, NULL; -} - static bool prog_is_subprog(const struct bpf_object *obj, const struct bpf_program *prog) { @@ -4548,14 +4534,6 @@ int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries) return 0; } -int bpf_map__resize(struct bpf_map *map, __u32 max_entries) -{ - if (!map || !max_entries) - return libbpf_err(-EINVAL); - - return bpf_map__set_max_entries(map, max_entries); -} - static int bpf_object__probe_loading(struct bpf_object *obj) { @@ -7339,11 +7317,6 @@ out: return libbpf_err(err); } -int bpf_program__load(struct bpf_program *prog, const char *license, __u32 kern_ver) -{ - return bpf_object_load_prog(prog->obj, prog, license, kern_ver); -} - static int bpf_object__load_progs(struct bpf_object *obj, int log_level) { @@ -7395,13 +7368,6 @@ static int bpf_object_init_progs(struct bpf_object *obj, const struct bpf_object prog->type = prog->sec_def->prog_type; prog->expected_attach_type = prog->sec_def->expected_attach_type; -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - if (prog->sec_def->prog_type == BPF_PROG_TYPE_TRACING || - prog->sec_def->prog_type == BPF_PROG_TYPE_EXT) - prog->attach_prog_fd = OPTS_GET(opts, attach_prog_fd, 0); -#pragma GCC diagnostic pop - /* sec_def can have custom callback which should be called * after bpf_program is initialized to adjust its properties */ @@ -7507,36 +7473,6 @@ out: return ERR_PTR(err); } -static struct bpf_object * -__bpf_object__open_xattr(struct bpf_object_open_attr *attr, int flags) -{ - DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, - .relaxed_maps = flags & MAPS_RELAX_COMPAT, - ); - - /* param validation */ - if (!attr->file) - return NULL; - - pr_debug("loading %s\n", attr->file); - return bpf_object_open(attr->file, NULL, 0, &opts); -} - -struct bpf_object *bpf_object__open_xattr(struct bpf_object_open_attr *attr) -{ - return libbpf_ptr(__bpf_object__open_xattr(attr, 0)); -} - -struct bpf_object *bpf_object__open(const char *path) -{ - struct bpf_object_open_attr attr = { - .file = path, - .prog_type = BPF_PROG_TYPE_UNSPEC, - }; - - return libbpf_ptr(__bpf_object__open_xattr(&attr, 0)); -} - struct bpf_object * bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts) { @@ -7548,6 +7484,11 @@ bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts) return libbpf_ptr(bpf_object_open(path, NULL, 0, opts)); } +struct bpf_object *bpf_object__open(const char *path) +{ + return bpf_object__open_file(path, NULL); +} + struct bpf_object * bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, const struct bpf_object_open_opts *opts) @@ -7558,23 +7499,6 @@ bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, return libbpf_ptr(bpf_object_open(NULL, obj_buf, obj_buf_sz, opts)); } -struct bpf_object * -bpf_object__open_buffer(const void *obj_buf, size_t obj_buf_sz, - const char *name) -{ - DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, - .object_name = name, - /* wrong default, but backwards-compatible */ - .relaxed_maps = true, - ); - - /* returning NULL is wrong, but backwards-compatible */ - if (!obj_buf || obj_buf_sz == 0) - return errno = EINVAL, NULL; - - return libbpf_ptr(bpf_object_open(NULL, obj_buf, obj_buf_sz, &opts)); -} - static int bpf_object_unload(struct bpf_object *obj) { size_t i; @@ -8007,11 +7931,6 @@ out: return libbpf_err(err); } -int bpf_object__load_xattr(struct bpf_object_load_attr *attr) -{ - return bpf_object_load(attr->obj, attr->log_level, attr->target_btf_path); -} - int bpf_object__load(struct bpf_object *obj) { return bpf_object_load(obj, 0, NULL); @@ -8642,33 +8561,9 @@ void bpf_object__close(struct bpf_object *obj) } zfree(&obj->programs); - list_del(&obj->list); free(obj); } -struct bpf_object * -bpf_object__next(struct bpf_object *prev) -{ - struct bpf_object *next; - bool strict = (libbpf_mode & LIBBPF_STRICT_NO_OBJECT_LIST); - - if (strict) - return NULL; - - if (!prev) - next = list_first_entry(&bpf_objects_list, - struct bpf_object, - list); - else - next = list_next_entry(prev, list); - - /* Empty list is noticed here so don't need checking on entry. */ - if (&next->list == &bpf_objects_list) - return NULL; - - return next; -} - const char *bpf_object__name(const struct bpf_object *obj) { return obj ? obj->name : libbpf_err_ptr(-EINVAL); @@ -8757,12 +8652,6 @@ __bpf_program__iter(const struct bpf_program *p, const struct bpf_object *obj, return &obj->programs[idx]; } -struct bpf_program * -bpf_program__next(struct bpf_program *prev, const struct bpf_object *obj) -{ - return bpf_object__next_program(obj, prev); -} - struct bpf_program * bpf_object__next_program(const struct bpf_object *obj, struct bpf_program *prev) { @@ -8775,12 +8664,6 @@ bpf_object__next_program(const struct bpf_object *obj, struct bpf_program *prev) return prog; } -struct bpf_program * -bpf_program__prev(struct bpf_program *next, const struct bpf_object *obj) -{ - return bpf_object__prev_program(obj, next); -} - struct bpf_program * bpf_object__prev_program(const struct bpf_object *obj, struct bpf_program *next) { @@ -8824,22 +8707,6 @@ const char *bpf_program__section_name(const struct bpf_program *prog) return prog->sec_name; } -const char *bpf_program__title(const struct bpf_program *prog, bool needs_copy) -{ - const char *title; - - title = prog->sec_name; - if (needs_copy) { - title = strdup(title); - if (!title) { - pr_warn("failed to strdup program title\n"); - return libbpf_err_ptr(-ENOMEM); - } - } - - return title; -} - bool bpf_program__autoload(const struct bpf_program *prog) { return prog->autoload; @@ -8861,11 +8728,6 @@ int bpf_program__fd(const struct bpf_program *prog) return bpf_program_nth_fd(prog, 0); } -size_t bpf_program__size(const struct bpf_program *prog) -{ - return prog->insns_cnt * BPF_INSN_SZ; -} - const struct bpf_insn *bpf_program__insns(const struct bpf_program *prog) { return prog->insns; @@ -8967,39 +8829,6 @@ int bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type) return 0; } -static bool bpf_program__is_type(const struct bpf_program *prog, - enum bpf_prog_type type) -{ - return prog ? (prog->type == type) : false; -} - -#define BPF_PROG_TYPE_FNS(NAME, TYPE) \ -int bpf_program__set_##NAME(struct bpf_program *prog) \ -{ \ - if (!prog) \ - return libbpf_err(-EINVAL); \ - return bpf_program__set_type(prog, TYPE); \ -} \ - \ -bool bpf_program__is_##NAME(const struct bpf_program *prog) \ -{ \ - return bpf_program__is_type(prog, TYPE); \ -} \ - -BPF_PROG_TYPE_FNS(socket_filter, BPF_PROG_TYPE_SOCKET_FILTER); -BPF_PROG_TYPE_FNS(lsm, BPF_PROG_TYPE_LSM); -BPF_PROG_TYPE_FNS(kprobe, BPF_PROG_TYPE_KPROBE); -BPF_PROG_TYPE_FNS(sched_cls, BPF_PROG_TYPE_SCHED_CLS); -BPF_PROG_TYPE_FNS(sched_act, BPF_PROG_TYPE_SCHED_ACT); -BPF_PROG_TYPE_FNS(tracepoint, BPF_PROG_TYPE_TRACEPOINT); -BPF_PROG_TYPE_FNS(raw_tracepoint, BPF_PROG_TYPE_RAW_TRACEPOINT); -BPF_PROG_TYPE_FNS(xdp, BPF_PROG_TYPE_XDP); -BPF_PROG_TYPE_FNS(perf_event, BPF_PROG_TYPE_PERF_EVENT); -BPF_PROG_TYPE_FNS(tracing, BPF_PROG_TYPE_TRACING); -BPF_PROG_TYPE_FNS(struct_ops, BPF_PROG_TYPE_STRUCT_OPS); -BPF_PROG_TYPE_FNS(extension, BPF_PROG_TYPE_EXT); -BPF_PROG_TYPE_FNS(sk_lookup, BPF_PROG_TYPE_SK_LOOKUP); - __alias(bpf_program__expected_attach_type) enum bpf_attach_type bpf_program__get_expected_attach_type(const struct bpf_program *prog); @@ -9773,11 +9602,6 @@ int bpf_map__fd(const struct bpf_map *map) return map ? map->fd : libbpf_err(-EINVAL); } -const struct bpf_map_def *bpf_map__def(const struct bpf_map *map) -{ - return map ? &map->def : libbpf_err_ptr(-EINVAL); -} - static bool map_uses_real_name(const struct bpf_map *map) { /* Since libbpf started to support custom .data.* and .rodata.* maps, @@ -9932,11 +9756,6 @@ const void *bpf_map__initial_value(struct bpf_map *map, size_t *psize) return map->mmaped; } -bool bpf_map__is_offload_neutral(const struct bpf_map *map) -{ - return map->def.type == BPF_MAP_TYPE_PERF_EVENT_ARRAY; -} - bool bpf_map__is_internal(const struct bpf_map *map) { return map->libbpf_type != LIBBPF_MAP_UNSPEC; @@ -9997,12 +9816,6 @@ __bpf_map__iter(const struct bpf_map *m, const struct bpf_object *obj, int i) return &obj->maps[idx]; } -struct bpf_map * -bpf_map__next(const struct bpf_map *prev, const struct bpf_object *obj) -{ - return bpf_object__next_map(obj, prev); -} - struct bpf_map * bpf_object__next_map(const struct bpf_object *obj, const struct bpf_map *prev) { @@ -10012,12 +9825,6 @@ bpf_object__next_map(const struct bpf_object *obj, const struct bpf_map *prev) return __bpf_map__iter(prev, obj, 1); } -struct bpf_map * -bpf_map__prev(const struct bpf_map *next, const struct bpf_object *obj) -{ - return bpf_object__prev_map(obj, next); -} - struct bpf_map * bpf_object__prev_map(const struct bpf_object *obj, const struct bpf_map *next) { @@ -10063,12 +9870,6 @@ bpf_object__find_map_fd_by_name(const struct bpf_object *obj, const char *name) return bpf_map__fd(bpf_object__find_map_by_name(obj, name)); } -struct bpf_map * -bpf_object__find_map_by_offset(struct bpf_object *obj, size_t offset) -{ - return libbpf_err_ptr(-ENOTSUP); -} - static int validate_map_op(const struct bpf_map *map, size_t key_sz, size_t value_sz, bool check_value_sz) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index e6357ea7bf41..9c2279b7b6ed 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -101,11 +101,6 @@ LIBBPF_API libbpf_print_fn_t libbpf_set_print(libbpf_print_fn_t fn); /* Hide internal to user */ struct bpf_object; -struct bpf_object_open_attr { - const char *file; - enum bpf_prog_type prog_type; -}; - struct bpf_object_open_opts { /* size of this struct, for forward/backward compatibility */ size_t sz; @@ -118,21 +113,12 @@ struct bpf_object_open_opts { const char *object_name; /* parse map definitions non-strictly, allowing extra attributes/data */ bool relaxed_maps; - /* DEPRECATED: handle CO-RE relocations non-strictly, allowing failures. - * Value is ignored. Relocations always are processed non-strictly. - * Non-relocatable instructions are replaced with invalid ones to - * prevent accidental errors. - * */ - LIBBPF_DEPRECATED_SINCE(0, 6, "field has no effect") - bool relaxed_core_relocs; /* maps that set the 'pinning' attribute in their definition will have * their pin_path attribute set to a file in this directory, and be * auto-pinned to that path on load; defaults to "/sys/fs/bpf". */ const char *pin_root_path; - - LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_program__set_attach_target() on each individual bpf_program") - __u32 attach_prog_fd; + long :0; /* Additional kernel config content that augments and overrides * system Kconfig for CONFIG_xxx externs. */ @@ -215,20 +201,10 @@ LIBBPF_API struct bpf_object * bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, const struct bpf_object_open_opts *opts); -/* deprecated bpf_object__open variants */ -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_object__open_mem() instead") -LIBBPF_API struct bpf_object * -bpf_object__open_buffer(const void *obj_buf, size_t obj_buf_sz, - const char *name); -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__open_file() instead") -LIBBPF_API struct bpf_object * -bpf_object__open_xattr(struct bpf_object_open_attr *attr); +/* Load/unload object into/from kernel */ +LIBBPF_API int bpf_object__load(struct bpf_object *obj); -enum libbpf_pin_type { - LIBBPF_PIN_NONE, - /* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */ - LIBBPF_PIN_BY_NAME, -}; +LIBBPF_API void bpf_object__close(struct bpf_object *object); /* pin_maps and unpin_maps can both be called with a NULL path, in which case * they will use the pin_path attribute of each map (and ignore all maps that @@ -242,20 +218,6 @@ LIBBPF_API int bpf_object__pin_programs(struct bpf_object *obj, LIBBPF_API int bpf_object__unpin_programs(struct bpf_object *obj, const char *path); LIBBPF_API int bpf_object__pin(struct bpf_object *object, const char *path); -LIBBPF_API void bpf_object__close(struct bpf_object *object); - -struct bpf_object_load_attr { - struct bpf_object *obj; - int log_level; - const char *target_btf_path; -}; - -/* Load/unload object into/from kernel */ -LIBBPF_API int bpf_object__load(struct bpf_object *obj); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_object__load() instead") -LIBBPF_API int bpf_object__load_xattr(struct bpf_object_load_attr *attr); -LIBBPF_DEPRECATED_SINCE(0, 6, "bpf_object__unload() is deprecated, use bpf_object__close() instead") -LIBBPF_API int bpf_object__unload(struct bpf_object *obj); LIBBPF_API const char *bpf_object__name(const struct bpf_object *obj); LIBBPF_API unsigned int bpf_object__kversion(const struct bpf_object *obj); @@ -265,22 +227,10 @@ struct btf; LIBBPF_API struct btf *bpf_object__btf(const struct bpf_object *obj); LIBBPF_API int bpf_object__btf_fd(const struct bpf_object *obj); -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__find_program_by_name() instead") -LIBBPF_API struct bpf_program * -bpf_object__find_program_by_title(const struct bpf_object *obj, - const char *title); LIBBPF_API struct bpf_program * bpf_object__find_program_by_name(const struct bpf_object *obj, const char *name); -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "track bpf_objects in application code instead") -struct bpf_object *bpf_object__next(struct bpf_object *prev); -#define bpf_object__for_each_safe(pos, tmp) \ - for ((pos) = bpf_object__next(NULL), \ - (tmp) = bpf_object__next(pos); \ - (pos) != NULL; \ - (pos) = (tmp), (tmp) = bpf_object__next(tmp)) - typedef void (*bpf_object_clear_priv_t)(struct bpf_object *, void *); LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated") LIBBPF_API int bpf_object__set_priv(struct bpf_object *obj, void *priv, @@ -298,9 +248,7 @@ LIBBPF_API int libbpf_find_vmlinux_btf_id(const char *name, /* Accessors of bpf_program */ struct bpf_program; -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__next_program() instead") -struct bpf_program *bpf_program__next(struct bpf_program *prog, - const struct bpf_object *obj); + LIBBPF_API struct bpf_program * bpf_object__next_program(const struct bpf_object *obj, struct bpf_program *prog); @@ -309,9 +257,6 @@ bpf_object__next_program(const struct bpf_object *obj, struct bpf_program *prog) (pos) != NULL; \ (pos) = bpf_object__next_program((obj), (pos))) -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__prev_program() instead") -struct bpf_program *bpf_program__prev(struct bpf_program *prog, - const struct bpf_object *obj); LIBBPF_API struct bpf_program * bpf_object__prev_program(const struct bpf_object *obj, struct bpf_program *prog); @@ -327,15 +272,9 @@ LIBBPF_API void bpf_program__set_ifindex(struct bpf_program *prog, LIBBPF_API const char *bpf_program__name(const struct bpf_program *prog); LIBBPF_API const char *bpf_program__section_name(const struct bpf_program *prog); -LIBBPF_API LIBBPF_DEPRECATED("BPF program title is confusing term; please use bpf_program__section_name() instead") -const char *bpf_program__title(const struct bpf_program *prog, bool needs_copy); LIBBPF_API bool bpf_program__autoload(const struct bpf_program *prog); LIBBPF_API int bpf_program__set_autoload(struct bpf_program *prog, bool autoload); -/* returns program size in bytes */ -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_program__insn_cnt() instead") -LIBBPF_API size_t bpf_program__size(const struct bpf_program *prog); - struct bpf_insn; /** @@ -388,8 +327,6 @@ LIBBPF_API int bpf_program__set_insns(struct bpf_program *prog, */ LIBBPF_API size_t bpf_program__insn_cnt(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 6, "use bpf_object__load() instead") -LIBBPF_API int bpf_program__load(struct bpf_program *prog, const char *license, __u32 kern_version); LIBBPF_API int bpf_program__fd(const struct bpf_program *prog); LIBBPF_DEPRECATED_SINCE(0, 7, "multi-instance bpf_program support is deprecated") LIBBPF_API int bpf_program__pin_instance(struct bpf_program *prog, @@ -761,36 +698,6 @@ LIBBPF_API int bpf_program__set_prep(struct bpf_program *prog, int nr_instance, LIBBPF_DEPRECATED_SINCE(0, 7, "multi-instance bpf_program support is deprecated") LIBBPF_API int bpf_program__nth_fd(const struct bpf_program *prog, int n); -/* - * Adjust type of BPF program. Default is kprobe. - */ -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_socket_filter(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_tracepoint(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_raw_tracepoint(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_kprobe(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_lsm(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_sched_cls(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_sched_act(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_xdp(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_perf_event(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_tracing(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_struct_ops(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_extension(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_sk_lookup(struct bpf_program *prog); - LIBBPF_API enum bpf_prog_type bpf_program__type(const struct bpf_program *prog); /** @@ -853,47 +760,6 @@ LIBBPF_API int bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd, const char *attach_func_name); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_socket_filter(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_tracepoint(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_raw_tracepoint(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_kprobe(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_lsm(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_sched_cls(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_sched_act(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_xdp(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_perf_event(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_tracing(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_struct_ops(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_extension(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_sk_lookup(const struct bpf_program *prog); - -/* - * No need for __attribute__((packed)), all members of 'bpf_map_def' - * are all aligned. In addition, using __attribute__((packed)) - * would trigger a -Wpacked warning message, and lead to an error - * if -Werror is set. - */ -struct bpf_map_def { - unsigned int type; - unsigned int key_size; - unsigned int value_size; - unsigned int max_entries; - unsigned int map_flags; -}; - /** * @brief **bpf_object__find_map_by_name()** returns BPF map of * the given name, if it exists within the passed BPF object @@ -908,16 +774,6 @@ bpf_object__find_map_by_name(const struct bpf_object *obj, const char *name); LIBBPF_API int bpf_object__find_map_fd_by_name(const struct bpf_object *obj, const char *name); -/* - * Get bpf_map through the offset of corresponding struct bpf_map_def - * in the BPF object file. - */ -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_object__find_map_by_name() instead") -struct bpf_map * -bpf_object__find_map_by_offset(struct bpf_object *obj, size_t offset); - -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__next_map() instead") -struct bpf_map *bpf_map__next(const struct bpf_map *map, const struct bpf_object *obj); LIBBPF_API struct bpf_map * bpf_object__next_map(const struct bpf_object *obj, const struct bpf_map *map); @@ -927,8 +783,6 @@ bpf_object__next_map(const struct bpf_object *obj, const struct bpf_map *map); (pos) = bpf_object__next_map((obj), (pos))) #define bpf_map__for_each bpf_object__for_each_map -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__prev_map() instead") -struct bpf_map *bpf_map__prev(const struct bpf_map *map, const struct bpf_object *obj); LIBBPF_API struct bpf_map * bpf_object__prev_map(const struct bpf_object *obj, const struct bpf_map *map); @@ -962,9 +816,6 @@ LIBBPF_API bool bpf_map__autocreate(const struct bpf_map *map); */ LIBBPF_API int bpf_map__fd(const struct bpf_map *map); LIBBPF_API int bpf_map__reuse_fd(struct bpf_map *map, int fd); -/* get map definition */ -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 8, "use appropriate getters or setters instead") -const struct bpf_map_def *bpf_map__def(const struct bpf_map *map); /* get map name */ LIBBPF_API const char *bpf_map__name(const struct bpf_map *map); /* get/set map type */ @@ -973,8 +824,6 @@ LIBBPF_API int bpf_map__set_type(struct bpf_map *map, enum bpf_map_type type); /* get/set map size (max_entries) */ LIBBPF_API __u32 bpf_map__max_entries(const struct bpf_map *map); LIBBPF_API int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_map__set_max_entries() instead") -LIBBPF_API int bpf_map__resize(struct bpf_map *map, __u32 max_entries); /* get/set map flags */ LIBBPF_API __u32 bpf_map__map_flags(const struct bpf_map *map); LIBBPF_API int bpf_map__set_map_flags(struct bpf_map *map, __u32 flags); @@ -1006,8 +855,6 @@ LIBBPF_API void *bpf_map__priv(const struct bpf_map *map); LIBBPF_API int bpf_map__set_initial_value(struct bpf_map *map, const void *data, size_t size); LIBBPF_API const void *bpf_map__initial_value(struct bpf_map *map, size_t *psize); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_map__type() instead") -LIBBPF_API bool bpf_map__is_offload_neutral(const struct bpf_map *map); /** * @brief **bpf_map__is_internal()** tells the caller whether or not the diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 93c7bcc058c6..7f19c43028f1 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -3,13 +3,9 @@ LIBBPF_0.0.1 { bpf_btf_get_fd_by_id; bpf_map__btf_key_type_id; bpf_map__btf_value_type_id; - bpf_map__def; bpf_map__fd; - bpf_map__is_offload_neutral; bpf_map__name; - bpf_map__next; bpf_map__pin; - bpf_map__prev; bpf_map__priv; bpf_map__reuse_fd; bpf_map__set_ifindex; @@ -29,21 +25,15 @@ LIBBPF_0.0.1 { bpf_object__btf_fd; bpf_object__close; bpf_object__find_map_by_name; - bpf_object__find_map_by_offset; - bpf_object__find_program_by_title; bpf_object__kversion; bpf_object__load; bpf_object__name; - bpf_object__next; bpf_object__open; - bpf_object__open_buffer; - bpf_object__open_xattr; bpf_object__pin; bpf_object__pin_maps; bpf_object__pin_programs; bpf_object__priv; bpf_object__set_priv; - bpf_object__unload; bpf_object__unpin_maps; bpf_object__unpin_programs; bpf_prog_attach; @@ -53,35 +43,15 @@ LIBBPF_0.0.1 { bpf_prog_get_next_id; bpf_prog_query; bpf_program__fd; - bpf_program__is_kprobe; - bpf_program__is_perf_event; - bpf_program__is_raw_tracepoint; - bpf_program__is_sched_act; - bpf_program__is_sched_cls; - bpf_program__is_socket_filter; - bpf_program__is_tracepoint; - bpf_program__is_xdp; - bpf_program__load; - bpf_program__next; bpf_program__nth_fd; bpf_program__pin; bpf_program__pin_instance; - bpf_program__prev; bpf_program__priv; bpf_program__set_expected_attach_type; bpf_program__set_ifindex; - bpf_program__set_kprobe; - bpf_program__set_perf_event; bpf_program__set_prep; bpf_program__set_priv; - bpf_program__set_raw_tracepoint; - bpf_program__set_sched_act; - bpf_program__set_sched_cls; - bpf_program__set_socket_filter; - bpf_program__set_tracepoint; bpf_program__set_type; - bpf_program__set_xdp; - bpf_program__title; bpf_program__unload; bpf_program__unpin; bpf_program__unpin_instance; @@ -110,7 +80,6 @@ LIBBPF_0.0.1 { LIBBPF_0.0.2 { global: - bpf_map__resize; bpf_map_lookup_elem_flags; bpf_object__btf; bpf_object__find_map_fd_by_name; @@ -129,7 +98,6 @@ LIBBPF_0.0.3 { LIBBPF_0.0.4 { global: bpf_link__destroy; - bpf_object__load_xattr; bpf_program__attach_kprobe; bpf_program__attach_perf_event; bpf_program__attach_raw_tracepoint; @@ -158,9 +126,6 @@ LIBBPF_0.0.6 { bpf_program__attach_trace; bpf_program__get_expected_attach_type; bpf_program__get_type; - bpf_program__is_tracing; - bpf_program__set_tracing; - bpf_program__size; btf__find_by_name_kind; libbpf_find_vmlinux_btf_id; } LIBBPF_0.0.5; @@ -182,10 +147,6 @@ LIBBPF_0.0.7 { bpf_object__open_skeleton; bpf_program__attach; bpf_program__name; - bpf_program__is_extension; - bpf_program__is_struct_ops; - bpf_program__set_extension; - bpf_program__set_struct_ops; btf__align_of; libbpf_find_kernel_btf; } LIBBPF_0.0.6; @@ -204,9 +165,7 @@ LIBBPF_0.0.8 { bpf_prog_attach_opts; bpf_program__attach_cgroup; bpf_program__attach_lsm; - bpf_program__is_lsm; bpf_program__set_attach_target; - bpf_program__set_lsm; } LIBBPF_0.0.7; LIBBPF_0.0.9 { @@ -244,9 +203,7 @@ LIBBPF_0.1.0 { bpf_map__value_size; bpf_program__attach_xdp; bpf_program__autoload; - bpf_program__is_sk_lookup; bpf_program__set_autoload; - bpf_program__set_sk_lookup; btf__parse; btf__parse_raw; btf__pointer_size; -- cgit v1.2.3 From b4bda502dfa28c7cb78e25f24593fc1d4628b21c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:21 -0700 Subject: libbpf: remove multi-instance and custom private data APIs Remove all the public APIs that are related to creating multi-instance bpf_programs through custom preprocessing callback and generally working with them. Also remove all the bpf_{object,map,program}__[set_]priv() APIs. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-10-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 121 ++++------------------------------------------- tools/lib/bpf/libbpf.h | 91 ----------------------------------- tools/lib/bpf/libbpf.map | 10 ---- 3 files changed, 10 insertions(+), 212 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 149392604aa4..fe98789d1776 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -366,6 +366,16 @@ struct bpf_sec_def { libbpf_prog_attach_fn_t prog_attach_fn; }; +struct bpf_prog_prep_result { + struct bpf_insn *new_insn_ptr; + int new_insn_cnt; + int *pfd; +}; + +typedef int (*bpf_program_prep_t)(struct bpf_program *prog, int n, + struct bpf_insn *insns, int insns_cnt, + struct bpf_prog_prep_result *res); + /* * bpf_prog should be a better name but it has been used in * linux/filter.h. @@ -426,8 +436,6 @@ struct bpf_program { bpf_program_prep_t preprocessor; struct bpf_object *obj; - void *priv; - bpf_program_clear_priv_t clear_priv; bool autoload; bool mark_btf_static; @@ -511,8 +519,6 @@ struct bpf_map { __u32 btf_key_type_id; __u32 btf_value_type_id; __u32 btf_vmlinux_value_type_id; - void *priv; - bpf_map_clear_priv_t clear_priv; enum libbpf_map_type libbpf_type; void *mmaped; struct bpf_struct_ops *st_ops; @@ -668,9 +674,6 @@ struct bpf_object { size_t log_size; __u32 log_level; - void *priv; - bpf_object_clear_priv_t clear_priv; - int *fd_array; size_t fd_array_cap; size_t fd_array_cnt; @@ -721,12 +724,6 @@ static void bpf_program__exit(struct bpf_program *prog) if (!prog) return; - if (prog->clear_priv) - prog->clear_priv(prog, prog->priv); - - prog->priv = NULL; - prog->clear_priv = NULL; - bpf_program__unload(prog); zfree(&prog->name); zfree(&prog->sec_name); @@ -8051,12 +8048,6 @@ static int bpf_program_unpin_instance(struct bpf_program *prog, const char *path return 0; } -__attribute__((alias("bpf_program_pin_instance"))) -int bpf_object__pin_instance(struct bpf_program *prog, const char *path, int instance); - -__attribute__((alias("bpf_program_unpin_instance"))) -int bpf_program__unpin_instance(struct bpf_program *prog, const char *path, int instance); - int bpf_program__pin(struct bpf_program *prog, const char *path) { int i, err; @@ -8492,11 +8483,6 @@ int bpf_object__pin(struct bpf_object *obj, const char *path) static void bpf_map__destroy(struct bpf_map *map) { - if (map->clear_priv) - map->clear_priv(map, map->priv); - map->priv = NULL; - map->clear_priv = NULL; - if (map->inner_map) { bpf_map__destroy(map->inner_map); zfree(&map->inner_map); @@ -8532,9 +8518,6 @@ void bpf_object__close(struct bpf_object *obj) if (IS_ERR_OR_NULL(obj)) return; - if (obj->clear_priv) - obj->clear_priv(obj, obj->priv); - usdt_manager_free(obj->usdt_man); obj->usdt_man = NULL; @@ -8594,22 +8577,6 @@ int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version) return 0; } -int bpf_object__set_priv(struct bpf_object *obj, void *priv, - bpf_object_clear_priv_t clear_priv) -{ - if (obj->priv && obj->clear_priv) - obj->clear_priv(obj, obj->priv); - - obj->priv = priv; - obj->clear_priv = clear_priv; - return 0; -} - -void *bpf_object__priv(const struct bpf_object *obj) -{ - return obj ? obj->priv : libbpf_err_ptr(-EINVAL); -} - int bpf_object__gen_loader(struct bpf_object *obj, struct gen_loader_opts *opts) { struct bpf_gen *gen; @@ -8676,22 +8643,6 @@ bpf_object__prev_program(const struct bpf_object *obj, struct bpf_program *next) return prog; } -int bpf_program__set_priv(struct bpf_program *prog, void *priv, - bpf_program_clear_priv_t clear_priv) -{ - if (prog->priv && prog->clear_priv) - prog->clear_priv(prog, prog->priv); - - prog->priv = priv; - prog->clear_priv = clear_priv; - return 0; -} - -void *bpf_program__priv(const struct bpf_program *prog) -{ - return prog ? prog->priv : libbpf_err_ptr(-EINVAL); -} - void bpf_program__set_ifindex(struct bpf_program *prog, __u32 ifindex) { prog->prog_ifindex = ifindex; @@ -8758,37 +8709,6 @@ int bpf_program__set_insns(struct bpf_program *prog, return 0; } -int bpf_program__set_prep(struct bpf_program *prog, int nr_instances, - bpf_program_prep_t prep) -{ - int *instances_fds; - - if (nr_instances <= 0 || !prep) - return libbpf_err(-EINVAL); - - if (prog->instances.nr > 0 || prog->instances.fds) { - pr_warn("Can't set pre-processor after loading\n"); - return libbpf_err(-EINVAL); - } - - instances_fds = malloc(sizeof(int) * nr_instances); - if (!instances_fds) { - pr_warn("alloc memory failed for fds\n"); - return libbpf_err(-ENOMEM); - } - - /* fill all fd with -1 */ - memset(instances_fds, -1, sizeof(int) * nr_instances); - - prog->instances.nr = nr_instances; - prog->instances.fds = instances_fds; - prog->preprocessor = prep; - return 0; -} - -__attribute__((alias("bpf_program_nth_fd"))) -int bpf_program__nth_fd(const struct bpf_program *prog, int n); - static int bpf_program_nth_fd(const struct bpf_program *prog, int n) { int fd; @@ -9716,27 +9636,6 @@ __u32 bpf_map__btf_value_type_id(const struct bpf_map *map) return map ? map->btf_value_type_id : 0; } -int bpf_map__set_priv(struct bpf_map *map, void *priv, - bpf_map_clear_priv_t clear_priv) -{ - if (!map) - return libbpf_err(-EINVAL); - - if (map->priv) { - if (map->clear_priv) - map->clear_priv(map, map->priv); - } - - map->priv = priv; - map->clear_priv = clear_priv; - return 0; -} - -void *bpf_map__priv(const struct bpf_map *map) -{ - return map ? map->priv : libbpf_err_ptr(-EINVAL); -} - int bpf_map__set_initial_value(struct bpf_map *map, const void *data, size_t size) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 9c2279b7b6ed..5eb3145b8945 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -231,13 +231,6 @@ LIBBPF_API struct bpf_program * bpf_object__find_program_by_name(const struct bpf_object *obj, const char *name); -typedef void (*bpf_object_clear_priv_t)(struct bpf_object *, void *); -LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated") -LIBBPF_API int bpf_object__set_priv(struct bpf_object *obj, void *priv, - bpf_object_clear_priv_t clear_priv); -LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated") -LIBBPF_API void *bpf_object__priv(const struct bpf_object *prog); - LIBBPF_API int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, enum bpf_attach_type *expected_attach_type); @@ -260,13 +253,6 @@ bpf_object__next_program(const struct bpf_object *obj, struct bpf_program *prog) LIBBPF_API struct bpf_program * bpf_object__prev_program(const struct bpf_object *obj, struct bpf_program *prog); -typedef void (*bpf_program_clear_priv_t)(struct bpf_program *, void *); - -LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated") -LIBBPF_API int bpf_program__set_priv(struct bpf_program *prog, void *priv, - bpf_program_clear_priv_t clear_priv); -LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated") -LIBBPF_API void *bpf_program__priv(const struct bpf_program *prog); LIBBPF_API void bpf_program__set_ifindex(struct bpf_program *prog, __u32 ifindex); @@ -328,14 +314,6 @@ LIBBPF_API int bpf_program__set_insns(struct bpf_program *prog, LIBBPF_API size_t bpf_program__insn_cnt(const struct bpf_program *prog); LIBBPF_API int bpf_program__fd(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 7, "multi-instance bpf_program support is deprecated") -LIBBPF_API int bpf_program__pin_instance(struct bpf_program *prog, - const char *path, - int instance); -LIBBPF_DEPRECATED_SINCE(0, 7, "multi-instance bpf_program support is deprecated") -LIBBPF_API int bpf_program__unpin_instance(struct bpf_program *prog, - const char *path, - int instance); /** * @brief **bpf_program__pin()** pins the BPF program to a file @@ -635,69 +613,6 @@ LIBBPF_API struct bpf_link * bpf_program__attach_iter(const struct bpf_program *prog, const struct bpf_iter_attach_opts *opts); -/* - * Libbpf allows callers to adjust BPF programs before being loaded - * into kernel. One program in an object file can be transformed into - * multiple variants to be attached to different hooks. - * - * bpf_program_prep_t, bpf_program__set_prep and bpf_program__nth_fd - * form an API for this purpose. - * - * - bpf_program_prep_t: - * Defines a 'preprocessor', which is a caller defined function - * passed to libbpf through bpf_program__set_prep(), and will be - * called before program is loaded. The processor should adjust - * the program one time for each instance according to the instance id - * passed to it. - * - * - bpf_program__set_prep: - * Attaches a preprocessor to a BPF program. The number of instances - * that should be created is also passed through this function. - * - * - bpf_program__nth_fd: - * After the program is loaded, get resulting FD of a given instance - * of the BPF program. - * - * If bpf_program__set_prep() is not used, the program would be loaded - * without adjustment during bpf_object__load(). The program has only - * one instance. In this case bpf_program__fd(prog) is equal to - * bpf_program__nth_fd(prog, 0). - */ -struct bpf_prog_prep_result { - /* - * If not NULL, load new instruction array. - * If set to NULL, don't load this instance. - */ - struct bpf_insn *new_insn_ptr; - int new_insn_cnt; - - /* If not NULL, result FD is written to it. */ - int *pfd; -}; - -/* - * Parameters of bpf_program_prep_t: - * - prog: The bpf_program being loaded. - * - n: Index of instance being generated. - * - insns: BPF instructions array. - * - insns_cnt:Number of instructions in insns. - * - res: Output parameter, result of transformation. - * - * Return value: - * - Zero: pre-processing success. - * - Non-zero: pre-processing error, stop loading. - */ -typedef int (*bpf_program_prep_t)(struct bpf_program *prog, int n, - struct bpf_insn *insns, int insns_cnt, - struct bpf_prog_prep_result *res); - -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_program__insns() for getting bpf_program instructions") -LIBBPF_API int bpf_program__set_prep(struct bpf_program *prog, int nr_instance, - bpf_program_prep_t prep); - -LIBBPF_DEPRECATED_SINCE(0, 7, "multi-instance bpf_program support is deprecated") -LIBBPF_API int bpf_program__nth_fd(const struct bpf_program *prog, int n); - LIBBPF_API enum bpf_prog_type bpf_program__type(const struct bpf_program *prog); /** @@ -846,12 +761,6 @@ LIBBPF_API int bpf_map__set_ifindex(struct bpf_map *map, __u32 ifindex); LIBBPF_API __u64 bpf_map__map_extra(const struct bpf_map *map); LIBBPF_API int bpf_map__set_map_extra(struct bpf_map *map, __u64 map_extra); -typedef void (*bpf_map_clear_priv_t)(struct bpf_map *, void *); -LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated") -LIBBPF_API int bpf_map__set_priv(struct bpf_map *map, void *priv, - bpf_map_clear_priv_t clear_priv); -LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated") -LIBBPF_API void *bpf_map__priv(const struct bpf_map *map); LIBBPF_API int bpf_map__set_initial_value(struct bpf_map *map, const void *data, size_t size); LIBBPF_API const void *bpf_map__initial_value(struct bpf_map *map, size_t *psize); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 7f19c43028f1..236efc7338f1 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -6,11 +6,9 @@ LIBBPF_0.0.1 { bpf_map__fd; bpf_map__name; bpf_map__pin; - bpf_map__priv; bpf_map__reuse_fd; bpf_map__set_ifindex; bpf_map__set_inner_map_fd; - bpf_map__set_priv; bpf_map__unpin; bpf_map_delete_elem; bpf_map_get_fd_by_id; @@ -32,8 +30,6 @@ LIBBPF_0.0.1 { bpf_object__pin; bpf_object__pin_maps; bpf_object__pin_programs; - bpf_object__priv; - bpf_object__set_priv; bpf_object__unpin_maps; bpf_object__unpin_programs; bpf_prog_attach; @@ -43,18 +39,12 @@ LIBBPF_0.0.1 { bpf_prog_get_next_id; bpf_prog_query; bpf_program__fd; - bpf_program__nth_fd; bpf_program__pin; - bpf_program__pin_instance; - bpf_program__priv; bpf_program__set_expected_attach_type; bpf_program__set_ifindex; - bpf_program__set_prep; - bpf_program__set_priv; bpf_program__set_type; bpf_program__unload; bpf_program__unpin; - bpf_program__unpin_instance; bpf_prog_linfo__free; bpf_prog_linfo__new; bpf_prog_linfo__lfind_addr_func; -- cgit v1.2.3 From a11113a2dcbedd488ca82f46853cf3e1ee486a6e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:22 -0700 Subject: libbpf: cleanup LIBBPF_DEPRECATED_SINCE supporting macros for v0.x Keep the LIBBPF_DEPRECATED_SINCE macro "framework" for future deprecations, but clean up 0.x related helper macros. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-11-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf_common.h | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf_common.h b/tools/lib/bpf/libbpf_common.h index 000e37798ff2..9a7937f339df 100644 --- a/tools/lib/bpf/libbpf_common.h +++ b/tools/lib/bpf/libbpf_common.h @@ -30,20 +30,10 @@ /* Add checks for other versions below when planning deprecation of API symbols * with the LIBBPF_DEPRECATED_SINCE macro. */ -#if __LIBBPF_CURRENT_VERSION_GEQ(0, 6) -#define __LIBBPF_MARK_DEPRECATED_0_6(X) X +#if __LIBBPF_CURRENT_VERSION_GEQ(1, 0) +#define __LIBBPF_MARK_DEPRECATED_1_0(X) X #else -#define __LIBBPF_MARK_DEPRECATED_0_6(X) -#endif -#if __LIBBPF_CURRENT_VERSION_GEQ(0, 7) -#define __LIBBPF_MARK_DEPRECATED_0_7(X) X -#else -#define __LIBBPF_MARK_DEPRECATED_0_7(X) -#endif -#if __LIBBPF_CURRENT_VERSION_GEQ(0, 8) -#define __LIBBPF_MARK_DEPRECATED_0_8(X) X -#else -#define __LIBBPF_MARK_DEPRECATED_0_8(X) +#define __LIBBPF_MARK_DEPRECATED_1_0(X) #endif /* This set of internal macros allows to do "function overloading" based on -- cgit v1.2.3 From cf90a20db878fddedefbdfeaff6695a0ee20f690 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:23 -0700 Subject: libbpf: remove internal multi-instance prog support Clean up internals that had to deal with the possibility of multi-instance bpf_programs. Libbpf 1.0 doesn't support this, so all this is not necessary now and can be simplified. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-12-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 317 ++++++------------------------------------------- 1 file changed, 34 insertions(+), 283 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index fe98789d1776..65f2a57bc78d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -366,16 +366,6 @@ struct bpf_sec_def { libbpf_prog_attach_fn_t prog_attach_fn; }; -struct bpf_prog_prep_result { - struct bpf_insn *new_insn_ptr; - int new_insn_cnt; - int *pfd; -}; - -typedef int (*bpf_program_prep_t)(struct bpf_program *prog, int n, - struct bpf_insn *insns, int insns_cnt, - struct bpf_prog_prep_result *res); - /* * bpf_prog should be a better name but it has been used in * linux/filter.h. @@ -429,22 +419,19 @@ struct bpf_program { size_t log_size; __u32 log_level; - struct { - int nr; - int *fds; - } instances; - bpf_program_prep_t preprocessor; - struct bpf_object *obj; + int fd; bool autoload; bool mark_btf_static; enum bpf_prog_type type; enum bpf_attach_type expected_attach_type; + int prog_ifindex; __u32 attach_btf_obj_fd; __u32 attach_btf_id; __u32 attach_prog_fd; + void *func_info; __u32 func_info_rec_size; __u32 func_info_cnt; @@ -695,25 +682,10 @@ static Elf64_Rel *elf_rel_by_idx(Elf_Data *data, size_t idx); void bpf_program__unload(struct bpf_program *prog) { - int i; - if (!prog) return; - /* - * If the object is opened but the program was never loaded, - * it is possible that prog->instances.nr == -1. - */ - if (prog->instances.nr > 0) { - for (i = 0; i < prog->instances.nr; i++) - zclose(prog->instances.fds[i]); - } else if (prog->instances.nr != -1) { - pr_warn("Internal error: instances.nr is %d\n", - prog->instances.nr); - } - - prog->instances.nr = -1; - zfree(&prog->instances.fds); + zclose(prog->fd); zfree(&prog->func_info); zfree(&prog->line_info); @@ -797,6 +769,7 @@ bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog, prog->insns_cnt = prog->sec_insn_cnt; prog->type = BPF_PROG_TYPE_UNSPEC; + prog->fd = -1; /* libbpf's convention for SEC("?abc...") is that it's just like * SEC("abc...") but the corresponding bpf_program starts out with @@ -810,9 +783,6 @@ bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog, prog->autoload = true; } - prog->instances.fds = NULL; - prog->instances.nr = -1; - /* inherit object's log_level */ prog->log_level = obj->log_level; @@ -6862,10 +6832,9 @@ static int libbpf_prepare_prog_load(struct bpf_program *prog, static void fixup_verifier_log(struct bpf_program *prog, char *buf, size_t buf_sz); -static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_program *prog, - struct bpf_insn *insns, int insns_cnt, - const char *license, __u32 kern_version, - int *prog_fd) +static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog, + struct bpf_insn *insns, int insns_cnt, + const char *license, __u32 kern_version, int *prog_fd) { LIBBPF_OPTS(bpf_prog_load_opts, load_attr); const char *prog_name = NULL; @@ -7232,88 +7201,6 @@ static int bpf_program_record_relos(struct bpf_program *prog) return 0; } -static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog, - const char *license, __u32 kern_ver) -{ - int err = 0, fd, i; - - if (obj->loaded) { - pr_warn("prog '%s': can't load after object was loaded\n", prog->name); - return libbpf_err(-EINVAL); - } - - if (prog->instances.nr < 0 || !prog->instances.fds) { - if (prog->preprocessor) { - pr_warn("Internal error: can't load program '%s'\n", - prog->name); - return libbpf_err(-LIBBPF_ERRNO__INTERNAL); - } - - prog->instances.fds = malloc(sizeof(int)); - if (!prog->instances.fds) { - pr_warn("Not enough memory for BPF fds\n"); - return libbpf_err(-ENOMEM); - } - prog->instances.nr = 1; - prog->instances.fds[0] = -1; - } - - if (!prog->preprocessor) { - if (prog->instances.nr != 1) { - pr_warn("prog '%s': inconsistent nr(%d) != 1\n", - prog->name, prog->instances.nr); - } - if (obj->gen_loader) - bpf_program_record_relos(prog); - err = bpf_object_load_prog_instance(obj, prog, - prog->insns, prog->insns_cnt, - license, kern_ver, &fd); - if (!err) - prog->instances.fds[0] = fd; - goto out; - } - - for (i = 0; i < prog->instances.nr; i++) { - struct bpf_prog_prep_result result; - bpf_program_prep_t preprocessor = prog->preprocessor; - - memset(&result, 0, sizeof(result)); - err = preprocessor(prog, i, prog->insns, - prog->insns_cnt, &result); - if (err) { - pr_warn("Preprocessing the %dth instance of program '%s' failed\n", - i, prog->name); - goto out; - } - - if (!result.new_insn_ptr || !result.new_insn_cnt) { - pr_debug("Skip loading the %dth instance of program '%s'\n", - i, prog->name); - prog->instances.fds[i] = -1; - if (result.pfd) - *result.pfd = -1; - continue; - } - - err = bpf_object_load_prog_instance(obj, prog, - result.new_insn_ptr, result.new_insn_cnt, - license, kern_ver, &fd); - if (err) { - pr_warn("Loading the %dth instance of program '%s' failed\n", - i, prog->name); - goto out; - } - - if (result.pfd) - *result.pfd = fd; - prog->instances.fds[i] = fd; - } -out: - if (err) - pr_warn("failed to load program '%s'\n", prog->name); - return libbpf_err(err); -} - static int bpf_object__load_progs(struct bpf_object *obj, int log_level) { @@ -7337,9 +7224,16 @@ bpf_object__load_progs(struct bpf_object *obj, int log_level) continue; } prog->log_level |= log_level; - err = bpf_object_load_prog(obj, prog, obj->license, obj->kern_version); - if (err) + + if (obj->gen_loader) + bpf_program_record_relos(prog); + + err = bpf_object_load_prog(obj, prog, prog->insns, prog->insns_cnt, + obj->license, obj->kern_version, &prog->fd); + if (err) { + pr_warn("prog '%s': failed to load: %d\n", prog->name, err); return err; + } } bpf_object__free_relocs(obj); @@ -7985,11 +7879,16 @@ static int check_path(const char *path) return err; } -static int bpf_program_pin_instance(struct bpf_program *prog, const char *path, int instance) +int bpf_program__pin(struct bpf_program *prog, const char *path) { char *cp, errmsg[STRERR_BUFSIZE]; int err; + if (prog->fd < 0) { + pr_warn("prog '%s': can't pin program that wasn't loaded\n", prog->name); + return libbpf_err(-EINVAL); + } + err = make_parent_dir(path); if (err) return libbpf_err(err); @@ -7998,164 +7897,35 @@ static int bpf_program_pin_instance(struct bpf_program *prog, const char *path, if (err) return libbpf_err(err); - if (prog == NULL) { - pr_warn("invalid program pointer\n"); - return libbpf_err(-EINVAL); - } - - if (instance < 0 || instance >= prog->instances.nr) { - pr_warn("invalid prog instance %d of prog %s (max %d)\n", - instance, prog->name, prog->instances.nr); - return libbpf_err(-EINVAL); - } - - if (bpf_obj_pin(prog->instances.fds[instance], path)) { + if (bpf_obj_pin(prog->fd, path)) { err = -errno; cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); - pr_warn("failed to pin program: %s\n", cp); + pr_warn("prog '%s': failed to pin at '%s': %s\n", prog->name, path, cp); return libbpf_err(err); } - pr_debug("pinned program '%s'\n", path); + pr_debug("prog '%s': pinned at '%s'\n", prog->name, path); return 0; } -static int bpf_program_unpin_instance(struct bpf_program *prog, const char *path, int instance) +int bpf_program__unpin(struct bpf_program *prog, const char *path) { int err; - err = check_path(path); - if (err) - return libbpf_err(err); - - if (prog == NULL) { - pr_warn("invalid program pointer\n"); + if (prog->fd < 0) { + pr_warn("prog '%s': can't unpin program that wasn't loaded\n", prog->name); return libbpf_err(-EINVAL); } - if (instance < 0 || instance >= prog->instances.nr) { - pr_warn("invalid prog instance %d of prog %s (max %d)\n", - instance, prog->name, prog->instances.nr); - return libbpf_err(-EINVAL); - } - - err = unlink(path); - if (err != 0) - return libbpf_err(-errno); - - pr_debug("unpinned program '%s'\n", path); - - return 0; -} - -int bpf_program__pin(struct bpf_program *prog, const char *path) -{ - int i, err; - - err = make_parent_dir(path); - if (err) - return libbpf_err(err); - - err = check_path(path); - if (err) - return libbpf_err(err); - - if (prog == NULL) { - pr_warn("invalid program pointer\n"); - return libbpf_err(-EINVAL); - } - - if (prog->instances.nr <= 0) { - pr_warn("no instances of prog %s to pin\n", prog->name); - return libbpf_err(-EINVAL); - } - - if (prog->instances.nr == 1) { - /* don't create subdirs when pinning single instance */ - return bpf_program_pin_instance(prog, path, 0); - } - - for (i = 0; i < prog->instances.nr; i++) { - char buf[PATH_MAX]; - int len; - - len = snprintf(buf, PATH_MAX, "%s/%d", path, i); - if (len < 0) { - err = -EINVAL; - goto err_unpin; - } else if (len >= PATH_MAX) { - err = -ENAMETOOLONG; - goto err_unpin; - } - - err = bpf_program_pin_instance(prog, buf, i); - if (err) - goto err_unpin; - } - - return 0; - -err_unpin: - for (i = i - 1; i >= 0; i--) { - char buf[PATH_MAX]; - int len; - - len = snprintf(buf, PATH_MAX, "%s/%d", path, i); - if (len < 0) - continue; - else if (len >= PATH_MAX) - continue; - - bpf_program_unpin_instance(prog, buf, i); - } - - rmdir(path); - - return libbpf_err(err); -} - -int bpf_program__unpin(struct bpf_program *prog, const char *path) -{ - int i, err; - err = check_path(path); if (err) return libbpf_err(err); - if (prog == NULL) { - pr_warn("invalid program pointer\n"); - return libbpf_err(-EINVAL); - } - - if (prog->instances.nr <= 0) { - pr_warn("no instances of prog %s to pin\n", prog->name); - return libbpf_err(-EINVAL); - } - - if (prog->instances.nr == 1) { - /* don't create subdirs when pinning single instance */ - return bpf_program_unpin_instance(prog, path, 0); - } - - for (i = 0; i < prog->instances.nr; i++) { - char buf[PATH_MAX]; - int len; - - len = snprintf(buf, PATH_MAX, "%s/%d", path, i); - if (len < 0) - return libbpf_err(-EINVAL); - else if (len >= PATH_MAX) - return libbpf_err(-ENAMETOOLONG); - - err = bpf_program_unpin_instance(prog, buf, i); - if (err) - return err; - } - - err = rmdir(path); + err = unlink(path); if (err) return libbpf_err(-errno); + pr_debug("prog '%s': unpinned from '%s'\n", prog->name, path); return 0; } @@ -8672,13 +8442,6 @@ int bpf_program__set_autoload(struct bpf_program *prog, bool autoload) return 0; } -static int bpf_program_nth_fd(const struct bpf_program *prog, int n); - -int bpf_program__fd(const struct bpf_program *prog) -{ - return bpf_program_nth_fd(prog, 0); -} - const struct bpf_insn *bpf_program__insns(const struct bpf_program *prog) { return prog->insns; @@ -8709,27 +8472,15 @@ int bpf_program__set_insns(struct bpf_program *prog, return 0; } -static int bpf_program_nth_fd(const struct bpf_program *prog, int n) +int bpf_program__fd(const struct bpf_program *prog) { - int fd; - if (!prog) return libbpf_err(-EINVAL); - if (n >= prog->instances.nr || n < 0) { - pr_warn("Can't get the %dth fd from program %s: only %d instances\n", - n, prog->name, prog->instances.nr); - return libbpf_err(-EINVAL); - } - - fd = prog->instances.fds[n]; - if (fd < 0) { - pr_warn("%dth instance of program '%s' is invalid\n", - n, prog->name); + if (prog->fd < 0) return libbpf_err(-ENOENT); - } - return fd; + return prog->fd; } __alias(bpf_program__type) -- cgit v1.2.3 From 450b167fb9be91a8164d3f3d734674f5fe95b22d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:24 -0700 Subject: libbpf: clean up SEC() handling Get rid of sloppy prefix logic and remove deprecated xdp_{devmap,cpumap} sections. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-13-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 119 +++++++++++++++++++------------------------------ 1 file changed, 47 insertions(+), 72 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 65f2a57bc78d..ae58cf2898ad 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -346,12 +346,8 @@ enum sec_def_flags { SEC_ATTACH_BTF = 4, /* BPF program type allows sleeping/blocking in kernel */ SEC_SLEEPABLE = 8, - /* allow non-strict prefix matching */ - SEC_SLOPPY_PFX = 16, /* BPF program support non-linear XDP buffer */ - SEC_XDP_FRAGS = 32, - /* deprecated sec definitions not supposed to be used */ - SEC_DEPRECATED = 64, + SEC_XDP_FRAGS = 16, }; struct bpf_sec_def { @@ -6785,11 +6781,6 @@ static int libbpf_prepare_prog_load(struct bpf_program *prog, if (prog->type == BPF_PROG_TYPE_XDP && (def & SEC_XDP_FRAGS)) opts->prog_flags |= BPF_F_XDP_HAS_FRAGS; - if (def & SEC_DEPRECATED) { - pr_warn("SEC(\"%s\") is deprecated, please see https://github.com/libbpf/libbpf/wiki/Libbpf-1.0-migration-guide#bpf-program-sec-annotation-deprecations for details\n", - prog->sec_name); - } - if ((def & SEC_ATTACH_BTF) && !prog->attach_btf_id) { int btf_obj_fd = 0, btf_type_id = 0, err; const char *attach_name; @@ -8586,9 +8577,9 @@ static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_li static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link); static const struct bpf_sec_def section_defs[] = { - SEC_DEF("socket", SOCKET_FILTER, 0, SEC_NONE | SEC_SLOPPY_PFX), - SEC_DEF("sk_reuseport/migrate", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("sk_reuseport", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT, SEC_ATTACHABLE | SEC_SLOPPY_PFX), + SEC_DEF("socket", SOCKET_FILTER, 0, SEC_NONE), + SEC_DEF("sk_reuseport/migrate", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, SEC_ATTACHABLE), + SEC_DEF("sk_reuseport", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT, SEC_ATTACHABLE), SEC_DEF("kprobe+", KPROBE, 0, SEC_NONE, attach_kprobe), SEC_DEF("uprobe+", KPROBE, 0, SEC_NONE, attach_uprobe), SEC_DEF("uprobe.s+", KPROBE, 0, SEC_SLEEPABLE, attach_uprobe), @@ -8599,8 +8590,8 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("kretprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), SEC_DEF("usdt+", KPROBE, 0, SEC_NONE, attach_usdt), SEC_DEF("tc", SCHED_CLS, 0, SEC_NONE), - SEC_DEF("classifier", SCHED_CLS, 0, SEC_NONE | SEC_SLOPPY_PFX | SEC_DEPRECATED), - SEC_DEF("action", SCHED_ACT, 0, SEC_NONE | SEC_SLOPPY_PFX), + SEC_DEF("classifier", SCHED_CLS, 0, SEC_NONE), + SEC_DEF("action", SCHED_ACT, 0, SEC_NONE), SEC_DEF("tracepoint+", TRACEPOINT, 0, SEC_NONE, attach_tp), SEC_DEF("tp+", TRACEPOINT, 0, SEC_NONE, attach_tp), SEC_DEF("raw_tracepoint+", RAW_TRACEPOINT, 0, SEC_NONE, attach_raw_tp), @@ -8622,50 +8613,48 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("syscall", SYSCALL, 0, SEC_SLEEPABLE), SEC_DEF("xdp.frags/devmap", XDP, BPF_XDP_DEVMAP, SEC_XDP_FRAGS), SEC_DEF("xdp/devmap", XDP, BPF_XDP_DEVMAP, SEC_ATTACHABLE), - SEC_DEF("xdp_devmap/", XDP, BPF_XDP_DEVMAP, SEC_ATTACHABLE | SEC_DEPRECATED), SEC_DEF("xdp.frags/cpumap", XDP, BPF_XDP_CPUMAP, SEC_XDP_FRAGS), SEC_DEF("xdp/cpumap", XDP, BPF_XDP_CPUMAP, SEC_ATTACHABLE), - SEC_DEF("xdp_cpumap/", XDP, BPF_XDP_CPUMAP, SEC_ATTACHABLE | SEC_DEPRECATED), SEC_DEF("xdp.frags", XDP, BPF_XDP, SEC_XDP_FRAGS), - SEC_DEF("xdp", XDP, BPF_XDP, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("perf_event", PERF_EVENT, 0, SEC_NONE | SEC_SLOPPY_PFX), - SEC_DEF("lwt_in", LWT_IN, 0, SEC_NONE | SEC_SLOPPY_PFX), - SEC_DEF("lwt_out", LWT_OUT, 0, SEC_NONE | SEC_SLOPPY_PFX), - SEC_DEF("lwt_xmit", LWT_XMIT, 0, SEC_NONE | SEC_SLOPPY_PFX), - SEC_DEF("lwt_seg6local", LWT_SEG6LOCAL, 0, SEC_NONE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup_skb/ingress", CGROUP_SKB, BPF_CGROUP_INET_INGRESS, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("cgroup_skb/egress", CGROUP_SKB, BPF_CGROUP_INET_EGRESS, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/skb", CGROUP_SKB, 0, SEC_NONE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/sock_create", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_CREATE, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/sock_release", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_RELEASE, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/sock", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_CREATE, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/post_bind4", CGROUP_SOCK, BPF_CGROUP_INET4_POST_BIND, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/post_bind6", CGROUP_SOCK, BPF_CGROUP_INET6_POST_BIND, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/dev", CGROUP_DEVICE, BPF_CGROUP_DEVICE, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("sockops", SOCK_OPS, BPF_CGROUP_SOCK_OPS, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("sk_skb/stream_parser", SK_SKB, BPF_SK_SKB_STREAM_PARSER, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("sk_skb/stream_verdict",SK_SKB, BPF_SK_SKB_STREAM_VERDICT, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("sk_skb", SK_SKB, 0, SEC_NONE | SEC_SLOPPY_PFX), - SEC_DEF("sk_msg", SK_MSG, BPF_SK_MSG_VERDICT, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("lirc_mode2", LIRC_MODE2, BPF_LIRC_MODE2, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("flow_dissector", FLOW_DISSECTOR, BPF_FLOW_DISSECTOR, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/bind4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_BIND, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/bind6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_BIND, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/connect4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_CONNECT, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/connect6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_CONNECT, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/sendmsg4", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP4_SENDMSG, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/sendmsg6", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_SENDMSG, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/recvmsg4", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP4_RECVMSG, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/recvmsg6", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_RECVMSG, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/getpeername4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_GETPEERNAME, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/getpeername6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_GETPEERNAME, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/getsockname4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_GETSOCKNAME, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/getsockname6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_GETSOCKNAME, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/sysctl", CGROUP_SYSCTL, BPF_CGROUP_SYSCTL, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/getsockopt", CGROUP_SOCKOPT, BPF_CGROUP_GETSOCKOPT, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/setsockopt", CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT, SEC_ATTACHABLE | SEC_SLOPPY_PFX), + SEC_DEF("xdp", XDP, BPF_XDP, SEC_ATTACHABLE_OPT), + SEC_DEF("perf_event", PERF_EVENT, 0, SEC_NONE), + SEC_DEF("lwt_in", LWT_IN, 0, SEC_NONE), + SEC_DEF("lwt_out", LWT_OUT, 0, SEC_NONE), + SEC_DEF("lwt_xmit", LWT_XMIT, 0, SEC_NONE), + SEC_DEF("lwt_seg6local", LWT_SEG6LOCAL, 0, SEC_NONE), + SEC_DEF("sockops", SOCK_OPS, BPF_CGROUP_SOCK_OPS, SEC_ATTACHABLE_OPT), + SEC_DEF("sk_skb/stream_parser", SK_SKB, BPF_SK_SKB_STREAM_PARSER, SEC_ATTACHABLE_OPT), + SEC_DEF("sk_skb/stream_verdict",SK_SKB, BPF_SK_SKB_STREAM_VERDICT, SEC_ATTACHABLE_OPT), + SEC_DEF("sk_skb", SK_SKB, 0, SEC_NONE), + SEC_DEF("sk_msg", SK_MSG, BPF_SK_MSG_VERDICT, SEC_ATTACHABLE_OPT), + SEC_DEF("lirc_mode2", LIRC_MODE2, BPF_LIRC_MODE2, SEC_ATTACHABLE_OPT), + SEC_DEF("flow_dissector", FLOW_DISSECTOR, BPF_FLOW_DISSECTOR, SEC_ATTACHABLE_OPT), + SEC_DEF("cgroup_skb/ingress", CGROUP_SKB, BPF_CGROUP_INET_INGRESS, SEC_ATTACHABLE_OPT), + SEC_DEF("cgroup_skb/egress", CGROUP_SKB, BPF_CGROUP_INET_EGRESS, SEC_ATTACHABLE_OPT), + SEC_DEF("cgroup/skb", CGROUP_SKB, 0, SEC_NONE), + SEC_DEF("cgroup/sock_create", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_CREATE, SEC_ATTACHABLE), + SEC_DEF("cgroup/sock_release", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_RELEASE, SEC_ATTACHABLE), + SEC_DEF("cgroup/sock", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_CREATE, SEC_ATTACHABLE_OPT), + SEC_DEF("cgroup/post_bind4", CGROUP_SOCK, BPF_CGROUP_INET4_POST_BIND, SEC_ATTACHABLE), + SEC_DEF("cgroup/post_bind6", CGROUP_SOCK, BPF_CGROUP_INET6_POST_BIND, SEC_ATTACHABLE), + SEC_DEF("cgroup/bind4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_BIND, SEC_ATTACHABLE), + SEC_DEF("cgroup/bind6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_BIND, SEC_ATTACHABLE), + SEC_DEF("cgroup/connect4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_CONNECT, SEC_ATTACHABLE), + SEC_DEF("cgroup/connect6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_CONNECT, SEC_ATTACHABLE), + SEC_DEF("cgroup/sendmsg4", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP4_SENDMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/sendmsg6", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_SENDMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/recvmsg4", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP4_RECVMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/recvmsg6", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_RECVMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/getpeername4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_GETPEERNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/getpeername6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_GETPEERNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/getsockname4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_GETSOCKNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/getsockname6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_GETSOCKNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/sysctl", CGROUP_SYSCTL, BPF_CGROUP_SYSCTL, SEC_ATTACHABLE), + SEC_DEF("cgroup/getsockopt", CGROUP_SOCKOPT, BPF_CGROUP_GETSOCKOPT, SEC_ATTACHABLE), + SEC_DEF("cgroup/setsockopt", CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT, SEC_ATTACHABLE), + SEC_DEF("cgroup/dev", CGROUP_DEVICE, BPF_CGROUP_DEVICE, SEC_ATTACHABLE_OPT), SEC_DEF("struct_ops+", STRUCT_OPS, 0, SEC_NONE), - SEC_DEF("sk_lookup", SK_LOOKUP, BPF_SK_LOOKUP, SEC_ATTACHABLE | SEC_SLOPPY_PFX), + SEC_DEF("sk_lookup", SK_LOOKUP, BPF_SK_LOOKUP, SEC_ATTACHABLE), }; static size_t custom_sec_def_cnt; @@ -8760,8 +8749,7 @@ int libbpf_unregister_prog_handler(int handler_id) return 0; } -static bool sec_def_matches(const struct bpf_sec_def *sec_def, const char *sec_name, - bool allow_sloppy) +static bool sec_def_matches(const struct bpf_sec_def *sec_def, const char *sec_name) { size_t len = strlen(sec_def->sec); @@ -8786,17 +8774,6 @@ static bool sec_def_matches(const struct bpf_sec_def *sec_def, const char *sec_n return false; } - /* SEC_SLOPPY_PFX definitions are allowed to be just prefix - * matches, unless strict section name mode - * (LIBBPF_STRICT_SEC_NAME) is enabled, in which case the - * match has to be exact. - */ - if (allow_sloppy && str_has_pfx(sec_name, sec_def->sec)) - return true; - - /* Definitions not marked SEC_SLOPPY_PFX (e.g., - * SEC("syscall")) are exact matches in both modes. - */ return strcmp(sec_name, sec_def->sec) == 0; } @@ -8804,20 +8781,18 @@ static const struct bpf_sec_def *find_sec_def(const char *sec_name) { const struct bpf_sec_def *sec_def; int i, n; - bool strict = libbpf_mode & LIBBPF_STRICT_SEC_NAME, allow_sloppy; n = custom_sec_def_cnt; for (i = 0; i < n; i++) { sec_def = &custom_sec_defs[i]; - if (sec_def_matches(sec_def, sec_name, false)) + if (sec_def_matches(sec_def, sec_name)) return sec_def; } n = ARRAY_SIZE(section_defs); for (i = 0; i < n; i++) { sec_def = §ion_defs[i]; - allow_sloppy = (sec_def->cookie & SEC_SLOPPY_PFX) && !strict; - if (sec_def_matches(sec_def, sec_name, allow_sloppy)) + if (sec_def_matches(sec_def, sec_name)) return sec_def; } -- cgit v1.2.3 From 31e42721976b9c445477038f8a4006150cd27a60 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:25 -0700 Subject: selftests/bpf: remove last tests with legacy BPF map definitions Libbpf 1.0 stops support legacy-style BPF map definitions. Selftests has been migrated away from using legacy BPF map definitions except for two selftests, to make sure that legacy functionality still worked in pre-1.0 libbpf. Now it's time to let those tests go as libbpf 1.0 is imminent. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-14-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpf_legacy.h | 9 ---- tools/testing/selftests/bpf/prog_tests/btf.c | 1 - tools/testing/selftests/bpf/progs/test_btf_haskv.c | 51 ---------------------- tools/testing/selftests/bpf/progs/test_btf_newkv.c | 18 -------- 4 files changed, 79 deletions(-) delete mode 100644 tools/testing/selftests/bpf/progs/test_btf_haskv.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_legacy.h b/tools/testing/selftests/bpf/bpf_legacy.h index 719ab56cdb5d..845209581440 100644 --- a/tools/testing/selftests/bpf/bpf_legacy.h +++ b/tools/testing/selftests/bpf/bpf_legacy.h @@ -2,15 +2,6 @@ #ifndef __BPF_LEGACY__ #define __BPF_LEGACY__ -#define BPF_ANNOTATE_KV_PAIR(name, type_key, type_val) \ - struct ____btf_map_##name { \ - type_key key; \ - type_val value; \ - }; \ - struct ____btf_map_##name \ - __attribute__ ((section(".maps." #name), used)) \ - ____btf_map_##name = { } - /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions */ diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index 1fd792a92a1c..941b0100bafa 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -4651,7 +4651,6 @@ struct btf_file_test { }; static struct btf_file_test file_tests[] = { - { .file = "test_btf_haskv.o", }, { .file = "test_btf_newkv.o", }, { .file = "test_btf_nokv.o", .btf_kv_notfound = true, }, }; diff --git a/tools/testing/selftests/bpf/progs/test_btf_haskv.c b/tools/testing/selftests/bpf/progs/test_btf_haskv.c deleted file mode 100644 index 07c94df13660..000000000000 --- a/tools/testing/selftests/bpf/progs/test_btf_haskv.c +++ /dev/null @@ -1,51 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (c) 2018 Facebook */ -#include -#include -#include "bpf_legacy.h" - -struct ipv_counts { - unsigned int v4; - unsigned int v6; -}; - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" -struct bpf_map_def SEC("maps") btf_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(struct ipv_counts), - .max_entries = 4, -}; -#pragma GCC diagnostic pop - -BPF_ANNOTATE_KV_PAIR(btf_map, int, struct ipv_counts); - -__attribute__((noinline)) -int test_long_fname_2(void) -{ - struct ipv_counts *counts; - int key = 0; - - counts = bpf_map_lookup_elem(&btf_map, &key); - if (!counts) - return 0; - - counts->v6++; - - return 0; -} - -__attribute__((noinline)) -int test_long_fname_1(void) -{ - return test_long_fname_2(); -} - -SEC("dummy_tracepoint") -int _dummy_tracepoint(void *arg) -{ - return test_long_fname_1(); -} - -char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_btf_newkv.c b/tools/testing/selftests/bpf/progs/test_btf_newkv.c index 762671a2e90c..251854a041b5 100644 --- a/tools/testing/selftests/bpf/progs/test_btf_newkv.c +++ b/tools/testing/selftests/bpf/progs/test_btf_newkv.c @@ -9,19 +9,6 @@ struct ipv_counts { unsigned int v6; }; -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" -/* just to validate we can handle maps in multiple sections */ -struct bpf_map_def SEC("maps") btf_map_legacy = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(long long), - .max_entries = 4, -}; -#pragma GCC diagnostic pop - -BPF_ANNOTATE_KV_PAIR(btf_map_legacy, int, struct ipv_counts); - struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 4); @@ -41,11 +28,6 @@ int test_long_fname_2(void) counts->v6++; - /* just verify we can reference both maps */ - counts = bpf_map_lookup_elem(&btf_map_legacy, &key); - if (!counts) - return 0; - return 0; } -- cgit v1.2.3 From bd054102a8c7f36ff03446cc41822601180241f3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:26 -0700 Subject: libbpf: enforce strict libbpf 1.0 behaviors Remove support for legacy features and behaviors that previously had to be disabled by calling libbpf_set_strict_mode(): - legacy BPF map definitions are not supported now; - RLIMIT_MEMLOCK auto-setting, if necessary, is always on (but see libbpf_set_memlock_rlim()); - program name is used for program pinning (instead of section name); - cleaned up error returning logic; - entry BPF programs should have SEC() always. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-15-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf.c | 4 - tools/lib/bpf/libbpf.c | 212 ++-------------------------------------- tools/lib/bpf/libbpf.h | 34 ------- tools/lib/bpf/libbpf_internal.h | 24 +---- tools/lib/bpf/libbpf_legacy.h | 24 +++++ 5 files changed, 37 insertions(+), 261 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 2f3495d80d69..6d848b02c1e0 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -147,10 +147,6 @@ int bump_rlimit_memlock(void) { struct rlimit rlim; - /* this the default in libbpf 1.0, but for now user has to opt-in explicitly */ - if (!(libbpf_mode & LIBBPF_STRICT_AUTO_RLIMIT_MEMLOCK)) - return 0; - /* if kernel supports memcg-based accounting, skip bumping RLIMIT_MEMLOCK */ if (memlock_bumped || kernel_supports(NULL, FEAT_MEMCG_ACCOUNT)) return 0; diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index ae58cf2898ad..e994797bcd48 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -278,12 +278,9 @@ static inline __u64 ptr_to_u64(const void *ptr) return (__u64) (unsigned long) ptr; } -/* this goes away in libbpf 1.0 */ -enum libbpf_strict_mode libbpf_mode = LIBBPF_STRICT_NONE; - int libbpf_set_strict_mode(enum libbpf_strict_mode mode) { - libbpf_mode = mode; + /* as of v1.0 libbpf_set_strict_mode() is a no-op */ return 0; } @@ -367,9 +364,10 @@ struct bpf_sec_def { * linux/filter.h. */ struct bpf_program { - const struct bpf_sec_def *sec_def; + char *name; char *sec_name; size_t sec_idx; + const struct bpf_sec_def *sec_def; /* this program's instruction offset (in number of instructions) * within its containing ELF section */ @@ -389,12 +387,6 @@ struct bpf_program { */ size_t sub_insn_off; - char *name; - /* name with / replaced by _; makes recursive pinning - * in bpf_object__pin_programs easier - */ - char *pin_name; - /* instructions that belong to BPF program; insns[0] is located at * sec_insn_off instruction within its ELF section in ELF file, so * when mapping ELF file instruction index to the local instruction, @@ -695,7 +687,6 @@ static void bpf_program__exit(struct bpf_program *prog) bpf_program__unload(prog); zfree(&prog->name); zfree(&prog->sec_name); - zfree(&prog->pin_name); zfree(&prog->insns); zfree(&prog->reloc_desc); @@ -704,26 +695,6 @@ static void bpf_program__exit(struct bpf_program *prog) prog->sec_idx = -1; } -static char *__bpf_program__pin_name(struct bpf_program *prog) -{ - char *name, *p; - - if (libbpf_mode & LIBBPF_STRICT_SEC_NAME) - name = strdup(prog->name); - else - name = strdup(prog->sec_name); - - if (!name) - return NULL; - - p = name; - - while ((p = strchr(p, '/'))) - *p = '_'; - - return name; -} - static bool insn_is_subprog_call(const struct bpf_insn *insn) { return BPF_CLASS(insn->code) == BPF_JMP && @@ -790,10 +761,6 @@ bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog, if (!prog->name) goto errout; - prog->pin_name = __bpf_program__pin_name(prog); - if (!prog->pin_name) - goto errout; - prog->insns = malloc(insn_data_sz); if (!prog->insns) goto errout; @@ -2007,143 +1974,6 @@ static int bpf_object__init_kconfig_map(struct bpf_object *obj) return 0; } -static int bpf_object__init_user_maps(struct bpf_object *obj, bool strict) -{ - Elf_Data *symbols = obj->efile.symbols; - int i, map_def_sz = 0, nr_maps = 0, nr_syms; - Elf_Data *data = NULL; - Elf_Scn *scn; - - if (obj->efile.maps_shndx < 0) - return 0; - - if (libbpf_mode & LIBBPF_STRICT_MAP_DEFINITIONS) { - pr_warn("legacy map definitions in SEC(\"maps\") are not supported\n"); - return -EOPNOTSUPP; - } - - if (!symbols) - return -EINVAL; - - scn = elf_sec_by_idx(obj, obj->efile.maps_shndx); - data = elf_sec_data(obj, scn); - if (!scn || !data) { - pr_warn("elf: failed to get legacy map definitions for %s\n", - obj->path); - return -EINVAL; - } - - /* - * Count number of maps. Each map has a name. - * Array of maps is not supported: only the first element is - * considered. - * - * TODO: Detect array of map and report error. - */ - nr_syms = symbols->d_size / sizeof(Elf64_Sym); - for (i = 0; i < nr_syms; i++) { - Elf64_Sym *sym = elf_sym_by_idx(obj, i); - - if (sym->st_shndx != obj->efile.maps_shndx) - continue; - if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION) - continue; - nr_maps++; - } - /* Assume equally sized map definitions */ - pr_debug("elf: found %d legacy map definitions (%zd bytes) in %s\n", - nr_maps, data->d_size, obj->path); - - if (!data->d_size || nr_maps == 0 || (data->d_size % nr_maps) != 0) { - pr_warn("elf: unable to determine legacy map definition size in %s\n", - obj->path); - return -EINVAL; - } - map_def_sz = data->d_size / nr_maps; - - /* Fill obj->maps using data in "maps" section. */ - for (i = 0; i < nr_syms; i++) { - Elf64_Sym *sym = elf_sym_by_idx(obj, i); - const char *map_name; - struct bpf_map_def *def; - struct bpf_map *map; - - if (sym->st_shndx != obj->efile.maps_shndx) - continue; - if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION) - continue; - - map = bpf_object__add_map(obj); - if (IS_ERR(map)) - return PTR_ERR(map); - - map_name = elf_sym_str(obj, sym->st_name); - if (!map_name) { - pr_warn("failed to get map #%d name sym string for obj %s\n", - i, obj->path); - return -LIBBPF_ERRNO__FORMAT; - } - - pr_warn("map '%s' (legacy): legacy map definitions are deprecated, use BTF-defined maps instead\n", map_name); - - if (ELF64_ST_BIND(sym->st_info) == STB_LOCAL) { - pr_warn("map '%s' (legacy): static maps are not supported\n", map_name); - return -ENOTSUP; - } - - map->libbpf_type = LIBBPF_MAP_UNSPEC; - map->sec_idx = sym->st_shndx; - map->sec_offset = sym->st_value; - pr_debug("map '%s' (legacy): at sec_idx %d, offset %zu.\n", - map_name, map->sec_idx, map->sec_offset); - if (sym->st_value + map_def_sz > data->d_size) { - pr_warn("corrupted maps section in %s: last map \"%s\" too small\n", - obj->path, map_name); - return -EINVAL; - } - - map->name = strdup(map_name); - if (!map->name) { - pr_warn("map '%s': failed to alloc map name\n", map_name); - return -ENOMEM; - } - pr_debug("map %d is \"%s\"\n", i, map->name); - def = (struct bpf_map_def *)(data->d_buf + sym->st_value); - /* - * If the definition of the map in the object file fits in - * bpf_map_def, copy it. Any extra fields in our version - * of bpf_map_def will default to zero as a result of the - * calloc above. - */ - if (map_def_sz <= sizeof(struct bpf_map_def)) { - memcpy(&map->def, def, map_def_sz); - } else { - /* - * Here the map structure being read is bigger than what - * we expect, truncate if the excess bits are all zero. - * If they are not zero, reject this map as - * incompatible. - */ - char *b; - - for (b = ((char *)def) + sizeof(struct bpf_map_def); - b < ((char *)def) + map_def_sz; b++) { - if (*b != 0) { - pr_warn("maps section in %s: \"%s\" has unrecognized, non-zero options\n", - obj->path, map_name); - if (strict) - return -EINVAL; - } - } - memcpy(&map->def, def, sizeof(struct bpf_map_def)); - } - - /* btf info may not exist but fill it in if it does exist */ - (void) bpf_map_find_btf_info(obj, map); - } - return 0; -} - const struct btf_type * skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id) { @@ -2700,12 +2530,11 @@ static int bpf_object__init_maps(struct bpf_object *obj, { const char *pin_root_path; bool strict; - int err; + int err = 0; strict = !OPTS_GET(opts, relaxed_maps, false); pin_root_path = OPTS_GET(opts, pin_root_path, NULL); - err = bpf_object__init_user_maps(obj, strict); err = err ?: bpf_object__init_user_btf_maps(obj, strict, pin_root_path); err = err ?: bpf_object__init_global_data_maps(obj); err = err ?: bpf_object__init_kconfig_map(obj); @@ -3979,28 +3808,8 @@ static int bpf_object__collect_externs(struct bpf_object *obj) return 0; } -static bool prog_is_subprog(const struct bpf_object *obj, - const struct bpf_program *prog) -{ - /* For legacy reasons, libbpf supports an entry-point BPF programs - * without SEC() attribute, i.e., those in the .text section. But if - * there are 2 or more such programs in the .text section, they all - * must be subprograms called from entry-point BPF programs in - * designated SEC()'tions, otherwise there is no way to distinguish - * which of those programs should be loaded vs which are a subprogram. - * Similarly, if there is a function/program in .text and at least one - * other BPF program with custom SEC() attribute, then we just assume - * .text programs are subprograms (even if they are not called from - * other programs), because libbpf never explicitly supported mixing - * SEC()-designated BPF programs and .text entry-point BPF programs. - * - * In libbpf 1.0 strict mode, we always consider .text - * programs to be subprograms. - */ - - if (libbpf_mode & LIBBPF_STRICT_SEC_NAME) - return prog->sec_idx == obj->efile.text_shndx; - +static bool prog_is_subprog(const struct bpf_object *obj, const struct bpf_program *prog) +{ return prog->sec_idx == obj->efile.text_shndx && obj->nr_programs > 1; } @@ -8163,8 +7972,7 @@ int bpf_object__pin_programs(struct bpf_object *obj, const char *path) char buf[PATH_MAX]; int len; - len = snprintf(buf, PATH_MAX, "%s/%s", path, - prog->pin_name); + len = snprintf(buf, PATH_MAX, "%s/%s", path, prog->name); if (len < 0) { err = -EINVAL; goto err_unpin_programs; @@ -8185,8 +7993,7 @@ err_unpin_programs: char buf[PATH_MAX]; int len; - len = snprintf(buf, PATH_MAX, "%s/%s", path, - prog->pin_name); + len = snprintf(buf, PATH_MAX, "%s/%s", path, prog->name); if (len < 0) continue; else if (len >= PATH_MAX) @@ -8210,8 +8017,7 @@ int bpf_object__unpin_programs(struct bpf_object *obj, const char *path) char buf[PATH_MAX]; int len; - len = snprintf(buf, PATH_MAX, "%s/%s", path, - prog->pin_name); + len = snprintf(buf, PATH_MAX, "%s/%s", path, prog->name); if (len < 0) return libbpf_err(-EINVAL); else if (len >= PATH_MAX) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 5eb3145b8945..e4d5353f757b 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -886,40 +886,6 @@ LIBBPF_API int bpf_map__lookup_and_delete_elem(const struct bpf_map *map, LIBBPF_API int bpf_map__get_next_key(const struct bpf_map *map, const void *cur_key, void *next_key, size_t key_sz); -/** - * @brief **libbpf_get_error()** extracts the error code from the passed - * pointer - * @param ptr pointer returned from libbpf API function - * @return error code; or 0 if no error occured - * - * Many libbpf API functions which return pointers have logic to encode error - * codes as pointers, and do not return NULL. Meaning **libbpf_get_error()** - * should be used on the return value from these functions immediately after - * calling the API function, with no intervening calls that could clobber the - * `errno` variable. Consult the individual functions documentation to verify - * if this logic applies should be used. - * - * For these API functions, if `libbpf_set_strict_mode(LIBBPF_STRICT_CLEAN_PTRS)` - * is enabled, NULL is returned on error instead. - * - * If ptr is NULL, then errno should be already set by the failing - * API, because libbpf never returns NULL on success and it now always - * sets errno on error. - * - * Example usage: - * - * struct perf_buffer *pb; - * - * pb = perf_buffer__new(bpf_map__fd(obj->maps.events), PERF_BUFFER_PAGES, &opts); - * err = libbpf_get_error(pb); - * if (err) { - * pb = NULL; - * fprintf(stderr, "failed to open perf buffer: %d\n", err); - * goto cleanup; - * } - */ -LIBBPF_API long libbpf_get_error(const void *ptr); - struct bpf_xdp_set_link_opts { size_t sz; int old_fd; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index a1ad145ffa74..9cd7829cbe41 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -15,7 +15,6 @@ #include #include #include -#include "libbpf_legacy.h" #include "relo_core.h" /* make sure libbpf doesn't use kernel-only integer typedefs */ @@ -478,8 +477,6 @@ int btf_ext_visit_str_offs(struct btf_ext *btf_ext, str_off_visit_fn visit, void __s32 btf__find_by_name_kind_own(const struct btf *btf, const char *type_name, __u32 kind); -extern enum libbpf_strict_mode libbpf_mode; - typedef int (*kallsyms_cb_t)(unsigned long long sym_addr, char sym_type, const char *sym_name, void *ctx); @@ -498,12 +495,8 @@ static inline int libbpf_err(int ret) */ static inline int libbpf_err_errno(int ret) { - if (libbpf_mode & LIBBPF_STRICT_DIRECT_ERRS) - /* errno is already assumed to be set on error */ - return ret < 0 ? -errno : ret; - - /* legacy: on error return -1 directly and don't touch errno */ - return ret; + /* errno is already assumed to be set on error */ + return ret < 0 ? -errno : ret; } /* handle error for pointer-returning APIs, err is assumed to be < 0 always */ @@ -511,12 +504,7 @@ static inline void *libbpf_err_ptr(int err) { /* set errno on error, this doesn't break anything */ errno = -err; - - if (libbpf_mode & LIBBPF_STRICT_CLEAN_PTRS) - return NULL; - - /* legacy: encode err as ptr */ - return ERR_PTR(err); + return NULL; } /* handle pointer-returning APIs' error handling */ @@ -526,11 +514,7 @@ static inline void *libbpf_ptr(void *ret) if (IS_ERR(ret)) errno = -PTR_ERR(ret); - if (libbpf_mode & LIBBPF_STRICT_CLEAN_PTRS) - return IS_ERR(ret) ? NULL : ret; - - /* legacy: pass-through original pointer */ - return ret; + return IS_ERR(ret) ? NULL : ret; } static inline bool str_is_empty(const char *s) diff --git a/tools/lib/bpf/libbpf_legacy.h b/tools/lib/bpf/libbpf_legacy.h index d7bcbd01f66f..863f49df8bf4 100644 --- a/tools/lib/bpf/libbpf_legacy.h +++ b/tools/lib/bpf/libbpf_legacy.h @@ -20,6 +20,11 @@ extern "C" { #endif +/* As of libbpf 1.0 libbpf_set_strict_mode() and enum libbpf_struct_mode have + * no effect. But they are left in libbpf_legacy.h so that applications that + * prepared for libbpf 1.0 before final release by using + * libbpf_set_strict_mode() still work with libbpf 1.0+ without any changes. + */ enum libbpf_strict_mode { /* Turn on all supported strict features of libbpf to simulate libbpf * v1.0 behavior. @@ -88,6 +93,25 @@ enum libbpf_strict_mode { LIBBPF_API int libbpf_set_strict_mode(enum libbpf_strict_mode mode); +/** + * @brief **libbpf_get_error()** extracts the error code from the passed + * pointer + * @param ptr pointer returned from libbpf API function + * @return error code; or 0 if no error occured + * + * Note, as of libbpf 1.0 this function is not necessary and not recommended + * to be used. Libbpf doesn't return error code embedded into the pointer + * itself. Instead, NULL is returned on error and error code is passed through + * thread-local errno variable. **libbpf_get_error()** is just returning -errno + * value if it receives NULL, which is correct only if errno hasn't been + * modified between libbpf API call and corresponding **libbpf_get_error()** + * call. Prefer to check return for NULL and use errno directly. + * + * This API is left in libbpf 1.0 to allow applications that were 1.0-ready + * before final libbpf 1.0 without needing to change them. + */ +LIBBPF_API long libbpf_get_error(const void *ptr); + #define DECLARE_LIBBPF_OPTS LIBBPF_OPTS /* "Discouraged" APIs which don't follow consistent libbpf naming patterns. -- cgit v1.2.3 From ab9a5a05dc480f8994eddd31093a8920b08ee71d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:27 -0700 Subject: libbpf: fix up few libbpf.map problems Seems like we missed to add 2 APIs to libbpf.map and another API was misspelled. Fix it in libbpf.map. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-16-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.map | 3 ++- tools/lib/bpf/libbpf_legacy.h | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 236efc7338f1..43c6697a6d27 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -326,10 +326,11 @@ LIBBPF_0.7.0 { bpf_xdp_detach; bpf_xdp_query; bpf_xdp_query_id; + btf_ext__raw_data; libbpf_probe_bpf_helper; libbpf_probe_bpf_map_type; libbpf_probe_bpf_prog_type; - libbpf_set_memlock_rlim_max; + libbpf_set_memlock_rlim; } LIBBPF_0.6.0; LIBBPF_0.8.0 { diff --git a/tools/lib/bpf/libbpf_legacy.h b/tools/lib/bpf/libbpf_legacy.h index 863f49df8bf4..5b7e0155db6a 100644 --- a/tools/lib/bpf/libbpf_legacy.h +++ b/tools/lib/bpf/libbpf_legacy.h @@ -76,8 +76,8 @@ enum libbpf_strict_mode { * first BPF program or map creation operation. This is done only if * kernel is too old to support memcg-based memory accounting for BPF * subsystem. By default, RLIMIT_MEMLOCK limit is set to RLIM_INFINITY, - * but it can be overriden with libbpf_set_memlock_rlim_max() API. - * Note that libbpf_set_memlock_rlim_max() needs to be called before + * but it can be overriden with libbpf_set_memlock_rlim() API. + * Note that libbpf_set_memlock_rlim() needs to be called before * the very first bpf_prog_load(), bpf_map_create() or bpf_object__load() * operation. */ -- cgit v1.2.3 From 9cfd110a36649f9452120a648f15f32d1c82b99d Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Fri, 10 Jun 2022 19:10:47 +0530 Subject: selftests/powerpc/pmu: Refactor the platform check and add macros to find array size/PVR The platform check for selftest support "check_pvr_for_sampling_tests" is specific to sampling tests which includes PVR check, presence of PMU and extended regs support. Extended regs support is needed for sampling tests which tests whether PMU registers are programmed correctly. There could be other sampling tests which may not need extended regs, example, bhrb filter tests which only needs validity check via event open. Hence refactor the platform check to have a common function "platform_check_for_tests" that checks only for PVR check and presence of PMU. The existing function "check_pvr_for_sampling_tests" will invoke the common function and also will include checks for extended regs specific for sampling. The common function can also be used by tests other than sampling like event code tests. Add macro to find array size ("ARRAY_SIZE") to sampling tests "misc.h" file. This can be used in next tests to find event array size. Also update "include/reg.h" to add macros to find minor and major version from PVR which will be used in testcases. Signed-off-by: Athira Rajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-10-atrajeev@linux.vnet.ibm.com --- tools/testing/selftests/powerpc/include/reg.h | 4 ++++ .../selftests/powerpc/pmu/sampling_tests/misc.c | 20 ++++++++++++++++---- .../selftests/powerpc/pmu/sampling_tests/misc.h | 3 +++ 3 files changed, 23 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/include/reg.h b/tools/testing/selftests/powerpc/include/reg.h index c422be8a42b2..2ac7a4c7749c 100644 --- a/tools/testing/selftests/powerpc/include/reg.h +++ b/tools/testing/selftests/powerpc/include/reg.h @@ -55,6 +55,10 @@ #define PVR_VER(pvr) (((pvr) >> 16) & 0xFFFF) #define SPRN_PVR 0x11F +#define PVR_CFG(pvr) (((pvr) >> 8) & 0xF) /* Configuration field */ +#define PVR_MAJ(pvr) (((pvr) >> 4) & 0xF) /* Major revision field */ +#define PVR_MIN(pvr) (((pvr) >> 0) & 0xF) /* Minor revision field */ + #define SPRN_DSCR_PRIV 0x11 /* Privilege State DSCR */ #define SPRN_DSCR 0x03 /* Data Stream Control Register */ #define SPRN_PPR 896 /* Program Priority Register */ diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.c b/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.c index 5a26fc3a9706..eac6420abdf1 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.c +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.c @@ -121,12 +121,10 @@ int check_extended_regs_support(void) return -1; } -int check_pvr_for_sampling_tests(void) +int platform_check_for_tests(void) { pvr = PVR_VER(mfspr(SPRN_PVR)); - platform_extended_mask = perf_get_platform_reg_mask(); - /* * Check for supported platforms * for sampling test @@ -138,19 +136,33 @@ int check_pvr_for_sampling_tests(void) * Check PMU driver registered by looking for * PPC_FEATURE2_EBB bit in AT_HWCAP2 */ - if (!have_hwcap2(PPC_FEATURE2_EBB)) + if (!have_hwcap2(PPC_FEATURE2_EBB) || !have_hwcap2(PPC_FEATURE2_ARCH_3_00)) goto out; + return 0; + +out: + printf("%s: Tests unsupported for this platform\n", __func__); + return -1; +} + +int check_pvr_for_sampling_tests(void) +{ + SKIP_IF(platform_check_for_tests()); + + platform_extended_mask = perf_get_platform_reg_mask(); /* check if platform supports extended regs */ if (check_extended_regs_support()) goto out; init_ev_encodes(); return 0; + out: printf("%s: Sampling tests un-supported\n", __func__); return -1; } + /* * Allocate mmap buffer of "mmap_pages" number of * pages. diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.h b/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.h index 874a1596add8..4181755cf5a0 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.h +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.h @@ -18,6 +18,8 @@ #define MMCR1_RSQ 0x200000000000ULL /* radix scope qual field */ #define BHRB_DISABLE 0x2000000000ULL /* MMCRA BHRB DISABLE bit */ +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + extern int ev_mask_pmcxsel, ev_shift_pmcxsel; extern int ev_mask_marked, ev_shift_marked; extern int ev_mask_comb, ev_shift_comb; @@ -36,6 +38,7 @@ extern int ev_mask_mmcr3_src, ev_shift_mmcr3_src; extern int pvr; extern u64 platform_extended_mask; extern int check_pvr_for_sampling_tests(void); +extern int platform_check_for_tests(void); /* * Event code field extraction macro. -- cgit v1.2.3 From 2ac05f8f2e4b9068e5bbc0836b35abafd70f02c1 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Fri, 10 Jun 2022 19:10:48 +0530 Subject: selftests/powerpc/pmu: Add selftest to check branch stack enablement will not crash on any platforms While enabling branch stack for an event, BHRB (Branch History Rolling Buffer) filter is set using bhrb_filter_map() callback. This callback is not defined for cases like generic_compat_pmu or in case where there is no PMU registered. A fix was added in kernel to address a crash issue observed while enabling branch stack for environments which doesn't have this callback. commit b460b512417a ("powerpc/perf: Fix crashes with generic_compat_pmu & BHRB"). Add perf sampling test to exercise this code path and make sure enabling branch stack shouldn't crash in any platform. Testcase uses software event cycles since software event is available and can be used even in cases without PMU. Signed-off-by: Athira Rajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-11-atrajeev@linux.vnet.ibm.com --- .../selftests/powerpc/pmu/sampling_tests/Makefile | 2 +- .../pmu/sampling_tests/bhrb_no_crash_wo_pmu_test.c | 59 ++++++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/sampling_tests/bhrb_no_crash_wo_pmu_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile index f4da49d55d57..8d4839cde013 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile @@ -6,7 +6,7 @@ TEST_GEN_PROGS := mmcr0_exceptionbits_test mmcr0_cc56run_test mmcr0_pmccext_test mmcr1_comb_test mmcr2_l2l3_test mmcr2_fcs_fch_test \ mmcr3_src_test mmcra_thresh_marked_sample_test mmcra_thresh_cmp_test \ mmcra_bhrb_ind_call_test mmcra_bhrb_any_test mmcra_bhrb_cond_test \ - mmcra_bhrb_disable_test + mmcra_bhrb_disable_test bhrb_no_crash_wo_pmu_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/bhrb_no_crash_wo_pmu_test.c b/tools/testing/selftests/powerpc/pmu/sampling_tests/bhrb_no_crash_wo_pmu_test.c new file mode 100644 index 000000000000..4644c6782974 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/bhrb_no_crash_wo_pmu_test.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Athira Rajeev, IBM Corp. + */ + +#include +#include + +#include "../event.h" +#include "misc.h" +#include "utils.h" + +/* + * A perf sampling test for making sure + * enabling branch stack doesn't crash in any + * environment, say: + * - With generic compat PMU + * - without any PMU registered + * - With platform specific PMU + * A fix for bhrb sampling crash was added in kernel + * via commit: b460b512417a ("powerpc/perf: Fix crashes + * with generic_compat_pmu & BHRB") + * + * This testcase exercises this code by doing branch + * stack enable for software event. s/w event is used + * since software event will work even in platform + * without PMU. + */ +static int bhrb_no_crash_wo_pmu_test(void) +{ + struct event event; + + /* + * Init the event for the sampling test. + * This uses software event which works on + * any platform. + */ + event_init_opts(&event, 0, PERF_TYPE_SOFTWARE, "cycles"); + + event.attr.sample_period = 1000; + event.attr.sample_type = PERF_SAMPLE_BRANCH_STACK; + event.attr.disabled = 1; + + /* + * Return code of event_open is not + * considered since test just expects no crash from + * using PERF_SAMPLE_BRANCH_STACK. Also for environment + * like generic compat PMU, branch stack is unsupported. + */ + event_open(&event); + + event_close(&event); + return 0; +} + +int main(void) +{ + return test_harness(bhrb_no_crash_wo_pmu_test, "bhrb_no_crash_wo_pmu_test"); +} -- cgit v1.2.3 From 11bbc524390572dfe1bd0375c7e7ab8f9ddf4b34 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Fri, 10 Jun 2022 19:10:49 +0530 Subject: selftests/powerpc/pmu: Add selftest to check PERF_SAMPLE_REGS_INTR option will not crash on any platforms With sampling, --intr-regs option is used for capturing interrupt regs. When --intr-regs option is used, PMU code uses is_sier_available() function which uses PMU flags in the code. In environment where platform specific PMU is not registered, PMU flags is not defined. A fix was added in kernel to address crash while accessing is_sier_available() function when pmu is not set. commit f75e7d73bdf7 ("powerpc/perf: Fix crash with is_sier_available when pmu is not set"). Add perf sampling test to exercise this code and make sure enabling intr_regs shouldn't crash in any platform. Testcase uses software event cycles since software event will work even in cases without PMU. Signed-off-by: Athira Rajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-12-atrajeev@linux.vnet.ibm.com --- .../selftests/powerpc/pmu/sampling_tests/Makefile | 2 +- .../intr_regs_no_crash_wo_pmu_test.c | 57 ++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/sampling_tests/intr_regs_no_crash_wo_pmu_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile index 8d4839cde013..8d4566dac440 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile @@ -6,7 +6,7 @@ TEST_GEN_PROGS := mmcr0_exceptionbits_test mmcr0_cc56run_test mmcr0_pmccext_test mmcr1_comb_test mmcr2_l2l3_test mmcr2_fcs_fch_test \ mmcr3_src_test mmcra_thresh_marked_sample_test mmcra_thresh_cmp_test \ mmcra_bhrb_ind_call_test mmcra_bhrb_any_test mmcra_bhrb_cond_test \ - mmcra_bhrb_disable_test bhrb_no_crash_wo_pmu_test + mmcra_bhrb_disable_test bhrb_no_crash_wo_pmu_test intr_regs_no_crash_wo_pmu_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/intr_regs_no_crash_wo_pmu_test.c b/tools/testing/selftests/powerpc/pmu/sampling_tests/intr_regs_no_crash_wo_pmu_test.c new file mode 100644 index 000000000000..839d2d225da0 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/intr_regs_no_crash_wo_pmu_test.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Athira Rajeev, IBM Corp. + */ + +#include +#include + +#include "../event.h" +#include "misc.h" +#include "utils.h" + +/* + * A perf sampling test for making sure + * sampling with -intr-regs doesn't crash + * in any environment, say: + * - With generic compat PMU + * - without any PMU registered + * - With platform specific PMU. + * A fix for crash with intr_regs was + * addressed in commit: f75e7d73bdf7 in kernel. + * + * This testcase exercises this code path by doing + * intr_regs using software event. Software event is + * used since s/w event will work even in platform + * without PMU. + */ +static int intr_regs_no_crash_wo_pmu_test(void) +{ + struct event event; + + /* + * Init the event for the sampling test. + * This uses software event which works on + * any platform. + */ + event_init_opts(&event, 0, PERF_TYPE_SOFTWARE, "cycles"); + + event.attr.sample_period = 1000; + event.attr.sample_type = PERF_SAMPLE_REGS_INTR; + event.attr.disabled = 1; + + /* + * Return code of event_open is not considered + * since test just expects no crash from using + * PERF_SAMPLE_REGS_INTR. + */ + event_open(&event); + + event_close(&event); + return 0; +} + +int main(void) +{ + return test_harness(intr_regs_no_crash_wo_pmu_test, "intr_regs_no_crash_wo_pmu_test"); +} -- cgit v1.2.3 From f6380e05aa92b005ac6f38be92afbdd2a0706cff Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Fri, 10 Jun 2022 19:10:50 +0530 Subject: selftests/powerpc/pmu: Add selftest for checking valid and invalid bhrb filter maps For PERF_SAMPLE_BRANCH_STACK sample type, different branch_sample_type, ie branch filters are supported. All the branch filters are not supported in powerpc. Example, power10 platform supports any, ind_call and cond branch filters. Whereas, it is different in power9. Testcase checks event open for invalid and valid branch sample types. The branch types for testcase are picked from "perf_branch_sample_type" in perf_event.h Signed-off-by: Athira Rajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-13-atrajeev@linux.vnet.ibm.com --- .../selftests/powerpc/pmu/sampling_tests/Makefile | 3 +- .../pmu/sampling_tests/bhrb_filter_map_test.c | 105 +++++++++++++++++++++ 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/sampling_tests/bhrb_filter_map_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile index 8d4566dac440..ed9befc2f836 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile @@ -6,7 +6,8 @@ TEST_GEN_PROGS := mmcr0_exceptionbits_test mmcr0_cc56run_test mmcr0_pmccext_test mmcr1_comb_test mmcr2_l2l3_test mmcr2_fcs_fch_test \ mmcr3_src_test mmcra_thresh_marked_sample_test mmcra_thresh_cmp_test \ mmcra_bhrb_ind_call_test mmcra_bhrb_any_test mmcra_bhrb_cond_test \ - mmcra_bhrb_disable_test bhrb_no_crash_wo_pmu_test intr_regs_no_crash_wo_pmu_test + mmcra_bhrb_disable_test bhrb_no_crash_wo_pmu_test intr_regs_no_crash_wo_pmu_test \ + bhrb_filter_map_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/bhrb_filter_map_test.c b/tools/testing/selftests/powerpc/pmu/sampling_tests/bhrb_filter_map_test.c new file mode 100644 index 000000000000..8182647c63c8 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/bhrb_filter_map_test.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Athira Rajeev, IBM Corp. + */ + +#include +#include + +#include "../event.h" +#include "misc.h" +#include "utils.h" + +/* + * A perf sampling test to check bhrb filter + * map. All the branch filters are not supported + * in powerpc. Supported filters in: + * power10: any, any_call, ind_call, cond + * power9: any, any_call + * + * Testcase checks event open for invalid bhrb filter + * types should fail and valid filter types should pass. + * Testcase does validity check for these branch + * sample types. + */ + +/* Invalid types for powerpc */ +/* Valid bhrb filters in power9/power10 */ +int bhrb_filter_map_valid_common[] = { + PERF_SAMPLE_BRANCH_ANY, + PERF_SAMPLE_BRANCH_ANY_CALL, +}; + +/* Valid bhrb filters in power10 */ +int bhrb_filter_map_valid_p10[] = { + PERF_SAMPLE_BRANCH_IND_CALL, + PERF_SAMPLE_BRANCH_COND, +}; + +#define EventCode 0x1001e + +static int bhrb_filter_map_test(void) +{ + struct event event; + int i; + + /* Check for platform support for the test */ + SKIP_IF(platform_check_for_tests()); + + /* + * Skip for Generic compat PMU since + * bhrb filters is not supported + */ + SKIP_IF(check_for_generic_compat_pmu()); + + /* Init the event for the sampling test */ + event_init(&event, EventCode); + + event.attr.sample_period = 1000; + event.attr.sample_type = PERF_SAMPLE_BRANCH_STACK; + event.attr.disabled = 1; + + /* Invalid filter maps which are expected to fail in event_open */ + for (i = PERF_SAMPLE_BRANCH_USER_SHIFT; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) { + /* Skip the valid branch sample type */ + if (i == PERF_SAMPLE_BRANCH_ANY_SHIFT || i == PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT \ + || i == PERF_SAMPLE_BRANCH_IND_CALL_SHIFT || i == PERF_SAMPLE_BRANCH_COND_SHIFT) + continue; + event.attr.branch_sample_type = 1U << i; + FAIL_IF(!event_open(&event)); + } + + /* valid filter maps for power9/power10 which are expected to pass in event_open */ + for (i = 0; i < ARRAY_SIZE(bhrb_filter_map_valid_common); i++) { + event.attr.branch_sample_type = bhrb_filter_map_valid_common[i]; + FAIL_IF(event_open(&event)); + event_close(&event); + } + + /* + * filter maps which are valid in power10 and invalid in power9. + * PVR check is used here since PMU specific data like bhrb filter + * alternative tests is handled by respective PMU driver code and + * using PVR will work correctly for all cases including generic + * compat mode. + */ + if (PVR_VER(mfspr(SPRN_PVR)) == POWER10) { + for (i = 0; i < ARRAY_SIZE(bhrb_filter_map_valid_p10); i++) { + event.attr.branch_sample_type = bhrb_filter_map_valid_p10[i]; + FAIL_IF(event_open(&event)); + event_close(&event); + } + } else { + for (i = 0; i < ARRAY_SIZE(bhrb_filter_map_valid_p10); i++) { + event.attr.branch_sample_type = bhrb_filter_map_valid_p10[i]; + FAIL_IF(!event_open(&event)); + } + } + + return 0; +} + +int main(void) +{ + return test_harness(bhrb_filter_map_test, "bhrb_filter_map_test"); +} -- cgit v1.2.3 From 0321f2d0ae6959f79f5b8a21b31694b54dbaa35d Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Fri, 10 Jun 2022 19:10:51 +0530 Subject: selftests/powerpc/pmu: Add selftest for mmcr1 pmcxsel/unit/cache fields The testcase uses event code "0x21c040" to verify the settings for different fields in Monitor Mode Control Register 1 (MMCR1). The fields include PMCxSEL, PMCXCOMB PMCxUNIT, cache. Checks if these fields are translated correctly via perf interface to MMCR1 Signed-off-by: Athira Rajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-14-atrajeev@linux.vnet.ibm.com --- .../selftests/powerpc/pmu/sampling_tests/Makefile | 2 +- .../pmu/sampling_tests/mmcr1_sel_unit_cache_test.c | 77 ++++++++++++++++++++++ 2 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/sampling_tests/mmcr1_sel_unit_cache_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile index ed9befc2f836..f966d3359c6b 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile @@ -7,7 +7,7 @@ TEST_GEN_PROGS := mmcr0_exceptionbits_test mmcr0_cc56run_test mmcr0_pmccext_test mmcr3_src_test mmcra_thresh_marked_sample_test mmcra_thresh_cmp_test \ mmcra_bhrb_ind_call_test mmcra_bhrb_any_test mmcra_bhrb_cond_test \ mmcra_bhrb_disable_test bhrb_no_crash_wo_pmu_test intr_regs_no_crash_wo_pmu_test \ - bhrb_filter_map_test + bhrb_filter_map_test mmcr1_sel_unit_cache_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/mmcr1_sel_unit_cache_test.c b/tools/testing/selftests/powerpc/pmu/sampling_tests/mmcr1_sel_unit_cache_test.c new file mode 100644 index 000000000000..f0c003282630 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/mmcr1_sel_unit_cache_test.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Athira Rajeev, IBM Corp. + */ + +#include +#include + +#include "../event.h" +#include "misc.h" +#include "utils.h" + +#define MALLOC_SIZE (0x10000 * 10) /* Ought to be enough .. */ + +/* The data cache was reloaded from local core's L3 due to a demand load */ +#define EventCode 0x21c040 + +/* + * A perf sampling test for mmcr1 + * fields : pmcxsel, unit, cache. + */ +static int mmcr1_sel_unit_cache(void) +{ + struct event event; + u64 *intr_regs; + char *p; + int i; + + /* Check for platform support for the test */ + SKIP_IF(check_pvr_for_sampling_tests()); + + p = malloc(MALLOC_SIZE); + FAIL_IF(!p); + + /* Init the event for the sampling test */ + event_init_sampling(&event, EventCode); + event.attr.sample_regs_intr = platform_extended_mask; + event.attr.sample_period = 1; + FAIL_IF(event_open(&event)); + event.mmap_buffer = event_sample_buf_mmap(event.fd, 1); + + event_enable(&event); + + /* workload to make the event overflow */ + for (i = 0; i < MALLOC_SIZE; i += 0x10000) + p[i] = i; + + event_disable(&event); + + /* Check for sample count */ + FAIL_IF(!collect_samples(event.mmap_buffer)); + + intr_regs = get_intr_regs(&event, event.mmap_buffer); + + /* Check for intr_regs */ + FAIL_IF(!intr_regs); + + /* + * Verify that pmcxsel, unit and cache field of MMCR1 + * match with corresponding event code fields + */ + FAIL_IF(EV_CODE_EXTRACT(event.attr.config, pmcxsel) != + get_mmcr1_pmcxsel(get_reg_value(intr_regs, "MMCR1"), 1)); + FAIL_IF(EV_CODE_EXTRACT(event.attr.config, unit) != + get_mmcr1_unit(get_reg_value(intr_regs, "MMCR1"), 1)); + FAIL_IF(EV_CODE_EXTRACT(event.attr.config, cache) != + get_mmcr1_cache(get_reg_value(intr_regs, "MMCR1"), 1)); + + free(p); + event_close(&event); + return 0; +} + +int main(void) +{ + FAIL_IF(test_harness(mmcr1_sel_unit_cache, "mmcr1_sel_unit_cache")); +} -- cgit v1.2.3 From 78cd598af648131d2e9a32825c59b8d1e9ec9357 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Fri, 10 Jun 2022 19:10:52 +0530 Subject: selftests/powerpc/pmu: Add interface test for bhrb disable field for non-branch samples The testcase uses "instructions" event to generate the samples and fetch Monitor Mode Control Register A (MMCRA) when overflow. Branch History Rolling Buffer(bhrb) disable bit is part of MMCRA which need to be verified by perf interface. Incase sample is not of branch type, bhrb disable bit is explicitly set to 1. Testcase checks if the bhrb disable bit is set of MMCRA register via perf interface for ISA v3.1 platform Signed-off-by: Kajol Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-15-atrajeev@linux.vnet.ibm.com --- .../selftests/powerpc/pmu/sampling_tests/Makefile | 2 +- .../mmcra_bhrb_disable_no_branch_test.c | 64 ++++++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/sampling_tests/mmcra_bhrb_disable_no_branch_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile index f966d3359c6b..9e67351fb252 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile @@ -7,7 +7,7 @@ TEST_GEN_PROGS := mmcr0_exceptionbits_test mmcr0_cc56run_test mmcr0_pmccext_test mmcr3_src_test mmcra_thresh_marked_sample_test mmcra_thresh_cmp_test \ mmcra_bhrb_ind_call_test mmcra_bhrb_any_test mmcra_bhrb_cond_test \ mmcra_bhrb_disable_test bhrb_no_crash_wo_pmu_test intr_regs_no_crash_wo_pmu_test \ - bhrb_filter_map_test mmcr1_sel_unit_cache_test + bhrb_filter_map_test mmcr1_sel_unit_cache_test mmcra_bhrb_disable_no_branch_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/mmcra_bhrb_disable_no_branch_test.c b/tools/testing/selftests/powerpc/pmu/sampling_tests/mmcra_bhrb_disable_no_branch_test.c new file mode 100644 index 000000000000..488c865387e4 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/mmcra_bhrb_disable_no_branch_test.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Kajol Jain, IBM Corp. + */ + +#include +#include + +#include "../event.h" +#include "misc.h" +#include "utils.h" + +extern void thirty_two_instruction_loop(int loops); + +/* Instructions */ +#define EventCode 0x500fa + +/* + * A perf sampling test for mmcra + * field: bhrb_disable. + */ +static int mmcra_bhrb_disable_no_branch_test(void) +{ + struct event event; + u64 *intr_regs; + + /* + * Check for platform support for the test. + * This test is only aplicable on power10 + */ + SKIP_IF(check_pvr_for_sampling_tests()); + SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_3_1)); + + /* Init the event for the sampling test */ + event_init_sampling(&event, EventCode); + event.attr.sample_regs_intr = platform_extended_mask; + event.attr.exclude_kernel = 1; + + FAIL_IF(event_open(&event)); + event.mmap_buffer = event_sample_buf_mmap(event.fd, 1); + + FAIL_IF(event_enable(&event)); + + /* workload to make the event overflow */ + thirty_two_instruction_loop(10000); + + FAIL_IF(event_disable(&event)); + + intr_regs = get_intr_regs(&event, event.mmap_buffer); + + /* Check for intr_regs */ + FAIL_IF(!intr_regs); + + /* Verify that bhrb_disable bit is set in MMCRA for non-branch samples */ + FAIL_IF(!get_mmcra_bhrb_disable(get_reg_value(intr_regs, "MMCRA"), 5)); + + event_close(&event); + return 0; +} + +int main(void) +{ + return test_harness(mmcra_bhrb_disable_no_branch_test, "mmcra_bhrb_disable_no_branch_test"); +} -- cgit v1.2.3 From 0a110a4b69dacc30ce4f6c10c0396bd2fd097831 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Fri, 10 Jun 2022 19:10:53 +0530 Subject: selftests/powerpc/pmu: Add support for perf event code tests Add new folder for enabling perf event code tests which includes checking for group constraints, valid/invalid events, also checks for event excludes, alternatives so on. A new folder "event_code_tests", is created under "selftests/powerpc/pmu". Also updates the corresponding Makefiles in "selftests/powerpc" and "event_code_tests" folder. Signed-off-by: Athira Rajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-16-atrajeev@linux.vnet.ibm.com --- tools/testing/selftests/powerpc/pmu/Makefile | 11 +++++++++-- tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile | 9 +++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/Makefile b/tools/testing/selftests/powerpc/pmu/Makefile index edbd96d3b2ab..30803353bd7c 100644 --- a/tools/testing/selftests/powerpc/pmu/Makefile +++ b/tools/testing/selftests/powerpc/pmu/Makefile @@ -8,7 +8,7 @@ EXTRA_SOURCES := ../harness.c event.c lib.c ../utils.c top_srcdir = ../../../../.. include ../../lib.mk -all: $(TEST_GEN_PROGS) ebb sampling_tests +all: $(TEST_GEN_PROGS) ebb sampling_tests event_code_tests $(TEST_GEN_PROGS): $(EXTRA_SOURCES) @@ -27,6 +27,7 @@ override define RUN_TESTS $(DEFAULT_RUN_TESTS) TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests TARGET=sampling_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests + TARGET=event_code_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests endef DEFAULT_EMIT_TESTS := $(EMIT_TESTS) @@ -34,6 +35,7 @@ override define EMIT_TESTS $(DEFAULT_EMIT_TESTS) TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET emit_tests TARGET=sampling_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET emit_tests + TARGET=event_code_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET emit_tests endef DEFAULT_INSTALL_RULE := $(INSTALL_RULE) @@ -41,12 +43,14 @@ override define INSTALL_RULE $(DEFAULT_INSTALL_RULE) TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install TARGET=sampling_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install + TARGET=event_code_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install endef clean: $(RM) $(TEST_GEN_PROGS) $(OUTPUT)/loop.o TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean TARGET=sampling_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean + TARGET=event_code_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean ebb: TARGET=$@; BUILD_TARGET=$$OUTPUT/$$TARGET; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $$TARGET all @@ -54,4 +58,7 @@ ebb: sampling_tests: TARGET=$@; BUILD_TARGET=$$OUTPUT/$$TARGET; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $$TARGET all -.PHONY: all run_tests clean ebb sampling_tests +event_code_tests: + TARGET=$@; BUILD_TARGET=$$OUTPUT/$$TARGET; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $$TARGET all + +.PHONY: all run_tests clean ebb sampling_tests event_code_tests diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile new file mode 100644 index 000000000000..6377ae205064 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 +CFLAGS += -m64 + +TEST_GEN_PROGS := + +top_srcdir = ../../../../../.. +include ../../../lib.mk + +$(TEST_GEN_PROGS): ../../harness.c ../../utils.c ../event.c ../lib.c ../sampling_tests/misc.h ../sampling_tests/misc.c -- cgit v1.2.3 From 9258c0aa755fac469869dd647a6c3d5299ff7725 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Fri, 10 Jun 2022 19:10:54 +0530 Subject: selftests/powerpc/pmu: Add selftest for group constraint check for PMC5 and PMC6 Events using Performance Monitor Counter 5 (PMC5) and Performance Monitor Counter 6 (PMC6) can't have other fields in event code like cache bits, thresholding or marked bit. PMC5 and PMC6 only supports base events: ie 500fa and 600f4. Other combinations should fail. Testcase tries setting other bits in event code for 500fa and 600f4 to check this scenario. Signed-off-by: Athira Rajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-17-atrajeev@linux.vnet.ibm.com --- .../powerpc/pmu/event_code_tests/Makefile | 2 +- .../event_code_tests/group_constraint_pmc56_test.c | 63 ++++++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_pmc56_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index 6377ae205064..eb0017233b0b 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS += -m64 -TEST_GEN_PROGS := +TEST_GEN_PROGS := group_constraint_pmc56_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_pmc56_test.c b/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_pmc56_test.c new file mode 100644 index 000000000000..f5ee4796d46c --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_pmc56_test.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Athira Rajeev, IBM Corp. + */ + +#include +#include "../event.h" +#include "../sampling_tests/misc.h" + +/* + * Testcase for checking constraint checks for + * Performance Monitor Counter 5 (PMC5) and also + * Performance Monitor Counter 6 (PMC6). Events using + * PMC5/PMC6 shouldn't have other fields in event + * code like cache bits, thresholding or marked bit. + */ + +static int group_constraint_pmc56(void) +{ + struct event event; + + /* Check for platform support for the test */ + SKIP_IF(platform_check_for_tests()); + + /* + * Events using PMC5 and PMC6 with cache bit + * set in event code is expected to fail. + */ + event_init(&event, 0x2500fa); + FAIL_IF(!event_open(&event)); + + event_init(&event, 0x2600f4); + FAIL_IF(!event_open(&event)); + + /* + * PMC5 and PMC6 only supports base events: + * ie 500fa and 600f4. Other combinations + * should fail. + */ + event_init(&event, 0x501e0); + FAIL_IF(!event_open(&event)); + + event_init(&event, 0x6001e); + FAIL_IF(!event_open(&event)); + + event_init(&event, 0x501fa); + FAIL_IF(!event_open(&event)); + + /* + * Events using PMC5 and PMC6 with random + * sampling bits set in event code should fail + * to schedule. + */ + event_init(&event, 0x35340500fa); + FAIL_IF(!event_open(&event)); + + return 0; +} + +int main(void) +{ + return test_harness(group_constraint_pmc56, "group_constraint_pmc56"); +} -- cgit v1.2.3 From 4000c2e5d40a3ee340c3940949d658fc52a56603 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Fri, 10 Jun 2022 19:10:55 +0530 Subject: selftests/powerpc/pmu: Add selftest to check PMC5/6 is excluded from some constraint checks Events using Performance Monitor Counter 5 (PMC5) and Performance Monitor Counter 6 (PMC6) should be excluded from constraint check when scheduled along with group of events. Example, combination of PMC5, PMC6, and an event with cache bit will succeed to schedule though first two events doesn't have cache bit set. Testcase use three events, ie, 600f4(cycles), 500fa(instructions), 22C040 with cache bit (dc_ic) set to test this constraint check. Signed-off-by: Athira Rajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-18-atrajeev@linux.vnet.ibm.com --- .../powerpc/pmu/event_code_tests/Makefile | 2 +- .../group_pmc56_exclude_constraints_test.c | 64 ++++++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/event_code_tests/group_pmc56_exclude_constraints_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index eb0017233b0b..c0eb28935e6e 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS += -m64 -TEST_GEN_PROGS := group_constraint_pmc56_test +TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/group_pmc56_exclude_constraints_test.c b/tools/testing/selftests/powerpc/pmu/event_code_tests/group_pmc56_exclude_constraints_test.c new file mode 100644 index 000000000000..cff9ac170df6 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/group_pmc56_exclude_constraints_test.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Athira Rajeev, IBM Corp. + */ + +#include +#include "../event.h" +#include +#include +#include "../sampling_tests/misc.h" + +/* + * Testcase for group constraint check for + * Performance Monitor Counter 5 (PMC5) and also + * Performance Monitor Counter 6 (PMC6). + * Test that pmc5/6 is excluded from constraint + * check when scheduled along with group of events. + */ + +static int group_pmc56_exclude_constraints(void) +{ + struct event *e, events[3]; + int i; + + /* Check for platform support for the test */ + SKIP_IF(platform_check_for_tests()); + + /* + * PMC5/6 is excluded from constraint bit + * check along with group of events. Use + * group of events with PMC5, PMC6 and also + * event with cache bit (dc_ic) set. Test expects + * this set of events to go in as a group. + */ + e = &events[0]; + event_init(e, 0x500fa); + + e = &events[1]; + event_init(e, 0x600f4); + + e = &events[2]; + event_init(e, 0x22C040); + + FAIL_IF(event_open(&events[0])); + + /* + * The event_open will fail if constraint check fails. + * Since we are asking for events in a group and since + * PMC5/PMC6 is excluded from group constraints, even_open + * should pass. + */ + for (i = 1; i < 3; i++) + FAIL_IF(event_open_with_group(&events[i], events[0].fd)); + + for (i = 0; i < 3; i++) + event_close(&events[i]); + + return 0; +} + +int main(void) +{ + return test_harness(group_pmc56_exclude_constraints, "group_pmc56_exclude_constraints"); +} -- cgit v1.2.3 From 827765a449dbc41ad19ab3e31757c93cac47b728 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Fri, 10 Jun 2022 19:10:56 +0530 Subject: selftests/powerpc/pmu: Add selftest to check constraint for number of counters in use. Testcase for group constraint check for number of counters in use. The number of programmable counters is from PMC1 to PMC4. Testcase uses four events with PMC1 to PMC4 and 5th event without any PMC which is expected to fail since it is exceeding the number of counters in use. Signed-off-by: Athira Rajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-19-atrajeev@linux.vnet.ibm.com --- .../powerpc/pmu/event_code_tests/Makefile | 2 +- .../group_constraint_pmc_count_test.c | 70 ++++++++++++++++++++++ 2 files changed, 71 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_pmc_count_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index c0eb28935e6e..6310634c5beb 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS += -m64 -TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_test +TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_test group_constraint_pmc_count_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_pmc_count_test.c b/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_pmc_count_test.c new file mode 100644 index 000000000000..af7c5c75101c --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_pmc_count_test.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Athira Rajeev, IBM Corp. + */ + +#include +#include "../event.h" +#include "../sampling_tests/misc.h" + +/* + * Testcase for number of counters in use. + * The number of programmable counters is from + * performance monitor counter 1 to performance + * monitor counter 4 (PMC1-PMC4). If number of + * counters in use exceeds the limit, next event + * should fail to schedule. + */ + +static int group_constraint_pmc_count(void) +{ + struct event *e, events[5]; + int i; + + /* Check for platform support for the test */ + SKIP_IF(platform_check_for_tests()); + + /* + * Test for number of counters in use. + * Use PMC1 to PMC4 for leader and 3 sibling + * events. Trying to open fourth event should + * fail here. + */ + e = &events[0]; + event_init(e, 0x1001a); + + e = &events[1]; + event_init(e, 0x200fc); + + e = &events[2]; + event_init(e, 0x30080); + + e = &events[3]; + event_init(e, 0x40054); + + e = &events[4]; + event_init(e, 0x0002c); + + FAIL_IF(event_open(&events[0])); + + /* + * The event_open will fail on event 4 if constraint + * check fails + */ + for (i = 1; i < 5; i++) { + if (i == 4) + FAIL_IF(!event_open_with_group(&events[i], events[0].fd)); + else + FAIL_IF(event_open_with_group(&events[i], events[0].fd)); + } + + for (i = 1; i < 4; i++) + event_close(&events[i]); + + return 0; +} + +int main(void) +{ + return test_harness(group_constraint_pmc_count, "group_constraint_pmc_count"); +} -- cgit v1.2.3 From 38b6da45304e55de11b8b79a0f31a4d61818e63e Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Fri, 10 Jun 2022 19:10:57 +0530 Subject: selftests/powerpc/pmu: Add selftest for group constraint check when using same PMC Testcase for group constraint check when using events with same PMC. Multiple events in a group asking for same PMC should fail. Testcase uses "0x22C040" on PMC2 as leader and also subling which is expected to fail. Using PMC1 for sibling event should pass the test. Signed-off-by: Athira Rajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-20-atrajeev@linux.vnet.ibm.com --- .../powerpc/pmu/event_code_tests/Makefile | 3 +- .../group_constraint_repeat_test.c | 56 ++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_repeat_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index 6310634c5beb..ace100e3226e 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -1,7 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS += -m64 -TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_test group_constraint_pmc_count_test +TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_test group_constraint_pmc_count_test \ + group_constraint_repeat_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_repeat_test.c b/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_repeat_test.c new file mode 100644 index 000000000000..371cd05bb3ed --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_repeat_test.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Athira Rajeev, IBM Corp. + */ + +#include +#include "../event.h" +#include "../sampling_tests/misc.h" + +/* The processor's L1 data cache was reloaded */ +#define EventCode1 0x21C040 +#define EventCode2 0x22C040 + +/* + * Testcase for group constraint check + * when using events with same PMC. + * Multiple events in a group shouldn't + * ask for same PMC. If so it should fail. + */ + +static int group_constraint_repeat(void) +{ + struct event event, leader; + + /* Check for platform support for the test */ + SKIP_IF(platform_check_for_tests()); + + /* + * Two events in a group using same PMC + * should fail to get scheduled. Usei same PMC2 + * for leader and sibling event which is expected + * to fail. + */ + event_init(&leader, EventCode1); + FAIL_IF(event_open(&leader)); + + event_init(&event, EventCode1); + + /* Expected to fail since sibling event is requesting same PMC as leader */ + FAIL_IF(!event_open_with_group(&event, leader.fd)); + + event_init(&event, EventCode2); + + /* Expected to pass since sibling event is requesting different PMC */ + FAIL_IF(event_open_with_group(&event, leader.fd)); + + event_close(&leader); + event_close(&event); + + return 0; +} + +int main(void) +{ + return test_harness(group_constraint_repeat, "group_constraint_repeat"); +} -- cgit v1.2.3 From dc431be3b54901744e84f5f94f0f0a2b5d36bb7f Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Fri, 10 Jun 2022 19:10:58 +0530 Subject: selftests/powerpc/pmu: Add selftest for group constraint check for radix_scope_qual field Testcase for group constraint check for radix_scope_qual field which is used to program Monitor Mode Control Register (MMCR1) bit 18. All events in the group should match radix_scope_qual bit, otherwise event_open for the group should fail. Testcase uses "0x14242" (PM_DATA_RADIX_PROCESS_L2_PTE_FROM_L2) with radix_scope_qual bit set for power10. Signed-off-by: Athira Rajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-21-atrajeev@linux.vnet.ibm.com --- .../powerpc/pmu/event_code_tests/Makefile | 2 +- .../group_constraint_radix_scope_qual_test.c | 56 ++++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_radix_scope_qual_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index ace100e3226e..5b61fb0b9fd6 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -2,7 +2,7 @@ CFLAGS += -m64 TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_test group_constraint_pmc_count_test \ - group_constraint_repeat_test + group_constraint_repeat_test group_constraint_radix_scope_qual_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_radix_scope_qual_test.c b/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_radix_scope_qual_test.c new file mode 100644 index 000000000000..9225618b846a --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_radix_scope_qual_test.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Athira Rajeev, IBM Corp. + */ + +#include +#include "../event.h" +#include "../sampling_tests/misc.h" + +/* PM_DATA_RADIX_PROCESS_L2_PTE_FROM_L2 */ +#define EventCode_1 0x14242 +/* PM_DATA_RADIX_PROCESS_L2_PTE_FROM_L3 */ +#define EventCode_2 0x24242 + +/* + * Testcase for group constraint check for radix_scope_qual + * field which is used to program Monitor Mode Control + * egister (MMCR1) bit 18. + * All events in the group should match radix_scope_qual, + * bits otherwise event_open for the group should fail. + */ + +static int group_constraint_radix_scope_qual(void) +{ + struct event event, leader; + + /* + * Check for platform support for the test. + * This test is aplicable on power10 only. + */ + SKIP_IF(platform_check_for_tests()); + SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_3_1)); + + /* Init the events for the group contraint check for radix_scope_qual bits */ + event_init(&leader, EventCode_1); + FAIL_IF(event_open(&leader)); + + event_init(&event, 0x200fc); + + /* Expected to fail as sibling event doesn't request same radix_scope_qual bits as leader */ + FAIL_IF(!event_open_with_group(&event, leader.fd)); + + event_init(&event, EventCode_2); + /* Expected to pass as sibling event request same radix_scope_qual bits as leader */ + FAIL_IF(event_open_with_group(&event, leader.fd)); + + event_close(&leader); + event_close(&event); + return 0; +} + +int main(void) +{ + return test_harness(group_constraint_radix_scope_qual, + "group_constraint_radix_scope_qual"); +} -- cgit v1.2.3 From beebeecb47d3b93198fe46922fd4ba2af2090cdd Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Fri, 10 Jun 2022 19:10:59 +0530 Subject: selftests/powerpc/pmu: Add selftest for group constraint for MMCRA Sampling Mode field Testcase for reserved bits in Monitor Mode Control Register A (MMCRA) Random Sampling Mode (SM) value. As per Instruction Set Architecture (ISA), the values 0x5, 0x9, 0xD, 0x19, 0x1D, 0x1A, 0x1E are reserved for sampling mode field. Test that having these reserved bit values should cause event_open to fail. Input event code in testcases uses these sampling bits along with 401e0 (PM_MRK_INST_CMPL). Signed-off-by: Athira Rajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-22-atrajeev@linux.vnet.ibm.com --- .../powerpc/pmu/event_code_tests/Makefile | 2 +- .../reserved_bits_mmcra_sample_elig_mode_test.c | 77 ++++++++++++++++++++++ 2 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/event_code_tests/reserved_bits_mmcra_sample_elig_mode_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index 5b61fb0b9fd6..5dd482843572 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -2,7 +2,7 @@ CFLAGS += -m64 TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_test group_constraint_pmc_count_test \ - group_constraint_repeat_test group_constraint_radix_scope_qual_test + group_constraint_repeat_test group_constraint_radix_scope_qual_test reserved_bits_mmcra_sample_elig_mode_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/reserved_bits_mmcra_sample_elig_mode_test.c b/tools/testing/selftests/powerpc/pmu/event_code_tests/reserved_bits_mmcra_sample_elig_mode_test.c new file mode 100644 index 000000000000..4c119c821b99 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/reserved_bits_mmcra_sample_elig_mode_test.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Athira Rajeev, IBM Corp. + */ + +#include +#include "../event.h" +#include "../sampling_tests/misc.h" + +/* + * Testcase for reserved bits in Monitor Mode Control + * Register A (MMCRA) Random Sampling Mode (SM) value. + * As per Instruction Set Architecture (ISA), the values + * 0x5, 0x9, 0xD, 0x19, 0x1D, 0x1A, 0x1E are reserved + * for sampling mode field. Test that having these reserved + * bit values should cause event_open to fail. + * Input event code uses these sampling bits along with + * 401e0 (PM_MRK_INST_CMPL). + */ + +static int reserved_bits_mmcra_sample_elig_mode(void) +{ + struct event event; + + /* Check for platform support for the test */ + SKIP_IF(platform_check_for_tests()); + + /* Skip for Generic compat PMU */ + SKIP_IF(check_for_generic_compat_pmu()); + + /* + * MMCRA Random Sampling Mode (SM) values: 0x5 + * 0x9, 0xD, 0x19, 0x1D, 0x1A, 0x1E is reserved. + * Expected to fail when using these reserved values. + */ + event_init(&event, 0x50401e0); + FAIL_IF(!event_open(&event)); + + event_init(&event, 0x90401e0); + FAIL_IF(!event_open(&event)); + + event_init(&event, 0xD0401e0); + FAIL_IF(!event_open(&event)); + + event_init(&event, 0x190401e0); + FAIL_IF(!event_open(&event)); + + event_init(&event, 0x1D0401e0); + FAIL_IF(!event_open(&event)); + + event_init(&event, 0x1A0401e0); + FAIL_IF(!event_open(&event)); + + event_init(&event, 0x1E0401e0); + FAIL_IF(!event_open(&event)); + + /* + * MMCRA Random Sampling Mode (SM) value 0x10 + * is reserved in power10 and 0xC is reserved in + * power9. + */ + if (PVR_VER(mfspr(SPRN_PVR)) == POWER10) { + event_init(&event, 0x100401e0); + FAIL_IF(!event_open(&event)); + } else if (PVR_VER(mfspr(SPRN_PVR)) == POWER9) { + event_init(&event, 0xC0401e0); + FAIL_IF(!event_open(&event)); + } + + return 0; +} + +int main(void) +{ + return test_harness(reserved_bits_mmcra_sample_elig_mode, + "reserved_bits_mmcra_sample_elig_mode"); +} -- cgit v1.2.3 From 122b6b9e57006520addd9f3a44f6b7e3ce503044 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Fri, 10 Jun 2022 19:11:00 +0530 Subject: selftests/powerpc/pmu: Add selftest for group constraint check MMCRA sample bits Events with different "sample" field values which is used to program Monitor Mode Control Register A (MMCRA) in a group will fail to schedule. Testcase uses event with load only sampling mode as group leader and event with store only sampling as sibling event. So that it can check that using different sample bits in event code will fail in event open for group of events Signed-off-by: Athira Rajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-23-atrajeev@linux.vnet.ibm.com --- .../powerpc/pmu/event_code_tests/Makefile | 3 +- .../group_constraint_mmcra_sample_test.c | 54 ++++++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_mmcra_sample_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index 5dd482843572..590b642ef900 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -2,7 +2,8 @@ CFLAGS += -m64 TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_test group_constraint_pmc_count_test \ - group_constraint_repeat_test group_constraint_radix_scope_qual_test reserved_bits_mmcra_sample_elig_mode_test + group_constraint_repeat_test group_constraint_radix_scope_qual_test reserved_bits_mmcra_sample_elig_mode_test \ + group_constraint_mmcra_sample_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_mmcra_sample_test.c b/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_mmcra_sample_test.c new file mode 100644 index 000000000000..ff625b5d80eb --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_mmcra_sample_test.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Athira Rajeev, IBM Corp. + */ + +#include +#include "../event.h" +#include "../sampling_tests/misc.h" + +#define EventCode_1 0x35340401e0 +#define EventCode_2 0x353c0101ec +#define EventCode_3 0x35340101ec +/* + * Test that using different sample bits in + * event code cause failure in schedule for + * group of events. + */ + +static int group_constraint_mmcra_sample(void) +{ + struct event event, leader; + + SKIP_IF(platform_check_for_tests()); + + /* + * Events with different "sample" field values + * in a group will fail to schedule. + * Use event with load only sampling mode as + * group leader. Use event with store only sampling + * as sibling event. + */ + event_init(&leader, EventCode_1); + FAIL_IF(event_open(&leader)); + + event_init(&event, EventCode_2); + + /* Expected to fail as sibling event doesn't use same sampling bits as leader */ + FAIL_IF(!event_open_with_group(&event, leader.fd)); + + event_init(&event, EventCode_3); + + /* Expected to pass as sibling event use same sampling bits as leader */ + FAIL_IF(event_open_with_group(&event, leader.fd)); + + event_close(&leader); + event_close(&event); + + return 0; +} + +int main(void) +{ + return test_harness(group_constraint_mmcra_sample, "group_constraint_mmcra_sample"); +} -- cgit v1.2.3 From 5196a27978dcc74251eab14cffa8fa96813e0365 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Fri, 10 Jun 2022 19:11:01 +0530 Subject: selftests/powerpc/pmu: Add selftest for checking invalid bits in event code Some of the bits in the event code is reserved for specific platforms. Event code bits 52-59 are reserved in power9, whereas in power10, these are used for programming Monitor Mode Control Register 3 (MMCR3). Bit 9 in event code is reserved in power9, whereas it is used for programming "radix_scope_qual" bit 18 in Monitor Mode Control Register 1 (MMCR1). Testcase to ensure that using reserved bits in event code should cause event_open to fail. Signed-off-by: Athira Rajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-24-atrajeev@linux.vnet.ibm.com --- .../powerpc/pmu/event_code_tests/Makefile | 2 +- .../pmu/event_code_tests/invalid_event_code_test.c | 67 ++++++++++++++++++++++ 2 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/event_code_tests/invalid_event_code_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index 590b642ef900..1ce1ef4586fd 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -3,7 +3,7 @@ CFLAGS += -m64 TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_test group_constraint_pmc_count_test \ group_constraint_repeat_test group_constraint_radix_scope_qual_test reserved_bits_mmcra_sample_elig_mode_test \ - group_constraint_mmcra_sample_test + group_constraint_mmcra_sample_test invalid_event_code_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/invalid_event_code_test.c b/tools/testing/selftests/powerpc/pmu/event_code_tests/invalid_event_code_test.c new file mode 100644 index 000000000000..f51fcab837fc --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/invalid_event_code_test.c @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Athira Rajeev, IBM Corp. + */ + +#include +#include +#include +#include "../event.h" +#include "../sampling_tests/misc.h" + +/* The data cache was reloaded from local core's L3 due to a demand load */ +#define EventCode_1 0x1340000001c040 +/* PM_DATA_RADIX_PROCESS_L2_PTE_FROM_L2 */ +#define EventCode_2 0x14242 +/* Event code with IFM, EBB, BHRB bits set in event code */ +#define EventCode_3 0xf00000000000001e + +/* + * Some of the bits in the event code is + * reserved for specific platforms. + * Event code bits 52-59 are reserved in power9, + * whereas in power10, these are used for programming + * Monitor Mode Control Register 3 (MMCR3). + * Bit 9 in event code is reserved in power9, + * whereas it is used for programming "radix_scope_qual" + * bit 18 in Monitor Mode Control Register 1 (MMCR1). + * + * Testcase to ensure that using reserved bits in + * event code should cause event_open to fail. + */ + +static int invalid_event_code(void) +{ + struct event event; + + /* Check for platform support for the test */ + SKIP_IF(platform_check_for_tests()); + + /* + * Events using MMCR3 bits and radix scope qual bits + * should fail in power9 and should succeed in power10. + * Init the events and check for pass/fail in event open. + */ + if (have_hwcap2(PPC_FEATURE2_ARCH_3_1)) { + event_init(&event, EventCode_1); + FAIL_IF(event_open(&event)); + event_close(&event); + + event_init(&event, EventCode_2); + FAIL_IF(event_open(&event)); + event_close(&event); + } else { + event_init(&event, EventCode_1); + FAIL_IF(!event_open(&event)); + + event_init(&event, EventCode_2); + FAIL_IF(!event_open(&event)); + } + + return 0; +} + +int main(void) +{ + return test_harness(invalid_event_code, "invalid_event_code"); +} -- cgit v1.2.3 From 0c90263339da3e4cdcbf57cfa43d6d866c3ac95e Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Fri, 10 Jun 2022 19:11:02 +0530 Subject: selftests/powerpc/pmu: Add selftest for reserved bit check for MMCRA thresh_ctl field Testcase for reserved bits in Monitor Mode Control Register A (MMCRA) thresh_ctl bits. For MMCRA[48:51]/[52:55]) Threshold Start/Stop, 0b11110000/0b00001111 is reserved. Signed-off-by: Athira Rajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-25-atrajeev@linux.vnet.ibm.com --- .../powerpc/pmu/event_code_tests/Makefile | 2 +- .../reserved_bits_mmcra_thresh_ctl_test.c | 44 ++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/event_code_tests/reserved_bits_mmcra_thresh_ctl_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index 1ce1ef4586fd..e50570794337 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -3,7 +3,7 @@ CFLAGS += -m64 TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_test group_constraint_pmc_count_test \ group_constraint_repeat_test group_constraint_radix_scope_qual_test reserved_bits_mmcra_sample_elig_mode_test \ - group_constraint_mmcra_sample_test invalid_event_code_test + group_constraint_mmcra_sample_test invalid_event_code_test reserved_bits_mmcra_thresh_ctl_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/reserved_bits_mmcra_thresh_ctl_test.c b/tools/testing/selftests/powerpc/pmu/event_code_tests/reserved_bits_mmcra_thresh_ctl_test.c new file mode 100644 index 000000000000..4ea1c2f8913f --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/reserved_bits_mmcra_thresh_ctl_test.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Athira Rajeev, IBM Corp. + */ + +#include +#include "../event.h" +#include "../sampling_tests/misc.h" + +/* + * Testcase for reserved bits in Monitor Mode + * Control Register A (MMCRA) thresh_ctl bits. + * For MMCRA[48:51]/[52:55]) Threshold Start/Stop, + * 0b11110000/0b00001111 is reserved. + */ + +static int reserved_bits_mmcra_thresh_ctl(void) +{ + struct event event; + + /* Check for platform support for the test */ + SKIP_IF(platform_check_for_tests()); + + /* Skip for Generic compat PMU */ + SKIP_IF(check_for_generic_compat_pmu()); + + /* + * MMCRA[48:51]/[52:55]) Threshold Start/Stop + * events Selection. 0b11110000/0b00001111 is reserved. + * Expected to fail when using these reserved values. + */ + event_init(&event, 0xf0340401e0); + FAIL_IF(!event_open(&event)); + + event_init(&event, 0x0f340401e0); + FAIL_IF(!event_open(&event)); + + return 0; +} + +int main(void) +{ + return test_harness(reserved_bits_mmcra_thresh_ctl, "reserved_bits_mmcra_thresh_ctl"); +} -- cgit v1.2.3 From a77c69766c7d4a213e65a4ecdedda7c22f2deb01 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Fri, 10 Jun 2022 19:11:03 +0530 Subject: selftests/powerpc/pmu: Add selftest for blacklist events check in power9 Some of the events are blacklisted in power9. The list of blacklisted events are noted in power9-events-list.h When trying to do event open for any of these blacklisted event will cause a failure. Testcase ensures that using blacklisted events will cause event_open to fail in power9. This test is only applicable on power9 DD2.1 and DD2.2 and hence test adds checks to skip on other platforms. Signed-off-by: Athira Rajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-26-atrajeev@linux.vnet.ibm.com --- .../powerpc/pmu/event_code_tests/Makefile | 3 +- .../pmu/event_code_tests/blacklisted_events_test.c | 132 +++++++++++++++++++++ 2 files changed, 134 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/event_code_tests/blacklisted_events_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index e50570794337..a5916a938154 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -3,7 +3,8 @@ CFLAGS += -m64 TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_test group_constraint_pmc_count_test \ group_constraint_repeat_test group_constraint_radix_scope_qual_test reserved_bits_mmcra_sample_elig_mode_test \ - group_constraint_mmcra_sample_test invalid_event_code_test reserved_bits_mmcra_thresh_ctl_test + group_constraint_mmcra_sample_test invalid_event_code_test reserved_bits_mmcra_thresh_ctl_test \ + blacklisted_events_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/blacklisted_events_test.c b/tools/testing/selftests/powerpc/pmu/event_code_tests/blacklisted_events_test.c new file mode 100644 index 000000000000..fafeff19cb34 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/blacklisted_events_test.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Athira Rajeev, IBM Corp. + */ + +#include +#include +#include +#include "../event.h" +#include "../sampling_tests/misc.h" + +#define PM_DTLB_MISS_16G 0x1c058 +#define PM_DERAT_MISS_2M 0x1c05a +#define PM_DTLB_MISS_2M 0x1c05c +#define PM_MRK_DTLB_MISS_1G 0x1d15c +#define PM_DTLB_MISS_4K 0x2c056 +#define PM_DERAT_MISS_1G 0x2c05a +#define PM_MRK_DERAT_MISS_2M 0x2d152 +#define PM_MRK_DTLB_MISS_4K 0x2d156 +#define PM_MRK_DTLB_MISS_16G 0x2d15e +#define PM_DTLB_MISS_64K 0x3c056 +#define PM_MRK_DERAT_MISS_1G 0x3d152 +#define PM_MRK_DTLB_MISS_64K 0x3d156 +#define PM_DISP_HELD_SYNC_HOLD 0x4003c +#define PM_DTLB_MISS_16M 0x4c056 +#define PM_DTLB_MISS_1G 0x4c05a +#define PM_MRK_DTLB_MISS_16M 0x4c15e +#define PM_MRK_ST_DONE_L2 0x10134 +#define PM_RADIX_PWC_L1_HIT 0x1f056 +#define PM_FLOP_CMPL 0x100f4 +#define PM_MRK_NTF_FIN 0x20112 +#define PM_RADIX_PWC_L2_HIT 0x2d024 +#define PM_IFETCH_THROTTLE 0x3405e +#define PM_MRK_L2_TM_ST_ABORT_SISTER 0x3e15c +#define PM_RADIX_PWC_L3_HIT 0x3f056 +#define PM_RUN_CYC_SMT2_MODE 0x3006c +#define PM_TM_TX_PASS_RUN_INST 0x4e014 + +#define PVR_POWER9_CUMULUS 0x00002000 + +int blacklist_events_dd21[] = { + PM_MRK_ST_DONE_L2, + PM_RADIX_PWC_L1_HIT, + PM_FLOP_CMPL, + PM_MRK_NTF_FIN, + PM_RADIX_PWC_L2_HIT, + PM_IFETCH_THROTTLE, + PM_MRK_L2_TM_ST_ABORT_SISTER, + PM_RADIX_PWC_L3_HIT, + PM_RUN_CYC_SMT2_MODE, + PM_TM_TX_PASS_RUN_INST, + PM_DISP_HELD_SYNC_HOLD, +}; + +int blacklist_events_dd22[] = { + PM_DTLB_MISS_16G, + PM_DERAT_MISS_2M, + PM_DTLB_MISS_2M, + PM_MRK_DTLB_MISS_1G, + PM_DTLB_MISS_4K, + PM_DERAT_MISS_1G, + PM_MRK_DERAT_MISS_2M, + PM_MRK_DTLB_MISS_4K, + PM_MRK_DTLB_MISS_16G, + PM_DTLB_MISS_64K, + PM_MRK_DERAT_MISS_1G, + PM_MRK_DTLB_MISS_64K, + PM_DISP_HELD_SYNC_HOLD, + PM_DTLB_MISS_16M, + PM_DTLB_MISS_1G, + PM_MRK_DTLB_MISS_16M, +}; + +int pvr_min; + +/* + * check for power9 support for 2.1 and + * 2.2 model where blacklist is applicable. + */ +int check_for_power9_version(void) +{ + pvr_min = PVR_MIN(mfspr(SPRN_PVR)); + + SKIP_IF(PVR_VER(pvr) != POWER9); + SKIP_IF(!(pvr & PVR_POWER9_CUMULUS)); + + SKIP_IF(!(3 - pvr_min)); + + return 0; +} + +/* + * Testcase to ensure that using blacklisted bits in + * event code should cause event_open to fail in power9 + */ + +static int blacklisted_events(void) +{ + struct event event; + int i = 0; + + /* Check for platform support for the test */ + SKIP_IF(platform_check_for_tests()); + + /* + * check for power9 support for 2.1 and + * 2.2 model where blacklist is applicable. + */ + SKIP_IF(check_for_power9_version()); + + /* Skip for Generic compat mode */ + SKIP_IF(check_for_generic_compat_pmu()); + + if (pvr_min == 1) { + for (i = 0; i < ARRAY_SIZE(blacklist_events_dd21); i++) { + event_init(&event, blacklist_events_dd21[i]); + FAIL_IF(!event_open(&event)); + } + } else if (pvr_min == 2) { + for (i = 0; i < ARRAY_SIZE(blacklist_events_dd22); i++) { + event_init(&event, blacklist_events_dd22[i]); + FAIL_IF(!event_open(&event)); + } + } + + return 0; +} + +int main(void) +{ + return test_harness(blacklisted_events, "blacklisted_events"); +} -- cgit v1.2.3 From 5958ad4392b0f437605ade8bab42447b0d97ad8c Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Fri, 10 Jun 2022 19:11:04 +0530 Subject: selftests/powerpc/pmu: Add selftest for event alternatives for power9 Platform specific PMU supports alternative event for some of the event codes. During perf_event_open, it any event group doesn't match constraint check criteria, further lookup is done to find alternative event. Code checks to see if it is possible to schedule event as group using alternative events. Testcase exercises the alternative event find code for power9. Example, since events in same PMC can't go in as a group, ideally using PM_RUN_CYC_ALT (0x200f4) and PM_BR_TAKEN_CMPL (0x200fa) will fail. But since RUN_CYC (0x600f4) is alternative event for 0x200f4, it is possible to use 0x600f4 and 0x200fa as group. Testcase uses such combination for all events in power9 which has an alternative event. Signed-off-by: Athira Rajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-27-atrajeev@linux.vnet.ibm.com --- .../powerpc/pmu/event_code_tests/Makefile | 2 +- .../event_code_tests/event_alternatives_tests_p9.c | 116 +++++++++++++++++++++ 2 files changed, 117 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/event_code_tests/event_alternatives_tests_p9.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index a5916a938154..cf27e612290e 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -4,7 +4,7 @@ CFLAGS += -m64 TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_test group_constraint_pmc_count_test \ group_constraint_repeat_test group_constraint_radix_scope_qual_test reserved_bits_mmcra_sample_elig_mode_test \ group_constraint_mmcra_sample_test invalid_event_code_test reserved_bits_mmcra_thresh_ctl_test \ - blacklisted_events_test + blacklisted_events_test event_alternatives_tests_p9 top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/event_alternatives_tests_p9.c b/tools/testing/selftests/powerpc/pmu/event_code_tests/event_alternatives_tests_p9.c new file mode 100644 index 000000000000..f7dcf0e0447c --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/event_alternatives_tests_p9.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Athira Rajeev, IBM Corp. + */ + +#include +#include "../event.h" +#include "../sampling_tests/misc.h" + +#define PM_RUN_CYC_ALT 0x200f4 +#define PM_INST_DISP 0x200f2 +#define PM_BR_2PATH 0x20036 +#define PM_LD_MISS_L1 0x3e054 +#define PM_RUN_INST_CMPL_ALT 0x400fa + +#define EventCode_1 0x200fa +#define EventCode_2 0x200fc +#define EventCode_3 0x300fc +#define EventCode_4 0x400fc + +/* + * Check for event alternatives. + */ + +static int event_alternatives_tests_p9(void) +{ + struct event event, leader; + + /* Check for platform support for the test */ + SKIP_IF(platform_check_for_tests()); + + /* + * PVR check is used here since PMU specific data like + * alternative events is handled by respective PMU driver + * code and using PVR will work correctly for all cases + * including generic compat mode. + */ + SKIP_IF(PVR_VER(mfspr(SPRN_PVR)) != POWER9); + + /* Skip for generic compat PMU */ + SKIP_IF(check_for_generic_compat_pmu()); + + /* Init the event for PM_RUN_CYC_ALT */ + event_init(&leader, PM_RUN_CYC_ALT); + FAIL_IF(event_open(&leader)); + + event_init(&event, EventCode_1); + + /* + * Expected to pass since PM_RUN_CYC_ALT in PMC2 has alternative event + * 0x600f4. So it can go in with EventCode_1 which is using PMC2 + */ + FAIL_IF(event_open_with_group(&event, leader.fd)); + + event_close(&leader); + event_close(&event); + + event_init(&leader, PM_INST_DISP); + FAIL_IF(event_open(&leader)); + + event_init(&event, EventCode_2); + /* + * Expected to pass since PM_INST_DISP in PMC2 has alternative event + * 0x300f2 in PMC3. So it can go in with EventCode_2 which is using PMC2 + */ + FAIL_IF(event_open_with_group(&event, leader.fd)); + + event_close(&leader); + event_close(&event); + + event_init(&leader, PM_BR_2PATH); + FAIL_IF(event_open(&leader)); + + event_init(&event, EventCode_2); + /* + * Expected to pass since PM_BR_2PATH in PMC2 has alternative event + * 0x40036 in PMC4. So it can go in with EventCode_2 which is using PMC2 + */ + FAIL_IF(event_open_with_group(&event, leader.fd)); + + event_close(&leader); + event_close(&event); + + event_init(&leader, PM_LD_MISS_L1); + FAIL_IF(event_open(&leader)); + + event_init(&event, EventCode_3); + /* + * Expected to pass since PM_LD_MISS_L1 in PMC3 has alternative event + * 0x400f0 in PMC4. So it can go in with EventCode_3 which is using PMC3 + */ + FAIL_IF(event_open_with_group(&event, leader.fd)); + + event_close(&leader); + event_close(&event); + + event_init(&leader, PM_RUN_INST_CMPL_ALT); + FAIL_IF(event_open(&leader)); + + event_init(&event, EventCode_4); + /* + * Expected to pass since PM_RUN_INST_CMPL_ALT in PMC4 has alternative event + * 0x500fa in PMC5. So it can go in with EventCode_4 which is using PMC4 + */ + FAIL_IF(event_open_with_group(&event, leader.fd)); + + event_close(&leader); + event_close(&event); + + return 0; +} + +int main(void) +{ + return test_harness(event_alternatives_tests_p9, "event_alternatives_tests_p9"); +} -- cgit v1.2.3 From 3f1a87425f8c2f9af745923865a4765e36a2ed3c Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Fri, 10 Jun 2022 19:11:05 +0530 Subject: selftests/powerpc/pmu: Add selftest for event alternatives for power10 Platform specific PMU supports alternative event for some of the event codes. During perf_event_open, it any event group doesn't match constraint check criteria, further lookup is done to find alternative event. Code checks to see if it is possible to schedule event as group using alternative events. Testcase exercises the alternative event find code for power10. Example, Using PMC1 to PMC4 in a group and again trying to schedule PM_CYC_ALT (0x0001e) will fail since this exceeds number of programmable events in group. But since 0x600f4 is an alternative event for 0x0001e, it is possible to use 0x0001e in the group. Testcase uses such combination all events in power10 which has alternative event. Signed-off-by: Athira Rajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-28-atrajeev@linux.vnet.ibm.com --- .../powerpc/pmu/event_code_tests/Makefile | 2 +- .../event_alternatives_tests_p10.c | 109 +++++++++++++++++++++ 2 files changed, 110 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/event_code_tests/event_alternatives_tests_p10.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index cf27e612290e..50bcc036dddf 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -4,7 +4,7 @@ CFLAGS += -m64 TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_test group_constraint_pmc_count_test \ group_constraint_repeat_test group_constraint_radix_scope_qual_test reserved_bits_mmcra_sample_elig_mode_test \ group_constraint_mmcra_sample_test invalid_event_code_test reserved_bits_mmcra_thresh_ctl_test \ - blacklisted_events_test event_alternatives_tests_p9 + blacklisted_events_test event_alternatives_tests_p9 event_alternatives_tests_p10 top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/event_alternatives_tests_p10.c b/tools/testing/selftests/powerpc/pmu/event_code_tests/event_alternatives_tests_p10.c new file mode 100644 index 000000000000..8be7aada6523 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/event_alternatives_tests_p10.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Athira Rajeev, IBM Corp. + */ + +#include +#include "../event.h" +#include "../sampling_tests/misc.h" + +#define PM_RUN_CYC_ALT 0x200f4 +#define PM_INST_DISP 0x200f2 +#define PM_BR_2PATH 0x20036 +#define PM_LD_MISS_L1 0x3e054 +#define PM_RUN_INST_CMPL_ALT 0x400fa + +#define EventCode_1 0x100fc +#define EventCode_2 0x200fa +#define EventCode_3 0x300fc +#define EventCode_4 0x400fc + +/* + * Check for event alternatives. + */ + +static int event_alternatives_tests_p10(void) +{ + struct event *e, events[5]; + int i; + + /* Check for platform support for the test */ + SKIP_IF(platform_check_for_tests()); + + /* + * PVR check is used here since PMU specific data like + * alternative events is handled by respective PMU driver + * code and using PVR will work correctly for all cases + * including generic compat mode. + */ + SKIP_IF(PVR_VER(mfspr(SPRN_PVR)) != POWER10); + + SKIP_IF(check_for_generic_compat_pmu()); + + /* + * Test for event alternative for 0x0001e + * and 0x00002. + */ + e = &events[0]; + event_init(e, 0x0001e); + + e = &events[1]; + event_init(e, EventCode_1); + + e = &events[2]; + event_init(e, EventCode_2); + + e = &events[3]; + event_init(e, EventCode_3); + + e = &events[4]; + event_init(e, EventCode_4); + + FAIL_IF(event_open(&events[0])); + + /* + * Expected to pass since 0x0001e has alternative event + * 0x600f4 in PMC6. So it can go in with other events + * in PMC1 to PMC4. + */ + for (i = 1; i < 5; i++) + FAIL_IF(event_open_with_group(&events[i], events[0].fd)); + + for (i = 0; i < 5; i++) + event_close(&events[i]); + + e = &events[0]; + event_init(e, 0x00002); + + e = &events[1]; + event_init(e, EventCode_1); + + e = &events[2]; + event_init(e, EventCode_2); + + e = &events[3]; + event_init(e, EventCode_3); + + e = &events[4]; + event_init(e, EventCode_4); + + FAIL_IF(event_open(&events[0])); + + /* + * Expected to pass since 0x00020 has alternative event + * 0x500fa in PMC5. So it can go in with other events + * in PMC1 to PMC4. + */ + for (i = 1; i < 5; i++) + FAIL_IF(event_open_with_group(&events[i], events[0].fd)); + + for (i = 0; i < 5; i++) + event_close(&events[i]); + + return 0; +} + +int main(void) +{ + return test_harness(event_alternatives_tests_p10, "event_alternatives_tests_p10"); +} -- cgit v1.2.3 From 8efeedf5aac77b58f68e6eb9df62758ba1882bb3 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Fri, 10 Jun 2022 19:11:06 +0530 Subject: selftests/powerpc/pmu: Add selftest for PERF_TYPE_HARDWARE events valid check Testcase to ensure that using invalid event in generic event for PERF_TYPE_HARDWARE will fail. Invalid generic events in power10 are: - PERF_COUNT_HW_BUS_CYCLES - PERF_COUNT_HW_STALLED_CYCLES_FRONTEND - PERF_COUNT_HW_STALLED_CYCLES_BACKEND - PERF_COUNT_HW_REF_CPU_CYCLES Invalid generic events in power9 are: - PERF_COUNT_HW_BUS_CYCLES - PERF_COUNT_HW_REF_CPU_CYCLES Testcase does event open for valid and invalid generic events to ensure event open works for all valid events and fails for invalid events. Signed-off-by: Athira Rajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-29-atrajeev@linux.vnet.ibm.com --- .../powerpc/pmu/event_code_tests/Makefile | 2 +- .../event_code_tests/generic_events_valid_test.c | 130 +++++++++++++++++++++ 2 files changed, 131 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/event_code_tests/generic_events_valid_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index 50bcc036dddf..0d56f1ef530f 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -4,7 +4,7 @@ CFLAGS += -m64 TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_test group_constraint_pmc_count_test \ group_constraint_repeat_test group_constraint_radix_scope_qual_test reserved_bits_mmcra_sample_elig_mode_test \ group_constraint_mmcra_sample_test invalid_event_code_test reserved_bits_mmcra_thresh_ctl_test \ - blacklisted_events_test event_alternatives_tests_p9 event_alternatives_tests_p10 + blacklisted_events_test event_alternatives_tests_p9 event_alternatives_tests_p10 generic_events_valid_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/generic_events_valid_test.c b/tools/testing/selftests/powerpc/pmu/event_code_tests/generic_events_valid_test.c new file mode 100644 index 000000000000..0d237c15d3f2 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/generic_events_valid_test.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Athira Rajeev, IBM Corp. + */ + +#include +#include +#include +#include "../event.h" +#include "../sampling_tests/misc.h" + +/* + * Testcase to ensure that using invalid event in generic + * event for PERF_TYPE_HARDWARE should fail + */ + +static int generic_events_valid_test(void) +{ + struct event event; + + /* Check for platform support for the test */ + SKIP_IF(platform_check_for_tests()); + + /* generic events is different in compat_mode */ + SKIP_IF(check_for_generic_compat_pmu()); + + /* + * Invalid generic events in power10: + * - PERF_COUNT_HW_BUS_CYCLES + * - PERF_COUNT_HW_STALLED_CYCLES_FRONTEND + * - PERF_COUNT_HW_STALLED_CYCLES_BACKEND + * - PERF_COUNT_HW_REF_CPU_CYCLES + */ + if (PVR_VER(mfspr(SPRN_PVR)) == POWER10) { + event_init_opts(&event, PERF_COUNT_HW_CPU_CYCLES, PERF_TYPE_HARDWARE, "event"); + FAIL_IF(event_open(&event)); + event_close(&event); + + event_init_opts(&event, PERF_COUNT_HW_INSTRUCTIONS, + PERF_TYPE_HARDWARE, "event"); + FAIL_IF(event_open(&event)); + event_close(&event); + + event_init_opts(&event, PERF_COUNT_HW_CACHE_REFERENCES, + PERF_TYPE_HARDWARE, "event"); + FAIL_IF(event_open(&event)); + event_close(&event); + + event_init_opts(&event, PERF_COUNT_HW_CACHE_MISSES, PERF_TYPE_HARDWARE, "event"); + FAIL_IF(event_open(&event)); + event_close(&event); + + event_init_opts(&event, PERF_COUNT_HW_BRANCH_INSTRUCTIONS, + PERF_TYPE_HARDWARE, "event"); + FAIL_IF(event_open(&event)); + event_close(&event); + + event_init_opts(&event, PERF_COUNT_HW_BRANCH_MISSES, PERF_TYPE_HARDWARE, "event"); + FAIL_IF(event_open(&event)); + event_close(&event); + + event_init_opts(&event, PERF_COUNT_HW_BUS_CYCLES, PERF_TYPE_HARDWARE, "event"); + FAIL_IF(!event_open(&event)); + + event_init_opts(&event, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, + PERF_TYPE_HARDWARE, "event"); + FAIL_IF(!event_open(&event)); + + event_init_opts(&event, PERF_COUNT_HW_STALLED_CYCLES_BACKEND, + PERF_TYPE_HARDWARE, "event"); + FAIL_IF(!event_open(&event)); + + event_init_opts(&event, PERF_COUNT_HW_REF_CPU_CYCLES, PERF_TYPE_HARDWARE, "event"); + FAIL_IF(!event_open(&event)); + } else if (PVR_VER(mfspr(SPRN_PVR)) == POWER9) { + /* + * Invalid generic events in power9: + * - PERF_COUNT_HW_BUS_CYCLES + * - PERF_COUNT_HW_REF_CPU_CYCLES + */ + event_init_opts(&event, PERF_COUNT_HW_CPU_CYCLES, PERF_TYPE_HARDWARE, "event"); + FAIL_IF(event_open(&event)); + event_close(&event); + + event_init_opts(&event, PERF_COUNT_HW_INSTRUCTIONS, PERF_TYPE_HARDWARE, "event"); + FAIL_IF(event_open(&event)); + event_close(&event); + + event_init_opts(&event, PERF_COUNT_HW_CACHE_REFERENCES, + PERF_TYPE_HARDWARE, "event"); + FAIL_IF(event_open(&event)); + event_close(&event); + + event_init_opts(&event, PERF_COUNT_HW_CACHE_MISSES, PERF_TYPE_HARDWARE, "event"); + FAIL_IF(event_open(&event)); + event_close(&event); + + event_init_opts(&event, PERF_COUNT_HW_BRANCH_INSTRUCTIONS, + PERF_TYPE_HARDWARE, "event"); + FAIL_IF(event_open(&event)); + event_close(&event); + + event_init_opts(&event, PERF_COUNT_HW_BRANCH_MISSES, PERF_TYPE_HARDWARE, "event"); + FAIL_IF(event_open(&event)); + event_close(&event); + + event_init_opts(&event, PERF_COUNT_HW_BUS_CYCLES, PERF_TYPE_HARDWARE, "event"); + FAIL_IF(!event_open(&event)); + + event_init_opts(&event, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, + PERF_TYPE_HARDWARE, "event"); + FAIL_IF(event_open(&event)); + event_close(&event); + + event_init_opts(&event, PERF_COUNT_HW_STALLED_CYCLES_BACKEND, + PERF_TYPE_HARDWARE, "event"); + FAIL_IF(event_open(&event)); + event_close(&event); + + event_init_opts(&event, PERF_COUNT_HW_REF_CPU_CYCLES, PERF_TYPE_HARDWARE, "event"); + FAIL_IF(!event_open(&event)); + } + + return 0; +} + +int main(void) +{ + return test_harness(generic_events_valid_test, "generic_events_valid_test"); +} -- cgit v1.2.3 From 20b3073f8727e20332379f145b6eecf580291b2c Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Fri, 10 Jun 2022 19:11:07 +0530 Subject: selftests/powerpc/pmu: Add selftest for group constraint check for MMCR0 l2l3_sel bits In power10, L2L3 select bits in the event code is used to program l2l3_sel field in Monitor Mode Control Register 0 (MMCR0: 56-60). When scheduling events as a group, all events in that group should match value in these bits. Otherwise event open for the sibling events will fail. Testcase uses event code "0x010000046080" as leader and another events "0x26880" and "0x010000026880" as sibling events, and checks for l2l3_sel constraints via perf interface for ISA v3.1 platform. Signed-off-by: Kajol Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-30-atrajeev@linux.vnet.ibm.com --- .../powerpc/pmu/event_code_tests/Makefile | 3 +- .../group_constraint_l2l3_sel_test.c | 64 ++++++++++++++++++++++ 2 files changed, 66 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_l2l3_sel_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index 0d56f1ef530f..58e1a7a2ed4e 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -4,7 +4,8 @@ CFLAGS += -m64 TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_test group_constraint_pmc_count_test \ group_constraint_repeat_test group_constraint_radix_scope_qual_test reserved_bits_mmcra_sample_elig_mode_test \ group_constraint_mmcra_sample_test invalid_event_code_test reserved_bits_mmcra_thresh_ctl_test \ - blacklisted_events_test event_alternatives_tests_p9 event_alternatives_tests_p10 generic_events_valid_test + blacklisted_events_test event_alternatives_tests_p9 event_alternatives_tests_p10 generic_events_valid_test \ + group_constraint_l2l3_sel_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_l2l3_sel_test.c b/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_l2l3_sel_test.c new file mode 100644 index 000000000000..85a636886069 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_l2l3_sel_test.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Kajol Jain, IBM Corp. + */ + +#include +#include + +#include "../event.h" +#include "utils.h" +#include "../sampling_tests/misc.h" + +/* All successful D-side store dispatches for this thread */ +#define EventCode_1 0x010000046080 +/* All successful D-side store dispatches for this thread that were L2 Miss */ +#define EventCode_2 0x26880 +/* All successful D-side store dispatches for this thread that were L2 Miss */ +#define EventCode_3 0x010000026880 + +/* + * Testcase for group constraint check of l2l3_sel bits which is + * used to program l2l3 select field in Monitor Mode Control Register 0 + * (MMCR0: 56-60). + * All events in the group should match l2l3_sel bits otherwise + * event_open for the group should fail. + */ +static int group_constraint_l2l3_sel(void) +{ + struct event event, leader; + + /* + * Check for platform support for the test. + * This test is only aplicable on power10 + */ + SKIP_IF(platform_check_for_tests()); + SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_3_1)); + + /* Init the events for the group contraint check for l2l3_sel bits */ + event_init(&leader, EventCode_1); + FAIL_IF(event_open(&leader)); + + event_init(&event, EventCode_2); + + /* Expected to fail as sibling event doesn't request same l2l3_sel bits as leader */ + FAIL_IF(!event_open_with_group(&event, leader.fd)); + + event_close(&event); + + /* Init the event for the group contraint l2l3_sel test */ + event_init(&event, EventCode_3); + + /* Expected to succeed as sibling event request same l2l3_sel bits as leader */ + FAIL_IF(event_open_with_group(&event, leader.fd)); + + event_close(&leader); + event_close(&event); + + return 0; +} + +int main(void) +{ + return test_harness(group_constraint_l2l3_sel, "group_constraint_l2l3_sel"); +} -- cgit v1.2.3 From 291c01ed207d83c8910e0fb21944e6ef84021956 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Fri, 10 Jun 2022 19:11:08 +0530 Subject: selftests/powerpc/pmu: Add selftest for group constraint check for MMCR1 cache bits Data and instruction cache qualifier bits in the event code is used to program cache select field in Monitor Mode Control Register 1 (MMCR1: 16-17). When scheduling events as a group, all events in that group should match value in these bits. Otherwise event open for the sibling events will fail. Testcase uses event code "0x1100fc" as leader and other events like "0x23e054" and "0x13e054" as sibling events to checks for l1 cache select field constraints via perf interface. Signed-off-by: Kajol Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-31-atrajeev@linux.vnet.ibm.com --- .../powerpc/pmu/event_code_tests/Makefile | 2 +- .../event_code_tests/group_constraint_cache_test.c | 60 ++++++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_cache_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index 58e1a7a2ed4e..dc27ca2ffcad 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -5,7 +5,7 @@ TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_te group_constraint_repeat_test group_constraint_radix_scope_qual_test reserved_bits_mmcra_sample_elig_mode_test \ group_constraint_mmcra_sample_test invalid_event_code_test reserved_bits_mmcra_thresh_ctl_test \ blacklisted_events_test event_alternatives_tests_p9 event_alternatives_tests_p10 generic_events_valid_test \ - group_constraint_l2l3_sel_test + group_constraint_l2l3_sel_test group_constraint_cache_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_cache_test.c b/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_cache_test.c new file mode 100644 index 000000000000..f4be05aa3a3d --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_cache_test.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Kajol Jain, IBM Corp. + */ + +#include +#include + +#include "../event.h" +#include "utils.h" +#include "../sampling_tests/misc.h" + +/* All L1 D cache load references counted at finish, gated by reject */ +#define EventCode_1 0x1100fc +/* Load Missed L1 */ +#define EventCode_2 0x23e054 +/* Load Missed L1 */ +#define EventCode_3 0x13e054 + +/* + * Testcase for group constraint check of data and instructions + * cache qualifier bits which is used to program cache select field in + * Monitor Mode Control Register 1 (MMCR1: 16-17) for l1 cache. + * All events in the group should match cache select bits otherwise + * event_open for the group will fail. + */ +static int group_constraint_cache(void) +{ + struct event event, leader; + + /* Check for platform support for the test */ + SKIP_IF(platform_check_for_tests()); + + /* Init the events for the group contraint check for l1 cache select bits */ + event_init(&leader, EventCode_1); + FAIL_IF(event_open(&leader)); + + event_init(&event, EventCode_2); + + /* Expected to fail as sibling event doesn't request same l1 cache select bits as leader */ + FAIL_IF(!event_open_with_group(&event, leader.fd)); + + event_close(&event); + + /* Init the event for the group contraint l1 cache select test */ + event_init(&event, EventCode_3); + + /* Expected to succeed as sibling event request same l1 cache select bits as leader */ + FAIL_IF(event_open_with_group(&event, leader.fd)); + + event_close(&leader); + event_close(&event); + + return 0; +} + +int main(void) +{ + return test_harness(group_constraint_cache, "group_constraint_cache"); +} -- cgit v1.2.3 From 8eaca8c4b4ed9a2058e4f232d56b5973191fec37 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Fri, 10 Jun 2022 19:11:09 +0530 Subject: selftests/powerpc/pmu: Add selftest for group constraint check for MMCRA thresh_cmp field Thresh compare bits for a event is used to program thresh compare field in Monitor Mode Control Register A (MMCRA: 9-18 bits for power9 and MMCRA: 8-18 bits for power10). When scheduling events as a group, all events in that group should match value in thresh compare bits. Otherwise event open for the sibling events will fail. Testcase uses event code "0x401e0" as leader and another event "0x101ec" as sibling event, and checks for thresh compare constraint via perf interface. Signed-off-by: Kajol Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-32-atrajeev@linux.vnet.ibm.com --- .../powerpc/pmu/event_code_tests/Makefile | 2 +- .../group_constraint_thresh_cmp_test.c | 96 ++++++++++++++++++++++ 2 files changed, 97 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_thresh_cmp_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index dc27ca2ffcad..374044062561 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -5,7 +5,7 @@ TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_te group_constraint_repeat_test group_constraint_radix_scope_qual_test reserved_bits_mmcra_sample_elig_mode_test \ group_constraint_mmcra_sample_test invalid_event_code_test reserved_bits_mmcra_thresh_ctl_test \ blacklisted_events_test event_alternatives_tests_p9 event_alternatives_tests_p10 generic_events_valid_test \ - group_constraint_l2l3_sel_test group_constraint_cache_test + group_constraint_l2l3_sel_test group_constraint_cache_test group_constraint_thresh_cmp_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_thresh_cmp_test.c b/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_thresh_cmp_test.c new file mode 100644 index 000000000000..9f1197104e8c --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_thresh_cmp_test.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Kajol Jain, IBM Corp. + */ + +#include +#include + +#include "../event.h" +#include "utils.h" +#include "../sampling_tests/misc.h" + +/* + * Primary PMU events used here is PM_MRK_INST_CMPL (0x401e0) and + * PM_THRESH_MET (0x101ec) + * Threshold event selection used is issue to complete for cycles + * Sampling criteria is Load or Store only sampling + */ +#define p9_EventCode_1 0x13e35340401e0 +#define p9_EventCode_2 0x17d34340101ec +#define p9_EventCode_3 0x13e35340101ec +#define p10_EventCode_1 0x35340401e0 +#define p10_EventCode_2 0x35340101ec + +/* + * Testcase for group constraint check of thresh_cmp bits which is + * used to program thresh compare field in Monitor Mode Control Register A + * (MMCRA: 9-18 bits for power9 and MMCRA: 8-18 bits for power10). + * All events in the group should match thresh compare bits otherwise + * event_open for the group will fail. + */ +static int group_constraint_thresh_cmp(void) +{ + struct event event, leader; + + /* Check for platform support for the test */ + SKIP_IF(platform_check_for_tests()); + + if (have_hwcap2(PPC_FEATURE2_ARCH_3_1)) { + /* Init the events for the group contraint check for thresh_cmp bits */ + event_init(&leader, p10_EventCode_1); + + /* Add the thresh_cmp value for leader in config1 */ + leader.attr.config1 = 1000; + FAIL_IF(event_open(&leader)); + + event_init(&event, p10_EventCode_2); + + /* Add the different thresh_cmp value from the leader event in config1 */ + event.attr.config1 = 2000; + + /* Expected to fail as sibling and leader event request different thresh_cmp bits */ + FAIL_IF(!event_open_with_group(&event, leader.fd)); + + event_close(&event); + + /* Init the event for the group contraint thresh compare test */ + event_init(&event, p10_EventCode_2); + + /* Add the same thresh_cmp value for leader and sibling event in config1 */ + event.attr.config1 = 1000; + + /* Expected to succeed as sibling and leader event request same thresh_cmp bits */ + FAIL_IF(event_open_with_group(&event, leader.fd)); + + event_close(&leader); + event_close(&event); + } else { + /* Init the events for the group contraint check for thresh_cmp bits */ + event_init(&leader, p9_EventCode_1); + FAIL_IF(event_open(&leader)); + + event_init(&event, p9_EventCode_2); + + /* Expected to fail as sibling and leader event request different thresh_cmp bits */ + FAIL_IF(!event_open_with_group(&event, leader.fd)); + + event_close(&event); + + /* Init the event for the group contraint thresh compare test */ + event_init(&event, p9_EventCode_3); + + /* Expected to succeed as sibling and leader event request same thresh_cmp bits */ + FAIL_IF(event_open_with_group(&event, leader.fd)); + + event_close(&leader); + event_close(&event); + } + + return 0; +} + +int main(void) +{ + return test_harness(group_constraint_thresh_cmp, "group_constraint_thresh_cmp"); +} -- cgit v1.2.3 From 142c9bd1ff215f364a5d683a9dd0b7c413397185 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Fri, 10 Jun 2022 19:11:10 +0530 Subject: selftests/powerpc/pmu: Add selftest for group constraint for unit and pmc field in p9 Unit and pmu bits in the event code is used to program unit and pmc fields in Monitor Mode Control Register 1 (MMCR1). For power9 platform, incase unit field value is within 6 to 9, one of the event in the group should use PMC4. Otherwise event_open should fail for that group. Signed-off-by: Kajol Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-33-atrajeev@linux.vnet.ibm.com --- .../powerpc/pmu/event_code_tests/Makefile | 3 +- .../event_code_tests/group_constraint_unit_test.c | 74 ++++++++++++++++++++++ 2 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_unit_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index 374044062561..f72c73b5b79a 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -5,7 +5,8 @@ TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_te group_constraint_repeat_test group_constraint_radix_scope_qual_test reserved_bits_mmcra_sample_elig_mode_test \ group_constraint_mmcra_sample_test invalid_event_code_test reserved_bits_mmcra_thresh_ctl_test \ blacklisted_events_test event_alternatives_tests_p9 event_alternatives_tests_p10 generic_events_valid_test \ - group_constraint_l2l3_sel_test group_constraint_cache_test group_constraint_thresh_cmp_test + group_constraint_l2l3_sel_test group_constraint_cache_test group_constraint_thresh_cmp_test \ + group_constraint_unit_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_unit_test.c b/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_unit_test.c new file mode 100644 index 000000000000..a2c18923dcec --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_unit_test.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Kajol Jain, IBM Corp. + */ + +#include +#include + +#include "../event.h" +#include "utils.h" +#include "../sampling_tests/misc.h" + +/* All successful D-side store dispatches for this thread with PMC 2 */ +#define EventCode_1 0x26080 +/* All successful D-side store dispatches for this thread with PMC 4 */ +#define EventCode_2 0x46080 +/* All successful D-side store dispatches for this thread that were L2 Miss with PMC 3 */ +#define EventCode_3 0x36880 + +/* + * Testcase for group constraint check of unit and pmc bits which is + * used to program corresponding unit and pmc field in Monitor Mode + * Control Register 1 (MMCR1) + * One of the event in the group should use PMC 4 incase units field + * value is within 6 to 9 otherwise event_open for the group will fail. + */ +static int group_constraint_unit(void) +{ + struct event *e, events[3]; + + /* + * Check for platform support for the test. + * Constraint to use PMC4 with one of the event in group, + * when the unit is within 6 to 9 is only applicable on + * power9. + */ + SKIP_IF(platform_check_for_tests()); + SKIP_IF(have_hwcap2(PPC_FEATURE2_ARCH_3_1)); + + /* Init the events for the group contraint check for unit bits */ + e = &events[0]; + event_init(e, EventCode_1); + + /* Expected to fail as PMC 4 is not used with unit field value 6 to 9 */ + FAIL_IF(!event_open(&events[0])); + + /* Init the events for the group contraint check for unit bits */ + e = &events[1]; + event_init(e, EventCode_2); + + /* Expected to pass as PMC 4 is used with unit field value 6 to 9 */ + FAIL_IF(event_open(&events[1])); + + /* Init the event for the group contraint unit test */ + e = &events[2]; + event_init(e, EventCode_3); + + /* Expected to fail as PMC4 is not being used */ + FAIL_IF(!event_open_with_group(&events[2], events[0].fd)); + + /* Expected to succeed as event using PMC4 */ + FAIL_IF(event_open_with_group(&events[2], events[1].fd)); + + event_close(&events[0]); + event_close(&events[1]); + event_close(&events[2]); + + return 0; +} + +int main(void) +{ + return test_harness(group_constraint_unit, "group_constraint_unit"); +} -- cgit v1.2.3 From c178606ab51076d464fe537cd7a6bcbc615939e5 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Fri, 10 Jun 2022 19:11:11 +0530 Subject: selftests/powerpc/pmu: Add selftest for group constraint check for MMCRA thresh_ctl field Thresh control bits in the event code is used to program thresh_ctl field in Monitor Mode Control Register A (MMCRA: 48-55). When scheduling events as a group, all events in that group should match value in these bits. Otherwise event open for the sibling events will fail. Testcase uses event code PM_MRK_INST_CMPL (0x401e0) as leader and another event PM_THRESH_MET (101ec) as sibling event, and checks if group constraint checks for thresh_ctl field added correctly via perf interface. Signed-off-by: Kajol Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-34-atrajeev@linux.vnet.ibm.com --- .../powerpc/pmu/event_code_tests/Makefile | 2 +- .../group_constraint_thresh_ctl_test.c | 64 ++++++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_thresh_ctl_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index f72c73b5b79a..16cbb2e52865 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -6,7 +6,7 @@ TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_te group_constraint_mmcra_sample_test invalid_event_code_test reserved_bits_mmcra_thresh_ctl_test \ blacklisted_events_test event_alternatives_tests_p9 event_alternatives_tests_p10 generic_events_valid_test \ group_constraint_l2l3_sel_test group_constraint_cache_test group_constraint_thresh_cmp_test \ - group_constraint_unit_test + group_constraint_unit_test group_constraint_thresh_ctl_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_thresh_ctl_test.c b/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_thresh_ctl_test.c new file mode 100644 index 000000000000..e0852ebc1671 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_thresh_ctl_test.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Kajol Jain, IBM Corp. + */ + +#include +#include + +#include "../event.h" +#include "utils.h" +#include "../sampling_tests/misc.h" + +/* + * Primary PMU events used here are PM_MRK_INST_CMPL (0x401e0) and + * PM_THRESH_MET (0x101ec). + * Threshold event selection used is issue to complete and issue to + * finished for cycles + * Sampling criteria is Load or Store only sampling + */ +#define EventCode_1 0x35340401e0 +#define EventCode_2 0x34340101ec +#define EventCode_3 0x35340101ec + +/* + * Testcase for group constraint check of thresh_ctl bits which is + * used to program thresh compare field in Monitor Mode Control Register A + * (MMCR0: 48-55). + * All events in the group should match thresh ctl bits otherwise + * event_open for the group will fail. + */ +static int group_constraint_thresh_ctl(void) +{ + struct event event, leader; + + /* Check for platform support for the test */ + SKIP_IF(platform_check_for_tests()); + + /* Init the events for the group contraint thresh control test */ + event_init(&leader, EventCode_1); + FAIL_IF(event_open(&leader)); + + event_init(&event, EventCode_2); + + /* Expected to fail as sibling and leader event request different thresh_ctl bits */ + FAIL_IF(!event_open_with_group(&event, leader.fd)); + + event_close(&event); + + /* Init the event for the group contraint thresh control test */ + event_init(&event, EventCode_3); + + /* Expected to succeed as sibling and leader event request same thresh_ctl bits */ + FAIL_IF(event_open_with_group(&event, leader.fd)); + + event_close(&leader); + event_close(&event); + + return 0; +} + +int main(void) +{ + return test_harness(group_constraint_thresh_ctl, "group_constraint_thresh_ctl"); +} -- cgit v1.2.3 From 9ac92fecd1dbfcabd64925571b94151d7a814878 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Fri, 10 Jun 2022 19:11:12 +0530 Subject: selftests/powerpc/pmu: Add selftest for group constraint check for MMCRA thresh_sel field Thresh select bits in the event code is used to program thresh_sel field in Monitor Mode Control Register A (MMCRA: 45-47). When scheduling events as a group, all events in that group should match value in these bits. Otherwise event open for the sibling events will fail. Testcase uses event code PM_MRK_INST_CMPL (0x401e0) as leader and another event PM_THRESH_MET (0x101ec) as sibling event, and checks if group constraint checks for thresh_sel field added correctly via perf interface. Signed-off-by: Kajol Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-35-atrajeev@linux.vnet.ibm.com --- .../powerpc/pmu/event_code_tests/Makefile | 2 +- .../group_constraint_thresh_sel_test.c | 63 ++++++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_thresh_sel_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index 16cbb2e52865..755993d210f2 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -6,7 +6,7 @@ TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_te group_constraint_mmcra_sample_test invalid_event_code_test reserved_bits_mmcra_thresh_ctl_test \ blacklisted_events_test event_alternatives_tests_p9 event_alternatives_tests_p10 generic_events_valid_test \ group_constraint_l2l3_sel_test group_constraint_cache_test group_constraint_thresh_cmp_test \ - group_constraint_unit_test group_constraint_thresh_ctl_test + group_constraint_unit_test group_constraint_thresh_ctl_test group_constraint_thresh_sel_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_thresh_sel_test.c b/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_thresh_sel_test.c new file mode 100644 index 000000000000..50a8cd843ce7 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/group_constraint_thresh_sel_test.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Kajol Jain, IBM Corp. + */ + +#include +#include + +#include "../event.h" +#include "utils.h" +#include "../sampling_tests/misc.h" + +/* + * Primary PMU events used here are PM_MRK_INST_CMPL (0x401e0) and + * PM_THRESH_MET (0x101ec). + * Threshold event selection used is issue to complete + * Sampling criteria is Load or Store only sampling + */ +#define EventCode_1 0x35340401e0 +#define EventCode_2 0x35540101ec +#define EventCode_3 0x35340101ec + +/* + * Testcase for group constraint check of thresh_sel bits which is + * used to program thresh select field in Monitor Mode Control Register A + * (MMCRA: 45-57). + * All events in the group should match thresh sel bits otherwise + * event_open for the group will fail. + */ +static int group_constraint_thresh_sel(void) +{ + struct event event, leader; + + /* Check for platform support for the test */ + SKIP_IF(platform_check_for_tests()); + + /* Init the events for the group contraint thresh select test */ + event_init(&leader, EventCode_1); + FAIL_IF(event_open(&leader)); + + event_init(&event, EventCode_2); + + /* Expected to fail as sibling and leader event request different thresh_sel bits */ + FAIL_IF(!event_open_with_group(&event, leader.fd)); + + event_close(&event); + + /* Init the event for the group contraint thresh select test */ + event_init(&event, EventCode_3); + + /* Expected to succeed as sibling and leader event request same thresh_sel bits */ + FAIL_IF(event_open_with_group(&event, leader.fd)); + + event_close(&leader); + event_close(&event); + + return 0; +} + +int main(void) +{ + return test_harness(group_constraint_thresh_sel, "group_constraint_thresh_sel"); +} -- cgit v1.2.3 From ab8bca92aebcb59d81dc95ddebe241052f2bb411 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Fri, 10 Jun 2022 19:11:13 +0530 Subject: selftests/powerpc/pmu: Add test for hardware cache events The testcase checks if the transalation of a generic hardware cache event is done properly via perf interface. The hardware cache events has type as PERF_TYPE_HW_CACHE and each event points to raw event code id. Testcase checks different combination of cache level, cache event operation type and cache event result type and verify for a given event code, whether transalation matches with the current cache event mappings via perf interface. Signed-off-by: Kajol Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220610134113.62991-36-atrajeev@linux.vnet.ibm.com --- .../powerpc/pmu/event_code_tests/Makefile | 3 +- .../event_code_tests/hw_cache_event_type_test.c | 88 ++++++++++++++++++++++ 2 files changed, 90 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/pmu/event_code_tests/hw_cache_event_type_test.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index 755993d210f2..4e07d7046457 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -6,7 +6,8 @@ TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_te group_constraint_mmcra_sample_test invalid_event_code_test reserved_bits_mmcra_thresh_ctl_test \ blacklisted_events_test event_alternatives_tests_p9 event_alternatives_tests_p10 generic_events_valid_test \ group_constraint_l2l3_sel_test group_constraint_cache_test group_constraint_thresh_cmp_test \ - group_constraint_unit_test group_constraint_thresh_ctl_test group_constraint_thresh_sel_test + group_constraint_unit_test group_constraint_thresh_ctl_test group_constraint_thresh_sel_test \ + hw_cache_event_type_test top_srcdir = ../../../../../.. include ../../../lib.mk diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/hw_cache_event_type_test.c b/tools/testing/selftests/powerpc/pmu/event_code_tests/hw_cache_event_type_test.c new file mode 100644 index 000000000000..a45b1da5b568 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/hw_cache_event_type_test.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2022, Kajol Jain, IBM Corp. + */ + +#include +#include + +#include "../event.h" +#include "utils.h" +#include "../sampling_tests/misc.h" + +/* + * Load Missed L1, for power9 its pointing to PM_LD_MISS_L1_FIN (0x2c04e) and + * for power10 its pointing to PM_LD_MISS_L1 (0x3e054) + * + * Hardware cache level : PERF_COUNT_HW_CACHE_L1D + * Hardware cache event operation type : PERF_COUNT_HW_CACHE_OP_READ + * Hardware cache event result type : PERF_COUNT_HW_CACHE_RESULT_MISS + */ +#define EventCode_1 0x10000 +/* + * Hardware cache level : PERF_COUNT_HW_CACHE_L1D + * Hardware cache event operation type : PERF_COUNT_HW_CACHE_OP_WRITE + * Hardware cache event result type : PERF_COUNT_HW_CACHE_RESULT_ACCESS + */ +#define EventCode_2 0x0100 +/* + * Hardware cache level : PERF_COUNT_HW_CACHE_DTLB + * Hardware cache event operation type : PERF_COUNT_HW_CACHE_OP_WRITE + * Hardware cache event result type : PERF_COUNT_HW_CACHE_RESULT_ACCESS + */ +#define EventCode_3 0x0103 +/* + * Hardware cache level : PERF_COUNT_HW_CACHE_L1D + * Hardware cache event operation type : PERF_COUNT_HW_CACHE_OP_READ + * Hardware cache event result type : Invalid ( > PERF_COUNT_HW_CACHE_RESULT_MAX) + */ +#define EventCode_4 0x030000 + +/* + * A perf test to check valid hardware cache events. + */ +static int hw_cache_event_type_test(void) +{ + struct event event; + + /* Check for platform support for the test */ + SKIP_IF(platform_check_for_tests()); + + /* Skip for Generic compat PMU */ + SKIP_IF(check_for_generic_compat_pmu()); + + /* Init the event to test hardware cache event */ + event_init_opts(&event, EventCode_1, PERF_TYPE_HW_CACHE, "event"); + + /* Expected to success as its pointing to L1 load miss */ + FAIL_IF(event_open(&event)); + event_close(&event); + + /* Init the event to test hardware cache event */ + event_init_opts(&event, EventCode_2, PERF_TYPE_HW_CACHE, "event"); + + /* Expected to fail as the corresponding cache event entry have 0 in that index */ + FAIL_IF(!event_open(&event)); + event_close(&event); + + /* Init the event to test hardware cache event */ + event_init_opts(&event, EventCode_3, PERF_TYPE_HW_CACHE, "event"); + + /* Expected to fail as the corresponding cache event entry have -1 in that index */ + FAIL_IF(!event_open(&event)); + event_close(&event); + + /* Init the event to test hardware cache event */ + event_init_opts(&event, EventCode_4, PERF_TYPE_HW_CACHE, "event"); + + /* Expected to fail as hardware cache event result type is Invalid */ + FAIL_IF(!event_open(&event)); + event_close(&event); + + return 0; +} + +int main(void) +{ + return test_harness(hw_cache_event_type_test, "hw_cache_event_type_test"); +} -- cgit v1.2.3 From 863fdccdc5ed1e187a30a4a103340be4569904c8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 22 Jun 2022 01:00:45 -0700 Subject: tools/testing/cxl: Fix cxl_hdm_decode_init() calling convention This failing signature: [ 8.392669] cxl_bus_probe: cxl_port endpoint2: probe: 970997760 [ 8.392670] cxl_port: probe of endpoint2 failed with error 970997760 [ 8.392719] create_endpoint: cxl_mem mem0: add: endpoint2 [ 8.392721] cxl_mem mem0: endpoint2 failed probe [ 8.392725] cxl_bus_probe: cxl_mem mem0: probe: -6 ...shows cxl_hdm_decode_init() resulting in a return code ("970997760") that looks like stack corruption. The problem goes away if cxl_hdm_decode_init() is not mocked via __wrap_cxl_hdm_decode_init(). The corruption results from the mismatch that the calling convention for cxl_hdm_decode_init() is: int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm) ...and __wrap_cxl_hdm_decode_init() is: bool __wrap_cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm) ...i.e. an int is expected but __wrap_hdm_decode_init() returns bool. Fix the convention and cleanup the organization to match __wrap_cxl_await_media_ready() as the difference was a red herring that distracted from finding the bug. Fixes: 92804edb11f0 ("cxl/pci: Drop @info argument to cxl_hdm_decode_init()") Reviewed-by: Jonathan Cameron Reviewed-by: Adam Manzanares Link: https://lore.kernel.org/r/165603870776.551046.8709990108936497723.stgit@dwillia2-xfh Signed-off-by: Dan Williams --- tools/testing/cxl/test/mock.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c index f1f8c40948c5..bce6a21df0d5 100644 --- a/tools/testing/cxl/test/mock.c +++ b/tools/testing/cxl/test/mock.c @@ -208,13 +208,15 @@ int __wrap_cxl_await_media_ready(struct cxl_dev_state *cxlds) } EXPORT_SYMBOL_NS_GPL(__wrap_cxl_await_media_ready, CXL); -bool __wrap_cxl_hdm_decode_init(struct cxl_dev_state *cxlds, - struct cxl_hdm *cxlhdm) +int __wrap_cxl_hdm_decode_init(struct cxl_dev_state *cxlds, + struct cxl_hdm *cxlhdm) { int rc = 0, index; struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); - if (!ops || !ops->is_mock_dev(cxlds->dev)) + if (ops && ops->is_mock_dev(cxlds->dev)) + rc = 0; + else rc = cxl_hdm_decode_init(cxlds, cxlhdm); put_cxl_mock_ops(index); -- cgit v1.2.3 From dd3549c5032d588bf444bc8b8eae5108d0aa055a Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 16 Jun 2022 17:07:05 +1000 Subject: selftests/powerpc: Add missing files to .gitignores These were missed when the respective tests were added, add them now. Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220616070705.1941829-1-mpe@ellerman.id.au --- tools/testing/selftests/powerpc/math/.gitignore | 1 + tools/testing/selftests/powerpc/mce/.gitignore | 1 + tools/testing/selftests/powerpc/pmu/ebb/.gitignore | 1 + tools/testing/selftests/powerpc/security/.gitignore | 1 + 4 files changed, 4 insertions(+) create mode 100644 tools/testing/selftests/powerpc/mce/.gitignore (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/math/.gitignore b/tools/testing/selftests/powerpc/math/.gitignore index d0c23b2e4b60..07b4893ef7af 100644 --- a/tools/testing/selftests/powerpc/math/.gitignore +++ b/tools/testing/selftests/powerpc/math/.gitignore @@ -7,3 +7,4 @@ fpu_signal vmx_signal vsx_preempt fpu_denormal +mma diff --git a/tools/testing/selftests/powerpc/mce/.gitignore b/tools/testing/selftests/powerpc/mce/.gitignore new file mode 100644 index 000000000000..f5921462a495 --- /dev/null +++ b/tools/testing/selftests/powerpc/mce/.gitignore @@ -0,0 +1 @@ +inject-ra-err diff --git a/tools/testing/selftests/powerpc/pmu/ebb/.gitignore b/tools/testing/selftests/powerpc/pmu/ebb/.gitignore index 2920fb39439b..64d8dfdac74a 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/.gitignore +++ b/tools/testing/selftests/powerpc/pmu/ebb/.gitignore @@ -21,3 +21,4 @@ back_to_back_ebbs_test lost_exception_test no_handler_test cycles_with_mmcr2_test +regs_access_pmccext_test diff --git a/tools/testing/selftests/powerpc/security/.gitignore b/tools/testing/selftests/powerpc/security/.gitignore index 93614b125ded..9357b186b13c 100644 --- a/tools/testing/selftests/powerpc/security/.gitignore +++ b/tools/testing/selftests/powerpc/security/.gitignore @@ -2,3 +2,4 @@ rfi_flush entry_flush spectre_v2 +uaccess_flush -- cgit v1.2.3 From 04cfbc1d89d4cc73b5b328e3bacf24d43e9aa4b7 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 28 Jun 2022 12:37:44 +0200 Subject: selftests: forwarding: ethtool_extended_state: Convert to busywait Currently, this script sets up the test scenario, which is supposed to end in an inability of the system to negotiate a link. It then waits for a bit, and verifies that the system can diagnose why the link was not established. The wait time for the scenario where different link speeds are forced on the two ends of a loopback cable, was set to 4 seconds, which exactly covered it. As of a recent mlxsw firmware update, this time gets longer, and this test starts failing. The time that selftests currently wait for links to be established is currently $WAIT_TIMEOUT, or 20 seconds. It seems reasonable that if this is the time necessary to establish and bring up a link, it should also be enough to determine that a link cannot be established and why. Therefore in this patch, convert the sleeps to busywaits, so that if a failure is established sooner (as is expected), the test runs quicker. And use $WAIT_TIMEOUT as the time to wait. Signed-off-by: Petr Machata Reviewed-by: Amit Cohen Signed-off-by: David S. Miller --- .../net/forwarding/ethtool_extended_state.sh | 43 ++++++++++++++-------- 1 file changed, 28 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh b/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh index 4b42dfd4efd1..072faa77f53b 100755 --- a/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh +++ b/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh @@ -11,6 +11,8 @@ NUM_NETIFS=2 source lib.sh source ethtool_lib.sh +TIMEOUT=$((WAIT_TIMEOUT * 1000)) # ms + setup_prepare() { swp1=${NETIFS[p1]} @@ -18,7 +20,7 @@ setup_prepare() swp3=$NETIF_NO_CABLE } -ethtool_extended_state_check() +ethtool_ext_state() { local dev=$1; shift local expected_ext_state=$1; shift @@ -30,21 +32,27 @@ ethtool_extended_state_check() | sed -e 's/^[[:space:]]*//') ext_state=$(echo $ext_state | cut -d "," -f1) - [[ $ext_state == $expected_ext_state ]] - check_err $? "Expected \"$expected_ext_state\", got \"$ext_state\"" - - [[ $ext_substate == $expected_ext_substate ]] - check_err $? "Expected \"$expected_ext_substate\", got \"$ext_substate\"" + if [[ $ext_state != $expected_ext_state ]]; then + echo "Expected \"$expected_ext_state\", got \"$ext_state\"" + return 1 + fi + if [[ $ext_substate != $expected_ext_substate ]]; then + echo "Expected \"$expected_ext_substate\", got \"$ext_substate\"" + return 1 + fi } autoneg() { + local msg + RET=0 ip link set dev $swp1 up - sleep 4 - ethtool_extended_state_check $swp1 "Autoneg" "No partner detected" + msg=$(busywait $TIMEOUT ethtool_ext_state $swp1 \ + "Autoneg" "No partner detected") + check_err $? "$msg" log_test "Autoneg, No partner detected" @@ -53,6 +61,8 @@ autoneg() autoneg_force_mode() { + local msg + RET=0 ip link set dev $swp1 up @@ -65,12 +75,13 @@ autoneg_force_mode() ethtool_set $swp1 speed $speed1 autoneg off ethtool_set $swp2 speed $speed2 autoneg off - sleep 4 - ethtool_extended_state_check $swp1 "Autoneg" \ - "No partner detected during force mode" + msg=$(busywait $TIMEOUT ethtool_ext_state $swp1 \ + "Autoneg" "No partner detected during force mode") + check_err $? "$msg" - ethtool_extended_state_check $swp2 "Autoneg" \ - "No partner detected during force mode" + msg=$(busywait $TIMEOUT ethtool_ext_state $swp2 \ + "Autoneg" "No partner detected during force mode") + check_err $? "$msg" log_test "Autoneg, No partner detected during force mode" @@ -83,12 +94,14 @@ autoneg_force_mode() no_cable() { + local msg + RET=0 ip link set dev $swp3 up - sleep 1 - ethtool_extended_state_check $swp3 "No cable" + msg=$(busywait $TIMEOUT ethtool_ext_state $swp3 "No cable") + check_err $? "$msg" log_test "No cable" -- cgit v1.2.3 From f43b9876e857c739d407bc56df288b0ebe1a9164 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 27 Jun 2022 22:21:17 +0000 Subject: x86/retbleed: Add fine grained Kconfig knobs Do fine-grained Kconfig for all the various retbleed parts. NOTE: if your compiler doesn't support return thunks this will silently 'upgrade' your mitigation to IBPB, you might not like this. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov --- arch/x86/Kconfig | 111 +++++++++++++++++++++++-------- arch/x86/Makefile | 8 ++- arch/x86/entry/calling.h | 4 ++ arch/x86/include/asm/disabled-features.h | 18 +++-- arch/x86/include/asm/linkage.h | 4 +- arch/x86/include/asm/nospec-branch.h | 10 ++- arch/x86/include/asm/static_call.h | 2 +- arch/x86/kernel/alternative.c | 5 ++ arch/x86/kernel/cpu/amd.c | 2 + arch/x86/kernel/cpu/bugs.c | 42 +++++++----- arch/x86/kernel/static_call.c | 2 +- arch/x86/kvm/emulate.c | 4 +- arch/x86/lib/retpoline.S | 4 ++ scripts/Makefile.lib | 1 + scripts/Makefile.vmlinux_o | 2 +- security/Kconfig | 11 --- tools/objtool/builtin-check.c | 7 ++ tools/objtool/check.c | 9 ++- tools/objtool/include/objtool/builtin.h | 1 + 19 files changed, 178 insertions(+), 69 deletions(-) (limited to 'tools') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e35eecfb74f2..e58798f636d4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -462,32 +462,6 @@ config GOLDFISH def_bool y depends on X86_GOLDFISH -config RETPOLINE - bool "Avoid speculative indirect branches in kernel" - select OBJTOOL if HAVE_OBJTOOL - default y - help - Compile kernel with the retpoline compiler options to guard against - kernel-to-user data leaks by avoiding speculative indirect - branches. Requires a compiler with -mindirect-branch=thunk-extern - support for full protection. The kernel may run slower. - -config CC_HAS_SLS - def_bool $(cc-option,-mharden-sls=all) - -config CC_HAS_RETURN_THUNK - def_bool $(cc-option,-mfunction-return=thunk-extern) - -config SLS - bool "Mitigate Straight-Line-Speculation" - depends on CC_HAS_SLS && X86_64 - select OBJTOOL if HAVE_OBJTOOL - default n - help - Compile the kernel with straight-line-speculation options to guard - against straight line speculation. The kernel image might be slightly - larger. - config X86_CPU_RESCTRL bool "x86 CPU resource control support" depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD) @@ -2456,6 +2430,91 @@ source "kernel/livepatch/Kconfig" endmenu +config CC_HAS_SLS + def_bool $(cc-option,-mharden-sls=all) + +config CC_HAS_RETURN_THUNK + def_bool $(cc-option,-mfunction-return=thunk-extern) + +menuconfig SPECULATION_MITIGATIONS + bool "Mitigations for speculative execution vulnerabilities" + default y + help + Say Y here to enable options which enable mitigations for + speculative execution hardware vulnerabilities. + + If you say N, all mitigations will be disabled. You really + should know what you are doing to say so. + +if SPECULATION_MITIGATIONS + +config PAGE_TABLE_ISOLATION + bool "Remove the kernel mapping in user mode" + default y + depends on (X86_64 || X86_PAE) + help + This feature reduces the number of hardware side channels by + ensuring that the majority of kernel addresses are not mapped + into userspace. + + See Documentation/x86/pti.rst for more details. + +config RETPOLINE + bool "Avoid speculative indirect branches in kernel" + select OBJTOOL if HAVE_OBJTOOL + default y + help + Compile kernel with the retpoline compiler options to guard against + kernel-to-user data leaks by avoiding speculative indirect + branches. Requires a compiler with -mindirect-branch=thunk-extern + support for full protection. The kernel may run slower. + +config RETHUNK + bool "Enable return-thunks" + depends on RETPOLINE && CC_HAS_RETURN_THUNK + select OBJTOOL if HAVE_OBJTOOL + default y + help + Compile the kernel with the return-thunks compiler option to guard + against kernel-to-user data leaks by avoiding return speculation. + Requires a compiler with -mfunction-return=thunk-extern + support for full protection. The kernel may run slower. + +config CPU_UNRET_ENTRY + bool "Enable UNRET on kernel entry" + depends on CPU_SUP_AMD && RETHUNK + default y + help + Compile the kernel with support for the retbleed=unret mitigation. + +config CPU_IBPB_ENTRY + bool "Enable IBPB on kernel entry" + depends on CPU_SUP_AMD + default y + help + Compile the kernel with support for the retbleed=ibpb mitigation. + +config CPU_IBRS_ENTRY + bool "Enable IBRS on kernel entry" + depends on CPU_SUP_INTEL + default y + help + Compile the kernel with support for the spectre_v2=ibrs mitigation. + This mitigates both spectre_v2 and retbleed at great cost to + performance. + +config SLS + bool "Mitigate Straight-Line-Speculation" + depends on CC_HAS_SLS && X86_64 + select OBJTOOL if HAVE_OBJTOOL + default n + help + Compile the kernel with straight-line-speculation options to guard + against straight line speculation. The kernel image might be slightly + larger. + +endif + config ARCH_HAS_ADD_PAGES def_bool y depends on ARCH_ENABLE_MEMORY_HOTPLUG diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 6e16057737e5..1f40dad30d50 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -15,14 +15,18 @@ endif ifdef CONFIG_CC_IS_GCC RETPOLINE_CFLAGS := $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register) RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch-cs-prefix) -RETPOLINE_CFLAGS += $(call cc-option,-mfunction-return=thunk-extern) RETPOLINE_VDSO_CFLAGS := $(call cc-option,-mindirect-branch=thunk-inline -mindirect-branch-register) endif ifdef CONFIG_CC_IS_CLANG RETPOLINE_CFLAGS := -mretpoline-external-thunk RETPOLINE_VDSO_CFLAGS := -mretpoline -RETPOLINE_CFLAGS += $(call cc-option,-mfunction-return=thunk-extern) endif + +ifdef CONFIG_RETHUNK +RETHUNK_CFLAGS := -mfunction-return=thunk-extern +RETPOLINE_CFLAGS += $(RETHUNK_CFLAGS) +endif + export RETPOLINE_CFLAGS export RETPOLINE_VDSO_CFLAGS diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 074d04e434de..f6907627172b 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -297,6 +297,7 @@ For 32-bit we have the following conventions - kernel is built with * Assumes x86_spec_ctrl_{base,current} to have SPEC_CTRL_IBRS set. */ .macro IBRS_ENTER save_reg +#ifdef CONFIG_CPU_IBRS_ENTRY ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS movl $MSR_IA32_SPEC_CTRL, %ecx @@ -317,6 +318,7 @@ For 32-bit we have the following conventions - kernel is built with shr $32, %rdx wrmsr .Lend_\@: +#endif .endm /* @@ -324,6 +326,7 @@ For 32-bit we have the following conventions - kernel is built with * regs. Must be called after the last RET. */ .macro IBRS_EXIT save_reg +#ifdef CONFIG_CPU_IBRS_ENTRY ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS movl $MSR_IA32_SPEC_CTRL, %ecx @@ -338,6 +341,7 @@ For 32-bit we have the following conventions - kernel is built with shr $32, %rdx wrmsr .Lend_\@: +#endif .endm /* diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index db75da511a36..33d2cd04d254 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -54,9 +54,19 @@ # define DISABLE_RETPOLINE 0 #else # define DISABLE_RETPOLINE ((1 << (X86_FEATURE_RETPOLINE & 31)) | \ - (1 << (X86_FEATURE_RETPOLINE_LFENCE & 31)) | \ - (1 << (X86_FEATURE_RETHUNK & 31)) | \ - (1 << (X86_FEATURE_UNRET & 31))) + (1 << (X86_FEATURE_RETPOLINE_LFENCE & 31))) +#endif + +#ifdef CONFIG_RETHUNK +# define DISABLE_RETHUNK 0 +#else +# define DISABLE_RETHUNK (1 << (X86_FEATURE_RETHUNK & 31)) +#endif + +#ifdef CONFIG_CPU_UNRET_ENTRY +# define DISABLE_UNRET 0 +#else +# define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31)) #endif #ifdef CONFIG_INTEL_IOMMU_SVM @@ -91,7 +101,7 @@ #define DISABLED_MASK8 (DISABLE_TDX_GUEST) #define DISABLED_MASK9 (DISABLE_SGX) #define DISABLED_MASK10 0 -#define DISABLED_MASK11 (DISABLE_RETPOLINE) +#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET) #define DISABLED_MASK12 0 #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index e3ae331cabb1..73ca20049835 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -19,7 +19,7 @@ #define __ALIGN_STR __stringify(__ALIGN) #endif -#if defined(CONFIG_RETPOLINE) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define RET jmp __x86_return_thunk #else /* CONFIG_RETPOLINE */ #ifdef CONFIG_SLS @@ -31,7 +31,7 @@ #else /* __ASSEMBLY__ */ -#if defined(CONFIG_RETPOLINE) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define ASM_RET "jmp __x86_return_thunk\n\t" #else /* CONFIG_RETPOLINE */ #ifdef CONFIG_SLS diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index ccde87e6eabb..bb05ed4f46bd 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -127,6 +127,12 @@ .Lskip_rsb_\@: .endm +#ifdef CONFIG_CPU_UNRET_ENTRY +#define CALL_ZEN_UNTRAIN_RET "call zen_untrain_ret" +#else +#define CALL_ZEN_UNTRAIN_RET "" +#endif + /* * Mitigate RETBleed for AMD/Hygon Zen uarch. Requires KERNEL CR3 because the * return thunk isn't mapped into the userspace tables (then again, AMD @@ -139,10 +145,10 @@ * where we have a stack but before any RET instruction. */ .macro UNTRAIN_RET -#ifdef CONFIG_RETPOLINE +#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) ANNOTATE_UNRET_END ALTERNATIVE_2 "", \ - "call zen_untrain_ret", X86_FEATURE_UNRET, \ + CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ "call entry_ibpb", X86_FEATURE_ENTRY_IBPB #endif .endm diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h index 70cc9ccb8029..343b722ccaf2 100644 --- a/arch/x86/include/asm/static_call.h +++ b/arch/x86/include/asm/static_call.h @@ -46,7 +46,7 @@ #define ARCH_DEFINE_STATIC_CALL_TRAMP(name, func) \ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, ".byte 0xe9; .long " #func " - (. + 4)") -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_RETHUNK #define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "jmp __x86_return_thunk") #else diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index cf447ee18b3c..d6858533e6e5 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -508,6 +508,7 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) } } +#ifdef CONFIG_RETHUNK /* * Rewrite the compiler generated return thunk tail-calls. * @@ -569,6 +570,10 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) } } } +#else +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } +#endif /* CONFIG_RETHUNK */ + #else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 9cfd11f7ba11..35d5288394cb 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -864,6 +864,7 @@ static void init_amd_bd(struct cpuinfo_x86 *c) void init_spectral_chicken(struct cpuinfo_x86 *c) { +#ifdef CONFIG_CPU_UNRET_ENTRY u64 value; /* @@ -880,6 +881,7 @@ void init_spectral_chicken(struct cpuinfo_x86 *c) wrmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value); } } +#endif } static void init_amd_zn(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 182f8b2e8a3c..cf08a1b8f3c7 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -842,7 +842,6 @@ static int __init retbleed_parse_cmdline(char *str) early_param("retbleed", retbleed_parse_cmdline); #define RETBLEED_UNTRAIN_MSG "WARNING: BTB untrained return thunk mitigation is only effective on AMD/Hygon!\n" -#define RETBLEED_COMPILER_MSG "WARNING: kernel not compiled with RETPOLINE or -mfunction-return capable compiler; falling back to IBPB!\n" #define RETBLEED_INTEL_MSG "WARNING: Spectre v2 mitigation leaves CPU vulnerable to RETBleed attacks, data leaks possible!\n" static void __init retbleed_select_mitigation(void) @@ -857,18 +856,33 @@ static void __init retbleed_select_mitigation(void) return; case RETBLEED_CMD_UNRET: - retbleed_mitigation = RETBLEED_MITIGATION_UNRET; + if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) { + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; + } else { + pr_err("WARNING: kernel not compiled with CPU_UNRET_ENTRY.\n"); + goto do_cmd_auto; + } break; case RETBLEED_CMD_IBPB: - retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) { + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + } else { + pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); + goto do_cmd_auto; + } break; +do_cmd_auto: case RETBLEED_CMD_AUTO: default: if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) - retbleed_mitigation = RETBLEED_MITIGATION_UNRET; + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { + if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; + else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + } /* * The Intel mitigation (IBRS or eIBRS) was already selected in @@ -881,14 +895,6 @@ static void __init retbleed_select_mitigation(void) switch (retbleed_mitigation) { case RETBLEED_MITIGATION_UNRET: - - if (!IS_ENABLED(CONFIG_RETPOLINE) || - !IS_ENABLED(CONFIG_CC_HAS_RETURN_THUNK)) { - pr_err(RETBLEED_COMPILER_MSG); - retbleed_mitigation = RETBLEED_MITIGATION_IBPB; - goto retbleed_force_ibpb; - } - setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_UNRET); @@ -900,7 +906,6 @@ static void __init retbleed_select_mitigation(void) break; case RETBLEED_MITIGATION_IBPB: -retbleed_force_ibpb: setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); mitigate_smt = true; break; @@ -1271,6 +1276,12 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return SPECTRE_V2_CMD_AUTO; } + if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_CPU_IBRS_ENTRY)) { + pr_err("%s selected but not compiled in. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { pr_err("%s selected but not Intel CPU. Switching to AUTO select\n", mitigation_options[i].option); @@ -1328,7 +1339,8 @@ static void __init spectre_v2_select_mitigation(void) break; } - if (boot_cpu_has_bug(X86_BUG_RETBLEED) && + if (IS_ENABLED(CONFIG_CPU_IBRS_ENTRY) && + boot_cpu_has_bug(X86_BUG_RETBLEED) && retbleed_cmd != RETBLEED_CMD_OFF && boot_cpu_has(X86_FEATURE_IBRS) && boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index fe21fe778185..be7038a0da4d 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -126,7 +126,7 @@ void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) } EXPORT_SYMBOL_GPL(arch_static_call_transform); -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_RETHUNK /* * This is called by apply_returns() to fix up static call trampolines, * specifically ARCH_DEFINE_STATIC_CALL_NULL_TRAMP which is recorded as diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b01437015f99..db96bf7d1122 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -439,10 +439,10 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop); * * ENDBR [4 bytes; CONFIG_X86_KERNEL_IBT] * SETcc %al [3 bytes] - * RET | JMP __x86_return_thunk [1,5 bytes; CONFIG_RETPOLINE] + * RET | JMP __x86_return_thunk [1,5 bytes; CONFIG_RETHUNK] * INT3 [1 byte; CONFIG_SLS] */ -#define RET_LENGTH (1 + (4 * IS_ENABLED(CONFIG_RETPOLINE)) + \ +#define RET_LENGTH (1 + (4 * IS_ENABLED(CONFIG_RETHUNK)) + \ IS_ENABLED(CONFIG_SLS)) #define SETCC_LENGTH (ENDBR_INSN_SIZE + 3 + RET_LENGTH) #define SETCC_ALIGN (4 << ((SETCC_LENGTH > 4) & 1) << ((SETCC_LENGTH > 8) & 1)) diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index fdd16163b996..073289a55f84 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -72,6 +72,8 @@ SYM_CODE_END(__x86_indirect_thunk_array) * This function name is magical and is used by -mfunction-return=thunk-extern * for the compiler to generate JMPs to it. */ +#ifdef CONFIG_RETHUNK + .section .text.__x86.return_thunk /* @@ -136,3 +138,5 @@ SYM_FUNC_END(zen_untrain_ret) __EXPORT_THUNK(zen_untrain_ret) EXPORT_SYMBOL(__x86_return_thunk) + +#endif /* CONFIG_RETHUNK */ diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index d1425778664b..3fb6a99e78c4 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -236,6 +236,7 @@ objtool_args = \ $(if $(CONFIG_FTRACE_MCOUNT_USE_OBJTOOL), --mcount) \ $(if $(CONFIG_UNWINDER_ORC), --orc) \ $(if $(CONFIG_RETPOLINE), --retpoline) \ + $(if $(CONFIG_RETHUNK), --rethunk) \ $(if $(CONFIG_SLS), --sls) \ $(if $(CONFIG_STACK_VALIDATION), --stackval) \ $(if $(CONFIG_HAVE_STATIC_CALL_INLINE), --static-call) \ diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o index bc67748044a6..84019814f33f 100644 --- a/scripts/Makefile.vmlinux_o +++ b/scripts/Makefile.vmlinux_o @@ -44,7 +44,7 @@ objtool-enabled := $(or $(delay-objtool),$(CONFIG_NOINSTR_VALIDATION)) objtool_args := \ $(if $(delay-objtool),$(objtool_args)) \ - $(if $(CONFIG_NOINSTR_VALIDATION), --noinstr $(if $(CONFIG_RETPOLINE), --unret)) \ + $(if $(CONFIG_NOINSTR_VALIDATION), --noinstr $(if $(CONFIG_CPU_UNRET_ENTRY), --unret)) \ $(if $(CONFIG_GCOV_KERNEL), --no-unreachable) \ --link diff --git a/security/Kconfig b/security/Kconfig index f29e4c656983..e6db09a779b7 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -54,17 +54,6 @@ config SECURITY_NETWORK implement socket and networking access controls. If you are unsure how to answer this question, answer N. -config PAGE_TABLE_ISOLATION - bool "Remove the kernel mapping in user mode" - default y - depends on (X86_64 || X86_PAE) && !UML - help - This feature reduces the number of hardware side channels by - ensuring that the majority of kernel addresses are not mapped - into userspace. - - See Documentation/x86/pti.rst for more details. - config SECURITY_INFINIBAND bool "Infiniband Security Hooks" depends on SECURITY && INFINIBAND diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index c063e1ff96b2..24fbe803a0d3 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -68,6 +68,7 @@ const struct option check_options[] = { OPT_BOOLEAN('n', "noinstr", &opts.noinstr, "validate noinstr rules"), OPT_BOOLEAN('o', "orc", &opts.orc, "generate ORC metadata"), OPT_BOOLEAN('r', "retpoline", &opts.retpoline, "validate and annotate retpoline usage"), + OPT_BOOLEAN(0, "rethunk", &opts.rethunk, "validate and annotate rethunk usage"), OPT_BOOLEAN(0, "unret", &opts.unret, "validate entry unret placement"), OPT_BOOLEAN('l', "sls", &opts.sls, "validate straight-line-speculation mitigations"), OPT_BOOLEAN('s', "stackval", &opts.stackval, "validate frame pointer rules"), @@ -124,6 +125,7 @@ static bool opts_valid(void) opts.noinstr || opts.orc || opts.retpoline || + opts.rethunk || opts.sls || opts.stackval || opts.static_call || @@ -136,6 +138,11 @@ static bool opts_valid(void) return true; } + if (opts.unret && !opts.rethunk) { + ERROR("--unret requires --rethunk"); + return false; + } + if (opts.dump_orc) return true; diff --git a/tools/objtool/check.c b/tools/objtool/check.c index ddfdd138cc2a..7bebdb8867cd 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -3732,8 +3732,11 @@ static int validate_retpoline(struct objtool_file *file) continue; if (insn->type == INSN_RETURN) { - WARN_FUNC("'naked' return found in RETPOLINE build", - insn->sec, insn->offset); + if (opts.rethunk) { + WARN_FUNC("'naked' return found in RETHUNK build", + insn->sec, insn->offset); + } else + continue; } else { WARN_FUNC("indirect %s found in RETPOLINE build", insn->sec, insn->offset, @@ -4264,7 +4267,9 @@ int check(struct objtool_file *file) if (ret < 0) goto out; warnings += ret; + } + if (opts.rethunk) { ret = create_return_sites_sections(file); if (ret < 0) goto out; diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index 0c476b0b40a3..42a52f1a0add 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -19,6 +19,7 @@ struct opts { bool noinstr; bool orc; bool retpoline; + bool rethunk; bool unret; bool sls; bool stackval; -- cgit v1.2.3 From 69fd337a975c7e690dfe49d9cb4fe5ba1e6db44e Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:06 -0700 Subject: bpf: per-cgroup lsm flavor Allow attaching to lsm hooks in the cgroup context. Attaching to per-cgroup LSM works exactly like attaching to other per-cgroup hooks. New BPF_LSM_CGROUP is added to trigger new mode; the actual lsm hook we attach to is signaled via existing attach_btf_id. For the hooks that have 'struct socket' or 'struct sock' as its first argument, we use the cgroup associated with that socket. For the rest, we use 'current' cgroup (this is all on default hierarchy == v2 only). Note that for some hooks that work on 'struct sock' we still take the cgroup from 'current' because some of them work on the socket that hasn't been properly initialized yet. Behind the scenes, we allocate a shim program that is attached to the trampoline and runs cgroup effective BPF programs array. This shim has some rudimentary ref counting and can be shared between several programs attaching to the same lsm hook from different cgroups. Note that this patch bloats cgroup size because we add 211 cgroup_bpf_attach_type(s) for simplicity sake. This will be addressed in the subsequent patch. Also note that we only add non-sleepable flavor for now. To enable sleepable use-cases, bpf_prog_run_array_cg has to grab trace rcu, shim programs have to be freed via trace rcu, cgroup_bpf.effective should be also trace-rcu-managed + maybe some other changes that I'm not aware of. Reviewed-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-4-sdf@google.com Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 24 +++-- include/linux/bpf-cgroup-defs.h | 8 ++ include/linux/bpf-cgroup.h | 7 ++ include/linux/bpf.h | 24 +++++ include/linux/bpf_lsm.h | 13 +++ include/linux/btf_ids.h | 3 +- include/uapi/linux/bpf.h | 1 + kernel/bpf/bpf_lsm.c | 48 ++++++++++ kernel/bpf/btf.c | 11 +++ kernel/bpf/cgroup.c | 136 ++++++++++++++++++++++++--- kernel/bpf/core.c | 2 + kernel/bpf/syscall.c | 10 ++ kernel/bpf/trampoline.c | 198 ++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 32 +++++++ tools/include/uapi/linux/bpf.h | 1 + 15 files changed, 498 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 2c51ca9f7cec..2f460c67f9c7 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1770,6 +1770,10 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, struct bpf_tramp_link *l, int stack_size, int run_ctx_off, bool save_ret) { + void (*exit)(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) = __bpf_prog_exit; + u64 (*enter)(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx) = __bpf_prog_enter; u8 *prog = *pprog; u8 *jmp_insn; int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie); @@ -1788,15 +1792,21 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_1, -run_ctx_off + ctx_cookie_off); + if (p->aux->sleepable) { + enter = __bpf_prog_enter_sleepable; + exit = __bpf_prog_exit_sleepable; + } else if (p->expected_attach_type == BPF_LSM_CGROUP) { + enter = __bpf_prog_enter_lsm_cgroup; + exit = __bpf_prog_exit_lsm_cgroup; + } + /* arg1: mov rdi, progs[i] */ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); /* arg2: lea rsi, [rbp - ctx_cookie_off] */ EMIT4(0x48, 0x8D, 0x75, -run_ctx_off); - if (emit_call(&prog, - p->aux->sleepable ? __bpf_prog_enter_sleepable : - __bpf_prog_enter, prog)) - return -EINVAL; + if (emit_call(&prog, enter, prog)) + return -EINVAL; /* remember prog start time returned by __bpf_prog_enter */ emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); @@ -1840,10 +1850,8 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); /* arg3: lea rdx, [rbp - run_ctx_off] */ EMIT4(0x48, 0x8D, 0x55, -run_ctx_off); - if (emit_call(&prog, - p->aux->sleepable ? __bpf_prog_exit_sleepable : - __bpf_prog_exit, prog)) - return -EINVAL; + if (emit_call(&prog, exit, prog)) + return -EINVAL; *pprog = prog; return 0; diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h index 5d268e76d8e6..b99f8c3e37ea 100644 --- a/include/linux/bpf-cgroup-defs.h +++ b/include/linux/bpf-cgroup-defs.h @@ -10,6 +10,12 @@ struct bpf_prog_array; +#ifdef CONFIG_BPF_LSM +#define CGROUP_LSM_NUM 211 /* will be addressed in the next patch */ +#else +#define CGROUP_LSM_NUM 0 +#endif + enum cgroup_bpf_attach_type { CGROUP_BPF_ATTACH_TYPE_INVALID = -1, CGROUP_INET_INGRESS = 0, @@ -35,6 +41,8 @@ enum cgroup_bpf_attach_type { CGROUP_INET4_GETSOCKNAME, CGROUP_INET6_GETSOCKNAME, CGROUP_INET_SOCK_RELEASE, + CGROUP_LSM_START, + CGROUP_LSM_END = CGROUP_LSM_START + CGROUP_LSM_NUM - 1, MAX_CGROUP_BPF_ATTACH_TYPE }; diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 6673acfbf2ef..2bd1b5f8de9b 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -23,6 +23,13 @@ struct ctl_table; struct ctl_table_header; struct task_struct; +unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx, + const struct bpf_insn *insn); +unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx, + const struct bpf_insn *insn); +unsigned int __cgroup_bpf_run_lsm_current(const void *ctx, + const struct bpf_insn *insn); + #ifdef CONFIG_CGROUP_BPF #define CGROUP_ATYPE(type) \ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d547be9db75f..77cd613a00bd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -794,6 +794,10 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_ u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx); +u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx); +void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr); void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr); @@ -1060,6 +1064,7 @@ struct bpf_prog_aux { struct user_struct *user; u64 load_time; /* ns since boottime */ u32 verified_insns; + int cgroup_atype; /* enum cgroup_bpf_attach_type */ struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; char name[BPF_OBJ_NAME_LEN]; #ifdef CONFIG_SECURITY @@ -1167,6 +1172,11 @@ struct bpf_tramp_link { u64 cookie; }; +struct bpf_shim_tramp_link { + struct bpf_tramp_link link; + struct bpf_trampoline *trampoline; +}; + struct bpf_tracing_link { struct bpf_tramp_link link; enum bpf_attach_type attach_type; @@ -1245,6 +1255,9 @@ struct bpf_dummy_ops { int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); #endif +int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, + int cgroup_atype); +void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog); #else static inline const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) { @@ -1268,6 +1281,14 @@ static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, { return -EINVAL; } +static inline int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, + int cgroup_atype) +{ + return -EOPNOTSUPP; +} +static inline void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog) +{ +} #endif struct bpf_array { @@ -2368,6 +2389,8 @@ extern const struct bpf_func_proto bpf_sk_getsockopt_proto; extern const struct bpf_func_proto bpf_find_vma_proto; extern const struct bpf_func_proto bpf_loop_proto; extern const struct bpf_func_proto bpf_copy_from_user_task_proto; +extern const struct bpf_func_proto bpf_set_retval_proto; +extern const struct bpf_func_proto bpf_get_retval_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); @@ -2485,6 +2508,7 @@ int bpf_arch_text_invalidate(void *dst, size_t len); struct btf_id_set; bool btf_id_set_contains(const struct btf_id_set *set, u32 id); +int btf_id_set_index(const struct btf_id_set *set, u32 id); #define MAX_BPRINTF_VARARGS 12 diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index 479c101546ad..61787a5f6af9 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -42,6 +42,9 @@ extern const struct bpf_func_proto bpf_inode_storage_get_proto; extern const struct bpf_func_proto bpf_inode_storage_delete_proto; void bpf_inode_storage_free(struct inode *inode); +int bpf_lsm_hook_idx(u32 btf_id); +void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func); + #else /* !CONFIG_BPF_LSM */ static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id) @@ -65,6 +68,16 @@ static inline void bpf_inode_storage_free(struct inode *inode) { } +static inline void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, + bpf_func_t *bpf_func) +{ +} + +static inline int bpf_lsm_hook_idx(u32 btf_id) +{ + return -EINVAL; +} + #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 335a19092368..252a4befeab1 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -179,7 +179,8 @@ extern struct btf_id_set name; BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, udp_sock) \ BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock) \ BTF_SOCK_TYPE(BTF_SOCK_TYPE_UNIX, unix_sock) \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_MPTCP, mptcp_sock) + BTF_SOCK_TYPE(BTF_SOCK_TYPE_MPTCP, mptcp_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCKET, socket) enum { #define BTF_SOCK_TYPE(name, str) name, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e81362891596..b7479898c879 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -998,6 +998,7 @@ enum bpf_attach_type { BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, BPF_PERF_EVENT, BPF_TRACE_KPROBE_MULTI, + BPF_LSM_CGROUP, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index c1351df9f7ee..0f72020bfdcf 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -16,6 +16,7 @@ #include #include #include +#include /* For every LSM hook that allows attachment of BPF programs, declare a nop * function where a BPF program can be attached. @@ -35,6 +36,44 @@ BTF_SET_START(bpf_lsm_hooks) #undef LSM_HOOK BTF_SET_END(bpf_lsm_hooks) +/* List of LSM hooks that should operate on 'current' cgroup regardless + * of function signature. + */ +BTF_SET_START(bpf_lsm_current_hooks) +/* operate on freshly allocated sk without any cgroup association */ +BTF_ID(func, bpf_lsm_sk_alloc_security) +BTF_ID(func, bpf_lsm_sk_free_security) +BTF_SET_END(bpf_lsm_current_hooks) + +void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, + bpf_func_t *bpf_func) +{ + const struct btf_param *args; + + if (btf_type_vlen(prog->aux->attach_func_proto) < 1 || + btf_id_set_contains(&bpf_lsm_current_hooks, + prog->aux->attach_btf_id)) { + *bpf_func = __cgroup_bpf_run_lsm_current; + return; + } + + args = btf_params(prog->aux->attach_func_proto); + +#ifdef CONFIG_NET + if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCKET]) + *bpf_func = __cgroup_bpf_run_lsm_socket; + else if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCK]) + *bpf_func = __cgroup_bpf_run_lsm_sock; + else +#endif + *bpf_func = __cgroup_bpf_run_lsm_current; +} + +int bpf_lsm_hook_idx(u32 btf_id) +{ + return btf_id_set_index(&bpf_lsm_hooks, btf_id); +} + int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { @@ -158,6 +197,15 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL; case BPF_FUNC_get_attach_cookie: return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL; + case BPF_FUNC_get_local_storage: + return prog->expected_attach_type == BPF_LSM_CGROUP ? + &bpf_get_local_storage_proto : NULL; + case BPF_FUNC_set_retval: + return prog->expected_attach_type == BPF_LSM_CGROUP ? + &bpf_set_retval_proto : NULL; + case BPF_FUNC_get_retval: + return prog->expected_attach_type == BPF_LSM_CGROUP ? + &bpf_get_retval_proto : NULL; default: return tracing_prog_func_proto(func_id, prog); } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2e2066d6af94..7c1fe422ed3f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5363,6 +5363,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (arg == nr_args) { switch (prog->expected_attach_type) { + case BPF_LSM_CGROUP: case BPF_LSM_MAC: case BPF_TRACE_FEXIT: /* When LSM programs are attached to void LSM hooks @@ -6842,6 +6843,16 @@ static int btf_id_cmp_func(const void *a, const void *b) return *pa - *pb; } +int btf_id_set_index(const struct btf_id_set *set, u32 id) +{ + const u32 *p; + + p = bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func); + if (!p) + return -1; + return p - set->ids; +} + bool btf_id_set_contains(const struct btf_id_set *set, u32 id) { return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4adb4f3ecb7f..9cf41dd4f96f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include @@ -61,6 +63,87 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, return run_ctx.retval; } +unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx, + const struct bpf_insn *insn) +{ + const struct bpf_prog *shim_prog; + struct sock *sk; + struct cgroup *cgrp; + int ret = 0; + u64 *args; + + args = (u64 *)ctx; + sk = (void *)(unsigned long)args[0]; + /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ + shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + if (likely(cgrp)) + ret = bpf_prog_run_array_cg(&cgrp->bpf, + shim_prog->aux->cgroup_atype, + ctx, bpf_prog_run, 0, NULL); + return ret; +} + +unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx, + const struct bpf_insn *insn) +{ + const struct bpf_prog *shim_prog; + struct socket *sock; + struct cgroup *cgrp; + int ret = 0; + u64 *args; + + args = (u64 *)ctx; + sock = (void *)(unsigned long)args[0]; + /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ + shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); + + cgrp = sock_cgroup_ptr(&sock->sk->sk_cgrp_data); + if (likely(cgrp)) + ret = bpf_prog_run_array_cg(&cgrp->bpf, + shim_prog->aux->cgroup_atype, + ctx, bpf_prog_run, 0, NULL); + return ret; +} + +unsigned int __cgroup_bpf_run_lsm_current(const void *ctx, + const struct bpf_insn *insn) +{ + const struct bpf_prog *shim_prog; + struct cgroup *cgrp; + int ret = 0; + + /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ + shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); + + /* We rely on trampoline's __bpf_prog_enter_lsm_cgroup to grab RCU read lock. */ + cgrp = task_dfl_cgroup(current); + if (likely(cgrp)) + ret = bpf_prog_run_array_cg(&cgrp->bpf, + shim_prog->aux->cgroup_atype, + ctx, bpf_prog_run, 0, NULL); + return ret; +} + +#ifdef CONFIG_BPF_LSM +static enum cgroup_bpf_attach_type +bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) +{ + if (attach_type != BPF_LSM_CGROUP) + return to_cgroup_bpf_attach_type(attach_type); + return CGROUP_LSM_START + bpf_lsm_hook_idx(attach_btf_id); +} +#else +static enum cgroup_bpf_attach_type +bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) +{ + if (attach_type != BPF_LSM_CGROUP) + return to_cgroup_bpf_attach_type(attach_type); + return -EOPNOTSUPP; +} +#endif /* CONFIG_BPF_LSM */ + void cgroup_bpf_offline(struct cgroup *cgrp) { cgroup_get(cgrp); @@ -163,10 +246,16 @@ static void cgroup_bpf_release(struct work_struct *work) hlist_for_each_entry_safe(pl, pltmp, progs, node) { hlist_del(&pl->node); - if (pl->prog) + if (pl->prog) { + if (pl->prog->expected_attach_type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(pl->prog); bpf_prog_put(pl->prog); - if (pl->link) + } + if (pl->link) { + if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog); bpf_cgroup_link_auto_detach(pl->link); + } kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key[atype]); } @@ -479,6 +568,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; + struct bpf_prog *new_prog = prog ? : link->link.prog; enum cgroup_bpf_attach_type atype; struct bpf_prog_list *pl; struct hlist_head *progs; @@ -495,7 +585,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, /* replace_prog implies BPF_F_REPLACE, and vice versa */ return -EINVAL; - atype = to_cgroup_bpf_attach_type(type); + atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id); if (atype < 0) return -EINVAL; @@ -549,17 +639,30 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, bpf_cgroup_storages_assign(pl->storage, storage); cgrp->bpf.flags[atype] = saved_flags; + if (type == BPF_LSM_CGROUP) { + err = bpf_trampoline_link_cgroup_shim(new_prog, atype); + if (err) + goto cleanup; + } + err = update_effective_progs(cgrp, atype); if (err) - goto cleanup; + goto cleanup_trampoline; - if (old_prog) + if (old_prog) { + if (type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(old_prog); bpf_prog_put(old_prog); - else + } else { static_branch_inc(&cgroup_bpf_enabled_key[atype]); + } bpf_cgroup_storages_link(new_storage, cgrp, type); return 0; +cleanup_trampoline: + if (type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(new_prog); + cleanup: if (old_prog) { pl->prog = old_prog; @@ -651,7 +754,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, struct hlist_head *progs; bool found = false; - atype = to_cgroup_bpf_attach_type(link->type); + atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id); if (atype < 0) return -EINVAL; @@ -803,9 +906,15 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog *old_prog; struct bpf_prog_list *pl; struct hlist_head *progs; + u32 attach_btf_id = 0; u32 flags; - atype = to_cgroup_bpf_attach_type(type); + if (prog) + attach_btf_id = prog->aux->attach_btf_id; + if (link) + attach_btf_id = link->link.prog->aux->attach_btf_id; + + atype = bpf_cgroup_atype_find(type, attach_btf_id); if (atype < 0) return -EINVAL; @@ -839,8 +948,11 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, if (hlist_empty(progs)) /* last program was detached, reset flags to zero */ cgrp->bpf.flags[atype] = 0; - if (old_prog) + if (old_prog) { + if (type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(old_prog); bpf_prog_put(old_prog); + } static_branch_dec(&cgroup_bpf_enabled_key[atype]); return 0; } @@ -999,6 +1111,8 @@ static void bpf_cgroup_link_release(struct bpf_link *link) WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, cg_link->type)); + if (cg_link->type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog); cg = cg_link->cgroup; cg_link->cgroup = NULL; @@ -1343,7 +1457,7 @@ BPF_CALL_0(bpf_get_retval) return ctx->retval; } -static const struct bpf_func_proto bpf_get_retval_proto = { +const struct bpf_func_proto bpf_get_retval_proto = { .func = bpf_get_retval, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1358,7 +1472,7 @@ BPF_CALL_1(bpf_set_retval, int, retval) return 0; } -static const struct bpf_func_proto bpf_set_retval_proto = { +const struct bpf_func_proto bpf_set_retval_proto = { .func = bpf_set_retval, .gpl_only = false, .ret_type = RET_INTEGER, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f023cb399e3f..4cc10b942a3c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2666,6 +2666,8 @@ const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_snprintf_btf_proto __weak; const struct bpf_func_proto bpf_seq_printf_btf_proto __weak; +const struct bpf_func_proto bpf_set_retval_proto __weak; +const struct bpf_func_proto bpf_get_retval_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7d5af5b99f0d..626b8f7d237b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3416,6 +3416,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_SK_LOOKUP; case BPF_XDP: return BPF_PROG_TYPE_XDP; + case BPF_LSM_CGROUP: + return BPF_PROG_TYPE_LSM; default: return BPF_PROG_TYPE_UNSPEC; } @@ -3469,6 +3471,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_LSM: + if (ptype == BPF_PROG_TYPE_LSM && + prog->expected_attach_type != BPF_LSM_CGROUP) + return -EINVAL; + ret = cgroup_bpf_prog_attach(attr, ptype, prog); break; default: @@ -3506,6 +3513,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_LSM: return cgroup_bpf_prog_detach(attr, ptype); default: return -EINVAL; @@ -4540,6 +4548,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_raw_tp_link_attach(prog, NULL); else if (prog->expected_attach_type == BPF_TRACE_ITER) ret = bpf_iter_link_attach(attr, uattr, prog); + else if (prog->expected_attach_type == BPF_LSM_CGROUP) + ret = cgroup_bpf_link_attach(attr, prog); else ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 5466e15be61f..d7c251d7fbcd 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include /* dummy _ops. The verifier will operate on target program's ops. */ const struct bpf_verifier_ops bpf_extension_verifier_ops = { @@ -496,6 +498,177 @@ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampolin return err; } +#if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) +static void bpf_shim_tramp_link_release(struct bpf_link *link) +{ + struct bpf_shim_tramp_link *shim_link = + container_of(link, struct bpf_shim_tramp_link, link.link); + + /* paired with 'shim_link->trampoline = tr' in bpf_trampoline_link_cgroup_shim */ + if (!shim_link->trampoline) + return; + + WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline)); + bpf_trampoline_put(shim_link->trampoline); +} + +static void bpf_shim_tramp_link_dealloc(struct bpf_link *link) +{ + struct bpf_shim_tramp_link *shim_link = + container_of(link, struct bpf_shim_tramp_link, link.link); + + kfree(shim_link); +} + +static const struct bpf_link_ops bpf_shim_tramp_link_lops = { + .release = bpf_shim_tramp_link_release, + .dealloc = bpf_shim_tramp_link_dealloc, +}; + +static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog, + bpf_func_t bpf_func, + int cgroup_atype) +{ + struct bpf_shim_tramp_link *shim_link = NULL; + struct bpf_prog *p; + + shim_link = kzalloc(sizeof(*shim_link), GFP_USER); + if (!shim_link) + return NULL; + + p = bpf_prog_alloc(1, 0); + if (!p) { + kfree(shim_link); + return NULL; + } + + p->jited = false; + p->bpf_func = bpf_func; + + p->aux->cgroup_atype = cgroup_atype; + p->aux->attach_func_proto = prog->aux->attach_func_proto; + p->aux->attach_btf_id = prog->aux->attach_btf_id; + p->aux->attach_btf = prog->aux->attach_btf; + btf_get(p->aux->attach_btf); + p->type = BPF_PROG_TYPE_LSM; + p->expected_attach_type = BPF_LSM_MAC; + bpf_prog_inc(p); + bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC, + &bpf_shim_tramp_link_lops, p); + + return shim_link; +} + +static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr, + bpf_func_t bpf_func) +{ + struct bpf_tramp_link *link; + int kind; + + for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { + hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { + struct bpf_prog *p = link->link.prog; + + if (p->bpf_func == bpf_func) + return container_of(link, struct bpf_shim_tramp_link, link); + } + } + + return NULL; +} + +int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, + int cgroup_atype) +{ + struct bpf_shim_tramp_link *shim_link = NULL; + struct bpf_attach_target_info tgt_info = {}; + struct bpf_trampoline *tr; + bpf_func_t bpf_func; + u64 key; + int err; + + err = bpf_check_attach_target(NULL, prog, NULL, + prog->aux->attach_btf_id, + &tgt_info); + if (err) + return err; + + key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, + prog->aux->attach_btf_id); + + bpf_lsm_find_cgroup_shim(prog, &bpf_func); + tr = bpf_trampoline_get(key, &tgt_info); + if (!tr) + return -ENOMEM; + + mutex_lock(&tr->mutex); + + shim_link = cgroup_shim_find(tr, bpf_func); + if (shim_link) { + /* Reusing existing shim attached by the other program. */ + bpf_link_inc(&shim_link->link.link); + + mutex_unlock(&tr->mutex); + bpf_trampoline_put(tr); /* bpf_trampoline_get above */ + return 0; + } + + /* Allocate and install new shim. */ + + shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype); + if (!shim_link) { + err = -ENOMEM; + goto err; + } + + err = __bpf_trampoline_link_prog(&shim_link->link, tr); + if (err) + goto err; + + shim_link->trampoline = tr; + /* note, we're still holding tr refcnt from above */ + + mutex_unlock(&tr->mutex); + + return 0; +err: + mutex_unlock(&tr->mutex); + + if (shim_link) + bpf_link_put(&shim_link->link.link); + + /* have to release tr while _not_ holding its mutex */ + bpf_trampoline_put(tr); /* bpf_trampoline_get above */ + + return err; +} + +void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog) +{ + struct bpf_shim_tramp_link *shim_link = NULL; + struct bpf_trampoline *tr; + bpf_func_t bpf_func; + u64 key; + + key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, + prog->aux->attach_btf_id); + + bpf_lsm_find_cgroup_shim(prog, &bpf_func); + tr = bpf_trampoline_lookup(key); + if (WARN_ON_ONCE(!tr)) + return; + + mutex_lock(&tr->mutex); + shim_link = cgroup_shim_find(tr, bpf_func); + mutex_unlock(&tr->mutex); + + if (shim_link) + bpf_link_put(&shim_link->link.link); + + bpf_trampoline_put(tr); /* bpf_trampoline_lookup above */ +} +#endif + struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info) { @@ -628,6 +801,31 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_ rcu_read_unlock(); } +u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx) + __acquires(RCU) +{ + /* Runtime stats are exported via actual BPF_LSM_CGROUP + * programs, not the shims. + */ + rcu_read_lock(); + migrate_disable(); + + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + + return NO_START_TIME; +} + +void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) + __releases(RCU) +{ + bpf_reset_run_ctx(run_ctx->saved_run_ctx); + + migrate_enable(); + rcu_read_unlock(); +} + u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) { rcu_read_lock_trace(); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4938477912cd..df3ec6b05f05 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7322,6 +7322,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn reg_type_str(env, regs[BPF_REG_1].type)); return -EACCES; } + break; + case BPF_FUNC_set_retval: + if (env->prog->expected_attach_type == BPF_LSM_CGROUP) { + if (!env->prog->aux->attach_func_proto->type) { + /* Make sure programs that attach to void + * hooks don't try to modify return value. + */ + verbose(env, "BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); + return -EINVAL; + } + } + break; } if (err) @@ -10527,6 +10539,22 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_SK_LOOKUP: range = tnum_range(SK_DROP, SK_PASS); break; + + case BPF_PROG_TYPE_LSM: + if (env->prog->expected_attach_type != BPF_LSM_CGROUP) { + /* Regular BPF_PROG_TYPE_LSM programs can return + * any value. + */ + return 0; + } + if (!env->prog->aux->attach_func_proto->type) { + /* Make sure programs that attach to void + * hooks don't try to modify return value. + */ + range = tnum_range(1, 1); + } + break; + case BPF_PROG_TYPE_EXT: /* freplace program can return anything as its return value * depends on the to-be-replaced kernel func or bpf program. @@ -10543,6 +10571,9 @@ static int check_return_code(struct bpf_verifier_env *env) if (!tnum_in(range, reg->var_off)) { verbose_invalid_scalar(env, reg, &range, "program exit", "R0"); + if (prog->expected_attach_type == BPF_LSM_CGROUP && + !prog->aux->attach_func_proto->type) + verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); return -EINVAL; } @@ -14902,6 +14933,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, fallthrough; case BPF_MODIFY_RETURN: case BPF_LSM_MAC: + case BPF_LSM_CGROUP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: if (!btf_type_is_func(t)) { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e81362891596..b7479898c879 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -998,6 +998,7 @@ enum bpf_attach_type { BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, BPF_PERF_EVENT, BPF_TRACE_KPROBE_MULTI, + BPF_LSM_CGROUP, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From 3b34bcb946c2a8240ef6761be2ee404ceb7e1079 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:10 -0700 Subject: tools/bpf: Sync btf_ids.h to tools Has been slowly getting out of sync, let's update it. resolve_btfids usage has been updated to match the header changes. Also bring new parts of tools/include/uapi/linux/bpf.h. Acked-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-8-sdf@google.com Signed-off-by: Alexei Starovoitov --- tools/include/linux/btf_ids.h | 35 +++++++++++++++++----- tools/include/uapi/linux/bpf.h | 3 ++ .../selftests/bpf/prog_tests/resolve_btfids.c | 2 +- 3 files changed, 32 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/include/linux/btf_ids.h b/tools/include/linux/btf_ids.h index 57890b357f85..71e54b1e3796 100644 --- a/tools/include/linux/btf_ids.h +++ b/tools/include/linux/btf_ids.h @@ -73,7 +73,7 @@ asm( \ __BTF_ID_LIST(name, local) \ extern u32 name[]; -#define BTF_ID_LIST_GLOBAL(name) \ +#define BTF_ID_LIST_GLOBAL(name, n) \ __BTF_ID_LIST(name, globl) /* The BTF_ID_LIST_SINGLE macro defines a BTF_ID_LIST with @@ -82,6 +82,9 @@ __BTF_ID_LIST(name, globl) #define BTF_ID_LIST_SINGLE(name, prefix, typename) \ BTF_ID_LIST(name) \ BTF_ID(prefix, typename) +#define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) \ + BTF_ID_LIST_GLOBAL(name, 1) \ + BTF_ID(prefix, typename) /* * The BTF_ID_UNUSED macro defines 4 zero bytes. @@ -143,13 +146,14 @@ extern struct btf_id_set name; #else -#define BTF_ID_LIST(name) static u32 name[5]; +#define BTF_ID_LIST(name) static u32 __maybe_unused name[5]; #define BTF_ID(prefix, name) #define BTF_ID_UNUSED -#define BTF_ID_LIST_GLOBAL(name) u32 name[1]; -#define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 name[1]; -#define BTF_SET_START(name) static struct btf_id_set name = { 0 }; -#define BTF_SET_START_GLOBAL(name) static struct btf_id_set name = { 0 }; +#define BTF_ID_LIST_GLOBAL(name, n) u32 __maybe_unused name[n]; +#define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 __maybe_unused name[1]; +#define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) u32 __maybe_unused name[1]; +#define BTF_SET_START(name) static struct btf_id_set __maybe_unused name = { 0 }; +#define BTF_SET_START_GLOBAL(name) static struct btf_id_set __maybe_unused name = { 0 }; #define BTF_SET_END(name) #endif /* CONFIG_DEBUG_INFO_BTF */ @@ -172,7 +176,10 @@ extern struct btf_id_set name; BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_TW, tcp_timewait_sock) \ BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, tcp6_sock) \ BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, udp_sock) \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock) + BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_UNIX, unix_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_MPTCP, mptcp_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCKET, socket) enum { #define BTF_SOCK_TYPE(name, str) name, @@ -184,4 +191,18 @@ MAX_BTF_SOCK_TYPE, extern u32 btf_sock_ids[]; #endif +#define BTF_TRACING_TYPE_xxx \ + BTF_TRACING_TYPE(BTF_TRACING_TYPE_TASK, task_struct) \ + BTF_TRACING_TYPE(BTF_TRACING_TYPE_FILE, file) \ + BTF_TRACING_TYPE(BTF_TRACING_TYPE_VMA, vm_area_struct) + +enum { +#define BTF_TRACING_TYPE(name, type) name, +BTF_TRACING_TYPE_xxx +#undef BTF_TRACING_TYPE +MAX_BTF_TRACING_TYPE, +}; + +extern u32 btf_tracing_ids[]; + #endif diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b7479898c879..ad9e7311c4cf 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1432,6 +1432,7 @@ union bpf_attr { __u32 attach_flags; __aligned_u64 prog_ids; __u32 prog_cnt; + __aligned_u64 prog_attach_flags; /* output: per-program attach_flags */ } query; struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ @@ -6076,6 +6077,8 @@ struct bpf_prog_info { __u64 run_cnt; __u64 recursion_misses; __u32 verified_insns; + __u32 attach_btf_obj_id; + __u32 attach_btf_id; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c index f4a13d9dd5c8..c197261d02e2 100644 --- a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c +++ b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c @@ -44,7 +44,7 @@ BTF_ID(union, U) BTF_ID(func, func) extern __u32 test_list_global[]; -BTF_ID_LIST_GLOBAL(test_list_global) +BTF_ID_LIST_GLOBAL(test_list_global, 1) BTF_ID_UNUSED BTF_ID(typedef, S) BTF_ID(typedef, T) -- cgit v1.2.3 From bffcf34878b1c16c322c6606e4a48d82bd5f7f24 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:11 -0700 Subject: libbpf: add lsm_cgoup_sock type lsm_cgroup/ is the prefix for BPF_LSM_CGROUP. Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-9-sdf@google.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index e994797bcd48..8a45a84eb9b2 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -106,6 +106,7 @@ static const char * const attach_type_name[] = { [BPF_TRACE_FEXIT] = "trace_fexit", [BPF_MODIFY_RETURN] = "modify_return", [BPF_LSM_MAC] = "lsm_mac", + [BPF_LSM_CGROUP] = "lsm_cgroup", [BPF_SK_LOOKUP] = "sk_lookup", [BPF_TRACE_ITER] = "trace_iter", [BPF_XDP_DEVMAP] = "xdp_devmap", @@ -8414,6 +8415,7 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("freplace+", EXT, 0, SEC_ATTACH_BTF, attach_trace), SEC_DEF("lsm+", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF, attach_lsm), SEC_DEF("lsm.s+", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_lsm), + SEC_DEF("lsm_cgroup+", LSM, BPF_LSM_CGROUP, SEC_ATTACH_BTF), SEC_DEF("iter+", TRACING, BPF_TRACE_ITER, SEC_ATTACH_BTF, attach_iter), SEC_DEF("iter.s+", TRACING, BPF_TRACE_ITER, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_iter), SEC_DEF("syscall", SYSCALL, 0, SEC_SLEEPABLE), @@ -8851,6 +8853,7 @@ void btf_get_kernel_prefix_kind(enum bpf_attach_type attach_type, *kind = BTF_KIND_TYPEDEF; break; case BPF_LSM_MAC: + case BPF_LSM_CGROUP: *prefix = BTF_LSM_PREFIX; *kind = BTF_KIND_FUNC; break; -- cgit v1.2.3 From a4b2f3cf699f8ec3964722e7ef3f37f809701172 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:12 -0700 Subject: libbpf: implement bpf_prog_query_opts Implement bpf_prog_query_opts as a more expendable version of bpf_prog_query. Expose new prog_attach_flags and attach_btf_func_id as well: * prog_attach_flags is a per-program attach_type; relevant only for lsm cgroup program which might have different attach_flags per attach_btf_id * attach_btf_func_id is a new field expose for prog_query which specifies real btf function id for lsm cgroup attachments Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-10-sdf@google.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf.c | 38 +++++++++++++++++++++++++++++++------- tools/lib/bpf/bpf.h | 15 +++++++++++++++ tools/lib/bpf/libbpf.map | 1 + 3 files changed, 47 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 6d848b02c1e0..5eb0df90eb2b 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -795,24 +795,48 @@ int bpf_iter_create(int link_fd) return libbpf_err_errno(fd); } -int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, - __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt) +int bpf_prog_query_opts(int target_fd, + enum bpf_attach_type type, + struct bpf_prog_query_opts *opts) { union bpf_attr attr; int ret; + if (!OPTS_VALID(opts, bpf_prog_query_opts)) + return libbpf_err(-EINVAL); + memset(&attr, 0, sizeof(attr)); + attr.query.target_fd = target_fd; attr.query.attach_type = type; - attr.query.query_flags = query_flags; - attr.query.prog_cnt = *prog_cnt; - attr.query.prog_ids = ptr_to_u64(prog_ids); + attr.query.query_flags = OPTS_GET(opts, query_flags, 0); + attr.query.prog_cnt = OPTS_GET(opts, prog_cnt, 0); + attr.query.prog_ids = ptr_to_u64(OPTS_GET(opts, prog_ids, NULL)); + attr.query.prog_attach_flags = ptr_to_u64(OPTS_GET(opts, prog_attach_flags, NULL)); ret = sys_bpf(BPF_PROG_QUERY, &attr, sizeof(attr)); + OPTS_SET(opts, attach_flags, attr.query.attach_flags); + OPTS_SET(opts, prog_cnt, attr.query.prog_cnt); + + return libbpf_err_errno(ret); +} + +int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, + __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt) +{ + LIBBPF_OPTS(bpf_prog_query_opts, opts); + int ret; + + opts.query_flags = query_flags; + opts.prog_ids = prog_ids; + opts.prog_cnt = *prog_cnt; + + ret = bpf_prog_query_opts(target_fd, type, &opts); + if (attach_flags) - *attach_flags = attr.query.attach_flags; - *prog_cnt = attr.query.prog_cnt; + *attach_flags = opts.attach_flags; + *prog_cnt = opts.prog_cnt; return libbpf_err_errno(ret); } diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index af7baf5da74d..88a7cc4bd76f 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -359,9 +359,24 @@ LIBBPF_API int bpf_map_get_fd_by_id(__u32 id); LIBBPF_API int bpf_btf_get_fd_by_id(__u32 id); LIBBPF_API int bpf_link_get_fd_by_id(__u32 id); LIBBPF_API int bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len); + +struct bpf_prog_query_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 query_flags; + __u32 attach_flags; /* output argument */ + __u32 *prog_ids; + __u32 prog_cnt; /* input+output argument */ + __u32 *prog_attach_flags; +}; +#define bpf_prog_query_opts__last_field prog_attach_flags + +LIBBPF_API int bpf_prog_query_opts(int target_fd, + enum bpf_attach_type type, + struct bpf_prog_query_opts *opts); LIBBPF_API int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt); + LIBBPF_API int bpf_raw_tracepoint_open(const char *name, int prog_fd); LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 43c6697a6d27..94b589ecfeaa 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -355,6 +355,7 @@ LIBBPF_0.8.0 { LIBBPF_1.0.0 { global: + bpf_prog_query_opts; btf__add_enum64; btf__add_enum64_value; libbpf_bpf_attach_type_str; -- cgit v1.2.3 From 596f5fb2ea2a38032abb8043606ce7168f21f6ab Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:13 -0700 Subject: bpftool: implement cgroup tree for BPF_LSM_CGROUP $ bpftool --nomount prog loadall $KDIR/tools/testing/selftests/bpf/lsm_cgroup.o /sys/fs/bpf/x $ bpftool cgroup attach /sys/fs/cgroup lsm_cgroup pinned /sys/fs/bpf/x/socket_alloc $ bpftool cgroup attach /sys/fs/cgroup lsm_cgroup pinned /sys/fs/bpf/x/socket_bind $ bpftool cgroup attach /sys/fs/cgroup lsm_cgroup pinned /sys/fs/bpf/x/socket_clone $ bpftool cgroup attach /sys/fs/cgroup lsm_cgroup pinned /sys/fs/bpf/x/socket_post_create $ bpftool cgroup tree CgroupPath ID AttachType AttachFlags Name /sys/fs/cgroup 6 lsm_cgroup socket_post_create bpf_lsm_socket_post_create 8 lsm_cgroup socket_bind bpf_lsm_socket_bind 10 lsm_cgroup socket_alloc bpf_lsm_sk_alloc_security 11 lsm_cgroup socket_clone bpf_lsm_inet_csk_clone $ bpftool cgroup detach /sys/fs/cgroup lsm_cgroup pinned /sys/fs/bpf/x/socket_post_create $ bpftool cgroup tree CgroupPath ID AttachType AttachFlags Name /sys/fs/cgroup 8 lsm_cgroup socket_bind bpf_lsm_socket_bind 10 lsm_cgroup socket_alloc bpf_lsm_sk_alloc_security 11 lsm_cgroup socket_clone bpf_lsm_inet_csk_clone Reviewed-by: Quentin Monnet Acked-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-11-sdf@google.com Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/cgroup.c | 109 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 87 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c index 42421fe47a58..cced668fb2a3 100644 --- a/tools/bpf/bpftool/cgroup.c +++ b/tools/bpf/bpftool/cgroup.c @@ -15,6 +15,7 @@ #include #include +#include #include "main.h" @@ -36,6 +37,8 @@ " cgroup_inet_sock_release }" static unsigned int query_flags; +static struct btf *btf_vmlinux; +static __u32 btf_vmlinux_id; static enum bpf_attach_type parse_attach_type(const char *str) { @@ -64,11 +67,38 @@ static enum bpf_attach_type parse_attach_type(const char *str) return __MAX_BPF_ATTACH_TYPE; } +static void guess_vmlinux_btf_id(__u32 attach_btf_obj_id) +{ + struct bpf_btf_info btf_info = {}; + __u32 btf_len = sizeof(btf_info); + char name[16] = {}; + int err; + int fd; + + btf_info.name = ptr_to_u64(name); + btf_info.name_len = sizeof(name); + + fd = bpf_btf_get_fd_by_id(attach_btf_obj_id); + if (fd < 0) + return; + + err = bpf_obj_get_info_by_fd(fd, &btf_info, &btf_len); + if (err) + goto out; + + if (btf_info.kernel_btf && strncmp(name, "vmlinux", sizeof(name)) == 0) + btf_vmlinux_id = btf_info.id; + +out: + close(fd); +} + static int show_bpf_prog(int id, enum bpf_attach_type attach_type, const char *attach_flags_str, int level) { char prog_name[MAX_PROG_FULL_NAME]; + const char *attach_btf_name = NULL; struct bpf_prog_info info = {}; const char *attach_type_str; __u32 info_len = sizeof(info); @@ -84,6 +114,20 @@ static int show_bpf_prog(int id, enum bpf_attach_type attach_type, } attach_type_str = libbpf_bpf_attach_type_str(attach_type); + + if (btf_vmlinux) { + if (!btf_vmlinux_id) + guess_vmlinux_btf_id(info.attach_btf_obj_id); + + if (btf_vmlinux_id == info.attach_btf_obj_id && + info.attach_btf_id < btf__type_cnt(btf_vmlinux)) { + const struct btf_type *t = + btf__type_by_id(btf_vmlinux, info.attach_btf_id); + attach_btf_name = + btf__name_by_offset(btf_vmlinux, t->name_off); + } + } + get_prog_full_name(&info, prog_fd, prog_name, sizeof(prog_name)); if (json_output) { jsonw_start_object(json_wtr); @@ -95,6 +139,10 @@ static int show_bpf_prog(int id, enum bpf_attach_type attach_type, jsonw_string_field(json_wtr, "attach_flags", attach_flags_str); jsonw_string_field(json_wtr, "name", prog_name); + if (attach_btf_name) + jsonw_string_field(json_wtr, "attach_btf_name", attach_btf_name); + jsonw_uint_field(json_wtr, "attach_btf_obj_id", info.attach_btf_obj_id); + jsonw_uint_field(json_wtr, "attach_btf_id", info.attach_btf_id); jsonw_end_object(json_wtr); } else { printf("%s%-8u ", level ? " " : "", info.id); @@ -102,7 +150,13 @@ static int show_bpf_prog(int id, enum bpf_attach_type attach_type, printf("%-15s", attach_type_str); else printf("type %-10u", attach_type); - printf(" %-15s %-15s\n", attach_flags_str, prog_name); + printf(" %-15s %-15s", attach_flags_str, prog_name); + if (attach_btf_name) + printf(" %-15s", attach_btf_name); + else if (info.attach_btf_id) + printf(" attach_btf_obj_id=%d attach_btf_id=%d", + info.attach_btf_obj_id, info.attach_btf_id); + printf("\n"); } close(prog_fd); @@ -144,40 +198,49 @@ static int cgroup_has_attached_progs(int cgroup_fd) static int show_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type, int level) { + LIBBPF_OPTS(bpf_prog_query_opts, p); + __u32 prog_attach_flags[1024] = {0}; const char *attach_flags_str; __u32 prog_ids[1024] = {0}; - __u32 prog_cnt, iter; - __u32 attach_flags; char buf[32]; + __u32 iter; int ret; - prog_cnt = ARRAY_SIZE(prog_ids); - ret = bpf_prog_query(cgroup_fd, type, query_flags, &attach_flags, - prog_ids, &prog_cnt); + p.query_flags = query_flags; + p.prog_cnt = ARRAY_SIZE(prog_ids); + p.prog_ids = prog_ids; + p.prog_attach_flags = prog_attach_flags; + + ret = bpf_prog_query_opts(cgroup_fd, type, &p); if (ret) return ret; - if (prog_cnt == 0) + if (p.prog_cnt == 0) return 0; - switch (attach_flags) { - case BPF_F_ALLOW_MULTI: - attach_flags_str = "multi"; - break; - case BPF_F_ALLOW_OVERRIDE: - attach_flags_str = "override"; - break; - case 0: - attach_flags_str = ""; - break; - default: - snprintf(buf, sizeof(buf), "unknown(%x)", attach_flags); - attach_flags_str = buf; - } + for (iter = 0; iter < p.prog_cnt; iter++) { + __u32 attach_flags; + + attach_flags = prog_attach_flags[iter] ?: p.attach_flags; + + switch (attach_flags) { + case BPF_F_ALLOW_MULTI: + attach_flags_str = "multi"; + break; + case BPF_F_ALLOW_OVERRIDE: + attach_flags_str = "override"; + break; + case 0: + attach_flags_str = ""; + break; + default: + snprintf(buf, sizeof(buf), "unknown(%x)", attach_flags); + attach_flags_str = buf; + } - for (iter = 0; iter < prog_cnt; iter++) show_bpf_prog(prog_ids[iter], type, attach_flags_str, level); + } return 0; } @@ -233,6 +296,7 @@ static int do_show(int argc, char **argv) printf("%-8s %-15s %-15s %-15s\n", "ID", "AttachType", "AttachFlags", "Name"); + btf_vmlinux = libbpf_find_kernel_btf(); for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++) { /* * Not all attach types may be supported, so it's expected, @@ -296,6 +360,7 @@ static int do_show_tree_fn(const char *fpath, const struct stat *sb, printf("%s\n", fpath); } + btf_vmlinux = libbpf_find_kernel_btf(); for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++) show_attached_bpf_progs(cgroup_fd, type, ftw->level); -- cgit v1.2.3 From dca85aac8895708b74c7f2264e3ab5048c02b8b2 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:14 -0700 Subject: selftests/bpf: lsm_cgroup functional test Functional test that exercises the following: 1. apply default sk_priority policy 2. permit TX-only AF_PACKET socket 3. cgroup attach/detach/replace 4. reusing trampoline shim Signed-off-by: Stanislav Fomichev Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220628174314.1216643-12-sdf@google.com Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/lsm_cgroup.c | 293 +++++++++++++++++++++ .../testing/selftests/bpf/progs/bpf_tracing_net.h | 1 + tools/testing/selftests/bpf/progs/lsm_cgroup.c | 180 +++++++++++++ 3 files changed, 474 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c create mode 100644 tools/testing/selftests/bpf/progs/lsm_cgroup.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c b/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c new file mode 100644 index 000000000000..d40810a742fa --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c @@ -0,0 +1,293 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include + +#include "lsm_cgroup.skel.h" +#include "cgroup_helpers.h" +#include "network_helpers.h" + +static struct btf *btf; + +static __u32 query_prog_cnt(int cgroup_fd, const char *attach_func) +{ + LIBBPF_OPTS(bpf_prog_query_opts, p); + int cnt = 0; + int i; + + ASSERT_OK(bpf_prog_query_opts(cgroup_fd, BPF_LSM_CGROUP, &p), "prog_query"); + + if (!attach_func) + return p.prog_cnt; + + /* When attach_func is provided, count the number of progs that + * attach to the given symbol. + */ + + if (!btf) + btf = btf__load_vmlinux_btf(); + if (!ASSERT_OK(libbpf_get_error(btf), "btf_vmlinux")) + return -1; + + p.prog_ids = malloc(sizeof(u32) * p.prog_cnt); + p.prog_attach_flags = malloc(sizeof(u32) * p.prog_cnt); + ASSERT_OK(bpf_prog_query_opts(cgroup_fd, BPF_LSM_CGROUP, &p), "prog_query"); + + for (i = 0; i < p.prog_cnt; i++) { + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + int fd; + + fd = bpf_prog_get_fd_by_id(p.prog_ids[i]); + ASSERT_GE(fd, 0, "prog_get_fd_by_id"); + ASSERT_OK(bpf_obj_get_info_by_fd(fd, &info, &info_len), "prog_info_by_fd"); + close(fd); + + if (info.attach_btf_id == + btf__find_by_name_kind(btf, attach_func, BTF_KIND_FUNC)) + cnt++; + } + + free(p.prog_ids); + free(p.prog_attach_flags); + + return cnt; +} + +static void test_lsm_cgroup_functional(void) +{ + DECLARE_LIBBPF_OPTS(bpf_prog_attach_opts, attach_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + int cgroup_fd = -1, cgroup_fd2 = -1, cgroup_fd3 = -1; + int listen_fd, client_fd, accepted_fd; + struct lsm_cgroup *skel = NULL; + int post_create_prog_fd2 = -1; + int post_create_prog_fd = -1; + int bind_link_fd2 = -1; + int bind_prog_fd2 = -1; + int alloc_prog_fd = -1; + int bind_prog_fd = -1; + int bind_link_fd = -1; + int clone_prog_fd = -1; + int err, fd, prio; + socklen_t socklen; + + cgroup_fd3 = test__join_cgroup("/sock_policy_empty"); + if (!ASSERT_GE(cgroup_fd3, 0, "create empty cgroup")) + goto close_cgroup; + + cgroup_fd2 = test__join_cgroup("/sock_policy_reuse"); + if (!ASSERT_GE(cgroup_fd2, 0, "create cgroup for reuse")) + goto close_cgroup; + + cgroup_fd = test__join_cgroup("/sock_policy"); + if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup")) + goto close_cgroup; + + skel = lsm_cgroup__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + goto close_cgroup; + + post_create_prog_fd = bpf_program__fd(skel->progs.socket_post_create); + post_create_prog_fd2 = bpf_program__fd(skel->progs.socket_post_create2); + bind_prog_fd = bpf_program__fd(skel->progs.socket_bind); + bind_prog_fd2 = bpf_program__fd(skel->progs.socket_bind2); + alloc_prog_fd = bpf_program__fd(skel->progs.socket_alloc); + clone_prog_fd = bpf_program__fd(skel->progs.socket_clone); + + ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_sk_alloc_security"), 0, "prog count"); + ASSERT_EQ(query_prog_cnt(cgroup_fd, NULL), 0, "total prog count"); + err = bpf_prog_attach(alloc_prog_fd, cgroup_fd, BPF_LSM_CGROUP, 0); + if (!ASSERT_OK(err, "attach alloc_prog_fd")) + goto detach_cgroup; + ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_sk_alloc_security"), 1, "prog count"); + ASSERT_EQ(query_prog_cnt(cgroup_fd, NULL), 1, "total prog count"); + + ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_inet_csk_clone"), 0, "prog count"); + err = bpf_prog_attach(clone_prog_fd, cgroup_fd, BPF_LSM_CGROUP, 0); + if (!ASSERT_OK(err, "attach clone_prog_fd")) + goto detach_cgroup; + ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_inet_csk_clone"), 1, "prog count"); + ASSERT_EQ(query_prog_cnt(cgroup_fd, NULL), 2, "total prog count"); + + /* Make sure replacing works. */ + + ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_socket_post_create"), 0, "prog count"); + err = bpf_prog_attach(post_create_prog_fd, cgroup_fd, + BPF_LSM_CGROUP, 0); + if (!ASSERT_OK(err, "attach post_create_prog_fd")) + goto detach_cgroup; + ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_socket_post_create"), 1, "prog count"); + ASSERT_EQ(query_prog_cnt(cgroup_fd, NULL), 3, "total prog count"); + + attach_opts.replace_prog_fd = post_create_prog_fd; + err = bpf_prog_attach_opts(post_create_prog_fd2, cgroup_fd, + BPF_LSM_CGROUP, &attach_opts); + if (!ASSERT_OK(err, "prog replace post_create_prog_fd")) + goto detach_cgroup; + ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_socket_post_create"), 1, "prog count"); + ASSERT_EQ(query_prog_cnt(cgroup_fd, NULL), 3, "total prog count"); + + /* Try the same attach/replace via link API. */ + + ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_socket_bind"), 0, "prog count"); + bind_link_fd = bpf_link_create(bind_prog_fd, cgroup_fd, + BPF_LSM_CGROUP, NULL); + if (!ASSERT_GE(bind_link_fd, 0, "link create bind_prog_fd")) + goto detach_cgroup; + ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_socket_bind"), 1, "prog count"); + ASSERT_EQ(query_prog_cnt(cgroup_fd, NULL), 4, "total prog count"); + + update_opts.old_prog_fd = bind_prog_fd; + update_opts.flags = BPF_F_REPLACE; + + err = bpf_link_update(bind_link_fd, bind_prog_fd2, &update_opts); + if (!ASSERT_OK(err, "link update bind_prog_fd")) + goto detach_cgroup; + ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_socket_bind"), 1, "prog count"); + ASSERT_EQ(query_prog_cnt(cgroup_fd, NULL), 4, "total prog count"); + + /* Attach another instance of bind program to another cgroup. + * This should trigger the reuse of the trampoline shim (two + * programs attaching to the same btf_id). + */ + + ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_socket_bind"), 1, "prog count"); + ASSERT_EQ(query_prog_cnt(cgroup_fd2, "bpf_lsm_socket_bind"), 0, "prog count"); + bind_link_fd2 = bpf_link_create(bind_prog_fd2, cgroup_fd2, + BPF_LSM_CGROUP, NULL); + if (!ASSERT_GE(bind_link_fd2, 0, "link create bind_prog_fd2")) + goto detach_cgroup; + ASSERT_EQ(query_prog_cnt(cgroup_fd2, "bpf_lsm_socket_bind"), 1, "prog count"); + ASSERT_EQ(query_prog_cnt(cgroup_fd, NULL), 4, "total prog count"); + ASSERT_EQ(query_prog_cnt(cgroup_fd2, NULL), 1, "total prog count"); + + /* AF_UNIX is prohibited. */ + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_LT(fd, 0, "socket(AF_UNIX)"); + close(fd); + + /* AF_INET6 gets default policy (sk_priority). */ + + fd = socket(AF_INET6, SOCK_STREAM, 0); + if (!ASSERT_GE(fd, 0, "socket(SOCK_STREAM)")) + goto detach_cgroup; + + prio = 0; + socklen = sizeof(prio); + ASSERT_GE(getsockopt(fd, SOL_SOCKET, SO_PRIORITY, &prio, &socklen), 0, + "getsockopt"); + ASSERT_EQ(prio, 123, "sk_priority"); + + close(fd); + + /* TX-only AF_PACKET is allowed. */ + + ASSERT_LT(socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL)), 0, + "socket(AF_PACKET, ..., ETH_P_ALL)"); + + fd = socket(AF_PACKET, SOCK_RAW, 0); + ASSERT_GE(fd, 0, "socket(AF_PACKET, ..., 0)"); + + /* TX-only AF_PACKET can not be rebound. */ + + struct sockaddr_ll sa = { + .sll_family = AF_PACKET, + .sll_protocol = htons(ETH_P_ALL), + }; + ASSERT_LT(bind(fd, (struct sockaddr *)&sa, sizeof(sa)), 0, + "bind(ETH_P_ALL)"); + + close(fd); + + /* Trigger passive open. */ + + listen_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0); + ASSERT_GE(listen_fd, 0, "start_server"); + client_fd = connect_to_fd(listen_fd, 0); + ASSERT_GE(client_fd, 0, "connect_to_fd"); + accepted_fd = accept(listen_fd, NULL, NULL); + ASSERT_GE(accepted_fd, 0, "accept"); + + prio = 0; + socklen = sizeof(prio); + ASSERT_GE(getsockopt(accepted_fd, SOL_SOCKET, SO_PRIORITY, &prio, &socklen), 0, + "getsockopt"); + ASSERT_EQ(prio, 234, "sk_priority"); + + /* These are replaced and never called. */ + ASSERT_EQ(skel->bss->called_socket_post_create, 0, "called_create"); + ASSERT_EQ(skel->bss->called_socket_bind, 0, "called_bind"); + + /* AF_INET6+SOCK_STREAM + * AF_PACKET+SOCK_RAW + * listen_fd + * client_fd + * accepted_fd + */ + ASSERT_EQ(skel->bss->called_socket_post_create2, 5, "called_create2"); + + /* start_server + * bind(ETH_P_ALL) + */ + ASSERT_EQ(skel->bss->called_socket_bind2, 2, "called_bind2"); + /* Single accept(). */ + ASSERT_EQ(skel->bss->called_socket_clone, 1, "called_clone"); + + /* AF_UNIX+SOCK_STREAM (failed) + * AF_INET6+SOCK_STREAM + * AF_PACKET+SOCK_RAW (failed) + * AF_PACKET+SOCK_RAW + * listen_fd + * client_fd + * accepted_fd + */ + ASSERT_EQ(skel->bss->called_socket_alloc, 7, "called_alloc"); + + close(listen_fd); + close(client_fd); + close(accepted_fd); + + /* Make sure other cgroup doesn't trigger the programs. */ + + if (!ASSERT_OK(join_cgroup("/sock_policy_empty"), "join root cgroup")) + goto detach_cgroup; + + fd = socket(AF_INET6, SOCK_STREAM, 0); + if (!ASSERT_GE(fd, 0, "socket(SOCK_STREAM)")) + goto detach_cgroup; + + prio = 0; + socklen = sizeof(prio); + ASSERT_GE(getsockopt(fd, SOL_SOCKET, SO_PRIORITY, &prio, &socklen), 0, + "getsockopt"); + ASSERT_EQ(prio, 0, "sk_priority"); + + close(fd); + +detach_cgroup: + ASSERT_GE(bpf_prog_detach2(post_create_prog_fd2, cgroup_fd, + BPF_LSM_CGROUP), 0, "detach_create"); + close(bind_link_fd); + /* Don't close bind_link_fd2, exercise cgroup release cleanup. */ + ASSERT_GE(bpf_prog_detach2(alloc_prog_fd, cgroup_fd, + BPF_LSM_CGROUP), 0, "detach_alloc"); + ASSERT_GE(bpf_prog_detach2(clone_prog_fd, cgroup_fd, + BPF_LSM_CGROUP), 0, "detach_clone"); + +close_cgroup: + close(cgroup_fd); + close(cgroup_fd2); + close(cgroup_fd3); + lsm_cgroup__destroy(skel); +} + +void test_lsm_cgroup(void) +{ + if (test__start_subtest("functional")) + test_lsm_cgroup_functional(); + btf__free(btf); +} diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index 1c1289ba5fc5..98dd2c4815f0 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -8,6 +8,7 @@ #define SOL_SOCKET 1 #define SO_SNDBUF 7 #define __SO_ACCEPTCON (1 << 16) +#define SO_PRIORITY 12 #define SOL_TCP 6 #define TCP_CONGESTION 13 diff --git a/tools/testing/selftests/bpf/progs/lsm_cgroup.c b/tools/testing/selftests/bpf/progs/lsm_cgroup.c new file mode 100644 index 000000000000..89f3b1e961a8 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/lsm_cgroup.c @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include "bpf_tracing_net.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +#ifndef AF_PACKET +#define AF_PACKET 17 +#endif + +#ifndef AF_UNIX +#define AF_UNIX 1 +#endif + +#ifndef EPERM +#define EPERM 1 +#endif + +struct { + __uint(type, BPF_MAP_TYPE_CGROUP_STORAGE); + __type(key, __u64); + __type(value, __u64); +} cgroup_storage SEC(".maps"); + +int called_socket_post_create; +int called_socket_post_create2; +int called_socket_bind; +int called_socket_bind2; +int called_socket_alloc; +int called_socket_clone; + +static __always_inline int test_local_storage(void) +{ + __u64 *val; + + val = bpf_get_local_storage(&cgroup_storage, 0); + if (!val) + return 0; + *val += 1; + + return 1; +} + +static __always_inline int real_create(struct socket *sock, int family, + int protocol) +{ + struct sock *sk; + int prio = 123; + + /* Reject non-tx-only AF_PACKET. */ + if (family == AF_PACKET && protocol != 0) + return 0; /* EPERM */ + + sk = sock->sk; + if (!sk) + return 1; + + /* The rest of the sockets get default policy. */ + if (bpf_setsockopt(sk, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio))) + return 0; /* EPERM */ + + /* Make sure bpf_getsockopt is allowed and works. */ + prio = 0; + if (bpf_getsockopt(sk, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio))) + return 0; /* EPERM */ + if (prio != 123) + return 0; /* EPERM */ + + /* Can access cgroup local storage. */ + if (!test_local_storage()) + return 0; /* EPERM */ + + return 1; +} + +/* __cgroup_bpf_run_lsm_socket */ +SEC("lsm_cgroup/socket_post_create") +int BPF_PROG(socket_post_create, struct socket *sock, int family, + int type, int protocol, int kern) +{ + called_socket_post_create++; + return real_create(sock, family, protocol); +} + +/* __cgroup_bpf_run_lsm_socket */ +SEC("lsm_cgroup/socket_post_create") +int BPF_PROG(socket_post_create2, struct socket *sock, int family, + int type, int protocol, int kern) +{ + called_socket_post_create2++; + return real_create(sock, family, protocol); +} + +static __always_inline int real_bind(struct socket *sock, + struct sockaddr *address, + int addrlen) +{ + struct sockaddr_ll sa = {}; + + if (sock->sk->__sk_common.skc_family != AF_PACKET) + return 1; + + if (sock->sk->sk_kern_sock) + return 1; + + bpf_probe_read_kernel(&sa, sizeof(sa), address); + if (sa.sll_protocol) + return 0; /* EPERM */ + + /* Can access cgroup local storage. */ + if (!test_local_storage()) + return 0; /* EPERM */ + + return 1; +} + +/* __cgroup_bpf_run_lsm_socket */ +SEC("lsm_cgroup/socket_bind") +int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address, + int addrlen) +{ + called_socket_bind++; + return real_bind(sock, address, addrlen); +} + +/* __cgroup_bpf_run_lsm_socket */ +SEC("lsm_cgroup/socket_bind") +int BPF_PROG(socket_bind2, struct socket *sock, struct sockaddr *address, + int addrlen) +{ + called_socket_bind2++; + return real_bind(sock, address, addrlen); +} + +/* __cgroup_bpf_run_lsm_current (via bpf_lsm_current_hooks) */ +SEC("lsm_cgroup/sk_alloc_security") +int BPF_PROG(socket_alloc, struct sock *sk, int family, gfp_t priority) +{ + called_socket_alloc++; + if (family == AF_UNIX) + return 0; /* EPERM */ + + /* Can access cgroup local storage. */ + if (!test_local_storage()) + return 0; /* EPERM */ + + return 1; +} + +/* __cgroup_bpf_run_lsm_sock */ +SEC("lsm_cgroup/inet_csk_clone") +int BPF_PROG(socket_clone, struct sock *newsk, const struct request_sock *req) +{ + int prio = 234; + + called_socket_clone++; + + if (!newsk) + return 1; + + /* Accepted request sockets get a different priority. */ + if (bpf_setsockopt(newsk, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio))) + return 0; /* EPERM */ + + /* Make sure bpf_getsockopt is allowed and works. */ + prio = 0; + if (bpf_getsockopt(newsk, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio))) + return 0; /* EPERM */ + if (prio != 234) + return 0; /* EPERM */ + + /* Can access cgroup local storage. */ + if (!test_local_storage()) + return 0; /* EPERM */ + + return 1; +} -- cgit v1.2.3 From 4c41cb46a732fe82575df0015b7d0d8a52efb59b Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 29 Jun 2022 11:25:02 -0700 Subject: perf python: Prefer python3 The PYTHON_AUTO code orders the preference for the PYTHON command to be python3, python and then python2. python3 makes a more logical preference as python2 is no longer supported: https://www.python.org/doc/sunset-python-2/ Reorder the priority of the PYTHON command to be python2, python and then python3. Reported-by: John Garry Signed-off-by: Ian Rogers Tested-by: John Garry Tested-by: Thomas Richter Tested-by: Xing Zhengjun Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Andi Kleen Cc: Andrew Kilroy Cc: Caleb Biggers Cc: Felix Fietkau Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kajol Jain Cc: Kan Liang Cc: Kshipra Bopardikar Cc: Like Xu Cc: Mark Rutland Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Nick Forrington Cc: Paul Clarke Cc: Perry Taylor Cc: Peter Zijlstra Cc: Qi Liu Cc: Ravi Bangoria Cc: Sandipan Das Cc: Santosh Shukla Cc: Stephane Eranian Cc: Will Deacon Link: https://lore.kernel.org/r/20220629182505.406269-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.config | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 73e0762092fe..16c1a87444b8 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -241,15 +241,15 @@ endif # Try different combinations to accommodate systems that only have # python[2][3]-config in weird combinations in the following order of # priority from lowest to highest: -# * python3-config -# * python-config # * python2-config as per pep-0394. +# * python-config +# * python3-config # * $(PYTHON)-config (If PYTHON is user supplied but PYTHON_CONFIG isn't) # PYTHON_AUTO := python-config -PYTHON_AUTO := $(if $(call get-executable,python3-config),python3-config,$(PYTHON_AUTO)) -PYTHON_AUTO := $(if $(call get-executable,python-config),python-config,$(PYTHON_AUTO)) PYTHON_AUTO := $(if $(call get-executable,python2-config),python2-config,$(PYTHON_AUTO)) +PYTHON_AUTO := $(if $(call get-executable,python-config),python-config,$(PYTHON_AUTO)) +PYTHON_AUTO := $(if $(call get-executable,python3-config),python3-config,$(PYTHON_AUTO)) # If PYTHON is defined but PYTHON_CONFIG isn't, then take $(PYTHON)-config as if it was the user # supplied value for PYTHON_CONFIG. Because it's "user supplied", error out if it doesn't exist. -- cgit v1.2.3 From f0cf642c56b76dfbbb5f2be67fa180191d5ab0ef Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 29 Jun 2022 12:13:51 +0100 Subject: bpftool: Probe for memcg-based accounting before bumping rlimit Bpftool used to bump the memlock rlimit to make sure to be able to load BPF objects. After the kernel has switched to memcg-based memory accounting [0] in 5.11, bpftool has relied on libbpf to probe the system for memcg-based accounting support and for raising the rlimit if necessary [1]. But this was later reverted, because the probe would sometimes fail, resulting in bpftool not being able to load all required objects [2]. Here we add a more efficient probe, in bpftool itself. We first lower the rlimit to 0, then we attempt to load a BPF object (and finally reset the rlimit): if the load succeeds, then memcg-based memory accounting is supported. This approach was earlier proposed for the probe in libbpf itself [3], but given that the library may be used in multithreaded applications, the probe could have undesirable consequences if one thread attempts to lock kernel memory while memlock rlimit is at 0. Since bpftool is single-threaded and the rlimit is process-based, this is fine to do in bpftool itself. This probe was inspired by the similar one from the cilium/ebpf Go library [4]. [0] commit 97306be45fbe ("Merge branch 'switch to memcg-based memory accounting'") [1] commit a777e18f1bcd ("bpftool: Use libbpf 1.0 API mode instead of RLIMIT_MEMLOCK") [2] commit 6b4384ff1088 ("Revert "bpftool: Use libbpf 1.0 API mode instead of RLIMIT_MEMLOCK"") [3] https://lore.kernel.org/bpf/20220609143614.97837-1-quentin@isovalent.com/t/#u [4] https://github.com/cilium/ebpf/blob/v0.9.0/rlimit/rlimit.go#L39 Suggested-by: Daniel Borkmann Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Acked-by: Yafang Shao Link: https://lore.kernel.org/bpf/20220629111351.47699-1-quentin@isovalent.com --- tools/bpf/bpftool/common.c | 71 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index a0d4acd7c54a..fc8172a4969a 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -13,14 +13,17 @@ #include #include #include -#include -#include #include #include #include #include #include +#include +#include +#include +#include + #include #include #include /* libbpf_num_possible_cpus */ @@ -73,11 +76,73 @@ static bool is_bpffs(char *path) return (unsigned long)st_fs.f_type == BPF_FS_MAGIC; } +/* Probe whether kernel switched from memlock-based (RLIMIT_MEMLOCK) to + * memcg-based memory accounting for BPF maps and programs. This was done in + * commit 97306be45fbe ("Merge branch 'switch to memcg-based memory + * accounting'"), in Linux 5.11. + * + * Libbpf also offers to probe for memcg-based accounting vs rlimit, but does + * so by checking for the availability of a given BPF helper and this has + * failed on some kernels with backports in the past, see commit 6b4384ff1088 + * ("Revert "bpftool: Use libbpf 1.0 API mode instead of RLIMIT_MEMLOCK""). + * Instead, we can probe by lowering the process-based rlimit to 0, trying to + * load a BPF object, and resetting the rlimit. If the load succeeds then + * memcg-based accounting is supported. + * + * This would be too dangerous to do in the library, because multithreaded + * applications might attempt to load items while the rlimit is at 0. Given + * that bpftool is single-threaded, this is fine to do here. + */ +static bool known_to_need_rlimit(void) +{ + struct rlimit rlim_init, rlim_cur_zero = {}; + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + size_t insn_cnt = ARRAY_SIZE(insns); + union bpf_attr attr; + int prog_fd, err; + + memset(&attr, 0, sizeof(attr)); + attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + attr.insns = ptr_to_u64(insns); + attr.insn_cnt = insn_cnt; + attr.license = ptr_to_u64("GPL"); + + if (getrlimit(RLIMIT_MEMLOCK, &rlim_init)) + return false; + + /* Drop the soft limit to zero. We maintain the hard limit to its + * current value, because lowering it would be a permanent operation + * for unprivileged users. + */ + rlim_cur_zero.rlim_max = rlim_init.rlim_max; + if (setrlimit(RLIMIT_MEMLOCK, &rlim_cur_zero)) + return false; + + /* Do not use bpf_prog_load() from libbpf here, because it calls + * bump_rlimit_memlock(), interfering with the current probe. + */ + prog_fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); + err = errno; + + /* reset soft rlimit to its initial value */ + setrlimit(RLIMIT_MEMLOCK, &rlim_init); + + if (prog_fd < 0) + return err == EPERM; + + close(prog_fd); + return false; +} + void set_max_rlimit(void) { struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY }; - setrlimit(RLIMIT_MEMLOCK, &rinf); + if (known_to_need_rlimit()) + setrlimit(RLIMIT_MEMLOCK, &rinf); } static int -- cgit v1.2.3 From ffc606ada3d7d8b200a27fc0c3a1cf0e000d75b2 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 29 Jun 2022 11:25:03 -0700 Subject: perf jevents: Add python converter script jevents.c is large, has a dependency on an old forked version of jsmn, and is challenging to work upon. A lot of jevents.c's complexity comes from needing to write json and csv parsing from first principles. In contrast python has this functionality in standard libraries and is already a build pre-requisite for tools like asciidoc (that builds all of the perf man pages). Introduce jevents.py that produces identical output to jevents.c. Add a test that runs both converter tools and validates there are no output differences. The test can be invoked with a phony build target like: $ make -C tools/perf jevents-py-test The python code deliberately tries to replicate the behavior of jevents.c so that the output matches and transitioning tools shouldn't introduce regressions. In some cases the code isn't as elegant as hoped, but fixing this can be done as follow up. Committer testing: $ make -C tools/perf jevents-py-test make: Entering directory '/var/home/acme/git/perf/tools/perf' BUILD: Doing 'make -j32' parallel build HOSTCC fixdep.o HOSTLD fixdep-in.o LINK fixdep Auto-detecting system features: ... dwarf: [ on ] ... dwarf_getlocations: [ on ] ... glibc: [ on ] ... libbfd: [ on ] ... libbfd-buildid: [ on ] ... libcap: [ on ] ... libelf: [ on ] ... libnuma: [ on ] ... numa_num_possible_cpus: [ on ] ... libperl: [ on ] ... libpython: [ on ] ... libcrypto: [ OFF ] ... libunwind: [ on ] ... libdw-dwarf-unwind: [ on ] ... zlib: [ on ] ... lzma: [ on ] ... get_cpuid: [ on ] ... bpf: [ on ] ... libaio: [ on ] ... libzstd: [ on ] ... disassembler-four-args: [ on ] HOSTCC pmu-events/json.o HOSTCC pmu-events/jsmn.o HOSTCC pmu-events/jevents.o HOSTLD pmu-events/jevents-in.o LINK pmu-events/jevents Checking architecture: arm64 Generating using jevents.c Generating using jevents.py Diffing Checking architecture: nds32 Generating using jevents.c Generating using jevents.py Diffing Checking architecture: powerpc Generating using jevents.c Generating using jevents.py Diffing Checking architecture: s390 Generating using jevents.c Generating using jevents.py Diffing Checking architecture: x86 Generating using jevents.c Generating using jevents.py Diffing make: Leaving directory '/var/home/acme/git/perf/tools/perf' $ Signed-off-by: Ian Rogers Tested-by: John Garry Tested-by: Thomas Richter Tested-by: Xing Zhengjun Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Andi Kleen Cc: Andrew Kilroy Cc: Caleb Biggers Cc: Felix Fietkau Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kajol Jain Cc: Kan Liang Cc: Kshipra Bopardikar Cc: Like Xu Cc: Mark Rutland Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Nick Forrington Cc: Paul Clarke Cc: Perry Taylor Cc: Peter Zijlstra Cc: Qi Liu Cc: Ravi Bangoria Cc: Sandipan Das Cc: Santosh Shukla Cc: Stephane Eranian Cc: Will Deacon Link: https://lore.kernel.org/r/20220629182505.406269-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 6 + tools/perf/pmu-events/jevents-test.sh | 33 +++ tools/perf/pmu-events/jevents.py | 409 ++++++++++++++++++++++++++++++++++ 3 files changed, 448 insertions(+) create mode 100755 tools/perf/pmu-events/jevents-test.sh create mode 100755 tools/perf/pmu-events/jevents.py (limited to 'tools') diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 8f738e11356d..1e29c8936f71 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -669,6 +669,12 @@ $(JEVENTS_IN): FORCE $(JEVENTS): $(JEVENTS_IN) $(QUIET_LINK)$(HOSTCC) $(JEVENTS_IN) -o $@ +JEVENTS_PY := pmu-events/jevents.py +JEVENTS_PY_TEST := pmu-events/jevents-test.sh +.PHONY: jevents-py-test +jevents-py-test: $(JEVENTS) + $(Q)$(call echo-cmd,gen)$(JEVENTS_PY_TEST) $(JEVENTS) $(JEVENTS_PY) pmu-events/arch + $(PMU_EVENTS_IN): $(JEVENTS) FORCE $(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=pmu-events obj=pmu-events diff --git a/tools/perf/pmu-events/jevents-test.sh b/tools/perf/pmu-events/jevents-test.sh new file mode 100755 index 000000000000..9ae852292576 --- /dev/null +++ b/tools/perf/pmu-events/jevents-test.sh @@ -0,0 +1,33 @@ +#!/bin/sh +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +# Validate that the legacy jevents and jevents.py produce identical output. +set -e + +JEVENTS="$1" +JEVENTS_PY="$2" +ARCH_PATH="$3" +JEVENTS_C_GENERATED=$(mktemp /tmp/jevents_c.XXXXX.c) +JEVENTS_PY_GENERATED=$(mktemp /tmp/jevents_py.XXXXX.c) + +cleanup() { + rm "$JEVENTS_C_GENERATED" "$JEVENTS_PY_GENERATED" + trap - exit term int +} +trap cleanup exit term int + +for path in "$ARCH_PATH"/* +do + arch=$(basename $path) + if [ "$arch" = "test" ] + then + continue + fi + echo "Checking architecture: $arch" + echo "Generating using jevents.c" + "$JEVENTS" "$arch" "$ARCH_PATH" "$JEVENTS_C_GENERATED" + echo "Generating using jevents.py" + "$JEVENTS_PY" "$arch" "$ARCH_PATH" "$JEVENTS_PY_GENERATED" + echo "Diffing" + diff -u "$JEVENTS_C_GENERATED" "$JEVENTS_PY_GENERATED" +done +cleanup diff --git a/tools/perf/pmu-events/jevents.py b/tools/perf/pmu-events/jevents.py new file mode 100755 index 000000000000..83e0dcbeac9a --- /dev/null +++ b/tools/perf/pmu-events/jevents.py @@ -0,0 +1,409 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +"""Convert directories of JSON events to C code.""" +import argparse +import csv +import json +import os +import sys +from typing import Callable +from typing import Sequence + +# Global command line arguments. +_args = None +# List of event tables generated from "/sys" directories. +_sys_event_tables = [] +# Map from an event name to an architecture standard +# JsonEvent. Architecture standard events are in json files in the top +# f'{_args.starting_dir}/{_args.arch}' directory. +_arch_std_events = {} +# Track whether an events table is currently being defined and needs closing. +_close_table = False + + +def removesuffix(s: str, suffix: str) -> str: + """Remove the suffix from a string + + The removesuffix function is added to str in Python 3.9. We aim for 3.6 + compatibility and so provide our own function here. + """ + return s[0:-len(suffix)] if s.endswith(suffix) else s + + +def file_name_to_table_name(parents: Sequence[str], dirname: str) -> str: + """Generate a C table name from directory names.""" + tblname = 'pme' + for p in parents: + tblname += '_' + p + tblname += '_' + dirname + return tblname.replace('-', '_') + + +class JsonEvent: + """Representation of an event loaded from a json file dictionary.""" + + def __init__(self, jd: dict): + """Constructor passed the dictionary of parsed json values.""" + + def llx(x: int) -> str: + """Convert an int to a string similar to a printf modifier of %#llx.""" + return '0' if x == 0 else hex(x) + + def fixdesc(s: str) -> str: + """Fix formatting issue for the desc string.""" + if s is None: + return None + return removesuffix(removesuffix(removesuffix(s, '. '), + '. '), '.').replace('\n', '\\n').replace( + '\"', '\\"').replace('\r', '\\r') + + def convert_aggr_mode(aggr_mode: str) -> str: + """Returns the aggr_mode_class enum value associated with the JSON string.""" + if not aggr_mode: + return None + aggr_mode_to_enum = { + 'PerChip': '1', + 'PerCore': '2', + } + return aggr_mode_to_enum[aggr_mode] + + def lookup_msr(num: str) -> str: + """Converts the msr number, or first in a list to the appropriate event field.""" + if not num: + return None + msrmap = { + 0x3F6: 'ldlat=', + 0x1A6: 'offcore_rsp=', + 0x1A7: 'offcore_rsp=', + 0x3F7: 'frontend=', + } + return msrmap[int(num.split(',', 1)[0], 0)] + + def real_event(name: str, event: str) -> str: + """Convert well known event names to an event string otherwise use the event argument.""" + fixed = { + 'inst_retired.any': 'event=0xc0,period=2000003', + 'inst_retired.any_p': 'event=0xc0,period=2000003', + 'cpu_clk_unhalted.ref': 'event=0x0,umask=0x03,period=2000003', + 'cpu_clk_unhalted.thread': 'event=0x3c,period=2000003', + 'cpu_clk_unhalted.core': 'event=0x3c,period=2000003', + 'cpu_clk_unhalted.thread_any': 'event=0x3c,any=1,period=2000003', + } + if not name: + return None + if name.lower() in fixed: + return fixed[name.lower()] + return event + + def unit_to_pmu(unit: str) -> str: + """Convert a JSON Unit to Linux PMU name.""" + if not unit: + return None + # Comment brought over from jevents.c: + # it's not realistic to keep adding these, we need something more scalable ... + table = { + 'CBO': 'uncore_cbox', + 'QPI LL': 'uncore_qpi', + 'SBO': 'uncore_sbox', + 'iMPH-U': 'uncore_arb', + 'CPU-M-CF': 'cpum_cf', + 'CPU-M-SF': 'cpum_sf', + 'UPI LL': 'uncore_upi', + 'hisi_sicl,cpa': 'hisi_sicl,cpa', + 'hisi_sccl,ddrc': 'hisi_sccl,ddrc', + 'hisi_sccl,hha': 'hisi_sccl,hha', + 'hisi_sccl,l3c': 'hisi_sccl,l3c', + 'imx8_ddr': 'imx8_ddr', + 'L3PMC': 'amd_l3', + 'DFPMC': 'amd_df', + 'cpu_core': 'cpu_core', + 'cpu_atom': 'cpu_atom', + } + return table[unit] if unit in table else f'uncore_{unit.lower()}' + + eventcode = 0 + if 'EventCode' in jd: + eventcode = int(jd['EventCode'].split(',', 1)[0], 0) + if 'ExtSel' in jd: + eventcode |= int(jd['ExtSel']) << 8 + configcode = int(jd['ConfigCode'], 0) if 'ConfigCode' in jd else None + self.name = jd['EventName'].lower() if 'EventName' in jd else None + self.compat = jd.get('Compat') + self.desc = fixdesc(jd.get('BriefDescription')) + self.long_desc = fixdesc(jd.get('PublicDescription')) + precise = jd.get('PEBS') + msr = lookup_msr(jd.get('MSRIndex')) + msrval = jd.get('MSRValue') + extra_desc = '' + if 'Data_LA' in jd: + extra_desc += ' Supports address when precise' + if 'Errata' in jd: + extra_desc += '.' + if 'Errata' in jd: + extra_desc += ' Spec update: ' + jd['Errata'] + self.pmu = unit_to_pmu(jd.get('Unit')) + filter = jd.get('Filter') + self.unit = jd.get('ScaleUnit') + self.perpkg = jd.get('PerPkg') + self.aggr_mode = convert_aggr_mode(jd.get('AggregationMode')) + self.deprecated = jd.get('Deprecated') + self.metric_name = jd.get('MetricName') + self.metric_group = jd.get('MetricGroup') + self.metric_constraint = jd.get('MetricConstraint') + self.metric_expr = jd.get('MetricExpr') + if self.metric_expr: + self.metric_expr = self.metric_expr.replace('\\', '\\\\') + arch_std = jd.get('ArchStdEvent') + if precise and self.desc and not '(Precise Event)' in self.desc: + extra_desc += ' (Must be precise)' if precise == '2' else (' (Precise ' + 'event)') + event = f'config={llx(configcode)}' if configcode is not None else f'event={llx(eventcode)}' + event_fields = [ + ('AnyThread', 'any='), + ('PortMask', 'ch_mask='), + ('CounterMask', 'cmask='), + ('EdgeDetect', 'edge='), + ('FCMask', 'fc_mask='), + ('Invert', 'inv='), + ('SampleAfterValue', 'period='), + ('UMask', 'umask='), + ] + for key, value in event_fields: + if key in jd and jd[key] != '0': + event += ',' + value + jd[key] + if filter: + event += f',{filter}' + if msr: + event += f',{msr}{msrval}' + if self.desc and extra_desc: + self.desc += extra_desc + if self.long_desc and extra_desc: + self.long_desc += extra_desc + if self.pmu: + if self.desc and not self.desc.endswith('. '): + self.desc += '. ' + self.desc = (self.desc if self.desc else '') + ('Unit: ' + self.pmu + ' ') + if arch_std and arch_std.lower() in _arch_std_events: + event = _arch_std_events[arch_std.lower()].event + # Copy from the architecture standard event to self for undefined fields. + for attr, value in _arch_std_events[arch_std.lower()].__dict__.items(): + if hasattr(self, attr) and not getattr(self, attr): + setattr(self, attr, value) + + self.event = real_event(self.name, event) + + def __repr__(self) -> str: + """String representation primarily for debugging.""" + s = '{\n' + for attr, value in self.__dict__.items(): + if value: + s += f'\t{attr} = {value},\n' + return s + '}' + + def to_c_string(self, topic_local: str) -> str: + """Representation of the event as a C struct initializer.""" + + def attr_string(attr: str, value: str) -> str: + return '\t.%s = \"%s\",\n' % (attr, value) + + def str_if_present(self, attr: str) -> str: + if not getattr(self, attr): + return '' + return attr_string(attr, getattr(self, attr)) + + s = '{\n' + for attr in ['name', 'event']: + s += str_if_present(self, attr) + if self.desc is not None: + s += attr_string('desc', self.desc) + else: + s += attr_string('desc', '(null)') + s += str_if_present(self, 'compat') + s += f'\t.topic = "{topic_local}",\n' + for attr in [ + 'long_desc', 'pmu', 'unit', 'perpkg', 'aggr_mode', 'metric_expr', + 'metric_name', 'metric_group', 'deprecated', 'metric_constraint' + ]: + s += str_if_present(self, attr) + s += '},\n' + return s + + +def read_json_events(path: str) -> Sequence[JsonEvent]: + """Read json events from the specified file.""" + return json.load(open(path), object_hook=lambda d: JsonEvent(d)) + + +def preprocess_arch_std_files(archpath: str) -> None: + """Read in all architecture standard events.""" + global _arch_std_events + for item in os.scandir(archpath): + if item.is_file() and item.name.endswith('.json'): + for event in read_json_events(item.path): + if event.name: + _arch_std_events[event.name.lower()] = event + + +def print_events_table_prefix(tblname: str) -> None: + """Called when a new events table is started.""" + global _close_table + if _close_table: + raise IOError('Printing table prefix but last table has no suffix') + _args.output_file.write(f'static const struct pmu_event {tblname}[] = {{\n') + _close_table = True + + +def print_events_table_entries(item: os.DirEntry, topic: str) -> None: + """Create contents of an events table.""" + if not _close_table: + raise IOError('Table entries missing prefix') + for event in read_json_events(item.path): + _args.output_file.write(event.to_c_string(topic)) + + +def print_events_table_suffix() -> None: + """Optionally close events table.""" + global _close_table + if _close_table: + _args.output_file.write("""{ +\t.name = 0, +\t.event = 0, +\t.desc = 0, +}, +}; +""") + _close_table = False + + +def process_one_file(parents: Sequence[str], item: os.DirEntry) -> None: + """Process a JSON file during the main walk.""" + global _sys_event_tables + + def get_topic(topic: str) -> str: + return removesuffix(topic, '.json').replace('-', ' ') + + def is_leaf_dir(path: str) -> bool: + for item in os.scandir(path): + if item.is_dir(): + return False + return True + + # model directory, reset topic + if item.is_dir() and is_leaf_dir(item.path): + print_events_table_suffix() + + tblname = file_name_to_table_name(parents, item.name) + if item.name == 'sys': + _sys_event_tables.append(tblname) + print_events_table_prefix(tblname) + return + + # base dir or too deep + level = len(parents) + if level == 0 or level > 4: + return + + # Ignore other directories. If the file name does not have a .json + # extension, ignore it. It could be a readme.txt for instance. + if not item.is_file() or not item.name.endswith('.json'): + return + + print_events_table_entries(item, get_topic(item.name)) + + +def print_mapping_table() -> None: + """Read the mapfile and generate the struct from cpuid string to event table.""" + with open(f'{_args.starting_dir}/{_args.arch}/mapfile.csv') as csvfile: + table = csv.reader(csvfile) + _args.output_file.write( + 'const struct pmu_events_map pmu_events_map[] = {\n') + first = True + for row in table: + # Skip the first row or any row beginning with #. + if not first and len(row) > 0 and not row[0].startswith('#'): + tblname = file_name_to_table_name([], row[2].replace('/', '_')) + _args.output_file.write("""{ +\t.cpuid = \"%s\", +\t.version = \"%s\", +\t.type = \"%s\", +\t.table = %s +}, +""" % (row[0].replace('\\', '\\\\'), row[1], row[3], tblname)) + first = False + + _args.output_file.write("""{ +\t.cpuid = "testcpu", +\t.version = "v1", +\t.type = "core", +\t.table = pme_test_soc_cpu, +}, +{ +\t.cpuid = 0, +\t.version = 0, +\t.type = 0, +\t.table = 0, +}, +}; +""") + + +def print_system_mapping_table() -> None: + """C struct mapping table array for tables from /sys directories.""" + _args.output_file.write( + '\nconst struct pmu_sys_events pmu_sys_event_tables[] = {\n') + for tblname in _sys_event_tables: + _args.output_file.write(f"""\t{{ +\t\t.table = {tblname}, +\t\t.name = \"{tblname}\", +\t}}, +""") + _args.output_file.write("""\t{ +\t\t.table = 0 +\t}, +}; +""") + + +def main() -> None: + global _args + + def dir_path(path: str) -> str: + """Validate path is a directory for argparse.""" + if os.path.isdir(path): + return path + raise argparse.ArgumentTypeError(f'\'{path}\' is not a valid directory') + + def ftw(path: str, parents: Sequence[str], + action: Callable[[Sequence[str], os.DirEntry], None]) -> None: + """Replicate the directory/file walking behavior of C's file tree walk.""" + for item in os.scandir(path): + action(parents, item) + if item.is_dir(): + ftw(item.path, parents + [item.name], action) + + ap = argparse.ArgumentParser() + ap.add_argument('arch', help='Architecture name like x86') + ap.add_argument( + 'starting_dir', + type=dir_path, + help='Root of tree containing architecture directories containing json files' + ) + ap.add_argument( + 'output_file', type=argparse.FileType('w'), nargs='?', default=sys.stdout) + _args = ap.parse_args() + + _args.output_file.write("#include \"pmu-events/pmu-events.h\"\n") + for path in [_args.arch, 'test']: + arch_path = f'{_args.starting_dir}/{path}' + if not os.path.isdir(arch_path): + raise IOError(f'Missing architecture directory in \'{arch_path}\'') + preprocess_arch_std_files(arch_path) + ftw(arch_path, [], process_one_file) + print_events_table_suffix() + + print_mapping_table() + print_system_mapping_table() + + +if __name__ == '__main__': + main() -- cgit v1.2.3 From 00facc760903be6675870c2749e2cd72140e396e Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 29 Jun 2022 11:25:04 -0700 Subject: perf jevents: Switch build to use jevents.py Generate pmu-events.c using jevents.py rather than the binary built from jevents.c. Add a new config variable NO_JEVENTS that is set when there is no architecture json or an appropriate python interpreter isn't present. When NO_JEVENTS is defined the file pmu-events/empty-pmu-events.c is copied and used as the pmu-events.c file. Signed-off-by: Ian Rogers Tested-by: John Garry Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Andi Kleen Cc: Andrew Kilroy Cc: Caleb Biggers Cc: Felix Fietkau Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kajol Jain Cc: Kan Liang Cc: Kshipra Bopardikar Cc: Like Xu Cc: Mark Rutland Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Nick Forrington Cc: Paul Clarke Cc: Perry Taylor Cc: Peter Zijlstra Cc: Qi Liu Cc: Ravi Bangoria Cc: Sandipan Das Cc: Santosh Shukla Cc: Stephane Eranian Cc: Will Deacon Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20220629182505.406269-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.config | 19 ++++ tools/perf/Makefile.perf | 1 + tools/perf/pmu-events/Build | 13 ++- tools/perf/pmu-events/empty-pmu-events.c | 158 +++++++++++++++++++++++++++++++ 4 files changed, 189 insertions(+), 2 deletions(-) create mode 100644 tools/perf/pmu-events/empty-pmu-events.c (limited to 'tools') diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 16c1a87444b8..153c18909ff5 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -889,6 +889,25 @@ else endif endif +ifneq ($(NO_JEVENTS),1) + ifeq ($(wildcard pmu-events/arch/$(SRCARCH)/mapfile.csv),) + NO_JEVENTS := 1 + endif +endif +ifneq ($(NO_JEVENTS),1) + NO_JEVENTS := 0 + ifndef PYTHON + $(warning No python interpreter disabling jevent generation) + NO_JEVENTS := 1 + else + # jevents.py uses f-strings present in Python 3.6 released in Dec. 2016. + JEVENTS_PYTHON_GOOD := $(shell $(PYTHON) -c 'import sys;print("1" if(sys.version_info.major >= 3 and sys.version_info.minor >= 6) else "0")' 2> /dev/null) + ifneq ($(JEVENTS_PYTHON_GOOD), 1) + $(warning Python interpreter too old (older than 3.6) disabling jevent generation) + NO_JEVENTS := 1 + endif + endif +endif ifndef NO_LIBBFD ifeq ($(feature-libbfd), 1) diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 1e29c8936f71..dc6b177ac1de 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -655,6 +655,7 @@ JEVENTS := $(OUTPUT)pmu-events/jevents JEVENTS_IN := $(OUTPUT)pmu-events/jevents-in.o PMU_EVENTS_IN := $(OUTPUT)pmu-events/pmu-events-in.o +export NO_JEVENTS export JEVENTS diff --git a/tools/perf/pmu-events/Build b/tools/perf/pmu-events/Build index a055dee6a46a..5ec5ce8c31ba 100644 --- a/tools/perf/pmu-events/Build +++ b/tools/perf/pmu-events/Build @@ -9,10 +9,19 @@ JSON = $(shell [ -d $(JDIR) ] && \ JDIR_TEST = pmu-events/arch/test JSON_TEST = $(shell [ -d $(JDIR_TEST) ] && \ find $(JDIR_TEST) -name '*.json') +JEVENTS_PY = pmu-events/jevents.py # # Locate/process JSON files in pmu-events/arch/ # directory and create tables in pmu-events.c. # -$(OUTPUT)pmu-events/pmu-events.c: $(JSON) $(JSON_TEST) $(JEVENTS) - $(Q)$(call echo-cmd,gen)$(JEVENTS) $(SRCARCH) pmu-events/arch $(OUTPUT)pmu-events/pmu-events.c $(V) + +ifeq ($(NO_JEVENTS),1) +$(OUTPUT)pmu-events/pmu-events.c: pmu-events/empty-pmu-events.c + $(call rule_mkdir) + $(Q)$(call echo-cmd,gen)cp $< $@ +else +$(OUTPUT)pmu-events/pmu-events.c: $(JSON) $(JSON_TEST) $(JEVENTS_PY) + $(call rule_mkdir) + $(Q)$(call echo-cmd,gen)$(PYTHON) $(JEVENTS_PY) $(SRCARCH) pmu-events/arch $@ +endif diff --git a/tools/perf/pmu-events/empty-pmu-events.c b/tools/perf/pmu-events/empty-pmu-events.c new file mode 100644 index 000000000000..77e655c6f116 --- /dev/null +++ b/tools/perf/pmu-events/empty-pmu-events.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * An empty pmu-events.c file used when there is no architecture json files in + * arch or when the jevents.py script cannot be run. + * + * The test cpu/soc is provided for testing. + */ +#include "pmu-events/pmu-events.h" + +static const struct pmu_event pme_test_soc_cpu[] = { + { + .name = "l3_cache_rd", + .event = "event=0x40", + .desc = "L3 cache access, read", + .topic = "cache", + .long_desc = "Attributable Level 3 cache access, read", + }, + { + .name = "segment_reg_loads.any", + .event = "event=0x6,period=200000,umask=0x80", + .desc = "Number of segment register loads", + .topic = "other", + }, + { + .name = "dispatch_blocked.any", + .event = "event=0x9,period=200000,umask=0x20", + .desc = "Memory cluster signals to block micro-op dispatch for any reason", + .topic = "other", + }, + { + .name = "eist_trans", + .event = "event=0x3a,period=200000,umask=0x0", + .desc = "Number of Enhanced Intel SpeedStep(R) Technology (EIST) transitions", + .topic = "other", + }, + { + .name = "uncore_hisi_ddrc.flux_wcmd", + .event = "event=0x2", + .desc = "DDRC write commands. Unit: hisi_sccl,ddrc ", + .topic = "uncore", + .long_desc = "DDRC write commands", + .pmu = "hisi_sccl,ddrc", + }, + { + .name = "unc_cbo_xsnp_response.miss_eviction", + .event = "event=0x22,umask=0x81", + .desc = "A cross-core snoop resulted from L3 Eviction which misses in some processor core. Unit: uncore_cbox ", + .topic = "uncore", + .long_desc = "A cross-core snoop resulted from L3 Eviction which misses in some processor core", + .pmu = "uncore_cbox", + }, + { + .name = "event-hyphen", + .event = "event=0xe0,umask=0x00", + .desc = "UNC_CBO_HYPHEN. Unit: uncore_cbox ", + .topic = "uncore", + .long_desc = "UNC_CBO_HYPHEN", + .pmu = "uncore_cbox", + }, + { + .name = "event-two-hyph", + .event = "event=0xc0,umask=0x00", + .desc = "UNC_CBO_TWO_HYPH. Unit: uncore_cbox ", + .topic = "uncore", + .long_desc = "UNC_CBO_TWO_HYPH", + .pmu = "uncore_cbox", + }, + { + .name = "uncore_hisi_l3c.rd_hit_cpipe", + .event = "event=0x7", + .desc = "Total read hits. Unit: hisi_sccl,l3c ", + .topic = "uncore", + .long_desc = "Total read hits", + .pmu = "hisi_sccl,l3c", + }, + { + .name = "uncore_imc_free_running.cache_miss", + .event = "event=0x12", + .desc = "Total cache misses. Unit: uncore_imc_free_running ", + .topic = "uncore", + .long_desc = "Total cache misses", + .pmu = "uncore_imc_free_running", + }, + { + .name = "uncore_imc.cache_hits", + .event = "event=0x34", + .desc = "Total cache hits. Unit: uncore_imc ", + .topic = "uncore", + .long_desc = "Total cache hits", + .pmu = "uncore_imc", + }, + { + .name = "bp_l1_btb_correct", + .event = "event=0x8a", + .desc = "L1 BTB Correction", + .topic = "branch", + }, + { + .name = "bp_l2_btb_correct", + .event = "event=0x8b", + .desc = "L2 BTB Correction", + .topic = "branch", + }, + { + .name = 0, + .event = 0, + .desc = 0, + }, +}; + +const struct pmu_events_map pmu_events_map[] = { + { + .cpuid = "testcpu", + .version = "v1", + .type = "core", + .table = pme_test_soc_cpu, + }, + { + .cpuid = 0, + .version = 0, + .type = 0, + .table = 0, + }, +}; + +static const struct pmu_event pme_test_soc_sys[] = { + { + .name = "sys_ddr_pmu.write_cycles", + .event = "event=0x2b", + .desc = "ddr write-cycles event. Unit: uncore_sys_ddr_pmu ", + .compat = "v8", + .topic = "uncore", + .pmu = "uncore_sys_ddr_pmu", + }, + { + .name = "sys_ccn_pmu.read_cycles", + .event = "config=0x2c", + .desc = "ccn read-cycles event. Unit: uncore_sys_ccn_pmu ", + .compat = "0x01", + .topic = "uncore", + .pmu = "uncore_sys_ccn_pmu", + }, + { + .name = 0, + .event = 0, + .desc = 0, + }, +}; + +const struct pmu_sys_events pmu_sys_event_tables[] = { + { + .table = pme_test_soc_sys, + .name = "pme_test_soc_sys", + }, + { + .table = 0 + }, +}; -- cgit v1.2.3 From 5a059790afe8a6e11dbe45475eb45401ec9937ea Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 29 Jun 2022 11:25:05 -0700 Subject: perf jevents: Remove jevents.c Remove files and build rules. Remove test for comparing with jevents.py as there is no longer a binary to compare with. Signed-off-by: Ian Rogers Tested-by: John Garry Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Andi Kleen Cc: Andrew Kilroy Cc: Caleb Biggers Cc: Felix Fietkau Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kajol Jain Cc: Kan Liang Cc: Kshipra Bopardikar Cc: Like Xu Cc: Mark Rutland Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Nick Forrington Cc: Paul Clarke Cc: Perry Taylor Cc: Peter Zijlstra Cc: Qi Liu Cc: Ravi Bangoria Cc: Sandipan Das Cc: Santosh Shukla Cc: Stephane Eranian Cc: Will Deacon Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20220629182505.406269-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 21 +- tools/perf/pmu-events/Build | 4 - tools/perf/pmu-events/jevents-test.sh | 33 - tools/perf/pmu-events/jevents.c | 1342 --------------------------------- tools/perf/pmu-events/jsmn.c | 352 --------- tools/perf/pmu-events/jsmn.h | 68 -- tools/perf/pmu-events/json.c | 162 ---- tools/perf/pmu-events/json.h | 39 - 8 files changed, 2 insertions(+), 2019 deletions(-) delete mode 100755 tools/perf/pmu-events/jevents-test.sh delete mode 100644 tools/perf/pmu-events/jevents.c delete mode 100644 tools/perf/pmu-events/jsmn.c delete mode 100644 tools/perf/pmu-events/jsmn.h delete mode 100644 tools/perf/pmu-events/json.c delete mode 100644 tools/perf/pmu-events/json.h (limited to 'tools') diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index dc6b177ac1de..8f0b1fb39984 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -651,32 +651,15 @@ strip: $(PROGRAMS) $(OUTPUT)perf PERF_IN := $(OUTPUT)perf-in.o -JEVENTS := $(OUTPUT)pmu-events/jevents -JEVENTS_IN := $(OUTPUT)pmu-events/jevents-in.o - PMU_EVENTS_IN := $(OUTPUT)pmu-events/pmu-events-in.o export NO_JEVENTS -export JEVENTS - build := -f $(srctree)/tools/build/Makefile.build dir=. obj $(PERF_IN): prepare FORCE $(Q)$(MAKE) $(build)=perf -$(JEVENTS_IN): FORCE - $(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=pmu-events obj=jevents - -$(JEVENTS): $(JEVENTS_IN) - $(QUIET_LINK)$(HOSTCC) $(JEVENTS_IN) -o $@ - -JEVENTS_PY := pmu-events/jevents.py -JEVENTS_PY_TEST := pmu-events/jevents-test.sh -.PHONY: jevents-py-test -jevents-py-test: $(JEVENTS) - $(Q)$(call echo-cmd,gen)$(JEVENTS_PY_TEST) $(JEVENTS) $(JEVENTS_PY) pmu-events/arch - -$(PMU_EVENTS_IN): $(JEVENTS) FORCE +$(PMU_EVENTS_IN): FORCE $(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=pmu-events obj=pmu-events $(OUTPUT)perf: $(PERFLIBS) $(PERF_IN) $(PMU_EVENTS_IN) $(LIBTRACEEVENT_DYNAMIC_LIST) @@ -1096,7 +1079,7 @@ clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clea $(call QUIET_CLEAN, core-objs) $(RM) $(LIBPERF_A) $(OUTPUT)perf-archive $(OUTPUT)perf-iostat $(LANG_BINDINGS) $(Q)find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete $(Q)$(RM) $(OUTPUT).config-detected - $(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf perf-read-vdso32 perf-read-vdsox32 $(OUTPUT)pmu-events/jevents $(OUTPUT)$(LIBJVMTI).so + $(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf perf-read-vdso32 perf-read-vdsox32 $(OUTPUT)$(LIBJVMTI).so $(call QUIET_CLEAN, core-gen) $(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)FEATURE-DUMP $(OUTPUT)util/*-bison* $(OUTPUT)util/*-flex* \ $(OUTPUT)util/intel-pt-decoder/inat-tables.c \ $(OUTPUT)tests/llvm-src-{base,kbuild,prologue,relocation}.c \ diff --git a/tools/perf/pmu-events/Build b/tools/perf/pmu-events/Build index 5ec5ce8c31ba..28a9d01b08af 100644 --- a/tools/perf/pmu-events/Build +++ b/tools/perf/pmu-events/Build @@ -1,7 +1,3 @@ -hostprogs := jevents - -jevents-y += json.o jsmn.o jevents.o -HOSTCFLAGS_jevents.o = -I$(srctree)/tools/include pmu-events-y += pmu-events.o JDIR = pmu-events/arch/$(SRCARCH) JSON = $(shell [ -d $(JDIR) ] && \ diff --git a/tools/perf/pmu-events/jevents-test.sh b/tools/perf/pmu-events/jevents-test.sh deleted file mode 100755 index 9ae852292576..000000000000 --- a/tools/perf/pmu-events/jevents-test.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) -# Validate that the legacy jevents and jevents.py produce identical output. -set -e - -JEVENTS="$1" -JEVENTS_PY="$2" -ARCH_PATH="$3" -JEVENTS_C_GENERATED=$(mktemp /tmp/jevents_c.XXXXX.c) -JEVENTS_PY_GENERATED=$(mktemp /tmp/jevents_py.XXXXX.c) - -cleanup() { - rm "$JEVENTS_C_GENERATED" "$JEVENTS_PY_GENERATED" - trap - exit term int -} -trap cleanup exit term int - -for path in "$ARCH_PATH"/* -do - arch=$(basename $path) - if [ "$arch" = "test" ] - then - continue - fi - echo "Checking architecture: $arch" - echo "Generating using jevents.c" - "$JEVENTS" "$arch" "$ARCH_PATH" "$JEVENTS_C_GENERATED" - echo "Generating using jevents.py" - "$JEVENTS_PY" "$arch" "$ARCH_PATH" "$JEVENTS_PY_GENERATED" - echo "Diffing" - diff -u "$JEVENTS_C_GENERATED" "$JEVENTS_PY_GENERATED" -done -cleanup diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c deleted file mode 100644 index e597e4bac90f..000000000000 --- a/tools/perf/pmu-events/jevents.c +++ /dev/null @@ -1,1342 +0,0 @@ -#define _XOPEN_SOURCE 500 /* needed for nftw() */ -#define _GNU_SOURCE /* needed for asprintf() */ - -/* Parse event JSON files */ - -/* - * Copyright (c) 2014, Intel Corporation - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, - * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* getrlimit */ -#include /* getrlimit */ -#include -#include -#include -#include -#include "jsmn.h" -#include "json.h" -#include "pmu-events.h" - -int verbose; -char *prog; - -struct json_event { - char *name; - char *compat; - char *event; - char *desc; - char *long_desc; - char *pmu; - char *unit; - char *perpkg; - char *aggr_mode; - char *metric_expr; - char *metric_name; - char *metric_group; - char *deprecated; - char *metric_constraint; -}; - -static enum aggr_mode_class convert(const char *aggr_mode) -{ - if (!strcmp(aggr_mode, "PerCore")) - return PerCore; - else if (!strcmp(aggr_mode, "PerChip")) - return PerChip; - - pr_err("%s: Wrong AggregationMode value '%s'\n", prog, aggr_mode); - return -1; -} - -static LIST_HEAD(sys_event_tables); - -struct sys_event_table { - struct list_head list; - char *soc_id; -}; - -static void free_sys_event_tables(void) -{ - struct sys_event_table *et, *next; - - list_for_each_entry_safe(et, next, &sys_event_tables, list) { - free(et->soc_id); - free(et); - } -} - -int eprintf(int level, int var, const char *fmt, ...) -{ - - int ret; - va_list args; - - if (var < level) - return 0; - - va_start(args, fmt); - - ret = vfprintf(stderr, fmt, args); - - va_end(args); - - return ret; -} - -static void addfield(char *map, char **dst, const char *sep, - const char *a, jsmntok_t *bt) -{ - unsigned int len = strlen(a) + 1 + strlen(sep); - int olen = *dst ? strlen(*dst) : 0; - int blen = bt ? json_len(bt) : 0; - char *out; - - out = realloc(*dst, len + olen + blen); - if (!out) { - /* Don't add field in this case */ - return; - } - *dst = out; - - if (!olen) - *(*dst) = 0; - else - strcat(*dst, sep); - strcat(*dst, a); - if (bt) - strncat(*dst, map + bt->start, blen); -} - -static void fixname(char *s) -{ - for (; *s; s++) - *s = tolower(*s); -} - -static void fixdesc(char *s) -{ - char *e = s + strlen(s); - - /* Remove trailing dots that look ugly in perf list */ - --e; - while (e >= s && isspace(*e)) - --e; - if (*e == '.') - *e = 0; -} - -/* Add escapes for '\' so they are proper C strings. */ -static char *fixregex(char *s) -{ - int len = 0; - int esc_count = 0; - char *fixed = NULL; - char *p, *q; - - /* Count the number of '\' in string */ - for (p = s; *p; p++) { - ++len; - if (*p == '\\') - ++esc_count; - } - - if (esc_count == 0) - return s; - - /* allocate space for a new string */ - fixed = (char *) malloc(len + esc_count + 1); - if (!fixed) - return NULL; - - /* copy over the characters */ - q = fixed; - for (p = s; *p; p++) { - if (*p == '\\') { - *q = '\\'; - ++q; - } - *q = *p; - ++q; - } - *q = '\0'; - return fixed; -} - -static struct msrmap { - const char *num; - const char *pname; -} msrmap[] = { - { "0x3F6", "ldlat=" }, - { "0x1A6", "offcore_rsp=" }, - { "0x1A7", "offcore_rsp=" }, - { "0x3F7", "frontend=" }, - { NULL, NULL } -}; - -static void cut_comma(char *map, jsmntok_t *newval) -{ - int i; - - /* Cut off everything after comma */ - for (i = newval->start; i < newval->end; i++) { - if (map[i] == ',') - newval->end = i; - } -} - -static struct msrmap *lookup_msr(char *map, jsmntok_t *val) -{ - jsmntok_t newval = *val; - static bool warned; - int i; - - cut_comma(map, &newval); - for (i = 0; msrmap[i].num; i++) - if (json_streq(map, &newval, msrmap[i].num)) - return &msrmap[i]; - if (!warned) { - warned = true; - pr_err("%s: Unknown MSR in event file %.*s\n", prog, - json_len(val), map + val->start); - } - return NULL; -} - -static struct map { - const char *json; - const char *perf; -} unit_to_pmu[] = { - { "CBO", "uncore_cbox" }, - { "QPI LL", "uncore_qpi" }, - { "SBO", "uncore_sbox" }, - { "iMPH-U", "uncore_arb" }, - { "CPU-M-CF", "cpum_cf" }, - { "CPU-M-SF", "cpum_sf" }, - { "UPI LL", "uncore_upi" }, - { "hisi_sicl,cpa", "hisi_sicl,cpa"}, - { "hisi_sccl,ddrc", "hisi_sccl,ddrc" }, - { "hisi_sccl,hha", "hisi_sccl,hha" }, - { "hisi_sccl,l3c", "hisi_sccl,l3c" }, - /* it's not realistic to keep adding these, we need something more scalable ... */ - { "imx8_ddr", "imx8_ddr" }, - { "L3PMC", "amd_l3" }, - { "DFPMC", "amd_df" }, - { "cpu_core", "cpu_core" }, - { "cpu_atom", "cpu_atom" }, - {} -}; - -static const char *field_to_perf(struct map *table, char *map, jsmntok_t *val) -{ - int i; - - for (i = 0; table[i].json; i++) { - if (json_streq(map, val, table[i].json)) - return table[i].perf; - } - return NULL; -} - -#define EXPECT(e, t, m) do { if (!(e)) { \ - jsmntok_t *loc = (t); \ - if (!(t)->start && (t) > tokens) \ - loc = (t) - 1; \ - pr_err("%s:%d: " m ", got %s\n", fn, \ - json_line(map, loc), \ - json_name(t)); \ - err = -EIO; \ - goto out_free; \ -} } while (0) - -static char *topic; - -static char *get_topic(void) -{ - char *tp; - int i; - - /* tp is free'd in process_one_file() */ - i = asprintf(&tp, "%s", topic); - if (i < 0) { - pr_info("%s: asprintf() error %s\n", prog); - return NULL; - } - - for (i = 0; i < (int) strlen(tp); i++) { - char c = tp[i]; - - if (c == '-') - tp[i] = ' '; - else if (c == '.') { - tp[i] = '\0'; - break; - } - } - - return tp; -} - -static int add_topic(char *bname) -{ - free(topic); - topic = strdup(bname); - if (!topic) { - pr_info("%s: strdup() error %s for file %s\n", prog, - strerror(errno), bname); - return -ENOMEM; - } - return 0; -} - -struct perf_entry_data { - FILE *outfp; - char *topic; -}; - -static int close_table; - -static void print_events_table_prefix(FILE *fp, const char *tblname) -{ - fprintf(fp, "static const struct pmu_event %s[] = {\n", tblname); - close_table = 1; -} - -static int print_events_table_entry(void *data, struct json_event *je) -{ - struct perf_entry_data *pd = data; - FILE *outfp = pd->outfp; - char *topic_local = pd->topic; - - /* - * TODO: Remove formatting chars after debugging to reduce - * string lengths. - */ - fprintf(outfp, "{\n"); - - if (je->name) - fprintf(outfp, "\t.name = \"%s\",\n", je->name); - if (je->event) - fprintf(outfp, "\t.event = \"%s\",\n", je->event); - fprintf(outfp, "\t.desc = \"%s\",\n", je->desc); - if (je->compat) - fprintf(outfp, "\t.compat = \"%s\",\n", je->compat); - fprintf(outfp, "\t.topic = \"%s\",\n", topic_local); - if (je->long_desc && je->long_desc[0]) - fprintf(outfp, "\t.long_desc = \"%s\",\n", je->long_desc); - if (je->pmu) - fprintf(outfp, "\t.pmu = \"%s\",\n", je->pmu); - if (je->unit) - fprintf(outfp, "\t.unit = \"%s\",\n", je->unit); - if (je->perpkg) - fprintf(outfp, "\t.perpkg = \"%s\",\n", je->perpkg); - if (je->aggr_mode) - fprintf(outfp, "\t.aggr_mode = \"%d\",\n", convert(je->aggr_mode)); - if (je->metric_expr) - fprintf(outfp, "\t.metric_expr = \"%s\",\n", je->metric_expr); - if (je->metric_name) - fprintf(outfp, "\t.metric_name = \"%s\",\n", je->metric_name); - if (je->metric_group) - fprintf(outfp, "\t.metric_group = \"%s\",\n", je->metric_group); - if (je->deprecated) - fprintf(outfp, "\t.deprecated = \"%s\",\n", je->deprecated); - if (je->metric_constraint) - fprintf(outfp, "\t.metric_constraint = \"%s\",\n", je->metric_constraint); - fprintf(outfp, "},\n"); - - return 0; -} - -struct event_struct { - struct list_head list; - char *name; - char *event; - char *compat; - char *desc; - char *long_desc; - char *pmu; - char *unit; - char *perpkg; - char *aggr_mode; - char *metric_expr; - char *metric_name; - char *metric_group; - char *deprecated; - char *metric_constraint; -}; - -#define ADD_EVENT_FIELD(field) do { if (je->field) { \ - es->field = strdup(je->field); \ - if (!es->field) \ - goto out_free; \ -} } while (0) - -#define FREE_EVENT_FIELD(field) free(es->field) - -#define TRY_FIXUP_FIELD(field) do { if (es->field && !je->field) {\ - je->field = strdup(es->field); \ - if (!je->field) \ - return -ENOMEM; \ -} } while (0) - -#define FOR_ALL_EVENT_STRUCT_FIELDS(op) do { \ - op(name); \ - op(event); \ - op(desc); \ - op(long_desc); \ - op(pmu); \ - op(unit); \ - op(perpkg); \ - op(aggr_mode); \ - op(metric_expr); \ - op(metric_name); \ - op(metric_group); \ - op(deprecated); \ -} while (0) - -static LIST_HEAD(arch_std_events); - -static void free_arch_std_events(void) -{ - struct event_struct *es, *next; - - list_for_each_entry_safe(es, next, &arch_std_events, list) { - FOR_ALL_EVENT_STRUCT_FIELDS(FREE_EVENT_FIELD); - list_del_init(&es->list); - free(es); - } -} - -static int save_arch_std_events(void *data __maybe_unused, struct json_event *je) -{ - struct event_struct *es; - - es = malloc(sizeof(*es)); - if (!es) - return -ENOMEM; - memset(es, 0, sizeof(*es)); - FOR_ALL_EVENT_STRUCT_FIELDS(ADD_EVENT_FIELD); - list_add_tail(&es->list, &arch_std_events); - return 0; -out_free: - FOR_ALL_EVENT_STRUCT_FIELDS(FREE_EVENT_FIELD); - free(es); - return -ENOMEM; -} - -static void print_events_table_suffix(FILE *outfp) -{ - fprintf(outfp, "{\n"); - - fprintf(outfp, "\t.name = 0,\n"); - fprintf(outfp, "\t.event = 0,\n"); - fprintf(outfp, "\t.desc = 0,\n"); - - fprintf(outfp, "},\n"); - fprintf(outfp, "};\n"); - close_table = 0; -} - -static struct fixed { - const char *name; - const char *event; -} fixed[] = { - { "inst_retired.any", "event=0xc0,period=2000003" }, - { "inst_retired.any_p", "event=0xc0,period=2000003" }, - { "cpu_clk_unhalted.ref", "event=0x0,umask=0x03,period=2000003" }, - { "cpu_clk_unhalted.thread", "event=0x3c,period=2000003" }, - { "cpu_clk_unhalted.core", "event=0x3c,period=2000003" }, - { "cpu_clk_unhalted.thread_any", "event=0x3c,any=1,period=2000003" }, - { NULL, NULL}, -}; - -/* - * Handle different fixed counter encodings between JSON and perf. - */ -static char *real_event(const char *name, char *event) -{ - int i; - - if (!name) - return NULL; - - for (i = 0; fixed[i].name; i++) - if (!strcasecmp(name, fixed[i].name)) - return (char *)fixed[i].event; - return event; -} - -static int -try_fixup(const char *fn, char *arch_std, struct json_event *je, char **event) -{ - /* try to find matching event from arch standard values */ - struct event_struct *es; - - list_for_each_entry(es, &arch_std_events, list) { - if (!strcmp(arch_std, es->name)) { - FOR_ALL_EVENT_STRUCT_FIELDS(TRY_FIXUP_FIELD); - *event = je->event; - return 0; - } - } - - pr_err("%s: could not find matching %s for %s\n", - prog, arch_std, fn); - return -1; -} - -/* Call func with each event in the json file */ -static int json_events(const char *fn, - int (*func)(void *data, struct json_event *je), - void *data) -{ - int err; - size_t size; - jsmntok_t *tokens, *tok; - int i, j, len; - char *map; - char buf[128]; - - if (!fn) - return -ENOENT; - - tokens = parse_json(fn, &map, &size, &len); - if (!tokens) - return -EIO; - EXPECT(tokens->type == JSMN_ARRAY, tokens, "expected top level array"); - tok = tokens + 1; - for (i = 0; i < tokens->size; i++) { - char *event = NULL; - char *extra_desc = NULL; - char *filter = NULL; - struct json_event je = {}; - char *arch_std = NULL; - unsigned long long eventcode = 0; - unsigned long long configcode = 0; - struct msrmap *msr = NULL; - jsmntok_t *msrval = NULL; - jsmntok_t *precise = NULL; - jsmntok_t *obj = tok++; - bool configcode_present = false; - char *umask = NULL; - char *cmask = NULL; - char *inv = NULL; - char *any = NULL; - char *edge = NULL; - char *period = NULL; - char *fc_mask = NULL; - char *ch_mask = NULL; - - EXPECT(obj->type == JSMN_OBJECT, obj, "expected object"); - for (j = 0; j < obj->size; j += 2) { - jsmntok_t *field, *val; - int nz; - char *s; - - field = tok + j; - EXPECT(field->type == JSMN_STRING, tok + j, - "Expected field name"); - val = tok + j + 1; - EXPECT(val->type == JSMN_STRING, tok + j + 1, - "Expected string value"); - - nz = !json_streq(map, val, "0"); - /* match_field */ - if (json_streq(map, field, "UMask") && nz) { - addfield(map, &umask, "", "umask=", val); - } else if (json_streq(map, field, "CounterMask") && nz) { - addfield(map, &cmask, "", "cmask=", val); - } else if (json_streq(map, field, "Invert") && nz) { - addfield(map, &inv, "", "inv=", val); - } else if (json_streq(map, field, "AnyThread") && nz) { - addfield(map, &any, "", "any=", val); - } else if (json_streq(map, field, "EdgeDetect") && nz) { - addfield(map, &edge, "", "edge=", val); - } else if (json_streq(map, field, "SampleAfterValue") && nz) { - addfield(map, &period, "", "period=", val); - } else if (json_streq(map, field, "FCMask") && nz) { - addfield(map, &fc_mask, "", "fc_mask=", val); - } else if (json_streq(map, field, "PortMask") && nz) { - addfield(map, &ch_mask, "", "ch_mask=", val); - } else if (json_streq(map, field, "EventCode")) { - char *code = NULL; - addfield(map, &code, "", "", val); - eventcode |= strtoul(code, NULL, 0); - free(code); - } else if (json_streq(map, field, "ConfigCode")) { - char *code = NULL; - addfield(map, &code, "", "", val); - configcode |= strtoul(code, NULL, 0); - free(code); - configcode_present = true; - } else if (json_streq(map, field, "ExtSel")) { - char *code = NULL; - addfield(map, &code, "", "", val); - eventcode |= strtoul(code, NULL, 0) << 8; - free(code); - } else if (json_streq(map, field, "EventName")) { - addfield(map, &je.name, "", "", val); - } else if (json_streq(map, field, "Compat")) { - addfield(map, &je.compat, "", "", val); - } else if (json_streq(map, field, "BriefDescription")) { - addfield(map, &je.desc, "", "", val); - fixdesc(je.desc); - } else if (json_streq(map, field, - "PublicDescription")) { - addfield(map, &je.long_desc, "", "", val); - fixdesc(je.long_desc); - } else if (json_streq(map, field, "PEBS") && nz) { - precise = val; - } else if (json_streq(map, field, "MSRIndex") && nz) { - msr = lookup_msr(map, val); - } else if (json_streq(map, field, "MSRValue")) { - msrval = val; - } else if (json_streq(map, field, "Errata") && - !json_streq(map, val, "null")) { - addfield(map, &extra_desc, ". ", - " Spec update: ", val); - } else if (json_streq(map, field, "Data_LA") && nz) { - addfield(map, &extra_desc, ". ", - " Supports address when precise", - NULL); - } else if (json_streq(map, field, "Unit")) { - const char *ppmu; - - ppmu = field_to_perf(unit_to_pmu, map, val); - if (ppmu) { - je.pmu = strdup(ppmu); - } else { - if (!je.pmu) - je.pmu = strdup("uncore_"); - addfield(map, &je.pmu, "", "", val); - for (s = je.pmu; *s; s++) - *s = tolower(*s); - } - } else if (json_streq(map, field, "Filter")) { - addfield(map, &filter, "", "", val); - } else if (json_streq(map, field, "ScaleUnit")) { - addfield(map, &je.unit, "", "", val); - } else if (json_streq(map, field, "PerPkg")) { - addfield(map, &je.perpkg, "", "", val); - } else if (json_streq(map, field, "AggregationMode")) { - addfield(map, &je.aggr_mode, "", "", val); - } else if (json_streq(map, field, "Deprecated")) { - addfield(map, &je.deprecated, "", "", val); - } else if (json_streq(map, field, "MetricName")) { - addfield(map, &je.metric_name, "", "", val); - } else if (json_streq(map, field, "MetricGroup")) { - addfield(map, &je.metric_group, "", "", val); - } else if (json_streq(map, field, "MetricConstraint")) { - addfield(map, &je.metric_constraint, "", "", val); - } else if (json_streq(map, field, "MetricExpr")) { - addfield(map, &je.metric_expr, "", "", val); - } else if (json_streq(map, field, "ArchStdEvent")) { - addfield(map, &arch_std, "", "", val); - for (s = arch_std; *s; s++) - *s = tolower(*s); - } - /* ignore unknown fields */ - } - if (precise && je.desc && !strstr(je.desc, "(Precise Event)")) { - if (json_streq(map, precise, "2")) - addfield(map, &extra_desc, " ", - "(Must be precise)", NULL); - else - addfield(map, &extra_desc, " ", - "(Precise event)", NULL); - } - if (configcode_present) - snprintf(buf, sizeof buf, "config=%#llx", configcode); - else - snprintf(buf, sizeof buf, "event=%#llx", eventcode); - addfield(map, &event, ",", buf, NULL); - if (any) - addfield(map, &event, ",", any, NULL); - if (ch_mask) - addfield(map, &event, ",", ch_mask, NULL); - if (cmask) - addfield(map, &event, ",", cmask, NULL); - if (edge) - addfield(map, &event, ",", edge, NULL); - if (fc_mask) - addfield(map, &event, ",", fc_mask, NULL); - if (inv) - addfield(map, &event, ",", inv, NULL); - if (period) - addfield(map, &event, ",", period, NULL); - if (umask) - addfield(map, &event, ",", umask, NULL); - - if (je.desc && extra_desc) - addfield(map, &je.desc, " ", extra_desc, NULL); - if (je.long_desc && extra_desc) - addfield(map, &je.long_desc, " ", extra_desc, NULL); - if (je.pmu) { - addfield(map, &je.desc, ". ", "Unit: ", NULL); - addfield(map, &je.desc, "", je.pmu, NULL); - addfield(map, &je.desc, "", " ", NULL); - } - if (filter) - addfield(map, &event, ",", filter, NULL); - if (msr != NULL) - addfield(map, &event, ",", msr->pname, msrval); - if (je.name) - fixname(je.name); - - if (arch_std) { - /* - * An arch standard event is referenced, so try to - * fixup any unassigned values. - */ - err = try_fixup(fn, arch_std, &je, &event); - if (err) - goto free_strings; - } - je.event = real_event(je.name, event); - err = func(data, &je); -free_strings: - free(umask); - free(cmask); - free(inv); - free(any); - free(edge); - free(period); - free(fc_mask); - free(ch_mask); - free(event); - free(je.desc); - free(je.name); - free(je.compat); - free(je.long_desc); - free(extra_desc); - free(je.pmu); - free(filter); - free(je.perpkg); - free(je.aggr_mode); - free(je.deprecated); - free(je.unit); - free(je.metric_expr); - free(je.metric_name); - free(je.metric_group); - free(je.metric_constraint); - free(arch_std); - - if (err) - break; - tok += j; - } - EXPECT(tok - tokens == len, tok, "unexpected objects at end"); - err = 0; -out_free: - free_json(map, size, tokens); - return err; -} - -static char *file_name_to_table_name(char *fname) -{ - unsigned int i; - int n; - int c; - char *tblname; - - /* - * Ensure tablename starts with alphabetic character. - * Derive rest of table name from basename of the JSON file, - * replacing hyphens and stripping out .json suffix. - */ - n = asprintf(&tblname, "pme_%s", fname); - if (n < 0) { - pr_info("%s: asprintf() error %s for file %s\n", prog, - strerror(errno), fname); - return NULL; - } - - for (i = 0; i < strlen(tblname); i++) { - c = tblname[i]; - - if (c == '-' || c == '/') - tblname[i] = '_'; - else if (c == '.') { - tblname[i] = '\0'; - break; - } else if (!isalnum(c) && c != '_') { - pr_err("%s: Invalid character '%c' in file name %s\n", - prog, c, basename(fname)); - free(tblname); - tblname = NULL; - break; - } - } - - return tblname; -} - -static bool is_sys_dir(char *fname) -{ - size_t len = strlen(fname), len2 = strlen("/sys"); - - if (len2 > len) - return false; - return !strcmp(fname+len-len2, "/sys"); -} - -static void print_mapping_table_prefix(FILE *outfp) -{ - fprintf(outfp, "const struct pmu_events_map pmu_events_map[] = {\n"); -} - -static void print_mapping_table_suffix(FILE *outfp) -{ - /* - * Print the terminating, NULL entry. - */ - fprintf(outfp, "{\n"); - fprintf(outfp, "\t.cpuid = 0,\n"); - fprintf(outfp, "\t.version = 0,\n"); - fprintf(outfp, "\t.type = 0,\n"); - fprintf(outfp, "\t.table = 0,\n"); - fprintf(outfp, "},\n"); - - /* and finally, the closing curly bracket for the struct */ - fprintf(outfp, "};\n"); -} - -static void print_mapping_test_table(FILE *outfp) -{ - /* - * Print the terminating, NULL entry. - */ - fprintf(outfp, "{\n"); - fprintf(outfp, "\t.cpuid = \"testcpu\",\n"); - fprintf(outfp, "\t.version = \"v1\",\n"); - fprintf(outfp, "\t.type = \"core\",\n"); - fprintf(outfp, "\t.table = pme_test_soc_cpu,\n"); - fprintf(outfp, "},\n"); -} - -static void print_system_event_mapping_table_prefix(FILE *outfp) -{ - fprintf(outfp, "\nconst struct pmu_sys_events pmu_sys_event_tables[] = {"); -} - -static void print_system_event_mapping_table_suffix(FILE *outfp) -{ - fprintf(outfp, "\n\t{\n\t\t.table = 0\n\t},"); - fprintf(outfp, "\n};\n"); -} - -static int process_system_event_tables(FILE *outfp) -{ - struct sys_event_table *sys_event_table; - - print_system_event_mapping_table_prefix(outfp); - - list_for_each_entry(sys_event_table, &sys_event_tables, list) { - fprintf(outfp, "\n\t{\n\t\t.table = %s,\n\t\t.name = \"%s\",\n\t},", - sys_event_table->soc_id, - sys_event_table->soc_id); - } - - print_system_event_mapping_table_suffix(outfp); - - return 0; -} - -static int process_mapfile(FILE *outfp, char *fpath) -{ - int n = 16384; - FILE *mapfp; - char *save = NULL; - char *line, *p; - int line_num; - char *tblname; - int ret = 0; - - pr_info("%s: Processing mapfile %s\n", prog, fpath); - - line = malloc(n); - if (!line) - return -1; - - mapfp = fopen(fpath, "r"); - if (!mapfp) { - pr_info("%s: Error %s opening %s\n", prog, strerror(errno), - fpath); - free(line); - return -1; - } - - print_mapping_table_prefix(outfp); - - /* Skip first line (header) */ - p = fgets(line, n, mapfp); - if (!p) - goto out; - - line_num = 1; - while (1) { - char *cpuid, *version, *type, *fname; - - line_num++; - p = fgets(line, n, mapfp); - if (!p) - break; - - if (line[0] == '#' || line[0] == '\n') - continue; - - if (line[strlen(line)-1] != '\n') { - /* TODO Deal with lines longer than 16K */ - pr_info("%s: Mapfile %s: line %d too long, aborting\n", - prog, fpath, line_num); - ret = -1; - goto out; - } - line[strlen(line)-1] = '\0'; - - cpuid = fixregex(strtok_r(p, ",", &save)); - version = strtok_r(NULL, ",", &save); - fname = strtok_r(NULL, ",", &save); - type = strtok_r(NULL, ",", &save); - - tblname = file_name_to_table_name(fname); - fprintf(outfp, "{\n"); - fprintf(outfp, "\t.cpuid = \"%s\",\n", cpuid); - fprintf(outfp, "\t.version = \"%s\",\n", version); - fprintf(outfp, "\t.type = \"%s\",\n", type); - - /* - * CHECK: We can't use the type (eg "core") field in the - * table name. For us to do that, we need to somehow tweak - * the other caller of file_name_to_table(), process_json() - * to determine the type. process_json() file has no way - * of knowing these are "core" events unless file name has - * core in it. If filename has core in it, we can safely - * ignore the type field here also. - */ - fprintf(outfp, "\t.table = %s\n", tblname); - fprintf(outfp, "},\n"); - } - -out: - print_mapping_test_table(outfp); - print_mapping_table_suffix(outfp); - fclose(mapfp); - free(line); - return ret; -} - -/* - * If we fail to locate/process JSON and map files, create a NULL mapping - * table. This would at least allow perf to build even if we can't find/use - * the aliases. - */ -static void create_empty_mapping(const char *output_file) -{ - FILE *outfp; - - pr_info("%s: Creating empty pmu_events_map[] table\n", prog); - - /* Truncate file to clear any partial writes to it */ - outfp = fopen(output_file, "w"); - if (!outfp) { - perror("fopen()"); - _Exit(1); - } - - fprintf(outfp, "#include \"pmu-events/pmu-events.h\"\n"); - print_mapping_table_prefix(outfp); - print_mapping_table_suffix(outfp); - print_system_event_mapping_table_prefix(outfp); - print_system_event_mapping_table_suffix(outfp); - fclose(outfp); -} - -static int get_maxfds(void) -{ - struct rlimit rlim; - - if (getrlimit(RLIMIT_NOFILE, &rlim) == 0) - return min(rlim.rlim_max / 2, (rlim_t)512); - - return 512; -} - -/* - * nftw() doesn't let us pass an argument to the processing function, - * so use a global variables. - */ -static FILE *eventsfp; -static char *mapfile; - -static int is_leaf_dir(const char *fpath) -{ - DIR *d; - struct dirent *dir; - int res = 1; - - d = opendir(fpath); - if (!d) - return 0; - - while ((dir = readdir(d)) != NULL) { - if (!strcmp(dir->d_name, ".") || !strcmp(dir->d_name, "..")) - continue; - - if (dir->d_type == DT_DIR) { - res = 0; - break; - } else if (dir->d_type == DT_UNKNOWN) { - char path[PATH_MAX]; - struct stat st; - - sprintf(path, "%s/%s", fpath, dir->d_name); - if (stat(path, &st)) - break; - - if (S_ISDIR(st.st_mode)) { - res = 0; - break; - } - } - } - - closedir(d); - - return res; -} - -static int is_json_file(const char *name) -{ - const char *suffix; - - if (strlen(name) < 5) - return 0; - - suffix = name + strlen(name) - 5; - - if (strncmp(suffix, ".json", 5) == 0) - return 1; - return 0; -} - -static int preprocess_arch_std_files(const char *fpath, const struct stat *sb, - int typeflag, struct FTW *ftwbuf) -{ - int level = ftwbuf->level; - int is_file = typeflag == FTW_F; - - if (level == 1 && is_file && is_json_file(fpath)) - return json_events(fpath, save_arch_std_events, (void *)sb); - - return 0; -} - -static int process_one_file(const char *fpath, const struct stat *sb, - int typeflag, struct FTW *ftwbuf) -{ - char *tblname, *bname; - int is_dir = typeflag == FTW_D; - int is_file = typeflag == FTW_F; - int level = ftwbuf->level; - int err = 0; - - if (level >= 2 && is_dir) { - int count = 0; - /* - * For level 2 directory, bname will include parent name, - * like vendor/platform. So search back from platform dir - * to find this. - * Something similar for level 3 directory, but we're a PMU - * category folder, like vendor/platform/cpu. - */ - bname = (char *) fpath + ftwbuf->base - 2; - for (;;) { - if (*bname == '/') - count++; - if (count == level - 1) - break; - bname--; - } - bname++; - } else - bname = (char *) fpath + ftwbuf->base; - - pr_debug("%s %d %7jd %-20s %s\n", - is_file ? "f" : is_dir ? "d" : "x", - level, sb->st_size, bname, fpath); - - /* base dir or too deep */ - if (level == 0 || level > 4) - return 0; - - - /* model directory, reset topic */ - if ((level == 1 && is_dir && is_leaf_dir(fpath)) || - (level >= 2 && is_dir && is_leaf_dir(fpath))) { - if (close_table) - print_events_table_suffix(eventsfp); - - /* - * Drop file name suffix. Replace hyphens with underscores. - * Fail if file name contains any alphanum characters besides - * underscores. - */ - tblname = file_name_to_table_name(bname); - if (!tblname) { - pr_info("%s: Error determining table name for %s\n", prog, - bname); - return -1; - } - - if (is_sys_dir(bname)) { - struct sys_event_table *sys_event_table; - - sys_event_table = malloc(sizeof(*sys_event_table)); - if (!sys_event_table) - return -1; - - sys_event_table->soc_id = strdup(tblname); - if (!sys_event_table->soc_id) { - free(sys_event_table); - return -1; - } - list_add_tail(&sys_event_table->list, - &sys_event_tables); - } - - print_events_table_prefix(eventsfp, tblname); - return 0; - } - - /* - * Save the mapfile name for now. We will process mapfile - * after processing all JSON files (so we can write out the - * mapping table after all PMU events tables). - * - */ - if (level == 1 && is_file) { - if (!strcmp(bname, "mapfile.csv")) { - mapfile = strdup(fpath); - return 0; - } - if (is_json_file(bname)) - pr_debug("%s: ArchStd json is preprocessed %s\n", prog, fpath); - else - pr_info("%s: Ignoring file %s\n", prog, fpath); - return 0; - } - - /* - * If the file name does not have a .json extension, - * ignore it. It could be a readme.txt for instance. - */ - if (is_file) { - if (!is_json_file(bname)) { - pr_info("%s: Ignoring file without .json suffix %s\n", prog, - fpath); - return 0; - } - } - - if (level > 1 && add_topic(bname)) - return -ENOMEM; - - /* - * Assume all other files are JSON files. - * - * If mapfile refers to 'power7_core.json', we create a table - * named 'power7_core'. Any inconsistencies between the mapfile - * and directory tree could result in build failure due to table - * names not being found. - * - * At least for now, be strict with processing JSON file names. - * i.e. if JSON file name cannot be mapped to C-style table name, - * fail. - */ - if (is_file) { - struct perf_entry_data data = { - .topic = get_topic(), - .outfp = eventsfp, - }; - - err = json_events(fpath, print_events_table_entry, &data); - - free(data.topic); - } - - return err; -} - -#ifndef PATH_MAX -#define PATH_MAX 4096 -#endif - -/* - * Starting in directory 'start_dirname', find the "mapfile.csv" and - * the set of JSON files for the architecture 'arch'. - * - * From each JSON file, create a C-style "PMU events table" from the - * JSON file (see struct pmu_event). - * - * From the mapfile, create a mapping between the CPU revisions and - * PMU event tables (see struct pmu_events_map). - * - * Write out the PMU events tables and the mapping table to pmu-event.c. - */ -int main(int argc, char *argv[]) -{ - int rc, ret = 0, empty_map = 0; - int maxfds; - char ldirname[PATH_MAX]; - const char *arch; - const char *output_file; - const char *start_dirname; - const char *err_string_ext = ""; - struct stat stbuf; - - prog = basename(argv[0]); - if (argc < 4) { - pr_err("Usage: %s \n", prog); - return 1; - } - - arch = argv[1]; - start_dirname = argv[2]; - output_file = argv[3]; - - if (argc > 4) - verbose = atoi(argv[4]); - - eventsfp = fopen(output_file, "w"); - if (!eventsfp) { - pr_err("%s Unable to create required file %s (%s)\n", - prog, output_file, strerror(errno)); - return 2; - } - - sprintf(ldirname, "%s/%s", start_dirname, arch); - - /* If architecture does not have any event lists, bail out */ - if (stat(ldirname, &stbuf) < 0) { - pr_info("%s: Arch %s has no PMU event lists\n", prog, arch); - empty_map = 1; - goto err_close_eventsfp; - } - - /* Include pmu-events.h first */ - fprintf(eventsfp, "#include \"pmu-events/pmu-events.h\"\n"); - - /* - * The mapfile allows multiple CPUids to point to the same JSON file, - * so, not sure if there is a need for symlinks within the pmu-events - * directory. - * - * For now, treat symlinks of JSON files as regular files and create - * separate tables for each symlink (presumably, each symlink refers - * to specific version of the CPU). - */ - - maxfds = get_maxfds(); - rc = nftw(ldirname, preprocess_arch_std_files, maxfds, 0); - if (rc) - goto err_processing_std_arch_event_dir; - - rc = nftw(ldirname, process_one_file, maxfds, 0); - if (rc) - goto err_processing_dir; - - sprintf(ldirname, "%s/test", start_dirname); - - rc = nftw(ldirname, preprocess_arch_std_files, maxfds, 0); - if (rc) - goto err_processing_std_arch_event_dir; - - rc = nftw(ldirname, process_one_file, maxfds, 0); - if (rc) - goto err_processing_dir; - - if (close_table) - print_events_table_suffix(eventsfp); - - if (!mapfile) { - pr_info("%s: No CPU->JSON mapping?\n", prog); - empty_map = 1; - goto err_close_eventsfp; - } - - rc = process_mapfile(eventsfp, mapfile); - if (rc) { - pr_info("%s: Error processing mapfile %s\n", prog, mapfile); - /* Make build fail */ - ret = 1; - goto err_close_eventsfp; - } - - rc = process_system_event_tables(eventsfp); - fclose(eventsfp); - if (rc) { - ret = 1; - goto err_out; - } - - free_arch_std_events(); - free_sys_event_tables(); - free(mapfile); - return 0; - -err_processing_std_arch_event_dir: - err_string_ext = " for std arch event"; -err_processing_dir: - if (verbose) { - pr_info("%s: Error walking file tree %s%s\n", prog, ldirname, - err_string_ext); - empty_map = 1; - } else if (rc < 0) { - ret = 1; - } else { - empty_map = 1; - } -err_close_eventsfp: - fclose(eventsfp); - if (empty_map) - create_empty_mapping(output_file); -err_out: - free_arch_std_events(); - free_sys_event_tables(); - free(mapfile); - return ret; -} diff --git a/tools/perf/pmu-events/jsmn.c b/tools/perf/pmu-events/jsmn.c deleted file mode 100644 index 831dc44c4558..000000000000 --- a/tools/perf/pmu-events/jsmn.c +++ /dev/null @@ -1,352 +0,0 @@ -/* - * Copyright (c) 2010 Serge A. Zaitsev - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - * Slightly modified by AK to not assume 0 terminated input. - */ - -#include -#include "jsmn.h" -#define JSMN_STRICT - -/* - * Allocates a fresh unused token from the token pool. - */ -static jsmntok_t *jsmn_alloc_token(jsmn_parser *parser, - jsmntok_t *tokens, size_t num_tokens) -{ - jsmntok_t *tok; - - if ((unsigned)parser->toknext >= num_tokens) - return NULL; - tok = &tokens[parser->toknext++]; - tok->start = tok->end = -1; - tok->size = 0; - return tok; -} - -/* - * Fills token type and boundaries. - */ -static void jsmn_fill_token(jsmntok_t *token, jsmntype_t type, - int start, int end) -{ - token->type = type; - token->start = start; - token->end = end; - token->size = 0; -} - -/* - * Fills next available token with JSON primitive. - */ -static jsmnerr_t jsmn_parse_primitive(jsmn_parser *parser, const char *js, - size_t len, - jsmntok_t *tokens, size_t num_tokens) -{ - jsmntok_t *token; - int start; - - start = parser->pos; - - for (; parser->pos < len; parser->pos++) { - switch (js[parser->pos]) { -#ifndef JSMN_STRICT - /* - * In strict mode primitive must be followed by "," - * or "}" or "]" - */ - case ':': -#endif - case '\t': - case '\r': - case '\n': - case ' ': - case ',': - case ']': - case '}': - goto found; - default: - break; - } - if (js[parser->pos] < 32 || js[parser->pos] >= 127) { - parser->pos = start; - return JSMN_ERROR_INVAL; - } - } -#ifdef JSMN_STRICT - /* - * In strict mode primitive must be followed by a - * comma/object/array. - */ - parser->pos = start; - return JSMN_ERROR_PART; -#endif - -found: - token = jsmn_alloc_token(parser, tokens, num_tokens); - if (token == NULL) { - parser->pos = start; - return JSMN_ERROR_NOMEM; - } - jsmn_fill_token(token, JSMN_PRIMITIVE, start, parser->pos); - parser->pos--; /* parent sees closing brackets */ - return JSMN_SUCCESS; -} - -/* - * Fills next token with JSON string. - */ -static jsmnerr_t jsmn_parse_string(jsmn_parser *parser, const char *js, - size_t len, - jsmntok_t *tokens, size_t num_tokens) -{ - jsmntok_t *token; - int start = parser->pos; - - /* Skip starting quote */ - parser->pos++; - - for (; parser->pos < len; parser->pos++) { - char c = js[parser->pos]; - - /* Quote: end of string */ - if (c == '\"') { - token = jsmn_alloc_token(parser, tokens, num_tokens); - if (token == NULL) { - parser->pos = start; - return JSMN_ERROR_NOMEM; - } - jsmn_fill_token(token, JSMN_STRING, start+1, - parser->pos); - return JSMN_SUCCESS; - } - - /* Backslash: Quoted symbol expected */ - if (c == '\\') { - parser->pos++; - switch (js[parser->pos]) { - /* Allowed escaped symbols */ - case '\"': - case '/': - case '\\': - case 'b': - case 'f': - case 'r': - case 'n': - case 't': - break; - /* Allows escaped symbol \uXXXX */ - case 'u': - /* TODO */ - break; - /* Unexpected symbol */ - default: - parser->pos = start; - return JSMN_ERROR_INVAL; - } - } - } - parser->pos = start; - return JSMN_ERROR_PART; -} - -/* - * Parse JSON string and fill tokens. - */ -jsmnerr_t jsmn_parse(jsmn_parser *parser, const char *js, size_t len, - jsmntok_t *tokens, unsigned int num_tokens) -{ - jsmnerr_t r; - int i; - jsmntok_t *token; -#ifdef JSMN_STRICT - /* - * Keeps track of whether a new object/list/primitive is expected. New items are only - * allowed after an opening brace, comma or colon. A closing brace after a comma is not - * valid JSON. - */ - int expecting_item = 1; -#endif - - for (; parser->pos < len; parser->pos++) { - char c; - jsmntype_t type; - - c = js[parser->pos]; - switch (c) { - case '{': - case '[': -#ifdef JSMN_STRICT - if (!expecting_item) - return JSMN_ERROR_INVAL; -#endif - token = jsmn_alloc_token(parser, tokens, num_tokens); - if (token == NULL) - return JSMN_ERROR_NOMEM; - if (parser->toksuper != -1) - tokens[parser->toksuper].size++; - token->type = (c == '{' ? JSMN_OBJECT : JSMN_ARRAY); - token->start = parser->pos; - parser->toksuper = parser->toknext - 1; - break; - case '}': - case ']': -#ifdef JSMN_STRICT - if (expecting_item) - return JSMN_ERROR_INVAL; -#endif - type = (c == '}' ? JSMN_OBJECT : JSMN_ARRAY); - for (i = parser->toknext - 1; i >= 0; i--) { - token = &tokens[i]; - if (token->start != -1 && token->end == -1) { - if (token->type != type) - return JSMN_ERROR_INVAL; - parser->toksuper = -1; - token->end = parser->pos + 1; - break; - } - } - /* Error if unmatched closing bracket */ - if (i == -1) - return JSMN_ERROR_INVAL; - for (; i >= 0; i--) { - token = &tokens[i]; - if (token->start != -1 && token->end == -1) { - parser->toksuper = i; - break; - } - } - break; - case '\"': -#ifdef JSMN_STRICT - if (!expecting_item) - return JSMN_ERROR_INVAL; - expecting_item = 0; -#endif - r = jsmn_parse_string(parser, js, len, tokens, - num_tokens); - if (r < 0) - return r; - if (parser->toksuper != -1) - tokens[parser->toksuper].size++; - break; - case '\t': - case '\r': - case '\n': - case ' ': - break; -#ifdef JSMN_STRICT - case ':': - case ',': - if (expecting_item) - return JSMN_ERROR_INVAL; - expecting_item = 1; - break; - /* - * In strict mode primitives are: - * numbers and booleans. - */ - case '-': - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - case 't': - case 'f': - case 'n': -#else - case ':': - case ',': - break; - /* - * In non-strict mode every unquoted value - * is a primitive. - */ - /*FALL THROUGH */ - default: -#endif - -#ifdef JSMN_STRICT - if (!expecting_item) - return JSMN_ERROR_INVAL; - expecting_item = 0; -#endif - r = jsmn_parse_primitive(parser, js, len, tokens, - num_tokens); - if (r < 0) - return r; - if (parser->toksuper != -1) - tokens[parser->toksuper].size++; - break; - -#ifdef JSMN_STRICT - /* Unexpected char in strict mode */ - default: - return JSMN_ERROR_INVAL; -#endif - } - } - - for (i = parser->toknext - 1; i >= 0; i--) { - /* Unmatched opened object or array */ - if (tokens[i].start != -1 && tokens[i].end == -1) - return JSMN_ERROR_PART; - } - -#ifdef JSMN_STRICT - return expecting_item ? JSMN_ERROR_INVAL : JSMN_SUCCESS; -#else - return JSMN_SUCCESS; -#endif -} - -/* - * Creates a new parser based over a given buffer with an array of tokens - * available. - */ -void jsmn_init(jsmn_parser *parser) -{ - parser->pos = 0; - parser->toknext = 0; - parser->toksuper = -1; -} - -const char *jsmn_strerror(jsmnerr_t err) -{ - switch (err) { - case JSMN_ERROR_NOMEM: - return "No enough tokens"; - case JSMN_ERROR_INVAL: - return "Invalid character inside JSON string"; - case JSMN_ERROR_PART: - return "The string is not a full JSON packet, more bytes expected"; - case JSMN_SUCCESS: - return "Success"; - default: - return "Unknown json error"; - } -} diff --git a/tools/perf/pmu-events/jsmn.h b/tools/perf/pmu-events/jsmn.h deleted file mode 100644 index 1bdfd55fff30..000000000000 --- a/tools/perf/pmu-events/jsmn.h +++ /dev/null @@ -1,68 +0,0 @@ -/* SPDX-License-Identifier: MIT */ -#ifndef __JSMN_H_ -#define __JSMN_H_ - -/* - * JSON type identifier. Basic types are: - * o Object - * o Array - * o String - * o Other primitive: number, boolean (true/false) or null - */ -typedef enum { - JSMN_PRIMITIVE = 0, - JSMN_OBJECT = 1, - JSMN_ARRAY = 2, - JSMN_STRING = 3 -} jsmntype_t; - -typedef enum { - /* Not enough tokens were provided */ - JSMN_ERROR_NOMEM = -1, - /* Invalid character inside JSON string */ - JSMN_ERROR_INVAL = -2, - /* The string is not a full JSON packet, more bytes expected */ - JSMN_ERROR_PART = -3, - /* Everything was fine */ - JSMN_SUCCESS = 0 -} jsmnerr_t; - -/* - * JSON token description. - * @param type type (object, array, string etc.) - * @param start start position in JSON data string - * @param end end position in JSON data string - */ -typedef struct { - jsmntype_t type; - int start; - int end; - int size; -} jsmntok_t; - -/* - * JSON parser. Contains an array of token blocks available. Also stores - * the string being parsed now and current position in that string - */ -typedef struct { - unsigned int pos; /* offset in the JSON string */ - int toknext; /* next token to allocate */ - int toksuper; /* superior token node, e.g parent object or array */ -} jsmn_parser; - -/* - * Create JSON parser over an array of tokens - */ -void jsmn_init(jsmn_parser *parser); - -/* - * Run JSON parser. It parses a JSON data string into and array of tokens, - * each describing a single JSON object. - */ -jsmnerr_t jsmn_parse(jsmn_parser *parser, const char *js, - size_t len, - jsmntok_t *tokens, unsigned int num_tokens); - -const char *jsmn_strerror(jsmnerr_t err); - -#endif /* __JSMN_H_ */ diff --git a/tools/perf/pmu-events/json.c b/tools/perf/pmu-events/json.c deleted file mode 100644 index 0544398d6e2d..000000000000 --- a/tools/perf/pmu-events/json.c +++ /dev/null @@ -1,162 +0,0 @@ -/* Parse JSON files using the JSMN parser. */ - -/* - * Copyright (c) 2014, Intel Corporation - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, - * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "jsmn.h" -#include "json.h" -#include - - -static char *mapfile(const char *fn, size_t *size) -{ - unsigned ps = sysconf(_SC_PAGESIZE); - struct stat st; - char *map = NULL; - int err; - int fd = open(fn, O_RDONLY); - - if (fd < 0 && verbose > 0 && fn) { - pr_err("Error opening events file '%s': %s\n", fn, - strerror(errno)); - } - - if (fd < 0) - return NULL; - err = fstat(fd, &st); - if (err < 0) - goto out; - *size = st.st_size; - map = mmap(NULL, - (st.st_size + ps - 1) & ~(ps - 1), - PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); - if (map == MAP_FAILED) - map = NULL; -out: - close(fd); - return map; -} - -static void unmapfile(char *map, size_t size) -{ - unsigned ps = sysconf(_SC_PAGESIZE); - munmap(map, roundup(size, ps)); -} - -/* - * Parse json file using jsmn. Return array of tokens, - * and mapped file. Caller needs to free array. - */ -jsmntok_t *parse_json(const char *fn, char **map, size_t *size, int *len) -{ - jsmn_parser parser; - jsmntok_t *tokens; - jsmnerr_t res; - unsigned sz; - - *map = mapfile(fn, size); - if (!*map) - return NULL; - /* Heuristic */ - sz = *size * 16; - tokens = malloc(sz); - if (!tokens) - goto error; - jsmn_init(&parser); - res = jsmn_parse(&parser, *map, *size, tokens, - sz / sizeof(jsmntok_t)); - if (res != JSMN_SUCCESS) { - pr_err("%s: json error %s\n", fn, jsmn_strerror(res)); - goto error_free; - } - if (len) - *len = parser.toknext; - return tokens; -error_free: - free(tokens); -error: - unmapfile(*map, *size); - return NULL; -} - -void free_json(char *map, size_t size, jsmntok_t *tokens) -{ - free(tokens); - unmapfile(map, size); -} - -static int countchar(char *map, char c, int end) -{ - int i; - int count = 0; - for (i = 0; i < end; i++) - if (map[i] == c) - count++; - return count; -} - -/* Return line number of a jsmn token */ -int json_line(char *map, jsmntok_t *t) -{ - return countchar(map, '\n', t->start) + 1; -} - -static const char * const jsmn_types[] = { - [JSMN_PRIMITIVE] = "primitive", - [JSMN_ARRAY] = "array", - [JSMN_OBJECT] = "object", - [JSMN_STRING] = "string" -}; - -#define LOOKUP(a, i) ((i) < (sizeof(a)/sizeof(*(a))) ? ((a)[i]) : "?") - -/* Return type name of a jsmn token */ -const char *json_name(jsmntok_t *t) -{ - return LOOKUP(jsmn_types, t->type); -} - -int json_len(jsmntok_t *t) -{ - return t->end - t->start; -} - -/* Is string t equal to s? */ -int json_streq(char *map, jsmntok_t *t, const char *s) -{ - unsigned len = json_len(t); - return len == strlen(s) && !strncasecmp(map + t->start, s, len); -} diff --git a/tools/perf/pmu-events/json.h b/tools/perf/pmu-events/json.h deleted file mode 100644 index fbcd5a0590ad..000000000000 --- a/tools/perf/pmu-events/json.h +++ /dev/null @@ -1,39 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef JSON_H -#define JSON_H 1 - -#include "jsmn.h" - -jsmntok_t *parse_json(const char *fn, char **map, size_t *size, int *len); -void free_json(char *map, size_t size, jsmntok_t *tokens); -int json_line(char *map, jsmntok_t *t); -const char *json_name(jsmntok_t *t); -int json_streq(char *map, jsmntok_t *t, const char *s); -int json_len(jsmntok_t *t); - -extern int verbose; - -#include - -extern int eprintf(int level, int var, const char *fmt, ...); -#define pr_fmt(fmt) fmt - -#define pr_err(fmt, ...) \ - eprintf(0, verbose, pr_fmt(fmt), ##__VA_ARGS__) - -#define pr_info(fmt, ...) \ - eprintf(1, verbose, pr_fmt(fmt), ##__VA_ARGS__) - -#define pr_debug(fmt, ...) \ - eprintf(2, verbose, pr_fmt(fmt), ##__VA_ARGS__) - -#ifndef roundup -#define roundup(x, y) ( \ -{ \ - const typeof(y) __y = y; \ - (((x) + (__y - 1)) / __y) * __y; \ -} \ -) -#endif - -#endif -- cgit v1.2.3 From b0cbd6154a9a3964490cafa0a9444be626271bd9 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Thu, 30 Jun 2022 11:36:38 +0200 Subject: bpftool: Remove attach_type_name forward declaration The attach_type_name definition was removed in commit 1ba5ad36e00f ("bpftool: Use libbpf_bpf_attach_type_str"). Remove its forward declaration in main.h as well. Signed-off-by: Tobias Klauser Signed-off-by: Daniel Borkmann Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20220630093638.25916-1-tklauser@distanz.ch --- tools/bpf/bpftool/main.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 589cb76b227a..5e5060c2ac04 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -63,8 +63,6 @@ static inline void *u64_to_ptr(__u64 ptr) #define HELP_SPEC_LINK \ "LINK := { id LINK_ID | pinned FILE }" -extern const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE]; - /* keep in sync with the definition in skeleton/pid_iter.bpf.c */ enum bpf_obj_type { BPF_OBJ_UNKNOWN, -- cgit v1.2.3 From 27b3f70553432114b3d26f4d9c72cf02f38b84ee Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 29 Jun 2022 21:36:36 +0100 Subject: bpftool: Add feature list (prog/map/link/attach types, helpers) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a "bpftool feature list" subcommand to list BPF "features". Contrarily to "bpftool feature probe", this is not about the features available on the system. Instead, it lists all features known to bpftool from compilation time; in other words, all program, map, attach, link types known to the libbpf version in use, and all helpers found in the UAPI BPF header. The first use case for this feature is bash completion: running the command provides a list of types that can be used to produce the list of candidate map types, for example. Now that bpftool uses "standard" names provided by libbpf for the program, map, link, and attach types, having the ability to list these types and helpers could also be useful in scripts to loop over existing items. Sample output: # bpftool feature list prog_types | grep -vw unspec | head -n 6 socket_filter kprobe sched_cls sched_act tracepoint xdp # bpftool -p feature list map_types | jq '.[1]' "hash" # bpftool feature list attach_types | grep '^cgroup_' cgroup_inet_ingress cgroup_inet_egress [...] cgroup_inet_sock_release # bpftool feature list helpers | grep -vw bpf_unspec | wc -l 207 The "unspec" types and helpers are not filtered out by bpftool, so as to remain closer to the enums, and to preserve the indices in the JSON arrays (e.g. "hash" at index 1 == BPF_MAP_TYPE_HASH in map types list). Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Acked-by: Daniel Müller Link: https://lore.kernel.org/bpf/20220629203637.138944-2-quentin@isovalent.com --- .../bpf/bpftool/Documentation/bpftool-feature.rst | 12 +++++ tools/bpf/bpftool/bash-completion/bpftool | 7 ++- tools/bpf/bpftool/feature.c | 55 ++++++++++++++++++++++ 3 files changed, 73 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index 4ce9a77bc1e0..c08064628d39 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -24,9 +24,11 @@ FEATURE COMMANDS ================ | **bpftool** **feature probe** [*COMPONENT*] [**full**] [**unprivileged**] [**macros** [**prefix** *PREFIX*]] +| **bpftool** **feature list** *GROUP* | **bpftool** **feature help** | | *COMPONENT* := { **kernel** | **dev** *NAME* } +| *GROUP* := { **prog_types** | **map_types** | **attach_types** | **link_types** | **helpers** } DESCRIPTION =========== @@ -70,6 +72,16 @@ DESCRIPTION The keywords **full**, **macros** and **prefix** have the same role as when probing the kernel. + **bpftool feature list** *GROUP* + List items known to bpftool. These can be BPF program types + (**prog_types**), BPF map types (**map_types**), attach types + (**attach_types**), link types (**link_types**), or BPF helper + functions (**helpers**). The command does not probe the system, but + simply lists the elements that bpftool knows from compilation time, + as provided from libbpf (for all object types) or from the BPF UAPI + header (list of helpers). This can be used in scripts to iterate over + BPF types or helpers. + **bpftool feature help** Print short help message. diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 91f89a9a5b36..9cef6516320b 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -1175,9 +1175,14 @@ _bpftool() _bpftool_once_attr 'full unprivileged' return 0 ;; + list) + [[ $prev != "$command" ]] && return 0 + COMPREPLY=( $( compgen -W 'prog_types map_types \ + attach_types link_types helpers' -- "$cur" ) ) + ;; *) [[ $prev == $object ]] && \ - COMPREPLY=( $( compgen -W 'help probe' -- "$cur" ) ) + COMPREPLY=( $( compgen -W 'help list probe' -- "$cur" ) ) ;; esac ;; diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index bac4ef428a02..576cc6b90c6a 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -1258,6 +1258,58 @@ exit_close_json: return 0; } +static const char *get_helper_name(unsigned int id) +{ + if (id >= ARRAY_SIZE(helper_name)) + return NULL; + + return helper_name[id]; +} + +static int do_list(int argc, char **argv) +{ + const char *(*get_name)(unsigned int id); + unsigned int id = 0; + + if (argc < 1) + usage(); + + if (is_prefix(*argv, "prog_types")) { + get_name = (const char *(*)(unsigned int))libbpf_bpf_prog_type_str; + } else if (is_prefix(*argv, "map_types")) { + get_name = (const char *(*)(unsigned int))libbpf_bpf_map_type_str; + } else if (is_prefix(*argv, "attach_types")) { + get_name = (const char *(*)(unsigned int))libbpf_bpf_attach_type_str; + } else if (is_prefix(*argv, "link_types")) { + get_name = (const char *(*)(unsigned int))libbpf_bpf_link_type_str; + } else if (is_prefix(*argv, "helpers")) { + get_name = get_helper_name; + } else { + p_err("expected 'prog_types', 'map_types', 'attach_types', 'link_types' or 'helpers', got: %s", *argv); + return -1; + } + + if (json_output) + jsonw_start_array(json_wtr); /* root array */ + + while (true) { + const char *name; + + name = get_name(id++); + if (!name) + break; + if (json_output) + jsonw_string(json_wtr, name); + else + printf("%s\n", name); + } + + if (json_output) + jsonw_end_array(json_wtr); /* root array */ + + return 0; +} + static int do_help(int argc, char **argv) { if (json_output) { @@ -1267,9 +1319,11 @@ static int do_help(int argc, char **argv) fprintf(stderr, "Usage: %1$s %2$s probe [COMPONENT] [full] [unprivileged] [macros [prefix PREFIX]]\n" + " %1$s %2$s list GROUP\n" " %1$s %2$s help\n" "\n" " COMPONENT := { kernel | dev NAME }\n" + " GROUP := { prog_types | map_types | attach_types | link_types | helpers }\n" " " HELP_SPEC_OPTIONS " }\n" "", bin_name, argv[-2]); @@ -1279,6 +1333,7 @@ static int do_help(int argc, char **argv) static const struct cmd cmds[] = { { "probe", do_probe }, + { "list", do_list }, { "help", do_help }, { 0 } }; -- cgit v1.2.3 From 6d304871e3ef4c339c06aa9b4ab55b6c77642884 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 29 Jun 2022 21:36:37 +0100 Subject: bpftool: Use feature list in bash completion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that bpftool is able to produce a list of known program, map, attach types, let's use as much of this as we can in the bash completion file, so that we don't have to expand the list each time a new type is added to the kernel. Also update the relevant test script to remove some checks that are no longer needed. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Acked-by: Daniel Müller Link: https://lore.kernel.org/bpf/20220629203637.138944-3-quentin@isovalent.com --- tools/bpf/bpftool/bash-completion/bpftool | 21 ++++----------------- .../testing/selftests/bpf/test_bpftool_synctypes.py | 20 +++----------------- 2 files changed, 7 insertions(+), 34 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 9cef6516320b..ee177f83b179 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -703,15 +703,8 @@ _bpftool() return 0 ;; type) - local BPFTOOL_MAP_CREATE_TYPES='hash array \ - prog_array perf_event_array percpu_hash \ - percpu_array stack_trace cgroup_array lru_hash \ - lru_percpu_hash lpm_trie array_of_maps \ - hash_of_maps devmap devmap_hash sockmap cpumap \ - xskmap sockhash cgroup_storage reuseport_sockarray \ - percpu_cgroup_storage queue stack sk_storage \ - struct_ops ringbuf inode_storage task_storage \ - bloom_filter' + local BPFTOOL_MAP_CREATE_TYPES="$(bpftool feature list map_types | \ + grep -v '^unspec$')" COMPREPLY=( $( compgen -W "$BPFTOOL_MAP_CREATE_TYPES" -- "$cur" ) ) return 0 ;; @@ -1039,14 +1032,8 @@ _bpftool() return 0 ;; attach|detach) - local BPFTOOL_CGROUP_ATTACH_TYPES='cgroup_inet_ingress cgroup_inet_egress \ - cgroup_inet_sock_create cgroup_sock_ops cgroup_device cgroup_inet4_bind \ - cgroup_inet6_bind cgroup_inet4_post_bind cgroup_inet6_post_bind \ - cgroup_inet4_connect cgroup_inet6_connect cgroup_inet4_getpeername \ - cgroup_inet6_getpeername cgroup_inet4_getsockname cgroup_inet6_getsockname \ - cgroup_udp4_sendmsg cgroup_udp6_sendmsg cgroup_udp4_recvmsg \ - cgroup_udp6_recvmsg cgroup_sysctl cgroup_getsockopt cgroup_setsockopt \ - cgroup_inet_sock_release' + local BPFTOOL_CGROUP_ATTACH_TYPES="$(bpftool feature list attach_types | \ + grep '^cgroup_')" local ATTACH_FLAGS='multi override' local PROG_TYPE='id pinned tag name' # Check for $prev = $command first diff --git a/tools/testing/selftests/bpf/test_bpftool_synctypes.py b/tools/testing/selftests/bpf/test_bpftool_synctypes.py index e443e6542cb9..a6410bebe603 100755 --- a/tools/testing/selftests/bpf/test_bpftool_synctypes.py +++ b/tools/testing/selftests/bpf/test_bpftool_synctypes.py @@ -471,12 +471,6 @@ class BashcompExtractor(FileExtractor): def get_prog_attach_types(self): return self.get_bashcomp_list('BPFTOOL_PROG_ATTACH_TYPES') - def get_map_types(self): - return self.get_bashcomp_list('BPFTOOL_MAP_CREATE_TYPES') - - def get_cgroup_attach_types(self): - return self.get_bashcomp_list('BPFTOOL_CGROUP_ATTACH_TYPES') - def verify(first_set, second_set, message): """ Print all values that differ between two sets. @@ -516,17 +510,12 @@ def main(): man_map_types = man_map_info.get_map_types() man_map_info.close() - bashcomp_info = BashcompExtractor() - bashcomp_map_types = bashcomp_info.get_map_types() - verify(source_map_types, help_map_types, f'Comparing {BpfHeaderExtractor.filename} (bpf_map_type) and {MapFileExtractor.filename} (do_help() TYPE):') verify(source_map_types, man_map_types, f'Comparing {BpfHeaderExtractor.filename} (bpf_map_type) and {ManMapExtractor.filename} (TYPE):') verify(help_map_options, man_map_options, f'Comparing {MapFileExtractor.filename} (do_help() OPTIONS) and {ManMapExtractor.filename} (OPTIONS):') - verify(source_map_types, bashcomp_map_types, - f'Comparing {BpfHeaderExtractor.filename} (bpf_map_type) and {BashcompExtractor.filename} (BPFTOOL_MAP_CREATE_TYPES):') # Attach types (names) @@ -542,8 +531,10 @@ def main(): man_prog_attach_types = man_prog_info.get_attach_types() man_prog_info.close() - bashcomp_info.reset_read() # We stopped at map types, rewind + + bashcomp_info = BashcompExtractor() bashcomp_prog_attach_types = bashcomp_info.get_prog_attach_types() + bashcomp_info.close() verify(source_prog_attach_types, help_prog_attach_types, f'Comparing {ProgFileExtractor.filename} (bpf_attach_type) and {ProgFileExtractor.filename} (do_help() ATTACH_TYPE):') @@ -568,17 +559,12 @@ def main(): man_cgroup_attach_types = man_cgroup_info.get_attach_types() man_cgroup_info.close() - bashcomp_cgroup_attach_types = bashcomp_info.get_cgroup_attach_types() - bashcomp_info.close() - verify(source_cgroup_attach_types, help_cgroup_attach_types, f'Comparing {BpfHeaderExtractor.filename} (bpf_attach_type) and {CgroupFileExtractor.filename} (do_help() ATTACH_TYPE):') verify(source_cgroup_attach_types, man_cgroup_attach_types, f'Comparing {BpfHeaderExtractor.filename} (bpf_attach_type) and {ManCgroupExtractor.filename} (ATTACH_TYPE):') verify(help_cgroup_options, man_cgroup_options, f'Comparing {CgroupFileExtractor.filename} (do_help() OPTIONS) and {ManCgroupExtractor.filename} (OPTIONS):') - verify(source_cgroup_attach_types, bashcomp_cgroup_attach_types, - f'Comparing {BpfHeaderExtractor.filename} (bpf_attach_type) and {BashcompExtractor.filename} (BPFTOOL_CGROUP_ATTACH_TYPES):') # Options for remaining commands -- cgit v1.2.3 From 24d2e5d9da608445e2e5c2c9ab5cf418f0fc4612 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 29 Jun 2022 16:34:55 +0200 Subject: selftests/xsk: Avoid bpf_link probe for existing xsk Currently bpf_link probe is done for each call of xsk_socket__create(). For cases where xsk context was previously created and current socket creation uses it, has_bpf_link will be overwritten, where it has already been initialized. Optimize this by moving the query to the xsk_create_ctx() so that when xsk_get_ctx() finds a ctx then no further bpf_link probes are needed. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20220629143458.934337-2-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/xsk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xsk.c b/tools/testing/selftests/bpf/xsk.c index eb50c3f336f8..fa13d2c44517 100644 --- a/tools/testing/selftests/bpf/xsk.c +++ b/tools/testing/selftests/bpf/xsk.c @@ -958,6 +958,7 @@ static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk, ctx->fill = fill; ctx->comp = comp; list_add(&ctx->list, &umem->ctx_list); + ctx->has_bpf_link = xsk_probe_bpf_link(); return ctx; } @@ -1059,7 +1060,6 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, } } xsk->ctx = ctx; - xsk->ctx->has_bpf_link = xsk_probe_bpf_link(); if (rx && !rx_setup_done) { err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING, -- cgit v1.2.3 From 61333008d01e18716a7050fdc9479cc8d6e63883 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 29 Jun 2022 16:34:56 +0200 Subject: selftests/xsk: Introduce XDP prog load based on existing AF_XDP socket Currently, xsk_setup_xdp_prog() uses anonymous xsk_socket struct which means that during xsk_create_bpf_link() call, xsk->config.xdp_flags is always 0. This in turn means that from xdpxceiver it is impossible to use xdpgeneric attachment, so since commit 3b22523bca02 ("selftests, xsk: Fix bpf_res cleanup test") we were not testing SKB mode at all. To fix this, introduce a function, called xsk_setup_xdp_prog_xsk(), that will load XDP prog based on the existing xsk_socket, so that xsk context's refcount is correctly bumped and flags from application side are respected. Use this from xdpxceiver side so we get coverage of generic and native XDP program attach points. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20220629143458.934337-3-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/xdpxceiver.c | 2 +- tools/testing/selftests/bpf/xsk.c | 5 +++++ tools/testing/selftests/bpf/xsk.h | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 019c567b6b4e..c024aa91ea02 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -1130,7 +1130,7 @@ static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject) if (!ifindex) exit_with_error(errno); - ret = xsk_setup_xdp_prog(ifindex, &ifobject->xsk_map_fd); + ret = xsk_setup_xdp_prog_xsk(ifobject->xsk->xsk, &ifobject->xsk_map_fd); if (ret) exit_with_error(-ret); diff --git a/tools/testing/selftests/bpf/xsk.c b/tools/testing/selftests/bpf/xsk.c index fa13d2c44517..db911127720e 100644 --- a/tools/testing/selftests/bpf/xsk.c +++ b/tools/testing/selftests/bpf/xsk.c @@ -880,6 +880,11 @@ static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, int *xsks_map_fd) return err; } +int xsk_setup_xdp_prog_xsk(struct xsk_socket *xsk, int *xsks_map_fd) +{ + return __xsk_setup_xdp_prog(xsk, xsks_map_fd); +} + static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, int ifindex, __u32 queue_id) { diff --git a/tools/testing/selftests/bpf/xsk.h b/tools/testing/selftests/bpf/xsk.h index 915e7135337c..997723b0bfb2 100644 --- a/tools/testing/selftests/bpf/xsk.h +++ b/tools/testing/selftests/bpf/xsk.h @@ -269,6 +269,7 @@ struct xsk_umem_config { __u32 flags; }; +int xsk_setup_xdp_prog_xsk(struct xsk_socket *xsk, int *xsks_map_fd); int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd); int xsk_socket__update_xskmap(struct xsk_socket *xsk, int xsks_map_fd); -- cgit v1.2.3 From 6d4c767c032b165cc290c51f4e82bc54b14b1cf1 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 29 Jun 2022 16:34:57 +0200 Subject: selftests/xsk: Verify correctness of XDP prog attach point To prevent the case we had previously where for TEST_MODE_SKB, XDP prog was attached in native mode, call bpf_xdp_query() after loading prog and make sure that attach_mode is as expected. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20220629143458.934337-4-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/xdpxceiver.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index c024aa91ea02..4c425a43e5b0 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -1085,6 +1085,7 @@ static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject) { u64 umem_sz = ifobject->umem->num_frames * ifobject->umem->frame_size; int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE; + LIBBPF_OPTS(bpf_xdp_query_opts, opts); int ret, ifindex; void *bufs; u32 i; @@ -1134,6 +1135,22 @@ static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject) if (ret) exit_with_error(-ret); + ret = bpf_xdp_query(ifindex, ifobject->xdp_flags, &opts); + if (ret) + exit_with_error(-ret); + + if (ifobject->xdp_flags & XDP_FLAGS_SKB_MODE) { + if (opts.attach_mode != XDP_ATTACHED_SKB) { + ksft_print_msg("ERROR: [%s] XDP prog not in SKB mode\n"); + exit_with_error(-EINVAL); + } + } else if (ifobject->xdp_flags & XDP_FLAGS_DRV_MODE) { + if (opts.attach_mode != XDP_ATTACHED_DRV) { + ksft_print_msg("ERROR: [%s] XDP prog not in DRV mode\n"); + exit_with_error(-EINVAL); + } + } + ret = xsk_socket__update_xskmap(ifobject->xsk->xsk, ifobject->xsk_map_fd); if (ret) exit_with_error(-ret); -- cgit v1.2.3 From 39e940d4abfabb08b6937a315546b24d10be67e3 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 29 Jun 2022 16:34:58 +0200 Subject: selftests/xsk: Destroy BPF resources only when ctx refcount drops to 0 Currently, xsk_socket__delete frees BPF resources regardless of ctx refcount. Xdpxceiver has a test to verify whether underlying BPF resources would not be wiped out after closing XSK socket that was bound to interface with other active sockets. From library's xsk part perspective it also means that the internal xsk context is shared and its refcount is bumped accordingly. After a switch to loading XDP prog based on previously opened XSK socket, mentioned xdpxceiver test fails with: not ok 16 [xdpxceiver.c:swap_xsk_resources:1334]: ERROR: 9/"Bad file descriptor which means that in swap_xsk_resources(), xsk_socket__delete() released xskmap which in turn caused a failure of xsk_socket__update_xskmap(). To fix this, when deleting socket, decrement ctx refcount before releasing BPF resources and do so only when refcount dropped to 0 which means there are no more active sockets for this ctx so BPF resources can be freed safely. Fixes: 2f6324a3937f ("libbpf: Support shared umems between queues and devices") Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20220629143458.934337-5-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/xsk.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xsk.c b/tools/testing/selftests/bpf/xsk.c index db911127720e..f2721a4ae7c5 100644 --- a/tools/testing/selftests/bpf/xsk.c +++ b/tools/testing/selftests/bpf/xsk.c @@ -1156,8 +1156,6 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, goto out_mmap_tx; } - ctx->prog_fd = -1; - if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) { err = __xsk_setup_xdp_prog(xsk, NULL); if (err) @@ -1238,7 +1236,10 @@ void xsk_socket__delete(struct xsk_socket *xsk) ctx = xsk->ctx; umem = ctx->umem; - if (ctx->prog_fd != -1) { + + xsk_put_ctx(ctx, true); + + if (!ctx->refcount) { xsk_delete_bpf_maps(xsk); close(ctx->prog_fd); if (ctx->has_bpf_link) @@ -1257,8 +1258,6 @@ void xsk_socket__delete(struct xsk_socket *xsk) } } - xsk_put_ctx(ctx, true); - umem->refcount--; /* Do not close an fd that also has an associated umem connected * to it. -- cgit v1.2.3 From 7a255ae77216237a4ce83ddea595aa4e0a812f46 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 29 Jun 2022 15:48:32 +0000 Subject: bpftool: Show also the name of type BPF_OBJ_LINK For example, /sys/fs/bpf/maps.debug is a BPF link. When you run `bpftool map show` to show it: Before: $ bpftool map show pinned /sys/fs/bpf/maps.debug Error: incorrect object type: unknown After: $ bpftool map show pinned /sys/fs/bpf/maps.debug Error: incorrect object type: link Signed-off-by: Yafang Shao Signed-off-by: Daniel Borkmann Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20220629154832.56986-5-laoar.shao@gmail.com --- tools/bpf/bpftool/common.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index fc8172a4969a..067e9ea59e3b 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -316,6 +316,7 @@ const char *get_fd_type_name(enum bpf_obj_type type) [BPF_OBJ_UNKNOWN] = "unknown", [BPF_OBJ_PROG] = "prog", [BPF_OBJ_MAP] = "map", + [BPF_OBJ_LINK] = "link", }; if (type < 0 || type >= ARRAY_SIZE(names) || !names[type]) -- cgit v1.2.3 From 0e862838f290147ea9c16db852d8d494b552d38d Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 24 Jun 2022 14:13:07 +0200 Subject: bitops: unify non-atomic bitops prototypes across architectures Currently, there is a mess with the prototypes of the non-atomic bitops across the different architectures: ret bool, int, unsigned long nr int, long, unsigned int, unsigned long addr volatile unsigned long *, volatile void * Thankfully, it doesn't provoke any bugs, but can sometimes make the compiler angry when it's not handy at all. Adjust all the prototypes to the following standard: ret bool retval can be only 0 or 1 nr unsigned long native; signed makes no sense addr volatile unsigned long * bitmaps are arrays of ulongs Next, some architectures don't define 'arch_' versions as they don't support instrumentation, others do. To make sure there is always the same set of callables present and to ease any potential future changes, make them all follow the rule: * architecture-specific files define only 'arch_' versions; * non-prefixed versions can be defined only in asm-generic files; and place the non-prefixed definitions into a new file in asm-generic to be included by non-instrumented architectures. Finally, add some static assertions in order to prevent people from making a mess in this room again. I also used the %__always_inline attribute consistently, so that they always get resolved to the actual operations. Suggested-by: Andy Shevchenko Signed-off-by: Alexander Lobakin Acked-by: Mark Rutland Reviewed-by: Yury Norov Reviewed-by: Andy Shevchenko Signed-off-by: Yury Norov --- arch/alpha/include/asm/bitops.h | 32 ++++++------ arch/hexagon/include/asm/bitops.h | 24 +++++---- arch/ia64/include/asm/bitops.h | 42 ++++++++------- arch/m68k/include/asm/bitops.h | 49 +++++++++++------ arch/s390/include/asm/bitops.h | 61 +++++++++++----------- arch/sh/include/asm/bitops-op32.h | 34 +++++++----- arch/x86/include/asm/bitops.h | 22 ++++---- include/asm-generic/bitops/generic-non-atomic.h | 24 ++++----- .../asm-generic/bitops/instrumented-non-atomic.h | 21 +++++--- include/asm-generic/bitops/non-atomic.h | 13 +---- .../bitops/non-instrumented-non-atomic.h | 16 ++++++ include/linux/bitops.h | 17 ++++++ tools/include/asm-generic/bitops/non-atomic.h | 24 +++++---- 13 files changed, 229 insertions(+), 150 deletions(-) create mode 100644 include/asm-generic/bitops/non-instrumented-non-atomic.h (limited to 'tools') diff --git a/arch/alpha/include/asm/bitops.h b/arch/alpha/include/asm/bitops.h index e1d8483a45f2..492c7713ddae 100644 --- a/arch/alpha/include/asm/bitops.h +++ b/arch/alpha/include/asm/bitops.h @@ -46,8 +46,8 @@ set_bit(unsigned long nr, volatile void * addr) /* * WARNING: non atomic version. */ -static inline void -__set_bit(unsigned long nr, volatile void * addr) +static __always_inline void +arch___set_bit(unsigned long nr, volatile unsigned long *addr) { int *m = ((int *) addr) + (nr >> 5); @@ -82,8 +82,8 @@ clear_bit_unlock(unsigned long nr, volatile void * addr) /* * WARNING: non atomic version. */ -static __inline__ void -__clear_bit(unsigned long nr, volatile void * addr) +static __always_inline void +arch___clear_bit(unsigned long nr, volatile unsigned long *addr) { int *m = ((int *) addr) + (nr >> 5); @@ -94,7 +94,7 @@ static inline void __clear_bit_unlock(unsigned long nr, volatile void * addr) { smp_mb(); - __clear_bit(nr, addr); + arch___clear_bit(nr, addr); } static inline void @@ -118,8 +118,8 @@ change_bit(unsigned long nr, volatile void * addr) /* * WARNING: non atomic version. */ -static __inline__ void -__change_bit(unsigned long nr, volatile void * addr) +static __always_inline void +arch___change_bit(unsigned long nr, volatile unsigned long *addr) { int *m = ((int *) addr) + (nr >> 5); @@ -186,8 +186,8 @@ test_and_set_bit_lock(unsigned long nr, volatile void *addr) /* * WARNING: non atomic version. */ -static inline int -__test_and_set_bit(unsigned long nr, volatile void * addr) +static __always_inline bool +arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = 1 << (nr & 0x1f); int *m = ((int *) addr) + (nr >> 5); @@ -230,8 +230,8 @@ test_and_clear_bit(unsigned long nr, volatile void * addr) /* * WARNING: non atomic version. */ -static inline int -__test_and_clear_bit(unsigned long nr, volatile void * addr) +static __always_inline bool +arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = 1 << (nr & 0x1f); int *m = ((int *) addr) + (nr >> 5); @@ -272,8 +272,8 @@ test_and_change_bit(unsigned long nr, volatile void * addr) /* * WARNING: non atomic version. */ -static __inline__ int -__test_and_change_bit(unsigned long nr, volatile void * addr) +static __always_inline bool +arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = 1 << (nr & 0x1f); int *m = ((int *) addr) + (nr >> 5); @@ -283,8 +283,8 @@ __test_and_change_bit(unsigned long nr, volatile void * addr) return (old & mask) != 0; } -static inline int -test_bit(int nr, const volatile void * addr) +static __always_inline bool +arch_test_bit(unsigned long nr, const volatile unsigned long *addr) { return (1UL & (((const int *) addr)[nr >> 5] >> (nr & 31))) != 0UL; } @@ -450,6 +450,8 @@ sched_find_first_bit(const unsigned long b[2]) return __ffs(tmp) + ofs; } +#include + #include #include diff --git a/arch/hexagon/include/asm/bitops.h b/arch/hexagon/include/asm/bitops.h index 75d6ba3643b8..da500471ac73 100644 --- a/arch/hexagon/include/asm/bitops.h +++ b/arch/hexagon/include/asm/bitops.h @@ -127,38 +127,45 @@ static inline void change_bit(int nr, volatile void *addr) * be atomic, particularly for things like slab_lock and slab_unlock. * */ -static inline void __clear_bit(int nr, volatile unsigned long *addr) +static __always_inline void +arch___clear_bit(unsigned long nr, volatile unsigned long *addr) { test_and_clear_bit(nr, addr); } -static inline void __set_bit(int nr, volatile unsigned long *addr) +static __always_inline void +arch___set_bit(unsigned long nr, volatile unsigned long *addr) { test_and_set_bit(nr, addr); } -static inline void __change_bit(int nr, volatile unsigned long *addr) +static __always_inline void +arch___change_bit(unsigned long nr, volatile unsigned long *addr) { test_and_change_bit(nr, addr); } /* Apparently, at least some of these are allowed to be non-atomic */ -static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) +static __always_inline bool +arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { return test_and_clear_bit(nr, addr); } -static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) +static __always_inline bool +arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { return test_and_set_bit(nr, addr); } -static inline int __test_and_change_bit(int nr, volatile unsigned long *addr) +static __always_inline bool +arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { return test_and_change_bit(nr, addr); } -static inline int __test_bit(int nr, const volatile unsigned long *addr) +static __always_inline bool +arch_test_bit(unsigned long nr, const volatile unsigned long *addr) { int retval; @@ -172,8 +179,6 @@ static inline int __test_bit(int nr, const volatile unsigned long *addr) return retval; } -#define test_bit(nr, addr) __test_bit(nr, addr) - /* * ffz - find first zero in word. * @word: The word to search @@ -271,6 +276,7 @@ static inline unsigned long __fls(unsigned long word) } #include +#include #include #include diff --git a/arch/ia64/include/asm/bitops.h b/arch/ia64/include/asm/bitops.h index 577be93c0818..9f62af7fd7c4 100644 --- a/arch/ia64/include/asm/bitops.h +++ b/arch/ia64/include/asm/bitops.h @@ -53,7 +53,7 @@ set_bit (int nr, volatile void *addr) } /** - * __set_bit - Set a bit in memory + * arch___set_bit - Set a bit in memory * @nr: the bit to set * @addr: the address to start counting from * @@ -61,8 +61,8 @@ set_bit (int nr, volatile void *addr) * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static __inline__ void -__set_bit (int nr, volatile void *addr) +static __always_inline void +arch___set_bit(unsigned long nr, volatile unsigned long *addr) { *((__u32 *) addr + (nr >> 5)) |= (1 << (nr & 31)); } @@ -135,7 +135,7 @@ __clear_bit_unlock(int nr, void *addr) } /** - * __clear_bit - Clears a bit in memory (non-atomic version) + * arch___clear_bit - Clears a bit in memory (non-atomic version) * @nr: the bit to clear * @addr: the address to start counting from * @@ -143,8 +143,8 @@ __clear_bit_unlock(int nr, void *addr) * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static __inline__ void -__clear_bit (int nr, volatile void *addr) +static __always_inline void +arch___clear_bit(unsigned long nr, volatile unsigned long *addr) { *((__u32 *) addr + (nr >> 5)) &= ~(1 << (nr & 31)); } @@ -175,7 +175,7 @@ change_bit (int nr, volatile void *addr) } /** - * __change_bit - Toggle a bit in memory + * arch___change_bit - Toggle a bit in memory * @nr: the bit to toggle * @addr: the address to start counting from * @@ -183,8 +183,8 @@ change_bit (int nr, volatile void *addr) * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static __inline__ void -__change_bit (int nr, volatile void *addr) +static __always_inline void +arch___change_bit(unsigned long nr, volatile unsigned long *addr) { *((__u32 *) addr + (nr >> 5)) ^= (1 << (nr & 31)); } @@ -224,7 +224,7 @@ test_and_set_bit (int nr, volatile void *addr) #define test_and_set_bit_lock test_and_set_bit /** - * __test_and_set_bit - Set a bit and return its old value + * arch___test_and_set_bit - Set a bit and return its old value * @nr: Bit to set * @addr: Address to count from * @@ -232,8 +232,8 @@ test_and_set_bit (int nr, volatile void *addr) * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ -static __inline__ int -__test_and_set_bit (int nr, volatile void *addr) +static __always_inline bool +arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { __u32 *p = (__u32 *) addr + (nr >> 5); __u32 m = 1 << (nr & 31); @@ -269,7 +269,7 @@ test_and_clear_bit (int nr, volatile void *addr) } /** - * __test_and_clear_bit - Clear a bit and return its old value + * arch___test_and_clear_bit - Clear a bit and return its old value * @nr: Bit to clear * @addr: Address to count from * @@ -277,8 +277,8 @@ test_and_clear_bit (int nr, volatile void *addr) * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ -static __inline__ int -__test_and_clear_bit(int nr, volatile void * addr) +static __always_inline bool +arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { __u32 *p = (__u32 *) addr + (nr >> 5); __u32 m = 1 << (nr & 31); @@ -314,14 +314,14 @@ test_and_change_bit (int nr, volatile void *addr) } /** - * __test_and_change_bit - Change a bit and return its old value + * arch___test_and_change_bit - Change a bit and return its old value * @nr: Bit to change * @addr: Address to count from * * This operation is non-atomic and can be reordered. */ -static __inline__ int -__test_and_change_bit (int nr, void *addr) +static __always_inline bool +arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { __u32 old, bit = (1 << (nr & 31)); __u32 *m = (__u32 *) addr + (nr >> 5); @@ -331,8 +331,8 @@ __test_and_change_bit (int nr, void *addr) return (old & bit) != 0; } -static __inline__ int -test_bit (int nr, const volatile void *addr) +static __always_inline bool +arch_test_bit(unsigned long nr, const volatile unsigned long *addr) { return 1 & (((const volatile __u32 *) addr)[nr >> 5] >> (nr & 31)); } @@ -443,6 +443,8 @@ static __inline__ unsigned long __arch_hweight64(unsigned long x) #ifdef __KERNEL__ +#include + #include #include diff --git a/arch/m68k/include/asm/bitops.h b/arch/m68k/include/asm/bitops.h index 51283db53667..71495faf2a90 100644 --- a/arch/m68k/include/asm/bitops.h +++ b/arch/m68k/include/asm/bitops.h @@ -65,8 +65,11 @@ static inline void bfset_mem_set_bit(int nr, volatile unsigned long *vaddr) bfset_mem_set_bit(nr, vaddr)) #endif -#define __set_bit(nr, vaddr) set_bit(nr, vaddr) - +static __always_inline void +arch___set_bit(unsigned long nr, volatile unsigned long *addr) +{ + set_bit(nr, addr); +} static inline void bclr_reg_clear_bit(int nr, volatile unsigned long *vaddr) { @@ -105,8 +108,11 @@ static inline void bfclr_mem_clear_bit(int nr, volatile unsigned long *vaddr) bfclr_mem_clear_bit(nr, vaddr)) #endif -#define __clear_bit(nr, vaddr) clear_bit(nr, vaddr) - +static __always_inline void +arch___clear_bit(unsigned long nr, volatile unsigned long *addr) +{ + clear_bit(nr, addr); +} static inline void bchg_reg_change_bit(int nr, volatile unsigned long *vaddr) { @@ -145,14 +151,17 @@ static inline void bfchg_mem_change_bit(int nr, volatile unsigned long *vaddr) bfchg_mem_change_bit(nr, vaddr)) #endif -#define __change_bit(nr, vaddr) change_bit(nr, vaddr) - - -static inline int test_bit(int nr, const volatile unsigned long *vaddr) +static __always_inline void +arch___change_bit(unsigned long nr, volatile unsigned long *addr) { - return (vaddr[nr >> 5] & (1UL << (nr & 31))) != 0; + change_bit(nr, addr); } +static __always_inline bool +arch_test_bit(unsigned long nr, const volatile unsigned long *addr) +{ + return (addr[nr >> 5] & (1UL << (nr & 31))) != 0; +} static inline int bset_reg_test_and_set_bit(int nr, volatile unsigned long *vaddr) @@ -201,8 +210,11 @@ static inline int bfset_mem_test_and_set_bit(int nr, bfset_mem_test_and_set_bit(nr, vaddr)) #endif -#define __test_and_set_bit(nr, vaddr) test_and_set_bit(nr, vaddr) - +static __always_inline bool +arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr) +{ + return test_and_set_bit(nr, addr); +} static inline int bclr_reg_test_and_clear_bit(int nr, volatile unsigned long *vaddr) @@ -251,8 +263,11 @@ static inline int bfclr_mem_test_and_clear_bit(int nr, bfclr_mem_test_and_clear_bit(nr, vaddr)) #endif -#define __test_and_clear_bit(nr, vaddr) test_and_clear_bit(nr, vaddr) - +static __always_inline bool +arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) +{ + return test_and_clear_bit(nr, addr); +} static inline int bchg_reg_test_and_change_bit(int nr, volatile unsigned long *vaddr) @@ -301,8 +316,11 @@ static inline int bfchg_mem_test_and_change_bit(int nr, bfchg_mem_test_and_change_bit(nr, vaddr)) #endif -#define __test_and_change_bit(nr, vaddr) test_and_change_bit(nr, vaddr) - +static __always_inline bool +arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr) +{ + return test_and_change_bit(nr, addr); +} /* * The true 68020 and more advanced processors support the "bfffo" @@ -522,6 +540,7 @@ static inline int __fls(int x) #define clear_bit_unlock clear_bit #define __clear_bit_unlock clear_bit_unlock +#include #include #include #include diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index 191dc7898b0f..9a7d15da966e 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -113,75 +113,76 @@ static inline bool arch_test_and_change_bit(unsigned long nr, return old & mask; } -static inline void arch___set_bit(unsigned long nr, volatile unsigned long *ptr) +static __always_inline void +arch___set_bit(unsigned long nr, volatile unsigned long *addr) { - unsigned long *addr = __bitops_word(nr, ptr); + unsigned long *p = __bitops_word(nr, addr); unsigned long mask = __bitops_mask(nr); - *addr |= mask; + *p |= mask; } -static inline void arch___clear_bit(unsigned long nr, - volatile unsigned long *ptr) +static __always_inline void +arch___clear_bit(unsigned long nr, volatile unsigned long *addr) { - unsigned long *addr = __bitops_word(nr, ptr); + unsigned long *p = __bitops_word(nr, addr); unsigned long mask = __bitops_mask(nr); - *addr &= ~mask; + *p &= ~mask; } -static inline void arch___change_bit(unsigned long nr, - volatile unsigned long *ptr) +static __always_inline void +arch___change_bit(unsigned long nr, volatile unsigned long *addr) { - unsigned long *addr = __bitops_word(nr, ptr); + unsigned long *p = __bitops_word(nr, addr); unsigned long mask = __bitops_mask(nr); - *addr ^= mask; + *p ^= mask; } -static inline bool arch___test_and_set_bit(unsigned long nr, - volatile unsigned long *ptr) +static __always_inline bool +arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { - unsigned long *addr = __bitops_word(nr, ptr); + unsigned long *p = __bitops_word(nr, addr); unsigned long mask = __bitops_mask(nr); unsigned long old; - old = *addr; - *addr |= mask; + old = *p; + *p |= mask; return old & mask; } -static inline bool arch___test_and_clear_bit(unsigned long nr, - volatile unsigned long *ptr) +static __always_inline bool +arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { - unsigned long *addr = __bitops_word(nr, ptr); + unsigned long *p = __bitops_word(nr, addr); unsigned long mask = __bitops_mask(nr); unsigned long old; - old = *addr; - *addr &= ~mask; + old = *p; + *p &= ~mask; return old & mask; } -static inline bool arch___test_and_change_bit(unsigned long nr, - volatile unsigned long *ptr) +static __always_inline bool +arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { - unsigned long *addr = __bitops_word(nr, ptr); + unsigned long *p = __bitops_word(nr, addr); unsigned long mask = __bitops_mask(nr); unsigned long old; - old = *addr; - *addr ^= mask; + old = *p; + *p ^= mask; return old & mask; } -static inline bool arch_test_bit(unsigned long nr, - const volatile unsigned long *ptr) +static __always_inline bool +arch_test_bit(unsigned long nr, const volatile unsigned long *addr) { - const volatile unsigned long *addr = __bitops_word(nr, ptr); + const volatile unsigned long *p = __bitops_word(nr, addr); unsigned long mask = __bitops_mask(nr); - return *addr & mask; + return *p & mask; } static inline bool arch_test_and_set_bit_lock(unsigned long nr, diff --git a/arch/sh/include/asm/bitops-op32.h b/arch/sh/include/asm/bitops-op32.h index cfe5465acce7..565a85d8b7fb 100644 --- a/arch/sh/include/asm/bitops-op32.h +++ b/arch/sh/include/asm/bitops-op32.h @@ -2,6 +2,8 @@ #ifndef __ASM_SH_BITOPS_OP32_H #define __ASM_SH_BITOPS_OP32_H +#include + /* * The bit modifying instructions on SH-2A are only capable of working * with a 3-bit immediate, which signifies the shift position for the bit @@ -16,7 +18,8 @@ #define BYTE_OFFSET(nr) ((nr) % BITS_PER_BYTE) #endif -static inline void __set_bit(int nr, volatile unsigned long *addr) +static __always_inline void +arch___set_bit(unsigned long nr, volatile unsigned long *addr) { if (__builtin_constant_p(nr)) { __asm__ __volatile__ ( @@ -33,7 +36,8 @@ static inline void __set_bit(int nr, volatile unsigned long *addr) } } -static inline void __clear_bit(int nr, volatile unsigned long *addr) +static __always_inline void +arch___clear_bit(unsigned long nr, volatile unsigned long *addr) { if (__builtin_constant_p(nr)) { __asm__ __volatile__ ( @@ -52,7 +56,7 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr) } /** - * __change_bit - Toggle a bit in memory + * arch___change_bit - Toggle a bit in memory * @nr: the bit to change * @addr: the address to start counting from * @@ -60,7 +64,8 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr) * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static inline void __change_bit(int nr, volatile unsigned long *addr) +static __always_inline void +arch___change_bit(unsigned long nr, volatile unsigned long *addr) { if (__builtin_constant_p(nr)) { __asm__ __volatile__ ( @@ -79,7 +84,7 @@ static inline void __change_bit(int nr, volatile unsigned long *addr) } /** - * __test_and_set_bit - Set a bit and return its old value + * arch___test_and_set_bit - Set a bit and return its old value * @nr: Bit to set * @addr: Address to count from * @@ -87,7 +92,8 @@ static inline void __change_bit(int nr, volatile unsigned long *addr) * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ -static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) +static __always_inline bool +arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); @@ -98,7 +104,7 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) } /** - * __test_and_clear_bit - Clear a bit and return its old value + * arch___test_and_clear_bit - Clear a bit and return its old value * @nr: Bit to clear * @addr: Address to count from * @@ -106,7 +112,8 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ -static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) +static __always_inline bool +arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); @@ -117,8 +124,8 @@ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) } /* WARNING: non atomic and it can be reordered! */ -static inline int __test_and_change_bit(int nr, - volatile unsigned long *addr) +static __always_inline bool +arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); @@ -129,13 +136,16 @@ static inline int __test_and_change_bit(int nr, } /** - * test_bit - Determine whether a bit is set + * arch_test_bit - Determine whether a bit is set * @nr: bit number to test * @addr: Address to start counting from */ -static inline int test_bit(int nr, const volatile unsigned long *addr) +static __always_inline bool +arch_test_bit(unsigned long nr, const volatile unsigned long *addr) { return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); } +#include + #endif /* __ASM_SH_BITOPS_OP32_H */ diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index a288ecd230ab..973c6bd17f98 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -63,7 +63,7 @@ arch_set_bit(long nr, volatile unsigned long *addr) } static __always_inline void -arch___set_bit(long nr, volatile unsigned long *addr) +arch___set_bit(unsigned long nr, volatile unsigned long *addr) { asm volatile(__ASM_SIZE(bts) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } @@ -89,7 +89,7 @@ arch_clear_bit_unlock(long nr, volatile unsigned long *addr) } static __always_inline void -arch___clear_bit(long nr, volatile unsigned long *addr) +arch___clear_bit(unsigned long nr, volatile unsigned long *addr) { asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } @@ -114,7 +114,7 @@ arch___clear_bit_unlock(long nr, volatile unsigned long *addr) } static __always_inline void -arch___change_bit(long nr, volatile unsigned long *addr) +arch___change_bit(unsigned long nr, volatile unsigned long *addr) { asm volatile(__ASM_SIZE(btc) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } @@ -145,7 +145,7 @@ arch_test_and_set_bit_lock(long nr, volatile unsigned long *addr) } static __always_inline bool -arch___test_and_set_bit(long nr, volatile unsigned long *addr) +arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { bool oldbit; @@ -171,7 +171,7 @@ arch_test_and_clear_bit(long nr, volatile unsigned long *addr) * this without also updating arch/x86/kernel/kvm.c */ static __always_inline bool -arch___test_and_clear_bit(long nr, volatile unsigned long *addr) +arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { bool oldbit; @@ -183,7 +183,7 @@ arch___test_and_clear_bit(long nr, volatile unsigned long *addr) } static __always_inline bool -arch___test_and_change_bit(long nr, volatile unsigned long *addr) +arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { bool oldbit; @@ -219,10 +219,12 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l return oldbit; } -#define arch_test_bit(nr, addr) \ - (__builtin_constant_p((nr)) \ - ? constant_test_bit((nr), (addr)) \ - : variable_test_bit((nr), (addr))) +static __always_inline bool +arch_test_bit(unsigned long nr, const volatile unsigned long *addr) +{ + return __builtin_constant_p(nr) ? constant_test_bit(nr, addr) : + variable_test_bit(nr, addr); +} /** * __ffs - find first set bit in word diff --git a/include/asm-generic/bitops/generic-non-atomic.h b/include/asm-generic/bitops/generic-non-atomic.h index 7226488810e5..b85b8a2ac239 100644 --- a/include/asm-generic/bitops/generic-non-atomic.h +++ b/include/asm-generic/bitops/generic-non-atomic.h @@ -24,7 +24,7 @@ * may be that only one operation succeeds. */ static __always_inline void -generic___set_bit(unsigned int nr, volatile unsigned long *addr) +generic___set_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); @@ -33,7 +33,7 @@ generic___set_bit(unsigned int nr, volatile unsigned long *addr) } static __always_inline void -generic___clear_bit(unsigned int nr, volatile unsigned long *addr) +generic___clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); @@ -50,8 +50,8 @@ generic___clear_bit(unsigned int nr, volatile unsigned long *addr) * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static __always_inline -void generic___change_bit(unsigned int nr, volatile unsigned long *addr) +static __always_inline void +generic___change_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); @@ -68,8 +68,8 @@ void generic___change_bit(unsigned int nr, volatile unsigned long *addr) * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ -static __always_inline int -generic___test_and_set_bit(unsigned int nr, volatile unsigned long *addr) +static __always_inline bool +generic___test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); @@ -88,8 +88,8 @@ generic___test_and_set_bit(unsigned int nr, volatile unsigned long *addr) * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ -static __always_inline int -generic___test_and_clear_bit(unsigned int nr, volatile unsigned long *addr) +static __always_inline bool +generic___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); @@ -100,8 +100,8 @@ generic___test_and_clear_bit(unsigned int nr, volatile unsigned long *addr) } /* WARNING: non atomic and it can be reordered! */ -static __always_inline int -generic___test_and_change_bit(unsigned int nr, volatile unsigned long *addr) +static __always_inline bool +generic___test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); @@ -116,8 +116,8 @@ generic___test_and_change_bit(unsigned int nr, volatile unsigned long *addr) * @nr: bit number to test * @addr: Address to start counting from */ -static __always_inline int -generic_test_bit(unsigned int nr, const volatile unsigned long *addr) +static __always_inline bool +generic_test_bit(unsigned long nr, const volatile unsigned long *addr) { /* * Unlike the bitops with the '__' prefix above, this one *is* atomic, diff --git a/include/asm-generic/bitops/instrumented-non-atomic.h b/include/asm-generic/bitops/instrumented-non-atomic.h index 7ab1ecc37782..b019f77ef21c 100644 --- a/include/asm-generic/bitops/instrumented-non-atomic.h +++ b/include/asm-generic/bitops/instrumented-non-atomic.h @@ -22,7 +22,8 @@ * region of memory concurrently, the effect may be that only one operation * succeeds. */ -static __always_inline void __set_bit(long nr, volatile unsigned long *addr) +static __always_inline void +__set_bit(unsigned long nr, volatile unsigned long *addr) { instrument_write(addr + BIT_WORD(nr), sizeof(long)); arch___set_bit(nr, addr); @@ -37,7 +38,8 @@ static __always_inline void __set_bit(long nr, volatile unsigned long *addr) * region of memory concurrently, the effect may be that only one operation * succeeds. */ -static __always_inline void __clear_bit(long nr, volatile unsigned long *addr) +static __always_inline void +__clear_bit(unsigned long nr, volatile unsigned long *addr) { instrument_write(addr + BIT_WORD(nr), sizeof(long)); arch___clear_bit(nr, addr); @@ -52,7 +54,8 @@ static __always_inline void __clear_bit(long nr, volatile unsigned long *addr) * region of memory concurrently, the effect may be that only one operation * succeeds. */ -static __always_inline void __change_bit(long nr, volatile unsigned long *addr) +static __always_inline void +__change_bit(unsigned long nr, volatile unsigned long *addr) { instrument_write(addr + BIT_WORD(nr), sizeof(long)); arch___change_bit(nr, addr); @@ -90,7 +93,8 @@ static __always_inline void __instrument_read_write_bitop(long nr, volatile unsi * This operation is non-atomic. If two instances of this operation race, one * can appear to succeed but actually fail. */ -static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long *addr) +static __always_inline bool +__test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { __instrument_read_write_bitop(nr, addr); return arch___test_and_set_bit(nr, addr); @@ -104,7 +108,8 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long * * This operation is non-atomic. If two instances of this operation race, one * can appear to succeed but actually fail. */ -static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long *addr) +static __always_inline bool +__test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { __instrument_read_write_bitop(nr, addr); return arch___test_and_clear_bit(nr, addr); @@ -118,7 +123,8 @@ static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long * This operation is non-atomic. If two instances of this operation race, one * can appear to succeed but actually fail. */ -static __always_inline bool __test_and_change_bit(long nr, volatile unsigned long *addr) +static __always_inline bool +__test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { __instrument_read_write_bitop(nr, addr); return arch___test_and_change_bit(nr, addr); @@ -129,7 +135,8 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon * @nr: bit number to test * @addr: Address to start counting from */ -static __always_inline bool test_bit(long nr, const volatile unsigned long *addr) +static __always_inline bool +test_bit(unsigned long nr, const volatile unsigned long *addr) { instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long)); return arch_test_bit(nr, addr); diff --git a/include/asm-generic/bitops/non-atomic.h b/include/asm-generic/bitops/non-atomic.h index 23d3abc1e10d..5c37ced343ae 100644 --- a/include/asm-generic/bitops/non-atomic.h +++ b/include/asm-generic/bitops/non-atomic.h @@ -5,24 +5,15 @@ #include #define arch___set_bit generic___set_bit -#define __set_bit arch___set_bit - #define arch___clear_bit generic___clear_bit -#define __clear_bit arch___clear_bit - #define arch___change_bit generic___change_bit -#define __change_bit arch___change_bit #define arch___test_and_set_bit generic___test_and_set_bit -#define __test_and_set_bit arch___test_and_set_bit - #define arch___test_and_clear_bit generic___test_and_clear_bit -#define __test_and_clear_bit arch___test_and_clear_bit - #define arch___test_and_change_bit generic___test_and_change_bit -#define __test_and_change_bit arch___test_and_change_bit #define arch_test_bit generic_test_bit -#define test_bit arch_test_bit + +#include #endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */ diff --git a/include/asm-generic/bitops/non-instrumented-non-atomic.h b/include/asm-generic/bitops/non-instrumented-non-atomic.h new file mode 100644 index 000000000000..e0fd7bf72a56 --- /dev/null +++ b/include/asm-generic/bitops/non-instrumented-non-atomic.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H +#define __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H + +#define __set_bit arch___set_bit +#define __clear_bit arch___clear_bit +#define __change_bit arch___change_bit + +#define __test_and_set_bit arch___test_and_set_bit +#define __test_and_clear_bit arch___test_and_clear_bit +#define __test_and_change_bit arch___test_and_change_bit + +#define test_bit arch_test_bit + +#endif /* __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H */ diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 7aaed501f768..87087454a288 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -26,12 +26,29 @@ extern unsigned int __sw_hweight16(unsigned int w); extern unsigned int __sw_hweight32(unsigned int w); extern unsigned long __sw_hweight64(__u64 w); +#include + /* * Include this here because some architectures need generic_ffs/fls in * scope */ #include +/* Check that the bitops prototypes are sane */ +#define __check_bitop_pr(name) \ + static_assert(__same_type(arch_##name, generic_##name) && \ + __same_type(name, generic_##name)) + +__check_bitop_pr(__set_bit); +__check_bitop_pr(__clear_bit); +__check_bitop_pr(__change_bit); +__check_bitop_pr(__test_and_set_bit); +__check_bitop_pr(__test_and_clear_bit); +__check_bitop_pr(__test_and_change_bit); +__check_bitop_pr(test_bit); + +#undef __check_bitop_pr + static inline int get_bitmask_order(unsigned int count) { int order; diff --git a/tools/include/asm-generic/bitops/non-atomic.h b/tools/include/asm-generic/bitops/non-atomic.h index 7e10c4b50c5d..e5e78e42e57b 100644 --- a/tools/include/asm-generic/bitops/non-atomic.h +++ b/tools/include/asm-generic/bitops/non-atomic.h @@ -2,7 +2,7 @@ #ifndef _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ #define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ -#include +#include /** * __set_bit - Set a bit in memory @@ -13,7 +13,8 @@ * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static inline void __set_bit(int nr, volatile unsigned long *addr) +static __always_inline void +__set_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); @@ -21,7 +22,8 @@ static inline void __set_bit(int nr, volatile unsigned long *addr) *p |= mask; } -static inline void __clear_bit(int nr, volatile unsigned long *addr) +static __always_inline void +__clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); @@ -38,7 +40,8 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr) * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static inline void __change_bit(int nr, volatile unsigned long *addr) +static __always_inline void +__change_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); @@ -55,7 +58,8 @@ static inline void __change_bit(int nr, volatile unsigned long *addr) * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ -static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) +static __always_inline bool +__test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); @@ -74,7 +78,8 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ -static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) +static __always_inline bool +__test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); @@ -85,8 +90,8 @@ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) } /* WARNING: non atomic and it can be reordered! */ -static inline int __test_and_change_bit(int nr, - volatile unsigned long *addr) +static __always_inline bool +__test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); @@ -101,7 +106,8 @@ static inline int __test_and_change_bit(int nr, * @nr: bit number to test * @addr: Address to start counting from */ -static inline int test_bit(int nr, const volatile unsigned long *addr) +static __always_inline bool +test_bit(unsigned long nr, const volatile unsigned long *addr) { return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); } -- cgit v1.2.3 From e69eb9c460f128b71c6b995d75a05244e4b6cc3e Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 24 Jun 2022 14:13:09 +0200 Subject: bitops: wrap non-atomic bitops with a transparent macro In preparation for altering the non-atomic bitops with a macro, wrap them in a transparent definition. This requires prepending one more '_' to their names in order to be able to do that seamlessly. It is a simple change, given that all the non-prefixed definitions are now in asm-generic. sparc32 already has several triple-underscored functions, so I had to rename them ('___' -> 'sp32_'). Signed-off-by: Alexander Lobakin Reviewed-by: Marco Elver Reviewed-by: Andy Shevchenko Signed-off-by: Yury Norov --- arch/sparc/include/asm/bitops_32.h | 18 +++++++------- arch/sparc/lib/atomic32.c | 12 +++++----- .../asm-generic/bitops/instrumented-non-atomic.h | 28 +++++++++++----------- .../bitops/non-instrumented-non-atomic.h | 14 +++++------ include/linux/bitops.h | 18 +++++++++++++- tools/include/asm-generic/bitops/non-atomic.h | 24 +++++++++---------- tools/include/linux/bitops.h | 16 +++++++++++++ 7 files changed, 81 insertions(+), 49 deletions(-) (limited to 'tools') diff --git a/arch/sparc/include/asm/bitops_32.h b/arch/sparc/include/asm/bitops_32.h index 889afa9f990f..3448c191b484 100644 --- a/arch/sparc/include/asm/bitops_32.h +++ b/arch/sparc/include/asm/bitops_32.h @@ -19,9 +19,9 @@ #error only can be included directly #endif -unsigned long ___set_bit(unsigned long *addr, unsigned long mask); -unsigned long ___clear_bit(unsigned long *addr, unsigned long mask); -unsigned long ___change_bit(unsigned long *addr, unsigned long mask); +unsigned long sp32___set_bit(unsigned long *addr, unsigned long mask); +unsigned long sp32___clear_bit(unsigned long *addr, unsigned long mask); +unsigned long sp32___change_bit(unsigned long *addr, unsigned long mask); /* * Set bit 'nr' in 32-bit quantity at address 'addr' where bit '0' @@ -36,7 +36,7 @@ static inline int test_and_set_bit(unsigned long nr, volatile unsigned long *add ADDR = ((unsigned long *) addr) + (nr >> 5); mask = 1 << (nr & 31); - return ___set_bit(ADDR, mask) != 0; + return sp32___set_bit(ADDR, mask) != 0; } static inline void set_bit(unsigned long nr, volatile unsigned long *addr) @@ -46,7 +46,7 @@ static inline void set_bit(unsigned long nr, volatile unsigned long *addr) ADDR = ((unsigned long *) addr) + (nr >> 5); mask = 1 << (nr & 31); - (void) ___set_bit(ADDR, mask); + (void) sp32___set_bit(ADDR, mask); } static inline int test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) @@ -56,7 +56,7 @@ static inline int test_and_clear_bit(unsigned long nr, volatile unsigned long *a ADDR = ((unsigned long *) addr) + (nr >> 5); mask = 1 << (nr & 31); - return ___clear_bit(ADDR, mask) != 0; + return sp32___clear_bit(ADDR, mask) != 0; } static inline void clear_bit(unsigned long nr, volatile unsigned long *addr) @@ -66,7 +66,7 @@ static inline void clear_bit(unsigned long nr, volatile unsigned long *addr) ADDR = ((unsigned long *) addr) + (nr >> 5); mask = 1 << (nr & 31); - (void) ___clear_bit(ADDR, mask); + (void) sp32___clear_bit(ADDR, mask); } static inline int test_and_change_bit(unsigned long nr, volatile unsigned long *addr) @@ -76,7 +76,7 @@ static inline int test_and_change_bit(unsigned long nr, volatile unsigned long * ADDR = ((unsigned long *) addr) + (nr >> 5); mask = 1 << (nr & 31); - return ___change_bit(ADDR, mask) != 0; + return sp32___change_bit(ADDR, mask) != 0; } static inline void change_bit(unsigned long nr, volatile unsigned long *addr) @@ -86,7 +86,7 @@ static inline void change_bit(unsigned long nr, volatile unsigned long *addr) ADDR = ((unsigned long *) addr) + (nr >> 5); mask = 1 << (nr & 31); - (void) ___change_bit(ADDR, mask); + (void) sp32___change_bit(ADDR, mask); } #include diff --git a/arch/sparc/lib/atomic32.c b/arch/sparc/lib/atomic32.c index 8b81d0f00c97..cf80d1ae352b 100644 --- a/arch/sparc/lib/atomic32.c +++ b/arch/sparc/lib/atomic32.c @@ -120,7 +120,7 @@ void arch_atomic_set(atomic_t *v, int i) } EXPORT_SYMBOL(arch_atomic_set); -unsigned long ___set_bit(unsigned long *addr, unsigned long mask) +unsigned long sp32___set_bit(unsigned long *addr, unsigned long mask) { unsigned long old, flags; @@ -131,9 +131,9 @@ unsigned long ___set_bit(unsigned long *addr, unsigned long mask) return old & mask; } -EXPORT_SYMBOL(___set_bit); +EXPORT_SYMBOL(sp32___set_bit); -unsigned long ___clear_bit(unsigned long *addr, unsigned long mask) +unsigned long sp32___clear_bit(unsigned long *addr, unsigned long mask) { unsigned long old, flags; @@ -144,9 +144,9 @@ unsigned long ___clear_bit(unsigned long *addr, unsigned long mask) return old & mask; } -EXPORT_SYMBOL(___clear_bit); +EXPORT_SYMBOL(sp32___clear_bit); -unsigned long ___change_bit(unsigned long *addr, unsigned long mask) +unsigned long sp32___change_bit(unsigned long *addr, unsigned long mask) { unsigned long old, flags; @@ -157,7 +157,7 @@ unsigned long ___change_bit(unsigned long *addr, unsigned long mask) return old & mask; } -EXPORT_SYMBOL(___change_bit); +EXPORT_SYMBOL(sp32___change_bit); unsigned long __cmpxchg_u32(volatile u32 *ptr, u32 old, u32 new) { diff --git a/include/asm-generic/bitops/instrumented-non-atomic.h b/include/asm-generic/bitops/instrumented-non-atomic.h index b019f77ef21c..988a3bbfba34 100644 --- a/include/asm-generic/bitops/instrumented-non-atomic.h +++ b/include/asm-generic/bitops/instrumented-non-atomic.h @@ -14,7 +14,7 @@ #include /** - * __set_bit - Set a bit in memory + * ___set_bit - Set a bit in memory * @nr: the bit to set * @addr: the address to start counting from * @@ -23,14 +23,14 @@ * succeeds. */ static __always_inline void -__set_bit(unsigned long nr, volatile unsigned long *addr) +___set_bit(unsigned long nr, volatile unsigned long *addr) { instrument_write(addr + BIT_WORD(nr), sizeof(long)); arch___set_bit(nr, addr); } /** - * __clear_bit - Clears a bit in memory + * ___clear_bit - Clears a bit in memory * @nr: the bit to clear * @addr: the address to start counting from * @@ -39,14 +39,14 @@ __set_bit(unsigned long nr, volatile unsigned long *addr) * succeeds. */ static __always_inline void -__clear_bit(unsigned long nr, volatile unsigned long *addr) +___clear_bit(unsigned long nr, volatile unsigned long *addr) { instrument_write(addr + BIT_WORD(nr), sizeof(long)); arch___clear_bit(nr, addr); } /** - * __change_bit - Toggle a bit in memory + * ___change_bit - Toggle a bit in memory * @nr: the bit to change * @addr: the address to start counting from * @@ -55,7 +55,7 @@ __clear_bit(unsigned long nr, volatile unsigned long *addr) * succeeds. */ static __always_inline void -__change_bit(unsigned long nr, volatile unsigned long *addr) +___change_bit(unsigned long nr, volatile unsigned long *addr) { instrument_write(addr + BIT_WORD(nr), sizeof(long)); arch___change_bit(nr, addr); @@ -86,7 +86,7 @@ static __always_inline void __instrument_read_write_bitop(long nr, volatile unsi } /** - * __test_and_set_bit - Set a bit and return its old value + * ___test_and_set_bit - Set a bit and return its old value * @nr: Bit to set * @addr: Address to count from * @@ -94,14 +94,14 @@ static __always_inline void __instrument_read_write_bitop(long nr, volatile unsi * can appear to succeed but actually fail. */ static __always_inline bool -__test_and_set_bit(unsigned long nr, volatile unsigned long *addr) +___test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { __instrument_read_write_bitop(nr, addr); return arch___test_and_set_bit(nr, addr); } /** - * __test_and_clear_bit - Clear a bit and return its old value + * ___test_and_clear_bit - Clear a bit and return its old value * @nr: Bit to clear * @addr: Address to count from * @@ -109,14 +109,14 @@ __test_and_set_bit(unsigned long nr, volatile unsigned long *addr) * can appear to succeed but actually fail. */ static __always_inline bool -__test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) +___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { __instrument_read_write_bitop(nr, addr); return arch___test_and_clear_bit(nr, addr); } /** - * __test_and_change_bit - Change a bit and return its old value + * ___test_and_change_bit - Change a bit and return its old value * @nr: Bit to change * @addr: Address to count from * @@ -124,19 +124,19 @@ __test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) * can appear to succeed but actually fail. */ static __always_inline bool -__test_and_change_bit(unsigned long nr, volatile unsigned long *addr) +___test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { __instrument_read_write_bitop(nr, addr); return arch___test_and_change_bit(nr, addr); } /** - * test_bit - Determine whether a bit is set + * _test_bit - Determine whether a bit is set * @nr: bit number to test * @addr: Address to start counting from */ static __always_inline bool -test_bit(unsigned long nr, const volatile unsigned long *addr) +_test_bit(unsigned long nr, const volatile unsigned long *addr) { instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long)); return arch_test_bit(nr, addr); diff --git a/include/asm-generic/bitops/non-instrumented-non-atomic.h b/include/asm-generic/bitops/non-instrumented-non-atomic.h index e0fd7bf72a56..bdb9b1ffaee9 100644 --- a/include/asm-generic/bitops/non-instrumented-non-atomic.h +++ b/include/asm-generic/bitops/non-instrumented-non-atomic.h @@ -3,14 +3,14 @@ #ifndef __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H #define __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H -#define __set_bit arch___set_bit -#define __clear_bit arch___clear_bit -#define __change_bit arch___change_bit +#define ___set_bit arch___set_bit +#define ___clear_bit arch___clear_bit +#define ___change_bit arch___change_bit -#define __test_and_set_bit arch___test_and_set_bit -#define __test_and_clear_bit arch___test_and_clear_bit -#define __test_and_change_bit arch___test_and_change_bit +#define ___test_and_set_bit arch___test_and_set_bit +#define ___test_and_clear_bit arch___test_and_clear_bit +#define ___test_and_change_bit arch___test_and_change_bit -#define test_bit arch_test_bit +#define _test_bit arch_test_bit #endif /* __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H */ diff --git a/include/linux/bitops.h b/include/linux/bitops.h index d393297287d5..3c3afbae1533 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -26,8 +26,24 @@ extern unsigned int __sw_hweight16(unsigned int w); extern unsigned int __sw_hweight32(unsigned int w); extern unsigned long __sw_hweight64(__u64 w); +/* + * Defined here because those may be needed by architecture-specific static + * inlines. + */ + #include +#define bitop(op, nr, addr) \ + op(nr, addr) + +#define __set_bit(nr, addr) bitop(___set_bit, nr, addr) +#define __clear_bit(nr, addr) bitop(___clear_bit, nr, addr) +#define __change_bit(nr, addr) bitop(___change_bit, nr, addr) +#define __test_and_set_bit(nr, addr) bitop(___test_and_set_bit, nr, addr) +#define __test_and_clear_bit(nr, addr) bitop(___test_and_clear_bit, nr, addr) +#define __test_and_change_bit(nr, addr) bitop(___test_and_change_bit, nr, addr) +#define test_bit(nr, addr) bitop(_test_bit, nr, addr) + /* * Include this here because some architectures need generic_ffs/fls in * scope @@ -38,7 +54,7 @@ extern unsigned long __sw_hweight64(__u64 w); #define __check_bitop_pr(name) \ static_assert(__same_type(arch_##name, generic_##name) && \ __same_type(const_##name, generic_##name) && \ - __same_type(name, generic_##name)) + __same_type(_##name, generic_##name)) __check_bitop_pr(__set_bit); __check_bitop_pr(__clear_bit); diff --git a/tools/include/asm-generic/bitops/non-atomic.h b/tools/include/asm-generic/bitops/non-atomic.h index e5e78e42e57b..0c472a833408 100644 --- a/tools/include/asm-generic/bitops/non-atomic.h +++ b/tools/include/asm-generic/bitops/non-atomic.h @@ -5,7 +5,7 @@ #include /** - * __set_bit - Set a bit in memory + * ___set_bit - Set a bit in memory * @nr: the bit to set * @addr: the address to start counting from * @@ -14,7 +14,7 @@ * may be that only one operation succeeds. */ static __always_inline void -__set_bit(unsigned long nr, volatile unsigned long *addr) +___set_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); @@ -23,7 +23,7 @@ __set_bit(unsigned long nr, volatile unsigned long *addr) } static __always_inline void -__clear_bit(unsigned long nr, volatile unsigned long *addr) +___clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); @@ -32,7 +32,7 @@ __clear_bit(unsigned long nr, volatile unsigned long *addr) } /** - * __change_bit - Toggle a bit in memory + * ___change_bit - Toggle a bit in memory * @nr: the bit to change * @addr: the address to start counting from * @@ -41,7 +41,7 @@ __clear_bit(unsigned long nr, volatile unsigned long *addr) * may be that only one operation succeeds. */ static __always_inline void -__change_bit(unsigned long nr, volatile unsigned long *addr) +___change_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); @@ -50,7 +50,7 @@ __change_bit(unsigned long nr, volatile unsigned long *addr) } /** - * __test_and_set_bit - Set a bit and return its old value + * ___test_and_set_bit - Set a bit and return its old value * @nr: Bit to set * @addr: Address to count from * @@ -59,7 +59,7 @@ __change_bit(unsigned long nr, volatile unsigned long *addr) * but actually fail. You must protect multiple accesses with a lock. */ static __always_inline bool -__test_and_set_bit(unsigned long nr, volatile unsigned long *addr) +___test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); @@ -70,7 +70,7 @@ __test_and_set_bit(unsigned long nr, volatile unsigned long *addr) } /** - * __test_and_clear_bit - Clear a bit and return its old value + * ___test_and_clear_bit - Clear a bit and return its old value * @nr: Bit to clear * @addr: Address to count from * @@ -79,7 +79,7 @@ __test_and_set_bit(unsigned long nr, volatile unsigned long *addr) * but actually fail. You must protect multiple accesses with a lock. */ static __always_inline bool -__test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) +___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); @@ -91,7 +91,7 @@ __test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) /* WARNING: non atomic and it can be reordered! */ static __always_inline bool -__test_and_change_bit(unsigned long nr, volatile unsigned long *addr) +___test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); @@ -102,12 +102,12 @@ __test_and_change_bit(unsigned long nr, volatile unsigned long *addr) } /** - * test_bit - Determine whether a bit is set + * _test_bit - Determine whether a bit is set * @nr: bit number to test * @addr: Address to start counting from */ static __always_inline bool -test_bit(unsigned long nr, const volatile unsigned long *addr) +_test_bit(unsigned long nr, const volatile unsigned long *addr) { return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); } diff --git a/tools/include/linux/bitops.h b/tools/include/linux/bitops.h index 5fca38fe1ba8..f18683b95ea6 100644 --- a/tools/include/linux/bitops.h +++ b/tools/include/linux/bitops.h @@ -25,6 +25,22 @@ extern unsigned int __sw_hweight16(unsigned int w); extern unsigned int __sw_hweight32(unsigned int w); extern unsigned long __sw_hweight64(__u64 w); +/* + * Defined here because those may be needed by architecture-specific static + * inlines. + */ + +#define bitop(op, nr, addr) \ + op(nr, addr) + +#define __set_bit(nr, addr) bitop(___set_bit, nr, addr) +#define __clear_bit(nr, addr) bitop(___clear_bit, nr, addr) +#define __change_bit(nr, addr) bitop(___change_bit, nr, addr) +#define __test_and_set_bit(nr, addr) bitop(___test_and_set_bit, nr, addr) +#define __test_and_clear_bit(nr, addr) bitop(___test_and_clear_bit, nr, addr) +#define __test_and_change_bit(nr, addr) bitop(___test_and_change_bit, nr, addr) +#define test_bit(nr, addr) bitop(_test_bit, nr, addr) + /* * Include this here because some architectures need generic_ffs/fls in * scope -- cgit v1.2.3 From 1045a06724f322ed61f1ffb994427c7bdbe64647 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 29 Jun 2022 17:01:02 +0200 Subject: remove CONFIG_ANDROID The ANDROID config symbol is only used to guard the binder config symbol and to inject completely random config changes. Remove it as it is obviously a bad idea. Acked-by: Paul E. McKenney Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220629150102.1582425-2-hch@lst.de Signed-off-by: Greg Kroah-Hartman --- drivers/Makefile | 2 +- drivers/android/Kconfig | 9 --------- kernel/configs/android-base.config | 1 - kernel/rcu/Kconfig.debug | 3 +-- tools/testing/selftests/filesystems/binderfs/config | 1 - tools/testing/selftests/sync/config | 1 - 6 files changed, 2 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/drivers/Makefile b/drivers/Makefile index 9a30842b22c5..123dce286758 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -176,7 +176,7 @@ obj-$(CONFIG_USB4) += thunderbolt/ obj-$(CONFIG_CORESIGHT) += hwtracing/coresight/ obj-y += hwtracing/intel_th/ obj-$(CONFIG_STM) += hwtracing/stm/ -obj-$(CONFIG_ANDROID) += android/ +obj-y += android/ obj-$(CONFIG_NVMEM) += nvmem/ obj-$(CONFIG_FPGA) += fpga/ obj-$(CONFIG_FSI) += fsi/ diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index 53b22e26266c..07aa8ae0a058 100644 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -1,13 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 menu "Android" -config ANDROID - bool "Android Drivers" - help - Enable support for various drivers needed on the Android platform - -if ANDROID - config ANDROID_BINDER_IPC bool "Android Binder IPC Driver" depends on MMU @@ -54,6 +47,4 @@ config ANDROID_BINDER_IPC_SELFTEST exhaustively with combinations of various buffer sizes and alignments. -endif # if ANDROID - endmenu diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index eb701b2ac72f..44b0f0146a3f 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -7,7 +7,6 @@ # CONFIG_OABI_COMPAT is not set # CONFIG_SYSVIPC is not set # CONFIG_USELIB is not set -CONFIG_ANDROID=y CONFIG_ANDROID_BINDER_IPC=y CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder CONFIG_ANDROID_LOW_MEMORY_KILLER=y diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 9b64e55d4f61..e875f4f88965 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -86,8 +86,7 @@ config RCU_EXP_CPU_STALL_TIMEOUT int "Expedited RCU CPU stall timeout in milliseconds" depends on RCU_STALL_COMMON range 0 21000 - default 20 if ANDROID - default 0 if !ANDROID + default 0 help If a given expedited RCU grace period extends more than the specified number of milliseconds, a CPU stall warning is printed. diff --git a/tools/testing/selftests/filesystems/binderfs/config b/tools/testing/selftests/filesystems/binderfs/config index 02dd6cc9cf99..7b4fc6ee6205 100644 --- a/tools/testing/selftests/filesystems/binderfs/config +++ b/tools/testing/selftests/filesystems/binderfs/config @@ -1,3 +1,2 @@ -CONFIG_ANDROID=y CONFIG_ANDROID_BINDERFS=y CONFIG_ANDROID_BINDER_IPC=y diff --git a/tools/testing/selftests/sync/config b/tools/testing/selftests/sync/config index 47ff5afc3727..64c60f38b446 100644 --- a/tools/testing/selftests/sync/config +++ b/tools/testing/selftests/sync/config @@ -1,3 +1,2 @@ CONFIG_STAGING=y -CONFIG_ANDROID=y CONFIG_SW_SYNC=y -- cgit v1.2.3 From 9c154ab47f5e5ff632d2b7af6342c027d7e04b92 Mon Sep 17 00:00:00 2001 From: Alaa Mohamed Date: Thu, 30 Jun 2022 12:24:49 +0200 Subject: selftests: net: fib_rule_tests: fix support for running individual tests parsing and usage of -t got missed in the previous patch. this patch fixes it Fixes: 816cda9ae531 ("selftests: net: fib_rule_tests: add support to select a test to run") Signed-off-by: Alaa Mohamed Signed-off-by: David S. Miller --- tools/testing/selftests/net/fib_rule_tests.sh | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh index bbe3b379927a..c245476fa29d 100755 --- a/tools/testing/selftests/net/fib_rule_tests.sh +++ b/tools/testing/selftests/net/fib_rule_tests.sh @@ -303,6 +303,29 @@ run_fibrule_tests() log_section "IPv6 fib rule" fib_rule6_test } +################################################################################ +# usage + +usage() +{ + cat < Test(s) to run (default: all) + (options: $TESTS) +EOF +} + +################################################################################ +# main + +while getopts ":t:h" opt; do + case $opt in + t) TESTS=$OPTARG;; + h) usage; exit 0;; + *) usage; exit 1;; + esac +done if [ "$(id -u)" -ne 0 ];then echo "SKIP: Need root privileges" -- cgit v1.2.3 From dbdd9a28e1406ab8218a69e60f10a168b968c81d Mon Sep 17 00:00:00 2001 From: Li kunyu Date: Fri, 1 Jul 2022 17:13:45 +0800 Subject: net/cmsg_sender: Remove a semicolon Remove the repeated ';' from code. Signed-off-by: Li kunyu Signed-off-by: David S. Miller --- tools/testing/selftests/net/cmsg_sender.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/cmsg_sender.c b/tools/testing/selftests/net/cmsg_sender.c index bc2162909a1a..75dd83e39207 100644 --- a/tools/testing/selftests/net/cmsg_sender.c +++ b/tools/testing/selftests/net/cmsg_sender.c @@ -456,7 +456,7 @@ int main(int argc, char *argv[]) buf[1] = 0; } else if (opt.sock.type == SOCK_RAW) { struct udphdr hdr = { 1, 2, htons(opt.size), 0 }; - struct sockaddr_in6 *sin6 = (void *)ai->ai_addr;; + struct sockaddr_in6 *sin6 = (void *)ai->ai_addr; memcpy(buf, &hdr, sizeof(hdr)); sin6->sin6_port = htons(opt.sock.proto); -- cgit v1.2.3 From b0d93b44641a83c28014ca38001e85bf6dc8501e Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 30 Jun 2022 15:42:03 -0700 Subject: selftests/bpf: Skip lsm_cgroup when we don't have trampolines With arch_prepare_bpf_trampoline removed on x86: [...] #98/1 lsm_cgroup/functional:SKIP #98 lsm_cgroup:SKIP Summary: 1/0 PASSED, 1 SKIPPED, 0 FAILED Fixes: dca85aac8895 ("selftests/bpf: lsm_cgroup functional test") Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Hao Luo Link: https://lore.kernel.org/bpf/20220630224203.512815-1-sdf@google.com --- tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c b/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c index d40810a742fa..c542d7e80a5b 100644 --- a/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c +++ b/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c @@ -9,6 +9,10 @@ #include "cgroup_helpers.h" #include "network_helpers.h" +#ifndef ENOTSUPP +#define ENOTSUPP 524 +#endif + static struct btf *btf; static __u32 query_prog_cnt(int cgroup_fd, const char *attach_func) @@ -100,6 +104,10 @@ static void test_lsm_cgroup_functional(void) ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_sk_alloc_security"), 0, "prog count"); ASSERT_EQ(query_prog_cnt(cgroup_fd, NULL), 0, "total prog count"); err = bpf_prog_attach(alloc_prog_fd, cgroup_fd, BPF_LSM_CGROUP, 0); + if (err == -ENOTSUPP) { + test__skip(); + goto close_cgroup; + } if (!ASSERT_OK(err, "attach alloc_prog_fd")) goto detach_cgroup; ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_sk_alloc_security"), 1, "prog count"); -- cgit v1.2.3 From 73c4936f916de73fa3faec204a4deb37c25e18c1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 1 Jul 2022 14:47:26 +0200 Subject: bpf, selftests: Add verifier test case for imm=0,umin=0,umax=1 scalar Add a test case to trigger the constant scalar issue which leaves the register in scalar(imm=0,umin=0,umax=1,var_off=(0x0; 0x0)) state. Make use of dead code elimination, so that we can see the verifier bailing out on unfixed kernels. For the condition, we use jle given it checks on umax bound. Before: # ./test_verifier 743 #743/p jump & dead code elimination FAIL Failed to load prog 'Permission denied'! R4 !read_ok verification time 11 usec stack depth 0 processed 13 insns (limit 1000000) max_states_per_insn 0 total_states 1 peak_states 1 mark_read 1 Summary: 0 PASSED, 0 SKIPPED, 1 FAILED After: # ./test_verifier 743 #743/p jump & dead code elimination OK Summary: 1 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220701124727.11153-3-daniel@iogearbox.net --- tools/testing/selftests/bpf/verifier/jump.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/verifier/jump.c b/tools/testing/selftests/bpf/verifier/jump.c index 6f951d1ff0a4..497fe17d2eaf 100644 --- a/tools/testing/selftests/bpf/verifier/jump.c +++ b/tools/testing/selftests/bpf/verifier/jump.c @@ -373,3 +373,25 @@ .result = ACCEPT, .retval = 3, }, +{ + "jump & dead code elimination", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_3, 0), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_3, 0), + BPF_ALU64_IMM(BPF_OR, BPF_REG_3, 32767), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_3, 0, 1), + BPF_EXIT_INSN(), + BPF_JMP_IMM(BPF_JSLE, BPF_REG_3, 0x8000, 1), + BPF_EXIT_INSN(), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -32767), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_JMP_IMM(BPF_JLE, BPF_REG_3, 0, 1), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_4), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 2, +}, -- cgit v1.2.3 From a49b8ce7306cf8031361a6a4f7f6bc7a775a39c8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 1 Jul 2022 14:47:27 +0200 Subject: bpf, selftests: Add verifier test case for jmp32's jeq/jne Add a test case to trigger the verifier's incorrect conclusion in the case of jmp32's jeq/jne. Also here, make use of dead code elimination, so that we can see the verifier bailing out on unfixed kernels. Before: # ./test_verifier 724 #724/p jeq32/jne32: bounds checking FAIL Failed to load prog 'Permission denied'! R4 !read_ok verification time 8 usec stack depth 0 processed 8 insns (limit 1000000) max_states_per_insn 0 total_states 1 peak_states 1 mark_read 0 Summary: 0 PASSED, 0 SKIPPED, 1 FAILED After: # ./test_verifier 724 #724/p jeq32/jne32: bounds checking OK Summary: 1 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220701124727.11153-4-daniel@iogearbox.net --- tools/testing/selftests/bpf/verifier/jmp32.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/verifier/jmp32.c b/tools/testing/selftests/bpf/verifier/jmp32.c index 6ddc418fdfaf..1a27a6210554 100644 --- a/tools/testing/selftests/bpf/verifier/jmp32.c +++ b/tools/testing/selftests/bpf/verifier/jmp32.c @@ -864,3 +864,24 @@ .result = ACCEPT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, +{ + "jeq32/jne32: bounds checking", + .insns = { + BPF_MOV64_IMM(BPF_REG_6, 563), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0), + BPF_ALU32_REG(BPF_OR, BPF_REG_2, BPF_REG_6), + BPF_JMP32_IMM(BPF_JNE, BPF_REG_2, 8, 5), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_2, 500, 2), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_4), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 1, +}, -- cgit v1.2.3 From d28b25a62a47a8c8aa19bd543863aab6717e68c9 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 30 Jun 2022 14:22:28 +0800 Subject: selftests/net: fix section name when using xdp_dummy.o Since commit 8fffa0e3451a ("selftests/bpf: Normalize XDP section names in selftests") the xdp_dummy.o's section name has changed to xdp. But some tests are still using "section xdp_dummy", which make the tests failed. Fix them by updating to the new section name. Fixes: 8fffa0e3451a ("selftests/bpf: Normalize XDP section names in selftests") Signed-off-by: Hangbin Liu Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220630062228.3453016-1-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/udpgro.sh | 2 +- tools/testing/selftests/net/udpgro_bench.sh | 2 +- tools/testing/selftests/net/udpgro_frglist.sh | 2 +- tools/testing/selftests/net/udpgro_fwd.sh | 2 +- tools/testing/selftests/net/veth.sh | 6 +++--- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/udpgro.sh b/tools/testing/selftests/net/udpgro.sh index f8a19f548ae9..ebbd0b282432 100755 --- a/tools/testing/selftests/net/udpgro.sh +++ b/tools/testing/selftests/net/udpgro.sh @@ -34,7 +34,7 @@ cfg_veth() { ip -netns "${PEER_NS}" addr add dev veth1 192.168.1.1/24 ip -netns "${PEER_NS}" addr add dev veth1 2001:db8::1/64 nodad ip -netns "${PEER_NS}" link set dev veth1 up - ip -n "${PEER_NS}" link set veth1 xdp object ../bpf/xdp_dummy.o section xdp_dummy + ip -n "${PEER_NS}" link set veth1 xdp object ../bpf/xdp_dummy.o section xdp } run_one() { diff --git a/tools/testing/selftests/net/udpgro_bench.sh b/tools/testing/selftests/net/udpgro_bench.sh index 820bc50f6b68..fad2d1a71cac 100755 --- a/tools/testing/selftests/net/udpgro_bench.sh +++ b/tools/testing/selftests/net/udpgro_bench.sh @@ -34,7 +34,7 @@ run_one() { ip -netns "${PEER_NS}" addr add dev veth1 2001:db8::1/64 nodad ip -netns "${PEER_NS}" link set dev veth1 up - ip -n "${PEER_NS}" link set veth1 xdp object ../bpf/xdp_dummy.o section xdp_dummy + ip -n "${PEER_NS}" link set veth1 xdp object ../bpf/xdp_dummy.o section xdp ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${rx_args} -r & ip netns exec "${PEER_NS}" ./udpgso_bench_rx -t ${rx_args} -r & diff --git a/tools/testing/selftests/net/udpgro_frglist.sh b/tools/testing/selftests/net/udpgro_frglist.sh index 807b74c8fd80..832c738cc3c2 100755 --- a/tools/testing/selftests/net/udpgro_frglist.sh +++ b/tools/testing/selftests/net/udpgro_frglist.sh @@ -36,7 +36,7 @@ run_one() { ip netns exec "${PEER_NS}" ethtool -K veth1 rx-gro-list on - ip -n "${PEER_NS}" link set veth1 xdp object ../bpf/xdp_dummy.o section xdp_dummy + ip -n "${PEER_NS}" link set veth1 xdp object ../bpf/xdp_dummy.o section xdp tc -n "${PEER_NS}" qdisc add dev veth1 clsact tc -n "${PEER_NS}" filter add dev veth1 ingress prio 4 protocol ipv6 bpf object-file ../bpf/nat6to4.o section schedcls/ingress6/nat_6 direct-action tc -n "${PEER_NS}" filter add dev veth1 egress prio 4 protocol ip bpf object-file ../bpf/nat6to4.o section schedcls/egress4/snat4 direct-action diff --git a/tools/testing/selftests/net/udpgro_fwd.sh b/tools/testing/selftests/net/udpgro_fwd.sh index 6f05e06f6761..1bcd82e1f662 100755 --- a/tools/testing/selftests/net/udpgro_fwd.sh +++ b/tools/testing/selftests/net/udpgro_fwd.sh @@ -46,7 +46,7 @@ create_ns() { ip -n $BASE$ns addr add dev veth$ns $BM_NET_V4$ns/24 ip -n $BASE$ns addr add dev veth$ns $BM_NET_V6$ns/64 nodad done - ip -n $NS_DST link set veth$DST xdp object ../bpf/xdp_dummy.o section xdp_dummy 2>/dev/null + ip -n $NS_DST link set veth$DST xdp object ../bpf/xdp_dummy.o section xdp 2>/dev/null } create_vxlan_endpoint() { diff --git a/tools/testing/selftests/net/veth.sh b/tools/testing/selftests/net/veth.sh index 19eac3e44c06..430895d1a2b6 100755 --- a/tools/testing/selftests/net/veth.sh +++ b/tools/testing/selftests/net/veth.sh @@ -289,14 +289,14 @@ if [ $CPUS -gt 1 ]; then ip netns exec $NS_SRC ethtool -L veth$SRC rx 1 tx 2 2>/dev/null printf "%-60s" "bad setting: XDP with RX nr less than TX" ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o \ - section xdp_dummy 2>/dev/null &&\ + section xdp 2>/dev/null &&\ echo "fail - set operation successful ?!?" || echo " ok " # the following tests will run with multiple channels active ip netns exec $NS_SRC ethtool -L veth$SRC rx 2 ip netns exec $NS_DST ethtool -L veth$DST rx 2 ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o \ - section xdp_dummy 2>/dev/null + section xdp 2>/dev/null printf "%-60s" "bad setting: reducing RX nr below peer TX with XDP set" ip netns exec $NS_DST ethtool -L veth$DST rx 1 2>/dev/null &&\ echo "fail - set operation successful ?!?" || echo " ok " @@ -311,7 +311,7 @@ if [ $CPUS -gt 2 ]; then chk_channels "setting invalid channels nr" $DST 2 2 fi -ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o section xdp_dummy 2>/dev/null +ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o section xdp 2>/dev/null chk_gro_flag "with xdp attached - gro flag" $DST on chk_gro_flag " - peer gro flag" $SRC off chk_tso_flag " - tso flag" $SRC off -- cgit v1.2.3 From b7ecce6800eb1aa97c486c1aabf64659193d5a4c Mon Sep 17 00:00:00 2001 From: Zan Aziz Date: Fri, 1 Jul 2022 18:08:34 -0600 Subject: selftests:timers: globals don't need initialization to 0 Global variables do not need to be initialized to 0 and checkpatch flags this error in tools/testing/selftests/timers/alarmtimer-suspend.c: ERROR: do not initialise globals to 0 +int final_ret = 0; Fix this checkpatch error. Signed-off-by: Zan Aziz Signed-off-by: Shuah Khan --- tools/testing/selftests/timers/alarmtimer-suspend.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/timers/alarmtimer-suspend.c b/tools/testing/selftests/timers/alarmtimer-suspend.c index 54da4b088f4c..4332b494103d 100644 --- a/tools/testing/selftests/timers/alarmtimer-suspend.c +++ b/tools/testing/selftests/timers/alarmtimer-suspend.c @@ -92,7 +92,7 @@ long long timespec_sub(struct timespec a, struct timespec b) return ret; } -int final_ret = 0; +int final_ret; void sigalarm(int signo) { -- cgit v1.2.3 From d261ea23533b5da113b0779f32d39c3803dddb02 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 31 May 2022 20:22:26 -0700 Subject: tools: add memcg_shrinker.py Add a simple tool which prints a sorted list of shrinker lists in the following format: (number of objects, shrinker name, cgroup). Example: $ ./memcg_shrinker.py -n 10 2090 sb-sysfs-26 /sys/fs/cgroup/system.slice 1809 sb-sysfs-26 /sys/fs/cgroup/system.slice/systemd-udevd.service 1044 sb-btrfs:vda2-24 /sys/fs/cgroup/system.slice/system-dbus\x2d:1.3\... 861 sb-btrfs:vda2-24 /sys/fs/cgroup/system.slice/system-dbus\x2d:1.3\... 804 sb-btrfs:vda2-24 /sys/fs/cgroup/system.slice 643 sb-btrfs:vda2-24 /sys/fs/cgroup/system.slice/firewalld.service 616 sb-cgroup2-30 /sys/fs/cgroup/init.scope 275 sb-sysfs-26 / 238 sb-proc-25 /sys/fs/cgroup/system.slice/systemd-journald.service 225 sb-proc-25 /sys/fs/cgroup/system.slice/abrtd.service Link: https://lkml.kernel.org/r/20220601032227.4076670-6-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Cc: Christophe JAILLET Cc: Dave Chinner Cc: Hillf Danton Cc: Kent Overstreet Cc: Muchun Song Signed-off-by: Andrew Morton --- tools/cgroup/memcg_shrinker.py | 71 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 tools/cgroup/memcg_shrinker.py (limited to 'tools') diff --git a/tools/cgroup/memcg_shrinker.py b/tools/cgroup/memcg_shrinker.py new file mode 100644 index 000000000000..706ab27666a4 --- /dev/null +++ b/tools/cgroup/memcg_shrinker.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +# +# Copyright (C) 2022 Roman Gushchin +# Copyright (C) 2022 Meta + +import os +import argparse +import sys + + +def scan_cgroups(cgroup_root): + cgroups = {} + + for root, subdirs, _ in os.walk(cgroup_root): + for cgroup in subdirs: + path = os.path.join(root, cgroup) + ino = os.stat(path).st_ino + cgroups[ino] = path + + # (memcg ino, path) + return cgroups + + +def scan_shrinkers(shrinker_debugfs): + shrinkers = [] + + for root, subdirs, _ in os.walk(shrinker_debugfs): + for shrinker in subdirs: + count_path = os.path.join(root, shrinker, "count") + with open(count_path) as f: + for line in f.readlines(): + items = line.split(' ') + ino = int(items[0]) + # (count, shrinker, memcg ino) + shrinkers.append((int(items[1]), shrinker, ino)) + return shrinkers + + +def main(): + parser = argparse.ArgumentParser(description='Display biggest shrinkers') + parser.add_argument('-n', '--lines', type=int, help='Number of lines to print') + + args = parser.parse_args() + + cgroups = scan_cgroups("/sys/fs/cgroup/") + shrinkers = scan_shrinkers("/sys/kernel/debug/shrinker/") + shrinkers = sorted(shrinkers, reverse = True, key = lambda x: x[0]) + + n = 0 + for s in shrinkers: + count, name, ino = (s[0], s[1], s[2]) + if count == 0: + break + + if ino == 0 or ino == 1: + cg = "/" + else: + try: + cg = cgroups[ino] + except KeyError: + cg = "unknown (%d)" % ino + + print("%-8s %-20s %s" % (count, name, cg)) + + n += 1 + if args.lines and n >= args.lines: + break + + +if __name__ == '__main__': + main() -- cgit v1.2.3 From 50b0f797cab6cb7dff418777e1acf82dfd3568f8 Mon Sep 17 00:00:00 2001 From: Edward Liaw Date: Mon, 13 Jun 2022 23:33:21 +0000 Subject: userfaultfd: selftests: infinite loop in faulting_process On Android this test is getting stuck in an infinite loop due to indeterminate behavior: The local variables steps and signalled were being reset to 1 and 0 respectively after every jump back to sigsetjmp by siglongjmp in the signal handler. The test was incrementing them and expecting them to retain their incremented values. The documentation for siglongjmp says: All accessible objects have values as of the time sigsetjmp() was called, except that the values of objects of automatic storage duration which are local to the function containing the invocation of the corresponding sigsetjmp() which do not have volatile-qualified type and which are changed between the sigsetjmp() invocation and siglongjmp() call are indeterminate. Tagging steps and signalled with volatile enabled the test to pass. Link: https://lkml.kernel.org/r/20220613233321.431282-1-edliaw@google.com Signed-off-by: Edward Liaw Reviewed-by: Axel Rasmussen Cc: Shuah Khan Cc: Peter Xu Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/userfaultfd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 4bc24581760d..7c3f1b0ab468 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -931,7 +931,7 @@ static int faulting_process(int signal_test) unsigned long split_nr_pages; unsigned long lastnr; struct sigaction act; - unsigned long signalled = 0; + volatile unsigned long signalled = 0; split_nr_pages = (nr_pages + 1) / 2; @@ -946,7 +946,7 @@ static int faulting_process(int signal_test) } for (nr = 0; nr < split_nr_pages; nr++) { - int steps = 1; + volatile int steps = 1; unsigned long offset = nr * page_size; if (signal_test) { -- cgit v1.2.3 From 4f2930c6718afbb6c5904cda6f6781a70c52a042 Mon Sep 17 00:00:00 2001 From: Adam Sindelar Date: Mon, 27 Jun 2022 18:39:12 +0200 Subject: selftests/vm: only run 128TBswitch with 5-level paging The test va_128TBswitch.c expects to be able to pass mmap an address hint and length that cross the address 1<<47. On x86_64, this is not possible without 5-level page tables, so the test fails. The test is already only run on 64-bit powerpc and x86_64 archs, but this patch adds an additional check on x86_64 that skips the test if PG_TABLE_LEVELS < 5. There is precedent for checking /proc/config.gz in selftests, e.g. in selftests/firmware. Running the tests produces the desired output: sudo make -C tools/testing/selftests TARGETS=vm run_tests --------------------------- running ./va_128TBswitch.sh --------------------------- ./va_128TBswitch.sh: PG_TABLE_LEVELS=4, must be >= 5 to run this test [SKIP] ------------------------------- [adam@wowsignal.io: restrict the check to x86_64] Link: https://lkml.kernel.org/r/20220628163654.337600-1-adam@wowsignal.io [adam@wowsignal.io: fix formatting issues, rename "die" to "fail"] Link: https://lkml.kernel.org/r/20220701163030.415735-1-adam@wowsignal.io Link: https://lkml.kernel.org/r/20220627163912.5581-1-adam@wowsignal.io Signed-off-by: Adam Sindelar Cc: Adam Sindelar Cc: David Vernet Cc: Aneesh Kumar K.V Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/Makefile | 1 + tools/testing/selftests/vm/run_vmtests.sh | 2 +- tools/testing/selftests/vm/va_128TBswitch.sh | 54 ++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/vm/va_128TBswitch.sh (limited to 'tools') diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 44f25acfbeca..6a34209379a4 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -93,6 +93,7 @@ TEST_PROGS := run_vmtests.sh TEST_FILES := test_vmalloc.sh TEST_FILES += test_hmm.sh +TEST_FILES += va_128TBswitch.sh KSFT_KHDR_INSTALL := 1 include ../lib.mk diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 41fce8bea929..27c01c35c7a9 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -151,7 +151,7 @@ if [ $VADDR64 -ne 0 ]; then run_test ./virtual_address_range # virtual address 128TB switch test - run_test ./va_128TBswitch + run_test ./va_128TBswitch.sh fi # VADDR64 # vmalloc stability smoke test diff --git a/tools/testing/selftests/vm/va_128TBswitch.sh b/tools/testing/selftests/vm/va_128TBswitch.sh new file mode 100644 index 000000000000..41580751dc51 --- /dev/null +++ b/tools/testing/selftests/vm/va_128TBswitch.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2022 Adam Sindelar (Meta) +# +# This is a test for mmap behavior with 5-level paging. This script wraps the +# real test to check that the kernel is configured to support at least 5 +# pagetable levels. + +# 1 means the test failed +exitcode=1 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +fail() +{ + echo "$1" + exit $exitcode +} + +check_supported_x86_64() +{ + local config="/proc/config.gz" + [[ -f "${config}" ]] || config="/boot/config-$(uname -r)" + [[ -f "${config}" ]] || fail "Cannot find kernel config in /proc or /boot" + + # gzip -dcfq automatically handles both compressed and plaintext input. + # See man 1 gzip under '-f'. + local pg_table_levels=$(gzip -dcfq "${config}" | grep PGTABLE_LEVELS | cut -d'=' -f 2) + + if [[ "${pg_table_levels}" -lt 5 ]]; then + echo "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test" + exit $ksft_skip + fi +} + +check_test_requirements() +{ + # The test supports x86_64 and powerpc64. We currently have no useful + # eligibility check for powerpc64, and the test itself will reject other + # architectures. + case `uname -m` in + "x86_64") + check_supported_x86_64 + ;; + *) + return 0 + ;; + esac +} + +check_test_requirements +./va_128TBswitch -- cgit v1.2.3 From 0d153dd208d46c2096151ac1c484e776754dfe51 Mon Sep 17 00:00:00 2001 From: Casper Andersson Date: Fri, 1 Jul 2022 16:43:50 +0200 Subject: selftest: net: bridge mdb add/del entry to port that is down Tests that permanent mdb entries can be added/deleted on ports with state down. Signed-off-by: Casper Andersson Signed-off-by: David S. Miller --- tools/testing/selftests/net/forwarding/Makefile | 1 + .../net/forwarding/bridge_mdb_port_down.sh | 118 +++++++++++++++++++++ 2 files changed, 119 insertions(+) create mode 100755 tools/testing/selftests/net/forwarding/bridge_mdb_port_down.sh (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile index 8f481218a492..669ffd6f2a68 100644 --- a/tools/testing/selftests/net/forwarding/Makefile +++ b/tools/testing/selftests/net/forwarding/Makefile @@ -3,6 +3,7 @@ TEST_PROGS = bridge_igmp.sh \ bridge_locked_port.sh \ bridge_mdb.sh \ + bridge_mdb_port_down.sh \ bridge_mld.sh \ bridge_port_isolation.sh \ bridge_sticky_fdb.sh \ diff --git a/tools/testing/selftests/net/forwarding/bridge_mdb_port_down.sh b/tools/testing/selftests/net/forwarding/bridge_mdb_port_down.sh new file mode 100755 index 000000000000..1a0480e71d83 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/bridge_mdb_port_down.sh @@ -0,0 +1,118 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Verify that permanent mdb entries can be added to and deleted from bridge +# interfaces that are down, and works correctly when done so. + +ALL_TESTS="add_del_to_port_down" +NUM_NETIFS=4 + +TEST_GROUP="239.10.10.10" +TEST_GROUP_MAC="01:00:5e:0a:0a:0a" + +source lib.sh + + +add_del_to_port_down() { + RET=0 + + ip link set dev $swp2 down + bridge mdb add dev br0 port "$swp2" grp $TEST_GROUP permanent 2>/dev/null + check_err $? "Failed adding mdb entry" + + ip link set dev $swp2 up + setup_wait_dev $swp2 + mcast_packet_test $TEST_GROUP_MAC 192.0.2.1 $TEST_GROUP $h1 $h2 + check_fail $? "Traffic to $TEST_GROUP wasn't forwarded" + + ip link set dev $swp2 down + bridge mdb show dev br0 | grep -q "$TEST_GROUP permanent" 2>/dev/null + check_err $? "MDB entry did not persist after link up/down" + + bridge mdb del dev br0 port "$swp2" grp $TEST_GROUP 2>/dev/null + check_err $? "Failed deleting mdb entry" + + ip link set dev $swp2 up + setup_wait_dev $swp2 + mcast_packet_test $TEST_GROUP_MAC 192.0.2.1 $TEST_GROUP $h1 $h2 + check_err $? "Traffic to $TEST_GROUP was forwarded after entry removed" + + log_test "MDB add/del entry to port with state down " +} + +h1_create() +{ + simple_if_init $h1 192.0.2.1/24 2001:db8:1::1/64 +} + +h1_destroy() +{ + simple_if_fini $h1 192.0.2.1/24 2001:db8:1::1/64 +} + +h2_create() +{ + simple_if_init $h2 192.0.2.2/24 2001:db8:1::2/64 +} + +h2_destroy() +{ + simple_if_fini $h2 192.0.2.2/24 2001:db8:1::2/64 +} + +switch_create() +{ + # Enable multicast filtering + ip link add dev br0 type bridge mcast_snooping 1 mcast_querier 1 + + ip link set dev $swp1 master br0 + ip link set dev $swp2 master br0 + + ip link set dev br0 up + ip link set dev $swp1 up + + bridge link set dev $swp2 mcast_flood off + # Bridge currently has a "grace time" at creation time before it + # forwards multicast according to the mdb. Since we disable the + # mcast_flood setting per port + sleep 10 +} + +switch_destroy() +{ + ip link set dev $swp1 down + ip link set dev $swp2 down + ip link del dev br0 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + swp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + vrf_prepare + + h1_create + h2_create + switch_create +} + +cleanup() +{ + pre_cleanup + + switch_destroy + h1_destroy + h2_destroy + + vrf_cleanup +} + +trap cleanup EXIT + +setup_prepare +tests_run +exit $EXIT_STATUS -- cgit v1.2.3 From 0c7e0d699ef1430d7f4cf12b4b1d097af58b5515 Mon Sep 17 00:00:00 2001 From: Stéphane Graber Date: Thu, 23 Jun 2022 16:45:52 -0500 Subject: tools/vm/slabinfo: Handle files in debugfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 64dd68497be76 relocated and renamed the alloc_calls and free_calls files from /sys/kernel/slab/NAME/*_calls over to /sys/kernel/debug/slab/NAME/*_calls but didn't update the slabinfo tool with the new location. This change will now have slabinfo look at the new location (and filenames) with a fallback to the prior files. Fixes: 64dd68497be76 ("mm: slub: move sysfs slab alloc/free interfaces to debugfs") Cc: stable@vger.kernel.org Signed-off-by: Stéphane Graber Tested-by: Stéphane Graber Signed-off-by: Vlastimil Babka --- tools/vm/slabinfo.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index 9b68658b6bb8..5b98f3ee58a5 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c @@ -233,6 +233,24 @@ static unsigned long read_slab_obj(struct slabinfo *s, const char *name) return l; } +static unsigned long read_debug_slab_obj(struct slabinfo *s, const char *name) +{ + char x[128]; + FILE *f; + size_t l; + + snprintf(x, 128, "/sys/kernel/debug/slab/%s/%s", s->name, name); + f = fopen(x, "r"); + if (!f) { + buffer[0] = 0; + l = 0; + } else { + l = fread(buffer, 1, sizeof(buffer), f); + buffer[l] = 0; + fclose(f); + } + return l; +} /* * Put a size string together @@ -409,14 +427,18 @@ static void show_tracking(struct slabinfo *s) { printf("\n%s: Kernel object allocation\n", s->name); printf("-----------------------------------------------------------------------\n"); - if (read_slab_obj(s, "alloc_calls")) + if (read_debug_slab_obj(s, "alloc_traces")) + printf("%s", buffer); + else if (read_slab_obj(s, "alloc_calls")) printf("%s", buffer); else printf("No Data\n"); printf("\n%s: Kernel object freeing\n", s->name); printf("------------------------------------------------------------------------\n"); - if (read_slab_obj(s, "free_calls")) + if (read_debug_slab_obj(s, "free_traces")) + printf("%s", buffer); + else if (read_slab_obj(s, "free_calls")) printf("%s", buffer); else printf("No Data\n"); -- cgit v1.2.3 From 946dccb35d74079436677afcdc767d0406e99a88 Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sun, 3 Jul 2022 23:06:55 -0500 Subject: memblock tests: Makefile: add arguments to control verbosity Add VERBOSE and MEMBLOCK_DEBUG user-provided arguments. VERBOSE will enable verbose output from Memblock simulator. MEMBLOCK_DEBUG will enable memblock_dbg() messages. Update the help message to include VERBOSE and MEMBLOCK_DEBUG. Update the README to include VERBOSE. The README does not include all available options and refers to the help message for the remaining options. Therefore, omit MEMBLOCK_DEBUG from README. Reviewed-by: David Hildenbrand Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/5503f3efe82ecef5c99961a1d53003c8ad06cf27.1656907314.git.remckee0@gmail.com --- tools/testing/memblock/Makefile | 4 ++++ tools/testing/memblock/README | 10 +++++++++- tools/testing/memblock/scripts/Makefile.include | 10 ++++++++++ 3 files changed, 23 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/memblock/Makefile b/tools/testing/memblock/Makefile index a698e24b35e7..9fde49ad73bd 100644 --- a/tools/testing/memblock/Makefile +++ b/tools/testing/memblock/Makefile @@ -45,6 +45,10 @@ help: @echo ' clean - Remove generated files and symlinks in the directory' @echo '' @echo 'Configuration:' + @echo ' make VERBOSE=1 - enable verbose output, which includes the' + @echo ' names of functions being tested and the' + @echo ' number of test cases passing' + @echo ' make MEMBLOCK_DEBUG=1 - enable memblock_dbg() messages' @echo ' make NUMA=1 - simulate enabled NUMA' @echo ' make MOVABLE_NODE=1 - override `movable_node_is_enabled`' @echo ' definition to simulate movable NUMA nodes' diff --git a/tools/testing/memblock/README b/tools/testing/memblock/README index ca6afcff013a..058146b528a5 100644 --- a/tools/testing/memblock/README +++ b/tools/testing/memblock/README @@ -34,7 +34,15 @@ To run the tests, build the main target and run it: $ make && ./main A successful run produces no output. It is also possible to override different -configuration parameters. For example, to simulate enabled NUMA, use: +configuration parameters. For example, to include verbose output, specify the +VERBOSE flag when building the main target: + +$ make VERBOSE=1 + +This will print information about which functions are being tested and the +number of test cases that passed. + +To simulate enabled NUMA, use: $ make NUMA=1 diff --git a/tools/testing/memblock/scripts/Makefile.include b/tools/testing/memblock/scripts/Makefile.include index 641569ccb7b0..4401f79bed4c 100644 --- a/tools/testing/memblock/scripts/Makefile.include +++ b/tools/testing/memblock/scripts/Makefile.include @@ -17,3 +17,13 @@ ifeq ($(32BIT_PHYS_ADDR_T), 1) CFLAGS += -m32 -U CONFIG_PHYS_ADDR_T_64BIT LDFLAGS += -m32 endif + +# Enable verbose testing output +ifeq ($(VERBOSE), 1) + CFLAGS += -D VERBOSE +endif + +# Enable memblock_dbg() messages +ifeq ($(MEMBLOCK_DEBUG), 1) + CFLAGS += -D MEMBLOCK_DEBUG +endif -- cgit v1.2.3 From 76586c00e74d189964d2fde26345a4a15f2ea02e Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sun, 3 Jul 2022 23:06:56 -0500 Subject: memblock tests: add verbose output to memblock tests Add and use functions and macros for printing verbose testing output. If the Memblock simulator was compiled with VERBOSE=1: - prefix_push(): appends the given string to a prefix string that will be printed in test_fail() and test_pass*(). - prefix_pop(): removes the last prefix from the prefix string. - prefix_reset(): clears the prefix string. - test_fail(): prints a message after a test fails containing the test number of the failing test and the prefix. - test_pass(): prints a message after a test passes containing its test number and the prefix. - test_print(): prints the given formatted output string. - test_pass_pop(): runs test_pass() followed by prefix_pop(). - PREFIX_PUSH(): runs prefix_push(__func__). If the Memblock simulator was not compiled with VERBOSE=1, these functions/macros do nothing. Add the assert wrapper macros ASSERT_EQ(), ASSERT_NE(), and ASSERT_LT(). If the assert condition fails, these macros call test_fail() before executing assert(). Acked-by: David Hildenbrand Reviewed-by: Shaoqin Huang Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/f234d443fe154d5ae8d8aa07284aff69edfb6f61.1656907314.git.remckee0@gmail.com --- tools/testing/memblock/tests/alloc_api.c | 225 ++++++++++----- tools/testing/memblock/tests/alloc_helpers_api.c | 129 ++++++--- tools/testing/memblock/tests/alloc_nid_api.c | 351 +++++++++++++++-------- tools/testing/memblock/tests/basic_api.c | 337 +++++++++++++++------- tools/testing/memblock/tests/common.c | 57 ++++ tools/testing/memblock/tests/common.h | 62 ++++ 6 files changed, 817 insertions(+), 344 deletions(-) (limited to 'tools') diff --git a/tools/testing/memblock/tests/alloc_api.c b/tools/testing/memblock/tests/alloc_api.c index d1aa7e15c18d..a14f38eb8a89 100644 --- a/tools/testing/memblock/tests/alloc_api.c +++ b/tools/testing/memblock/tests/alloc_api.c @@ -10,6 +10,8 @@ static int alloc_top_down_simple_check(void) struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; + PREFIX_PUSH(); + phys_addr_t size = SZ_2; phys_addr_t expected_start; @@ -19,12 +21,14 @@ static int alloc_top_down_simple_check(void) allocated_ptr = memblock_alloc(size, SMP_CACHE_BYTES); - assert(allocated_ptr); - assert(rgn->size == size); - assert(rgn->base == expected_start); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(rgn->size, size); + ASSERT_EQ(rgn->base, expected_start); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == size); + test_pass_pop(); return 0; } @@ -55,6 +59,8 @@ static int alloc_top_down_disjoint_check(void) struct region r1; void *allocated_ptr = NULL; + PREFIX_PUSH(); + phys_addr_t r2_size = SZ_16; /* Use custom alignment */ phys_addr_t alignment = SMP_CACHE_BYTES * 2; @@ -73,15 +79,17 @@ static int alloc_top_down_disjoint_check(void) allocated_ptr = memblock_alloc(r2_size, alignment); - assert(allocated_ptr); - assert(rgn1->size == r1.size); - assert(rgn1->base == r1.base); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(rgn1->size, r1.size); + ASSERT_EQ(rgn1->base, r1.base); - assert(rgn2->size == r2_size); - assert(rgn2->base == expected_start); + ASSERT_EQ(rgn2->size, r2_size); + ASSERT_EQ(rgn2->base, expected_start); - assert(memblock.reserved.cnt == 2); - assert(memblock.reserved.total_size == total_size); + ASSERT_EQ(memblock.reserved.cnt, 2); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); return 0; } @@ -101,6 +109,8 @@ static int alloc_top_down_before_check(void) struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; + PREFIX_PUSH(); + /* * The first region ends at the aligned address to test region merging */ @@ -114,12 +124,14 @@ static int alloc_top_down_before_check(void) allocated_ptr = memblock_alloc(r2_size, SMP_CACHE_BYTES); - assert(allocated_ptr); - assert(rgn->size == total_size); - assert(rgn->base == memblock_end_of_DRAM() - total_size); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(rgn->size, total_size); + ASSERT_EQ(rgn->base, memblock_end_of_DRAM() - total_size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == total_size); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); return 0; } @@ -141,6 +153,8 @@ static int alloc_top_down_after_check(void) struct region r1; void *allocated_ptr = NULL; + PREFIX_PUSH(); + phys_addr_t r2_size = SZ_512; phys_addr_t total_size; @@ -158,12 +172,14 @@ static int alloc_top_down_after_check(void) allocated_ptr = memblock_alloc(r2_size, SMP_CACHE_BYTES); - assert(allocated_ptr); - assert(rgn->size == total_size); - assert(rgn->base == r1.base - r2_size); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(rgn->size, total_size); + ASSERT_EQ(rgn->base, r1.base - r2_size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == total_size); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); return 0; } @@ -186,6 +202,8 @@ static int alloc_top_down_second_fit_check(void) struct region r1, r2; void *allocated_ptr = NULL; + PREFIX_PUSH(); + phys_addr_t r3_size = SZ_1K; phys_addr_t total_size; @@ -204,12 +222,14 @@ static int alloc_top_down_second_fit_check(void) allocated_ptr = memblock_alloc(r3_size, SMP_CACHE_BYTES); - assert(allocated_ptr); - assert(rgn->size == r2.size + r3_size); - assert(rgn->base == r2.base - r3_size); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(rgn->size, r2.size + r3_size); + ASSERT_EQ(rgn->base, r2.base - r3_size); - assert(memblock.reserved.cnt == 2); - assert(memblock.reserved.total_size == total_size); + ASSERT_EQ(memblock.reserved.cnt, 2); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); return 0; } @@ -231,6 +251,8 @@ static int alloc_in_between_generic_check(void) struct region r1, r2; void *allocated_ptr = NULL; + PREFIX_PUSH(); + phys_addr_t gap_size = SMP_CACHE_BYTES; phys_addr_t r3_size = SZ_64; /* @@ -254,12 +276,14 @@ static int alloc_in_between_generic_check(void) allocated_ptr = memblock_alloc(r3_size, SMP_CACHE_BYTES); - assert(allocated_ptr); - assert(rgn->size == total_size); - assert(rgn->base == r1.base - r2.size - r3_size); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(rgn->size, total_size); + ASSERT_EQ(rgn->base, r1.base - r2.size - r3_size); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, total_size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == total_size); + test_pass_pop(); return 0; } @@ -281,6 +305,8 @@ static int alloc_small_gaps_generic_check(void) { void *allocated_ptr = NULL; + PREFIX_PUSH(); + phys_addr_t region_size = SZ_1K; phys_addr_t gap_size = SZ_256; phys_addr_t region_end; @@ -296,7 +322,9 @@ static int alloc_small_gaps_generic_check(void) allocated_ptr = memblock_alloc(region_size, SMP_CACHE_BYTES); - assert(!allocated_ptr); + ASSERT_EQ(allocated_ptr, NULL); + + test_pass_pop(); return 0; } @@ -309,6 +337,8 @@ static int alloc_all_reserved_generic_check(void) { void *allocated_ptr = NULL; + PREFIX_PUSH(); + setup_memblock(); /* Simulate full memory */ @@ -316,7 +346,9 @@ static int alloc_all_reserved_generic_check(void) allocated_ptr = memblock_alloc(SZ_256, SMP_CACHE_BYTES); - assert(!allocated_ptr); + ASSERT_EQ(allocated_ptr, NULL); + + test_pass_pop(); return 0; } @@ -338,6 +370,8 @@ static int alloc_no_space_generic_check(void) { void *allocated_ptr = NULL; + PREFIX_PUSH(); + setup_memblock(); phys_addr_t available_size = SZ_256; @@ -348,7 +382,9 @@ static int alloc_no_space_generic_check(void) allocated_ptr = memblock_alloc(SZ_1K, SMP_CACHE_BYTES); - assert(!allocated_ptr); + ASSERT_EQ(allocated_ptr, NULL); + + test_pass_pop(); return 0; } @@ -369,6 +405,8 @@ static int alloc_limited_space_generic_check(void) struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; + PREFIX_PUSH(); + phys_addr_t available_size = SZ_256; phys_addr_t reserved_size = MEM_SIZE - available_size; @@ -379,12 +417,14 @@ static int alloc_limited_space_generic_check(void) allocated_ptr = memblock_alloc(available_size, SMP_CACHE_BYTES); - assert(allocated_ptr); - assert(rgn->size == MEM_SIZE); - assert(rgn->base == memblock_start_of_DRAM()); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(rgn->size, MEM_SIZE); + ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, MEM_SIZE); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == MEM_SIZE); + test_pass_pop(); return 0; } @@ -399,14 +439,18 @@ static int alloc_no_memory_generic_check(void) struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; + PREFIX_PUSH(); + reset_memblock_regions(); allocated_ptr = memblock_alloc(SZ_1K, SMP_CACHE_BYTES); - assert(!allocated_ptr); - assert(rgn->size == 0); - assert(rgn->base == 0); - assert(memblock.reserved.total_size == 0); + ASSERT_EQ(allocated_ptr, NULL); + ASSERT_EQ(rgn->size, 0); + ASSERT_EQ(rgn->base, 0); + ASSERT_EQ(memblock.reserved.total_size, 0); + + test_pass_pop(); return 0; } @@ -421,16 +465,20 @@ static int alloc_bottom_up_simple_check(void) struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; + PREFIX_PUSH(); + setup_memblock(); allocated_ptr = memblock_alloc(SZ_2, SMP_CACHE_BYTES); - assert(allocated_ptr); - assert(rgn->size == SZ_2); - assert(rgn->base == memblock_start_of_DRAM()); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(rgn->size, SZ_2); + ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == SZ_2); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, SZ_2); + + test_pass_pop(); return 0; } @@ -459,6 +507,8 @@ static int alloc_bottom_up_disjoint_check(void) struct region r1; void *allocated_ptr = NULL; + PREFIX_PUSH(); + phys_addr_t r2_size = SZ_16; /* Use custom alignment */ phys_addr_t alignment = SMP_CACHE_BYTES * 2; @@ -477,16 +527,18 @@ static int alloc_bottom_up_disjoint_check(void) allocated_ptr = memblock_alloc(r2_size, alignment); - assert(allocated_ptr); + ASSERT_NE(allocated_ptr, NULL); - assert(rgn1->size == r1.size); - assert(rgn1->base == r1.base); + ASSERT_EQ(rgn1->size, r1.size); + ASSERT_EQ(rgn1->base, r1.base); - assert(rgn2->size == r2_size); - assert(rgn2->base == expected_start); + ASSERT_EQ(rgn2->size, r2_size); + ASSERT_EQ(rgn2->base, expected_start); - assert(memblock.reserved.cnt == 2); - assert(memblock.reserved.total_size == total_size); + ASSERT_EQ(memblock.reserved.cnt, 2); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); return 0; } @@ -506,6 +558,8 @@ static int alloc_bottom_up_before_check(void) struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; + PREFIX_PUSH(); + phys_addr_t r1_size = SZ_512; phys_addr_t r2_size = SZ_128; phys_addr_t total_size = r1_size + r2_size; @@ -516,12 +570,14 @@ static int alloc_bottom_up_before_check(void) allocated_ptr = memblock_alloc(r1_size, SMP_CACHE_BYTES); - assert(allocated_ptr); - assert(rgn->size == total_size); - assert(rgn->base == memblock_start_of_DRAM()); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(rgn->size, total_size); + ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == total_size); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); return 0; } @@ -542,6 +598,8 @@ static int alloc_bottom_up_after_check(void) struct region r1; void *allocated_ptr = NULL; + PREFIX_PUSH(); + phys_addr_t r2_size = SZ_512; phys_addr_t total_size; @@ -559,12 +617,14 @@ static int alloc_bottom_up_after_check(void) allocated_ptr = memblock_alloc(r2_size, SMP_CACHE_BYTES); - assert(allocated_ptr); - assert(rgn->size == total_size); - assert(rgn->base == r1.base); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(rgn->size, total_size); + ASSERT_EQ(rgn->base, r1.base); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == total_size); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); return 0; } @@ -588,6 +648,8 @@ static int alloc_bottom_up_second_fit_check(void) struct region r1, r2; void *allocated_ptr = NULL; + PREFIX_PUSH(); + phys_addr_t r3_size = SZ_1K; phys_addr_t total_size; @@ -606,12 +668,14 @@ static int alloc_bottom_up_second_fit_check(void) allocated_ptr = memblock_alloc(r3_size, SMP_CACHE_BYTES); - assert(allocated_ptr); - assert(rgn->size == r2.size + r3_size); - assert(rgn->base == r2.base); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(rgn->size, r2.size + r3_size); + ASSERT_EQ(rgn->base, r2.base); - assert(memblock.reserved.cnt == 2); - assert(memblock.reserved.total_size == total_size); + ASSERT_EQ(memblock.reserved.cnt, 2); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); return 0; } @@ -619,6 +683,7 @@ static int alloc_bottom_up_second_fit_check(void) /* Test case wrappers */ static int alloc_simple_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_top_down_simple_check(); memblock_set_bottom_up(true); @@ -629,6 +694,7 @@ static int alloc_simple_check(void) static int alloc_disjoint_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_top_down_disjoint_check(); memblock_set_bottom_up(true); @@ -639,6 +705,7 @@ static int alloc_disjoint_check(void) static int alloc_before_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_top_down_before_check(); memblock_set_bottom_up(true); @@ -649,6 +716,7 @@ static int alloc_before_check(void) static int alloc_after_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_top_down_after_check(); memblock_set_bottom_up(true); @@ -659,6 +727,7 @@ static int alloc_after_check(void) static int alloc_in_between_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_in_between_generic_check(); memblock_set_bottom_up(true); @@ -669,6 +738,7 @@ static int alloc_in_between_check(void) static int alloc_second_fit_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_top_down_second_fit_check(); memblock_set_bottom_up(true); @@ -679,6 +749,7 @@ static int alloc_second_fit_check(void) static int alloc_small_gaps_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_small_gaps_generic_check(); memblock_set_bottom_up(true); @@ -689,6 +760,7 @@ static int alloc_small_gaps_check(void) static int alloc_all_reserved_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_all_reserved_generic_check(); memblock_set_bottom_up(true); @@ -699,6 +771,7 @@ static int alloc_all_reserved_check(void) static int alloc_no_space_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_no_space_generic_check(); memblock_set_bottom_up(true); @@ -709,6 +782,7 @@ static int alloc_no_space_check(void) static int alloc_limited_space_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_limited_space_generic_check(); memblock_set_bottom_up(true); @@ -719,6 +793,7 @@ static int alloc_limited_space_check(void) static int alloc_no_memory_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_no_memory_generic_check(); memblock_set_bottom_up(true); @@ -729,6 +804,12 @@ static int alloc_no_memory_check(void) int memblock_alloc_checks(void) { + const char *func_testing = "memblock_alloc"; + + prefix_reset(); + prefix_push(func_testing); + test_print("Running %s tests...\n", func_testing); + reset_memblock_attributes(); dummy_physical_memory_init(); @@ -746,5 +827,7 @@ int memblock_alloc_checks(void) dummy_physical_memory_cleanup(); + prefix_pop(); + return 0; } diff --git a/tools/testing/memblock/tests/alloc_helpers_api.c b/tools/testing/memblock/tests/alloc_helpers_api.c index 963a966db461..1069b4bdd5fd 100644 --- a/tools/testing/memblock/tests/alloc_helpers_api.c +++ b/tools/testing/memblock/tests/alloc_helpers_api.c @@ -21,6 +21,8 @@ static int alloc_from_simple_generic_check(void) void *allocated_ptr = NULL; char *b; + PREFIX_PUSH(); + phys_addr_t size = SZ_16; phys_addr_t min_addr; @@ -31,14 +33,16 @@ static int alloc_from_simple_generic_check(void) allocated_ptr = memblock_alloc_from(size, SMP_CACHE_BYTES, min_addr); b = (char *)allocated_ptr; - assert(allocated_ptr); - assert(*b == 0); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(*b, 0); + + ASSERT_EQ(rgn->size, size); + ASSERT_EQ(rgn->base, min_addr); - assert(rgn->size == size); - assert(rgn->base == min_addr); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == size); + test_pass_pop(); return 0; } @@ -64,6 +68,8 @@ static int alloc_from_misaligned_generic_check(void) void *allocated_ptr = NULL; char *b; + PREFIX_PUSH(); + phys_addr_t size = SZ_32; phys_addr_t min_addr; @@ -75,14 +81,16 @@ static int alloc_from_misaligned_generic_check(void) allocated_ptr = memblock_alloc_from(size, SMP_CACHE_BYTES, min_addr); b = (char *)allocated_ptr; - assert(allocated_ptr); - assert(*b == 0); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(*b, 0); - assert(rgn->size == size); - assert(rgn->base == memblock_end_of_DRAM() - SMP_CACHE_BYTES); + ASSERT_EQ(rgn->size, size); + ASSERT_EQ(rgn->base, memblock_end_of_DRAM() - SMP_CACHE_BYTES); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == size); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); return 0; } @@ -110,6 +118,8 @@ static int alloc_from_top_down_high_addr_check(void) struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; + PREFIX_PUSH(); + phys_addr_t size = SZ_32; phys_addr_t min_addr; @@ -120,12 +130,14 @@ static int alloc_from_top_down_high_addr_check(void) allocated_ptr = memblock_alloc_from(size, SMP_CACHE_BYTES, min_addr); - assert(allocated_ptr); - assert(rgn->size == size); - assert(rgn->base == memblock_end_of_DRAM() - SMP_CACHE_BYTES); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(rgn->size, size); + ASSERT_EQ(rgn->base, memblock_end_of_DRAM() - SMP_CACHE_BYTES); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == size); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); return 0; } @@ -151,6 +163,8 @@ static int alloc_from_top_down_no_space_above_check(void) struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; + PREFIX_PUSH(); + phys_addr_t r1_size = SZ_64; phys_addr_t r2_size = SZ_2; phys_addr_t total_size = r1_size + r2_size; @@ -165,12 +179,14 @@ static int alloc_from_top_down_no_space_above_check(void) allocated_ptr = memblock_alloc_from(r1_size, SMP_CACHE_BYTES, min_addr); - assert(allocated_ptr); - assert(rgn->base == min_addr - r1_size); - assert(rgn->size == total_size); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(rgn->base, min_addr - r1_size); + ASSERT_EQ(rgn->size, total_size); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, total_size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == total_size); + test_pass_pop(); return 0; } @@ -186,6 +202,8 @@ static int alloc_from_top_down_min_addr_cap_check(void) struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; + PREFIX_PUSH(); + phys_addr_t r1_size = SZ_64; phys_addr_t min_addr; phys_addr_t start_addr; @@ -199,12 +217,14 @@ static int alloc_from_top_down_min_addr_cap_check(void) allocated_ptr = memblock_alloc_from(r1_size, SMP_CACHE_BYTES, min_addr); - assert(allocated_ptr); - assert(rgn->base == start_addr); - assert(rgn->size == MEM_SIZE); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(rgn->base, start_addr); + ASSERT_EQ(rgn->size, MEM_SIZE); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, MEM_SIZE); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == MEM_SIZE); + test_pass_pop(); return 0; } @@ -230,6 +250,8 @@ static int alloc_from_bottom_up_high_addr_check(void) struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; + PREFIX_PUSH(); + phys_addr_t size = SZ_32; phys_addr_t min_addr; @@ -240,12 +262,14 @@ static int alloc_from_bottom_up_high_addr_check(void) allocated_ptr = memblock_alloc_from(size, SMP_CACHE_BYTES, min_addr); - assert(allocated_ptr); - assert(rgn->size == size); - assert(rgn->base == memblock_start_of_DRAM()); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(rgn->size, size); + ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == size); + test_pass_pop(); return 0; } @@ -270,6 +294,8 @@ static int alloc_from_bottom_up_no_space_above_check(void) struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; + PREFIX_PUSH(); + phys_addr_t r1_size = SZ_64; phys_addr_t min_addr; phys_addr_t r2_size; @@ -284,12 +310,14 @@ static int alloc_from_bottom_up_no_space_above_check(void) allocated_ptr = memblock_alloc_from(r1_size, SMP_CACHE_BYTES, min_addr); - assert(allocated_ptr); - assert(rgn->base == memblock_start_of_DRAM()); - assert(rgn->size == r1_size); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); + ASSERT_EQ(rgn->size, r1_size); - assert(memblock.reserved.cnt == 2); - assert(memblock.reserved.total_size == r1_size + r2_size); + ASSERT_EQ(memblock.reserved.cnt, 2); + ASSERT_EQ(memblock.reserved.total_size, r1_size + r2_size); + + test_pass_pop(); return 0; } @@ -304,6 +332,8 @@ static int alloc_from_bottom_up_min_addr_cap_check(void) struct memblock_region *rgn = &memblock.reserved.regions[0]; void *allocated_ptr = NULL; + PREFIX_PUSH(); + phys_addr_t r1_size = SZ_64; phys_addr_t min_addr; phys_addr_t start_addr; @@ -315,12 +345,14 @@ static int alloc_from_bottom_up_min_addr_cap_check(void) allocated_ptr = memblock_alloc_from(r1_size, SMP_CACHE_BYTES, min_addr); - assert(allocated_ptr); - assert(rgn->base == start_addr); - assert(rgn->size == r1_size); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(rgn->base, start_addr); + ASSERT_EQ(rgn->size, r1_size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == r1_size); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, r1_size); + + test_pass_pop(); return 0; } @@ -328,6 +360,7 @@ static int alloc_from_bottom_up_min_addr_cap_check(void) /* Test case wrappers */ static int alloc_from_simple_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_from_simple_generic_check(); memblock_set_bottom_up(true); @@ -338,6 +371,7 @@ static int alloc_from_simple_check(void) static int alloc_from_misaligned_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_from_misaligned_generic_check(); memblock_set_bottom_up(true); @@ -348,6 +382,7 @@ static int alloc_from_misaligned_check(void) static int alloc_from_high_addr_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_from_top_down_high_addr_check(); memblock_set_bottom_up(true); @@ -358,6 +393,7 @@ static int alloc_from_high_addr_check(void) static int alloc_from_no_space_above_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_from_top_down_no_space_above_check(); memblock_set_bottom_up(true); @@ -368,6 +404,7 @@ static int alloc_from_no_space_above_check(void) static int alloc_from_min_addr_cap_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_from_top_down_min_addr_cap_check(); memblock_set_bottom_up(true); @@ -378,6 +415,12 @@ static int alloc_from_min_addr_cap_check(void) int memblock_alloc_helpers_checks(void) { + const char *func_testing = "memblock_alloc_from"; + + prefix_reset(); + prefix_push(func_testing); + test_print("Running %s tests...\n", func_testing); + reset_memblock_attributes(); dummy_physical_memory_init(); @@ -389,5 +432,7 @@ int memblock_alloc_helpers_checks(void) dummy_physical_memory_cleanup(); + prefix_pop(); + return 0; } diff --git a/tools/testing/memblock/tests/alloc_nid_api.c b/tools/testing/memblock/tests/alloc_nid_api.c index 6390206e50e1..255fd514e9f5 100644 --- a/tools/testing/memblock/tests/alloc_nid_api.c +++ b/tools/testing/memblock/tests/alloc_nid_api.c @@ -21,6 +21,8 @@ static int alloc_try_nid_top_down_simple_check(void) void *allocated_ptr = NULL; char *b; + PREFIX_PUSH(); + phys_addr_t size = SZ_128; phys_addr_t min_addr; phys_addr_t max_addr; @@ -36,15 +38,17 @@ static int alloc_try_nid_top_down_simple_check(void) b = (char *)allocated_ptr; rgn_end = rgn->base + rgn->size; - assert(allocated_ptr); - assert(*b == 0); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(*b, 0); + + ASSERT_EQ(rgn->size, size); + ASSERT_EQ(rgn->base, max_addr - size); + ASSERT_EQ(rgn_end, max_addr); - assert(rgn->size == size); - assert(rgn->base == max_addr - size); - assert(rgn_end == max_addr); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == size); + test_pass_pop(); return 0; } @@ -72,6 +76,8 @@ static int alloc_try_nid_top_down_end_misaligned_check(void) void *allocated_ptr = NULL; char *b; + PREFIX_PUSH(); + phys_addr_t size = SZ_128; phys_addr_t misalign = SZ_2; phys_addr_t min_addr; @@ -88,15 +94,17 @@ static int alloc_try_nid_top_down_end_misaligned_check(void) b = (char *)allocated_ptr; rgn_end = rgn->base + rgn->size; - assert(allocated_ptr); - assert(*b == 0); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(*b, 0); - assert(rgn->size == size); - assert(rgn->base == max_addr - size - misalign); - assert(rgn_end < max_addr); + ASSERT_EQ(rgn->size, size); + ASSERT_EQ(rgn->base, max_addr - size - misalign); + ASSERT_LT(rgn_end, max_addr); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == size); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); return 0; } @@ -122,6 +130,8 @@ static int alloc_try_nid_exact_address_generic_check(void) void *allocated_ptr = NULL; char *b; + PREFIX_PUSH(); + phys_addr_t size = SZ_1K; phys_addr_t min_addr; phys_addr_t max_addr; @@ -137,15 +147,17 @@ static int alloc_try_nid_exact_address_generic_check(void) b = (char *)allocated_ptr; rgn_end = rgn->base + rgn->size; - assert(allocated_ptr); - assert(*b == 0); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(*b, 0); + + ASSERT_EQ(rgn->size, size); + ASSERT_EQ(rgn->base, min_addr); + ASSERT_EQ(rgn_end, max_addr); - assert(rgn->size == size); - assert(rgn->base == min_addr); - assert(rgn_end == max_addr); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == size); + test_pass_pop(); return 0; } @@ -173,6 +185,8 @@ static int alloc_try_nid_top_down_narrow_range_check(void) void *allocated_ptr = NULL; char *b; + PREFIX_PUSH(); + phys_addr_t size = SZ_256; phys_addr_t min_addr; phys_addr_t max_addr; @@ -186,14 +200,16 @@ static int alloc_try_nid_top_down_narrow_range_check(void) min_addr, max_addr, NUMA_NO_NODE); b = (char *)allocated_ptr; - assert(allocated_ptr); - assert(*b == 0); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(*b, 0); + + ASSERT_EQ(rgn->size, size); + ASSERT_EQ(rgn->base, max_addr - size); - assert(rgn->size == size); - assert(rgn->base == max_addr - size); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == size); + test_pass_pop(); return 0; } @@ -222,6 +238,8 @@ static int alloc_try_nid_low_max_generic_check(void) { void *allocated_ptr = NULL; + PREFIX_PUSH(); + phys_addr_t size = SZ_1K; phys_addr_t min_addr; phys_addr_t max_addr; @@ -234,7 +252,9 @@ static int alloc_try_nid_low_max_generic_check(void) allocated_ptr = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - assert(!allocated_ptr); + ASSERT_EQ(allocated_ptr, NULL); + + test_pass_pop(); return 0; } @@ -259,6 +279,8 @@ static int alloc_try_nid_min_reserved_generic_check(void) void *allocated_ptr = NULL; char *b; + PREFIX_PUSH(); + phys_addr_t r1_size = SZ_128; phys_addr_t r2_size = SZ_64; phys_addr_t total_size = r1_size + r2_size; @@ -278,14 +300,16 @@ static int alloc_try_nid_min_reserved_generic_check(void) min_addr, max_addr, NUMA_NO_NODE); b = (char *)allocated_ptr; - assert(allocated_ptr); - assert(*b == 0); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(*b, 0); - assert(rgn->size == total_size); - assert(rgn->base == reserved_base); + ASSERT_EQ(rgn->size, total_size); + ASSERT_EQ(rgn->base, reserved_base); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == total_size); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); return 0; } @@ -310,6 +334,8 @@ static int alloc_try_nid_max_reserved_generic_check(void) void *allocated_ptr = NULL; char *b; + PREFIX_PUSH(); + phys_addr_t r1_size = SZ_64; phys_addr_t r2_size = SZ_128; phys_addr_t total_size = r1_size + r2_size; @@ -327,14 +353,16 @@ static int alloc_try_nid_max_reserved_generic_check(void) min_addr, max_addr, NUMA_NO_NODE); b = (char *)allocated_ptr; - assert(allocated_ptr); - assert(*b == 0); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(*b, 0); + + ASSERT_EQ(rgn->size, total_size); + ASSERT_EQ(rgn->base, min_addr); - assert(rgn->size == total_size); - assert(rgn->base == min_addr); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, total_size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == total_size); + test_pass_pop(); return 0; } @@ -364,6 +392,8 @@ static int alloc_try_nid_top_down_reserved_with_space_check(void) char *b; struct region r1, r2; + PREFIX_PUSH(); + phys_addr_t r3_size = SZ_64; phys_addr_t gap_size = SMP_CACHE_BYTES; phys_addr_t total_size; @@ -389,17 +419,19 @@ static int alloc_try_nid_top_down_reserved_with_space_check(void) min_addr, max_addr, NUMA_NO_NODE); b = (char *)allocated_ptr; - assert(allocated_ptr); - assert(*b == 0); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(*b, 0); + + ASSERT_EQ(rgn1->size, r1.size + r3_size); + ASSERT_EQ(rgn1->base, max_addr - r3_size); - assert(rgn1->size == r1.size + r3_size); - assert(rgn1->base == max_addr - r3_size); + ASSERT_EQ(rgn2->size, r2.size); + ASSERT_EQ(rgn2->base, r2.base); - assert(rgn2->size == r2.size); - assert(rgn2->base == r2.base); + ASSERT_EQ(memblock.reserved.cnt, 2); + ASSERT_EQ(memblock.reserved.total_size, total_size); - assert(memblock.reserved.cnt == 2); - assert(memblock.reserved.total_size == total_size); + test_pass_pop(); return 0; } @@ -427,6 +459,8 @@ static int alloc_try_nid_reserved_full_merge_generic_check(void) char *b; struct region r1, r2; + PREFIX_PUSH(); + phys_addr_t r3_size = SZ_64; phys_addr_t total_size; phys_addr_t max_addr; @@ -451,14 +485,16 @@ static int alloc_try_nid_reserved_full_merge_generic_check(void) min_addr, max_addr, NUMA_NO_NODE); b = (char *)allocated_ptr; - assert(allocated_ptr); - assert(*b == 0); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(*b, 0); - assert(rgn->size == total_size); - assert(rgn->base == r2.base); + ASSERT_EQ(rgn->size, total_size); + ASSERT_EQ(rgn->base, r2.base); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == total_size); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); return 0; } @@ -489,6 +525,8 @@ static int alloc_try_nid_top_down_reserved_no_space_check(void) char *b; struct region r1, r2; + PREFIX_PUSH(); + phys_addr_t r3_size = SZ_256; phys_addr_t gap_size = SMP_CACHE_BYTES; phys_addr_t total_size; @@ -514,17 +552,19 @@ static int alloc_try_nid_top_down_reserved_no_space_check(void) min_addr, max_addr, NUMA_NO_NODE); b = (char *)allocated_ptr; - assert(allocated_ptr); - assert(*b == 0); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(*b, 0); + + ASSERT_EQ(rgn1->size, r1.size); + ASSERT_EQ(rgn1->base, r1.base); - assert(rgn1->size == r1.size); - assert(rgn1->base == r1.base); + ASSERT_EQ(rgn2->size, r2.size + r3_size); + ASSERT_EQ(rgn2->base, r2.base - r3_size); - assert(rgn2->size == r2.size + r3_size); - assert(rgn2->base == r2.base - r3_size); + ASSERT_EQ(memblock.reserved.cnt, 2); + ASSERT_EQ(memblock.reserved.total_size, total_size); - assert(memblock.reserved.cnt == 2); - assert(memblock.reserved.total_size == total_size); + test_pass_pop(); return 0; } @@ -554,6 +594,8 @@ static int alloc_try_nid_reserved_all_generic_check(void) void *allocated_ptr = NULL; struct region r1, r2; + PREFIX_PUSH(); + phys_addr_t r3_size = SZ_256; phys_addr_t gap_size = SMP_CACHE_BYTES; phys_addr_t max_addr; @@ -576,7 +618,9 @@ static int alloc_try_nid_reserved_all_generic_check(void) allocated_ptr = memblock_alloc_try_nid(r3_size, SMP_CACHE_BYTES, min_addr, max_addr, NUMA_NO_NODE); - assert(!allocated_ptr); + ASSERT_EQ(allocated_ptr, NULL); + + test_pass_pop(); return 0; } @@ -592,6 +636,8 @@ static int alloc_try_nid_top_down_cap_max_check(void) void *allocated_ptr = NULL; char *b; + PREFIX_PUSH(); + phys_addr_t size = SZ_256; phys_addr_t min_addr; phys_addr_t max_addr; @@ -605,14 +651,16 @@ static int alloc_try_nid_top_down_cap_max_check(void) min_addr, max_addr, NUMA_NO_NODE); b = (char *)allocated_ptr; - assert(allocated_ptr); - assert(*b == 0); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(*b, 0); + + ASSERT_EQ(rgn->size, size); + ASSERT_EQ(rgn->base, memblock_end_of_DRAM() - size); - assert(rgn->size == size); - assert(rgn->base == memblock_end_of_DRAM() - size); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == size); + test_pass_pop(); return 0; } @@ -628,6 +676,8 @@ static int alloc_try_nid_top_down_cap_min_check(void) void *allocated_ptr = NULL; char *b; + PREFIX_PUSH(); + phys_addr_t size = SZ_1K; phys_addr_t min_addr; phys_addr_t max_addr; @@ -641,14 +691,16 @@ static int alloc_try_nid_top_down_cap_min_check(void) min_addr, max_addr, NUMA_NO_NODE); b = (char *)allocated_ptr; - assert(allocated_ptr); - assert(*b == 0); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(*b, 0); - assert(rgn->size == size); - assert(rgn->base == memblock_end_of_DRAM() - size); + ASSERT_EQ(rgn->size, size); + ASSERT_EQ(rgn->base, memblock_end_of_DRAM() - size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == size); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); return 0; } @@ -673,6 +725,8 @@ static int alloc_try_nid_bottom_up_simple_check(void) void *allocated_ptr = NULL; char *b; + PREFIX_PUSH(); + phys_addr_t size = SZ_128; phys_addr_t min_addr; phys_addr_t max_addr; @@ -689,15 +743,17 @@ static int alloc_try_nid_bottom_up_simple_check(void) b = (char *)allocated_ptr; rgn_end = rgn->base + rgn->size; - assert(allocated_ptr); - assert(*b == 0); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(*b, 0); + + ASSERT_EQ(rgn->size, size); + ASSERT_EQ(rgn->base, min_addr); + ASSERT_LT(rgn_end, max_addr); - assert(rgn->size == size); - assert(rgn->base == min_addr); - assert(rgn_end < max_addr); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == size); + test_pass_pop(); return 0; } @@ -725,6 +781,8 @@ static int alloc_try_nid_bottom_up_start_misaligned_check(void) void *allocated_ptr = NULL; char *b; + PREFIX_PUSH(); + phys_addr_t size = SZ_128; phys_addr_t misalign = SZ_2; phys_addr_t min_addr; @@ -742,15 +800,17 @@ static int alloc_try_nid_bottom_up_start_misaligned_check(void) b = (char *)allocated_ptr; rgn_end = rgn->base + rgn->size; - assert(allocated_ptr); - assert(*b == 0); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(*b, 0); + + ASSERT_EQ(rgn->size, size); + ASSERT_EQ(rgn->base, min_addr + (SMP_CACHE_BYTES - misalign)); + ASSERT_LT(rgn_end, max_addr); - assert(rgn->size == size); - assert(rgn->base == min_addr + (SMP_CACHE_BYTES - misalign)); - assert(rgn_end < max_addr); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == size); + test_pass_pop(); return 0; } @@ -778,6 +838,8 @@ static int alloc_try_nid_bottom_up_narrow_range_check(void) void *allocated_ptr = NULL; char *b; + PREFIX_PUSH(); + phys_addr_t size = SZ_256; phys_addr_t min_addr; phys_addr_t max_addr; @@ -792,14 +854,16 @@ static int alloc_try_nid_bottom_up_narrow_range_check(void) NUMA_NO_NODE); b = (char *)allocated_ptr; - assert(allocated_ptr); - assert(*b == 0); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(*b, 0); - assert(rgn->size == size); - assert(rgn->base == memblock_start_of_DRAM()); + ASSERT_EQ(rgn->size, size); + ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == size); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); + + test_pass_pop(); return 0; } @@ -829,6 +893,8 @@ static int alloc_try_nid_bottom_up_reserved_with_space_check(void) char *b; struct region r1, r2; + PREFIX_PUSH(); + phys_addr_t r3_size = SZ_64; phys_addr_t gap_size = SMP_CACHE_BYTES; phys_addr_t total_size; @@ -855,17 +921,19 @@ static int alloc_try_nid_bottom_up_reserved_with_space_check(void) NUMA_NO_NODE); b = (char *)allocated_ptr; - assert(allocated_ptr); - assert(*b == 0); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(*b, 0); - assert(rgn1->size == r1.size); - assert(rgn1->base == max_addr); + ASSERT_EQ(rgn1->size, r1.size); + ASSERT_EQ(rgn1->base, max_addr); - assert(rgn2->size == r2.size + r3_size); - assert(rgn2->base == r2.base); + ASSERT_EQ(rgn2->size, r2.size + r3_size); + ASSERT_EQ(rgn2->base, r2.base); - assert(memblock.reserved.cnt == 2); - assert(memblock.reserved.total_size == total_size); + ASSERT_EQ(memblock.reserved.cnt, 2); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); return 0; } @@ -899,6 +967,8 @@ static int alloc_try_nid_bottom_up_reserved_no_space_check(void) char *b; struct region r1, r2; + PREFIX_PUSH(); + phys_addr_t r3_size = SZ_256; phys_addr_t gap_size = SMP_CACHE_BYTES; phys_addr_t total_size; @@ -925,20 +995,22 @@ static int alloc_try_nid_bottom_up_reserved_no_space_check(void) NUMA_NO_NODE); b = (char *)allocated_ptr; - assert(allocated_ptr); - assert(*b == 0); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(*b, 0); + + ASSERT_EQ(rgn3->size, r3_size); + ASSERT_EQ(rgn3->base, memblock_start_of_DRAM()); - assert(rgn3->size == r3_size); - assert(rgn3->base == memblock_start_of_DRAM()); + ASSERT_EQ(rgn2->size, r2.size); + ASSERT_EQ(rgn2->base, r2.base); - assert(rgn2->size == r2.size); - assert(rgn2->base == r2.base); + ASSERT_EQ(rgn1->size, r1.size); + ASSERT_EQ(rgn1->base, r1.base); - assert(rgn1->size == r1.size); - assert(rgn1->base == r1.base); + ASSERT_EQ(memblock.reserved.cnt, 3); + ASSERT_EQ(memblock.reserved.total_size, total_size); - assert(memblock.reserved.cnt == 3); - assert(memblock.reserved.total_size == total_size); + test_pass_pop(); return 0; } @@ -954,6 +1026,8 @@ static int alloc_try_nid_bottom_up_cap_max_check(void) void *allocated_ptr = NULL; char *b; + PREFIX_PUSH(); + phys_addr_t size = SZ_256; phys_addr_t min_addr; phys_addr_t max_addr; @@ -968,14 +1042,16 @@ static int alloc_try_nid_bottom_up_cap_max_check(void) NUMA_NO_NODE); b = (char *)allocated_ptr; - assert(allocated_ptr); - assert(*b == 0); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(*b, 0); + + ASSERT_EQ(rgn->size, size); + ASSERT_EQ(rgn->base, min_addr); - assert(rgn->size == size); - assert(rgn->base == min_addr); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == size); + test_pass_pop(); return 0; } @@ -991,6 +1067,8 @@ static int alloc_try_nid_bottom_up_cap_min_check(void) void *allocated_ptr = NULL; char *b; + PREFIX_PUSH(); + phys_addr_t size = SZ_1K; phys_addr_t min_addr; phys_addr_t max_addr; @@ -1005,14 +1083,16 @@ static int alloc_try_nid_bottom_up_cap_min_check(void) NUMA_NO_NODE); b = (char *)allocated_ptr; - assert(allocated_ptr); - assert(*b == 0); + ASSERT_NE(allocated_ptr, NULL); + ASSERT_EQ(*b, 0); + + ASSERT_EQ(rgn->size, size); + ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); - assert(rgn->size == size); - assert(rgn->base == memblock_start_of_DRAM()); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == size); + test_pass_pop(); return 0; } @@ -1020,6 +1100,7 @@ static int alloc_try_nid_bottom_up_cap_min_check(void) /* Test case wrappers */ static int alloc_try_nid_simple_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_try_nid_top_down_simple_check(); memblock_set_bottom_up(true); @@ -1030,6 +1111,7 @@ static int alloc_try_nid_simple_check(void) static int alloc_try_nid_misaligned_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_try_nid_top_down_end_misaligned_check(); memblock_set_bottom_up(true); @@ -1040,6 +1122,7 @@ static int alloc_try_nid_misaligned_check(void) static int alloc_try_nid_narrow_range_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_try_nid_top_down_narrow_range_check(); memblock_set_bottom_up(true); @@ -1050,6 +1133,7 @@ static int alloc_try_nid_narrow_range_check(void) static int alloc_try_nid_reserved_with_space_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_try_nid_top_down_reserved_with_space_check(); memblock_set_bottom_up(true); @@ -1060,6 +1144,7 @@ static int alloc_try_nid_reserved_with_space_check(void) static int alloc_try_nid_reserved_no_space_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_try_nid_top_down_reserved_no_space_check(); memblock_set_bottom_up(true); @@ -1070,6 +1155,7 @@ static int alloc_try_nid_reserved_no_space_check(void) static int alloc_try_nid_cap_max_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_try_nid_top_down_cap_max_check(); memblock_set_bottom_up(true); @@ -1080,6 +1166,7 @@ static int alloc_try_nid_cap_max_check(void) static int alloc_try_nid_cap_min_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_try_nid_top_down_cap_min_check(); memblock_set_bottom_up(true); @@ -1090,6 +1177,7 @@ static int alloc_try_nid_cap_min_check(void) static int alloc_try_nid_min_reserved_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_try_nid_min_reserved_generic_check(); memblock_set_bottom_up(true); @@ -1100,6 +1188,7 @@ static int alloc_try_nid_min_reserved_check(void) static int alloc_try_nid_max_reserved_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_try_nid_max_reserved_generic_check(); memblock_set_bottom_up(true); @@ -1110,6 +1199,7 @@ static int alloc_try_nid_max_reserved_check(void) static int alloc_try_nid_exact_address_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_try_nid_exact_address_generic_check(); memblock_set_bottom_up(true); @@ -1120,6 +1210,7 @@ static int alloc_try_nid_exact_address_check(void) static int alloc_try_nid_reserved_full_merge_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_try_nid_reserved_full_merge_generic_check(); memblock_set_bottom_up(true); @@ -1130,6 +1221,7 @@ static int alloc_try_nid_reserved_full_merge_check(void) static int alloc_try_nid_reserved_all_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_try_nid_reserved_all_generic_check(); memblock_set_bottom_up(true); @@ -1140,6 +1232,7 @@ static int alloc_try_nid_reserved_all_check(void) static int alloc_try_nid_low_max_check(void) { + test_print("\tRunning %s...\n", __func__); memblock_set_bottom_up(false); alloc_try_nid_low_max_generic_check(); memblock_set_bottom_up(true); @@ -1150,6 +1243,12 @@ static int alloc_try_nid_low_max_check(void) int memblock_alloc_nid_checks(void) { + const char *func_testing = "memblock_alloc_try_nid"; + + prefix_reset(); + prefix_push(func_testing); + test_print("Running %s tests...\n", func_testing); + reset_memblock_attributes(); dummy_physical_memory_init(); @@ -1170,5 +1269,7 @@ int memblock_alloc_nid_checks(void) dummy_physical_memory_cleanup(); + prefix_pop(); + return 0; } diff --git a/tools/testing/memblock/tests/basic_api.c b/tools/testing/memblock/tests/basic_api.c index a7bc180316d6..66f46f261e66 100644 --- a/tools/testing/memblock/tests/basic_api.c +++ b/tools/testing/memblock/tests/basic_api.c @@ -4,21 +4,29 @@ #include "basic_api.h" #define EXPECTED_MEMBLOCK_REGIONS 128 +#define FUNC_ADD "memblock_add" +#define FUNC_RESERVE "memblock_reserve" +#define FUNC_REMOVE "memblock_remove" +#define FUNC_FREE "memblock_free" static int memblock_initialization_check(void) { - assert(memblock.memory.regions); - assert(memblock.memory.cnt == 1); - assert(memblock.memory.max == EXPECTED_MEMBLOCK_REGIONS); - assert(strcmp(memblock.memory.name, "memory") == 0); + PREFIX_PUSH(); - assert(memblock.reserved.regions); - assert(memblock.reserved.cnt == 1); - assert(memblock.memory.max == EXPECTED_MEMBLOCK_REGIONS); - assert(strcmp(memblock.reserved.name, "reserved") == 0); + ASSERT_NE(memblock.memory.regions, NULL); + ASSERT_EQ(memblock.memory.cnt, 1); + ASSERT_EQ(memblock.memory.max, EXPECTED_MEMBLOCK_REGIONS); + ASSERT_EQ(strcmp(memblock.memory.name, "memory"), 0); - assert(!memblock.bottom_up); - assert(memblock.current_limit == MEMBLOCK_ALLOC_ANYWHERE); + ASSERT_NE(memblock.reserved.regions, NULL); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.memory.max, EXPECTED_MEMBLOCK_REGIONS); + ASSERT_EQ(strcmp(memblock.reserved.name, "reserved"), 0); + + ASSERT_EQ(memblock.bottom_up, false); + ASSERT_EQ(memblock.current_limit, MEMBLOCK_ALLOC_ANYWHERE); + + test_pass_pop(); return 0; } @@ -40,14 +48,18 @@ static int memblock_add_simple_check(void) .size = SZ_4M }; + PREFIX_PUSH(); + reset_memblock_regions(); memblock_add(r.base, r.size); - assert(rgn->base == r.base); - assert(rgn->size == r.size); + ASSERT_EQ(rgn->base, r.base); + ASSERT_EQ(rgn->size, r.size); + + ASSERT_EQ(memblock.memory.cnt, 1); + ASSERT_EQ(memblock.memory.total_size, r.size); - assert(memblock.memory.cnt == 1); - assert(memblock.memory.total_size == r.size); + test_pass_pop(); return 0; } @@ -69,18 +81,22 @@ static int memblock_add_node_simple_check(void) .size = SZ_16M }; + PREFIX_PUSH(); + reset_memblock_regions(); memblock_add_node(r.base, r.size, 1, MEMBLOCK_HOTPLUG); - assert(rgn->base == r.base); - assert(rgn->size == r.size); + ASSERT_EQ(rgn->base, r.base); + ASSERT_EQ(rgn->size, r.size); #ifdef CONFIG_NUMA - assert(rgn->nid == 1); + ASSERT_EQ(rgn->nid, 1); #endif - assert(rgn->flags == MEMBLOCK_HOTPLUG); + ASSERT_EQ(rgn->flags, MEMBLOCK_HOTPLUG); + + ASSERT_EQ(memblock.memory.cnt, 1); + ASSERT_EQ(memblock.memory.total_size, r.size); - assert(memblock.memory.cnt == 1); - assert(memblock.memory.total_size == r.size); + test_pass_pop(); return 0; } @@ -113,18 +129,22 @@ static int memblock_add_disjoint_check(void) .size = SZ_8K }; + PREFIX_PUSH(); + reset_memblock_regions(); memblock_add(r1.base, r1.size); memblock_add(r2.base, r2.size); - assert(rgn1->base == r1.base); - assert(rgn1->size == r1.size); + ASSERT_EQ(rgn1->base, r1.base); + ASSERT_EQ(rgn1->size, r1.size); + + ASSERT_EQ(rgn2->base, r2.base); + ASSERT_EQ(rgn2->size, r2.size); - assert(rgn2->base == r2.base); - assert(rgn2->size == r2.size); + ASSERT_EQ(memblock.memory.cnt, 2); + ASSERT_EQ(memblock.memory.total_size, r1.size + r2.size); - assert(memblock.memory.cnt == 2); - assert(memblock.memory.total_size == r1.size + r2.size); + test_pass_pop(); return 0; } @@ -162,17 +182,21 @@ static int memblock_add_overlap_top_check(void) .size = SZ_512M }; + PREFIX_PUSH(); + total_size = (r1.base - r2.base) + r1.size; reset_memblock_regions(); memblock_add(r1.base, r1.size); memblock_add(r2.base, r2.size); - assert(rgn->base == r2.base); - assert(rgn->size == total_size); + ASSERT_EQ(rgn->base, r2.base); + ASSERT_EQ(rgn->size, total_size); + + ASSERT_EQ(memblock.memory.cnt, 1); + ASSERT_EQ(memblock.memory.total_size, total_size); - assert(memblock.memory.cnt == 1); - assert(memblock.memory.total_size == total_size); + test_pass_pop(); return 0; } @@ -210,17 +234,21 @@ static int memblock_add_overlap_bottom_check(void) .size = SZ_1G }; + PREFIX_PUSH(); + total_size = (r2.base - r1.base) + r2.size; reset_memblock_regions(); memblock_add(r1.base, r1.size); memblock_add(r2.base, r2.size); - assert(rgn->base == r1.base); - assert(rgn->size == total_size); + ASSERT_EQ(rgn->base, r1.base); + ASSERT_EQ(rgn->size, total_size); + + ASSERT_EQ(memblock.memory.cnt, 1); + ASSERT_EQ(memblock.memory.total_size, total_size); - assert(memblock.memory.cnt == 1); - assert(memblock.memory.total_size == total_size); + test_pass_pop(); return 0; } @@ -255,15 +283,19 @@ static int memblock_add_within_check(void) .size = SZ_1M }; + PREFIX_PUSH(); + reset_memblock_regions(); memblock_add(r1.base, r1.size); memblock_add(r2.base, r2.size); - assert(rgn->base == r1.base); - assert(rgn->size == r1.size); + ASSERT_EQ(rgn->base, r1.base); + ASSERT_EQ(rgn->size, r1.size); + + ASSERT_EQ(memblock.memory.cnt, 1); + ASSERT_EQ(memblock.memory.total_size, r1.size); - assert(memblock.memory.cnt == 1); - assert(memblock.memory.total_size == r1.size); + test_pass_pop(); return 0; } @@ -279,19 +311,27 @@ static int memblock_add_twice_check(void) .size = SZ_2M }; + PREFIX_PUSH(); + reset_memblock_regions(); memblock_add(r.base, r.size); memblock_add(r.base, r.size); - assert(memblock.memory.cnt == 1); - assert(memblock.memory.total_size == r.size); + ASSERT_EQ(memblock.memory.cnt, 1); + ASSERT_EQ(memblock.memory.total_size, r.size); + + test_pass_pop(); return 0; } static int memblock_add_checks(void) { + prefix_reset(); + prefix_push(FUNC_ADD); + test_print("Running %s tests...\n", FUNC_ADD); + memblock_add_simple_check(); memblock_add_node_simple_check(); memblock_add_disjoint_check(); @@ -300,6 +340,8 @@ static int memblock_add_checks(void) memblock_add_within_check(); memblock_add_twice_check(); + prefix_pop(); + return 0; } @@ -320,11 +362,15 @@ static int memblock_reserve_simple_check(void) .size = SZ_128M }; + PREFIX_PUSH(); + reset_memblock_regions(); memblock_reserve(r.base, r.size); - assert(rgn->base == r.base); - assert(rgn->size == r.size); + ASSERT_EQ(rgn->base, r.base); + ASSERT_EQ(rgn->size, r.size); + + test_pass_pop(); return 0; } @@ -356,18 +402,22 @@ static int memblock_reserve_disjoint_check(void) .size = SZ_512M }; + PREFIX_PUSH(); + reset_memblock_regions(); memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - assert(rgn1->base == r1.base); - assert(rgn1->size == r1.size); + ASSERT_EQ(rgn1->base, r1.base); + ASSERT_EQ(rgn1->size, r1.size); + + ASSERT_EQ(rgn2->base, r2.base); + ASSERT_EQ(rgn2->size, r2.size); - assert(rgn2->base == r2.base); - assert(rgn2->size == r2.size); + ASSERT_EQ(memblock.reserved.cnt, 2); + ASSERT_EQ(memblock.reserved.total_size, r1.size + r2.size); - assert(memblock.reserved.cnt == 2); - assert(memblock.reserved.total_size == r1.size + r2.size); + test_pass_pop(); return 0; } @@ -406,17 +456,21 @@ static int memblock_reserve_overlap_top_check(void) .size = SZ_1G }; + PREFIX_PUSH(); + total_size = (r1.base - r2.base) + r1.size; reset_memblock_regions(); memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - assert(rgn->base == r2.base); - assert(rgn->size == total_size); + ASSERT_EQ(rgn->base, r2.base); + ASSERT_EQ(rgn->size, total_size); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, total_size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == total_size); + test_pass_pop(); return 0; } @@ -455,17 +509,21 @@ static int memblock_reserve_overlap_bottom_check(void) .size = SZ_128K }; + PREFIX_PUSH(); + total_size = (r2.base - r1.base) + r2.size; reset_memblock_regions(); memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - assert(rgn->base == r1.base); - assert(rgn->size == total_size); + ASSERT_EQ(rgn->base, r1.base); + ASSERT_EQ(rgn->size, total_size); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, total_size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == total_size); + test_pass_pop(); return 0; } @@ -502,15 +560,19 @@ static int memblock_reserve_within_check(void) .size = SZ_64K }; + PREFIX_PUSH(); + reset_memblock_regions(); memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); - assert(rgn->base == r1.base); - assert(rgn->size == r1.size); + ASSERT_EQ(rgn->base, r1.base); + ASSERT_EQ(rgn->size, r1.size); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, r1.size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == r1.size); + test_pass_pop(); return 0; } @@ -527,19 +589,27 @@ static int memblock_reserve_twice_check(void) .size = SZ_2M }; + PREFIX_PUSH(); + reset_memblock_regions(); memblock_reserve(r.base, r.size); memblock_reserve(r.base, r.size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == r.size); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, r.size); + + test_pass_pop(); return 0; } static int memblock_reserve_checks(void) { + prefix_reset(); + prefix_push(FUNC_RESERVE); + test_print("Running %s tests...\n", FUNC_RESERVE); + memblock_reserve_simple_check(); memblock_reserve_disjoint_check(); memblock_reserve_overlap_top_check(); @@ -547,6 +617,8 @@ static int memblock_reserve_checks(void) memblock_reserve_within_check(); memblock_reserve_twice_check(); + prefix_pop(); + return 0; } @@ -581,16 +653,20 @@ static int memblock_remove_simple_check(void) .size = SZ_4M }; + PREFIX_PUSH(); + reset_memblock_regions(); memblock_add(r1.base, r1.size); memblock_add(r2.base, r2.size); memblock_remove(r1.base, r1.size); - assert(rgn->base == r2.base); - assert(rgn->size == r2.size); + ASSERT_EQ(rgn->base, r2.base); + ASSERT_EQ(rgn->size, r2.size); - assert(memblock.memory.cnt == 1); - assert(memblock.memory.total_size == r2.size); + ASSERT_EQ(memblock.memory.cnt, 1); + ASSERT_EQ(memblock.memory.total_size, r2.size); + + test_pass_pop(); return 0; } @@ -626,15 +702,19 @@ static int memblock_remove_absent_check(void) .size = SZ_1G }; + PREFIX_PUSH(); + reset_memblock_regions(); memblock_add(r1.base, r1.size); memblock_remove(r2.base, r2.size); - assert(rgn->base == r1.base); - assert(rgn->size == r1.size); + ASSERT_EQ(rgn->base, r1.base); + ASSERT_EQ(rgn->size, r1.size); + + ASSERT_EQ(memblock.memory.cnt, 1); + ASSERT_EQ(memblock.memory.total_size, r1.size); - assert(memblock.memory.cnt == 1); - assert(memblock.memory.total_size == r1.size); + test_pass_pop(); return 0; } @@ -674,6 +754,8 @@ static int memblock_remove_overlap_top_check(void) .size = SZ_32M }; + PREFIX_PUSH(); + r1_end = r1.base + r1.size; r2_end = r2.base + r2.size; total_size = r1_end - r2_end; @@ -682,11 +764,13 @@ static int memblock_remove_overlap_top_check(void) memblock_add(r1.base, r1.size); memblock_remove(r2.base, r2.size); - assert(rgn->base == r1.base + r2.base); - assert(rgn->size == total_size); + ASSERT_EQ(rgn->base, r1.base + r2.base); + ASSERT_EQ(rgn->size, total_size); + + ASSERT_EQ(memblock.memory.cnt, 1); + ASSERT_EQ(memblock.memory.total_size, total_size); - assert(memblock.memory.cnt == 1); - assert(memblock.memory.total_size == total_size); + test_pass_pop(); return 0; } @@ -724,17 +808,22 @@ static int memblock_remove_overlap_bottom_check(void) .size = SZ_256M }; + PREFIX_PUSH(); + total_size = r2.base - r1.base; reset_memblock_regions(); memblock_add(r1.base, r1.size); memblock_remove(r2.base, r2.size); - assert(rgn->base == r1.base); - assert(rgn->size == total_size); + ASSERT_EQ(rgn->base, r1.base); + ASSERT_EQ(rgn->size, total_size); + + ASSERT_EQ(memblock.memory.cnt, 1); + ASSERT_EQ(memblock.memory.total_size, total_size); + + test_pass_pop(); - assert(memblock.memory.cnt == 1); - assert(memblock.memory.total_size == total_size); return 0; } @@ -774,6 +863,8 @@ static int memblock_remove_within_check(void) .size = SZ_1M }; + PREFIX_PUSH(); + r1_size = r2.base - r1.base; r2_size = (r1.base + r1.size) - (r2.base + r2.size); total_size = r1_size + r2_size; @@ -782,26 +873,34 @@ static int memblock_remove_within_check(void) memblock_add(r1.base, r1.size); memblock_remove(r2.base, r2.size); - assert(rgn1->base == r1.base); - assert(rgn1->size == r1_size); + ASSERT_EQ(rgn1->base, r1.base); + ASSERT_EQ(rgn1->size, r1_size); + + ASSERT_EQ(rgn2->base, r2.base + r2.size); + ASSERT_EQ(rgn2->size, r2_size); - assert(rgn2->base == r2.base + r2.size); - assert(rgn2->size == r2_size); + ASSERT_EQ(memblock.memory.cnt, 2); + ASSERT_EQ(memblock.memory.total_size, total_size); - assert(memblock.memory.cnt == 2); - assert(memblock.memory.total_size == total_size); + test_pass_pop(); return 0; } static int memblock_remove_checks(void) { + prefix_reset(); + prefix_push(FUNC_REMOVE); + test_print("Running %s tests...\n", FUNC_REMOVE); + memblock_remove_simple_check(); memblock_remove_absent_check(); memblock_remove_overlap_top_check(); memblock_remove_overlap_bottom_check(); memblock_remove_within_check(); + prefix_pop(); + return 0; } @@ -835,16 +934,20 @@ static int memblock_free_simple_check(void) .size = SZ_1M }; + PREFIX_PUSH(); + reset_memblock_regions(); memblock_reserve(r1.base, r1.size); memblock_reserve(r2.base, r2.size); memblock_free((void *)r1.base, r1.size); - assert(rgn->base == r2.base); - assert(rgn->size == r2.size); + ASSERT_EQ(rgn->base, r2.base); + ASSERT_EQ(rgn->size, r2.size); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, r2.size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == r2.size); + test_pass_pop(); return 0; } @@ -880,15 +983,19 @@ static int memblock_free_absent_check(void) .size = SZ_128M }; + PREFIX_PUSH(); + reset_memblock_regions(); memblock_reserve(r1.base, r1.size); memblock_free((void *)r2.base, r2.size); - assert(rgn->base == r1.base); - assert(rgn->size == r1.size); + ASSERT_EQ(rgn->base, r1.base); + ASSERT_EQ(rgn->size, r1.size); + + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, r1.size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == r1.size); + test_pass_pop(); return 0; } @@ -928,17 +1035,21 @@ static int memblock_free_overlap_top_check(void) .size = SZ_8M }; + PREFIX_PUSH(); + total_size = (r1.size + r1.base) - (r2.base + r2.size); reset_memblock_regions(); memblock_reserve(r1.base, r1.size); memblock_free((void *)r2.base, r2.size); - assert(rgn->base == r2.base + r2.size); - assert(rgn->size == total_size); + ASSERT_EQ(rgn->base, r2.base + r2.size); + ASSERT_EQ(rgn->size, total_size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == total_size); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); return 0; } @@ -973,17 +1084,21 @@ static int memblock_free_overlap_bottom_check(void) .size = SZ_32M }; + PREFIX_PUSH(); + total_size = r2.base - r1.base; reset_memblock_regions(); memblock_reserve(r1.base, r1.size); memblock_free((void *)r2.base, r2.size); - assert(rgn->base == r1.base); - assert(rgn->size == total_size); + ASSERT_EQ(rgn->base, r1.base); + ASSERT_EQ(rgn->size, total_size); - assert(memblock.reserved.cnt == 1); - assert(memblock.reserved.total_size == total_size); + ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); return 0; } @@ -1024,6 +1139,8 @@ static int memblock_free_within_check(void) .size = SZ_1M }; + PREFIX_PUSH(); + r1_size = r2.base - r1.base; r2_size = (r1.base + r1.size) - (r2.base + r2.size); total_size = r1_size + r2_size; @@ -1032,26 +1149,34 @@ static int memblock_free_within_check(void) memblock_reserve(r1.base, r1.size); memblock_free((void *)r2.base, r2.size); - assert(rgn1->base == r1.base); - assert(rgn1->size == r1_size); + ASSERT_EQ(rgn1->base, r1.base); + ASSERT_EQ(rgn1->size, r1_size); - assert(rgn2->base == r2.base + r2.size); - assert(rgn2->size == r2_size); + ASSERT_EQ(rgn2->base, r2.base + r2.size); + ASSERT_EQ(rgn2->size, r2_size); - assert(memblock.reserved.cnt == 2); - assert(memblock.reserved.total_size == total_size); + ASSERT_EQ(memblock.reserved.cnt, 2); + ASSERT_EQ(memblock.reserved.total_size, total_size); + + test_pass_pop(); return 0; } static int memblock_free_checks(void) { + prefix_reset(); + prefix_push(FUNC_FREE); + test_print("Running %s tests...\n", FUNC_FREE); + memblock_free_simple_check(); memblock_free_absent_check(); memblock_free_overlap_top_check(); memblock_free_overlap_bottom_check(); memblock_free_within_check(); + prefix_pop(); + return 0; } diff --git a/tools/testing/memblock/tests/common.c b/tools/testing/memblock/tests/common.c index 62d3191f7c9a..ebc06b4c3255 100644 --- a/tools/testing/memblock/tests/common.c +++ b/tools/testing/memblock/tests/common.c @@ -4,8 +4,12 @@ #define INIT_MEMBLOCK_REGIONS 128 #define INIT_MEMBLOCK_RESERVED_REGIONS INIT_MEMBLOCK_REGIONS +#define PREFIXES_MAX 15 +#define DELIM ": " static struct test_memory memory_block; +static const char __maybe_unused *prefixes[PREFIXES_MAX]; +static int __maybe_unused nr_prefixes; void reset_memblock_regions(void) { @@ -46,3 +50,56 @@ void dummy_physical_memory_cleanup(void) { free(memory_block.base); } + +#ifdef VERBOSE +void print_prefixes(const char *postfix) +{ + for (int i = 0; i < nr_prefixes; i++) + test_print("%s%s", prefixes[i], DELIM); + test_print(postfix); +} + +void test_fail(void) +{ + ksft_test_result_fail(": "); + print_prefixes("failed\n"); +} + +void test_pass(void) +{ + ksft_test_result_pass(": "); + print_prefixes("passed\n"); +} + +void test_print(const char *fmt, ...) +{ + int saved_errno = errno; + va_list args; + + va_start(args, fmt); + errno = saved_errno; + vprintf(fmt, args); + va_end(args); +} + +void prefix_reset(void) +{ + memset(prefixes, 0, PREFIXES_MAX * sizeof(char *)); + nr_prefixes = 0; +} + +void prefix_push(const char *prefix) +{ + assert(nr_prefixes < PREFIXES_MAX); + prefixes[nr_prefixes] = prefix; + nr_prefixes++; +} + +void prefix_pop(void) +{ + if (nr_prefixes > 0) { + prefixes[nr_prefixes - 1] = 0; + nr_prefixes--; + } +} +#endif /* VERBOSE */ diff --git a/tools/testing/memblock/tests/common.h b/tools/testing/memblock/tests/common.h index 619054d03219..46de86a755f3 100644 --- a/tools/testing/memblock/tests/common.h +++ b/tools/testing/memblock/tests/common.h @@ -7,9 +7,49 @@ #include #include #include +#include +#include <../selftests/kselftest.h> #define MEM_SIZE SZ_16K +/** + * ASSERT_EQ(): + * Check the condition + * @_expected == @_seen + * If false, print failed test message (if in VERBOSE mode) and then assert + */ +#define ASSERT_EQ(_expected, _seen) do { \ + if ((_expected) != (_seen)) \ + test_fail(); \ + assert((_expected) == (_seen)); \ +} while (0) + +/** + * ASSERT_NE(): + * Check the condition + * @_expected != @_seen + * If false, print failed test message (if in VERBOSE mode) and then assert + */ +#define ASSERT_NE(_expected, _seen) do { \ + if ((_expected) == (_seen)) \ + test_fail(); \ + assert((_expected) != (_seen)); \ +} while (0) + +/** + * ASSERT_LT(): + * Check the condition + * @_expected < @_seen + * If false, print failed test message (if in VERBOSE mode) and then assert + */ +#define ASSERT_LT(_expected, _seen) do { \ + if ((_expected) >= (_seen)) \ + test_fail(); \ + assert((_expected) < (_seen)); \ +} while (0) + +#define PREFIX_PUSH() prefix_push(__func__) + /* * Available memory registered with memblock needs to be valid for allocs * test to run. This is a convenience wrapper for memory allocated in @@ -31,4 +71,26 @@ void setup_memblock(void); void dummy_physical_memory_init(void); void dummy_physical_memory_cleanup(void); +#ifdef VERBOSE +void test_fail(void); +void test_pass(void); +void test_print(const char *fmt, ...); +void prefix_reset(void); +void prefix_push(const char *prefix); +void prefix_pop(void); +#else +static inline void test_fail(void) {} +static inline void test_pass(void) {} +static inline void test_print(const char *fmt, ...) {} +static inline void prefix_reset(void) {} +static inline void prefix_push(const char *prefix) {} +static inline void prefix_pop(void) {} +#endif /* VERBOSE */ + +static inline void test_pass_pop(void) +{ + test_pass(); + prefix_pop(); +} + #endif -- cgit v1.2.3 From c55b31a124a68171e5915b1036ca42d8a683eca2 Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sun, 3 Jul 2022 23:06:57 -0500 Subject: memblock tests: set memblock_debug to enable memblock_dbg() messages If Memblock simulator was compiled with MEMBLOCK_DEBUG=1, set memblock_debug to 1 so that memblock_dbg() will print debug information when memblock functions are tested in Memblock simulator. Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/aee4200cce1c09992ed055006a81fde1b6b5b567.1656907314.git.remckee0@gmail.com --- tools/testing/memblock/internal.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'tools') diff --git a/tools/testing/memblock/internal.h b/tools/testing/memblock/internal.h index 94b52a8718b5..c2a492c05e0c 100644 --- a/tools/testing/memblock/internal.h +++ b/tools/testing/memblock/internal.h @@ -2,6 +2,13 @@ #ifndef _MM_INTERNAL_H #define _MM_INTERNAL_H +/* + * Enable memblock_dbg() messages + */ +#ifdef MEMBLOCK_DEBUG +static int memblock_debug = 1; +#endif + struct page {}; void memblock_free_pages(struct page *page, unsigned long pfn, -- cgit v1.2.3 From fe833b4edc594a4a6f1ce2d68975733e17b8f8ed Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sun, 3 Jul 2022 23:06:58 -0500 Subject: memblock tests: remove completed TODO items Remove completed items from TODO list. Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/6a3e74fcb51a07e8d9fbbcbe84bdb8aa8b00e843.1656907314.git.remckee0@gmail.com --- tools/testing/memblock/TODO | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/memblock/TODO b/tools/testing/memblock/TODO index cd1a30d5acc9..33044c634ea7 100644 --- a/tools/testing/memblock/TODO +++ b/tools/testing/memblock/TODO @@ -1,25 +1,17 @@ TODO ===== -1. Add verbose output (e.g., what is being tested and how many tests cases are - passing) - -2. Add flags to Makefile: - + verbosity level - + enable memblock_dbg() messages (i.e. pass "-D CONFIG_DEBUG_MEMORY_INIT" - flag) - -3. Add tests trying to memblock_add() or memblock_reserve() 129th region. +1. Add tests trying to memblock_add() or memblock_reserve() 129th region. This will trigger memblock_double_array(), make sure it succeeds. *Important:* These tests require valid memory ranges, use dummy physical memory block from common.c to implement them. It is also very likely that the current MEM_SIZE won't be enough for these test cases. Use realloc to adjust the size accordingly. -4. Add test cases using this functions (implement them for both directions): +2. Add test cases using this functions (implement them for both directions): + memblock_alloc_raw() + memblock_alloc_exact_nid_raw() + memblock_alloc_try_nid_raw() -5. Add tests for memblock_alloc_node() to check if the correct NUMA node is set +3. Add tests for memblock_alloc_node() to check if the correct NUMA node is set for the new region -- cgit v1.2.3 From e95ab1d852897a0b697cd0fb609d496ce97fff3a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 2 Jul 2022 08:48:18 -0700 Subject: selftests: net: af_unix: Test connect() with different netns. This patch add a test that checks connect()ivity between two sockets: unnamed socket -> bound socket * SOCK_STREAM or SOCK_DGRAM * pathname or abstract * same or different netns Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/.gitignore | 1 + tools/testing/selftests/net/af_unix/Makefile | 3 +- tools/testing/selftests/net/af_unix/unix_connect.c | 149 +++++++++++++++++++++ 3 files changed, 152 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/net/af_unix/unix_connect.c (limited to 'tools') diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index a29f79618934..1257baa79286 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -37,3 +37,4 @@ gro ioam6_parser toeplitz cmsg_sender +unix_connect \ No newline at end of file diff --git a/tools/testing/selftests/net/af_unix/Makefile b/tools/testing/selftests/net/af_unix/Makefile index df341648f818..969620ae9928 100644 --- a/tools/testing/selftests/net/af_unix/Makefile +++ b/tools/testing/selftests/net/af_unix/Makefile @@ -1,2 +1,3 @@ -TEST_GEN_PROGS := test_unix_oob +TEST_GEN_PROGS := test_unix_oob unix_connect + include ../../lib.mk diff --git a/tools/testing/selftests/net/af_unix/unix_connect.c b/tools/testing/selftests/net/af_unix/unix_connect.c new file mode 100644 index 000000000000..157e44ef7f37 --- /dev/null +++ b/tools/testing/selftests/net/af_unix/unix_connect.c @@ -0,0 +1,149 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include + +#include +#include + +#include +#include + +#include "../../kselftest_harness.h" + +FIXTURE(unix_connect) +{ + int server, client; + int family; +}; + +FIXTURE_VARIANT(unix_connect) +{ + int type; + char sun_path[8]; + int len; + int flags; + int err; +}; + +FIXTURE_VARIANT_ADD(unix_connect, stream_pathname) +{ + .type = SOCK_STREAM, + .sun_path = "test", + .len = 4 + 1, + .flags = 0, + .err = 0, +}; + +FIXTURE_VARIANT_ADD(unix_connect, stream_abstract) +{ + .type = SOCK_STREAM, + .sun_path = "\0test", + .len = 5, + .flags = 0, + .err = 0, +}; + +FIXTURE_VARIANT_ADD(unix_connect, stream_pathname_netns) +{ + .type = SOCK_STREAM, + .sun_path = "test", + .len = 4 + 1, + .flags = CLONE_NEWNET, + .err = 0, +}; + +FIXTURE_VARIANT_ADD(unix_connect, stream_abstract_netns) +{ + .type = SOCK_STREAM, + .sun_path = "\0test", + .len = 5, + .flags = CLONE_NEWNET, + .err = ECONNREFUSED, +}; + +FIXTURE_VARIANT_ADD(unix_connect, dgram_pathname) +{ + .type = SOCK_DGRAM, + .sun_path = "test", + .len = 4 + 1, + .flags = 0, + .err = 0, +}; + +FIXTURE_VARIANT_ADD(unix_connect, dgram_abstract) +{ + .type = SOCK_DGRAM, + .sun_path = "\0test", + .len = 5, + .flags = 0, + .err = 0, +}; + +FIXTURE_VARIANT_ADD(unix_connect, dgram_pathname_netns) +{ + .type = SOCK_DGRAM, + .sun_path = "test", + .len = 4 + 1, + .flags = CLONE_NEWNET, + .err = 0, +}; + +FIXTURE_VARIANT_ADD(unix_connect, dgram_abstract_netns) +{ + .type = SOCK_DGRAM, + .sun_path = "\0test", + .len = 5, + .flags = CLONE_NEWNET, + .err = ECONNREFUSED, +}; + +FIXTURE_SETUP(unix_connect) +{ + self->family = AF_UNIX; +} + +FIXTURE_TEARDOWN(unix_connect) +{ + close(self->server); + close(self->client); + + if (variant->sun_path[0]) + remove("test"); +} + +#define offsetof(type, member) ((size_t)&((type *)0)->(member)) + +TEST_F(unix_connect, test) +{ + socklen_t addrlen; + struct sockaddr_un addr = { + .sun_family = self->family, + }; + int err; + + self->server = socket(self->family, variant->type, 0); + ASSERT_NE(-1, self->server); + + addrlen = offsetof(struct sockaddr_un, sun_path) + variant->len; + memcpy(&addr.sun_path, variant->sun_path, variant->len); + + err = bind(self->server, (struct sockaddr *)&addr, addrlen); + ASSERT_EQ(0, err); + + if (variant->type == SOCK_STREAM) { + err = listen(self->server, 32); + ASSERT_EQ(0, err); + } + + err = unshare(variant->flags); + ASSERT_EQ(0, err); + + self->client = socket(self->family, variant->type, 0); + ASSERT_LT(0, self->client); + + err = connect(self->client, (struct sockaddr *)&addr, addrlen); + ASSERT_EQ(variant->err, err == -1 ? errno : 0); +} + +TEST_HARNESS_MAIN -- cgit v1.2.3 From b8e629b05f5d23f9649c901bef09fab8b0c2e4b9 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 3 Jul 2022 10:36:24 +0300 Subject: selftests: forwarding: fix flood_unicast_test when h2 supports IFF_UNICAST_FLT As mentioned in the blamed commit, flood_unicast_test() works by checking the match count on a tc filter placed on the receiving interface. But the second host interface (host2_if) has no interest in receiving a packet with MAC DA de:ad:be:ef:13:37, so its RX filter drops it even before the ingress tc filter gets to be executed. So we will incorrectly get the message "Packet was not flooded when should", when in fact, the packet was flooded as expected but dropped due to an unrelated reason, at some other layer on the receiving side. Force h2 to accept this packet by temporarily placing it in promiscuous mode. Alternatively we could either deliver to its MAC address or use tcpdump_start, but this has the fewest complications. This fixes the "flooding" test from bridge_vlan_aware.sh and bridge_vlan_unaware.sh, which calls flood_test from the lib. Fixes: 236dd50bf67a ("selftests: forwarding: Add a test for flooded traffic") Signed-off-by: Vladimir Oltean Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/forwarding/lib.sh | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 37ae49d47853..4e69497f021f 100755 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -1306,6 +1306,7 @@ flood_test_do() # Add an ACL on `host2_if` which will tell us whether the packet # was flooded to it or not. + ip link set $host2_if promisc on tc qdisc add dev $host2_if ingress tc filter add dev $host2_if ingress protocol ip pref 1 handle 101 \ flower dst_mac $mac action drop @@ -1323,6 +1324,7 @@ flood_test_do() tc filter del dev $host2_if ingress protocol ip pref 1 handle 101 flower tc qdisc del dev $host2_if ingress + ip link set $host2_if promisc off return $err } -- cgit v1.2.3 From 1a635d3e1c80626237fdae47a5545b6655d8d81c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 3 Jul 2022 10:36:25 +0300 Subject: selftests: forwarding: fix learning_test when h1 supports IFF_UNICAST_FLT The first host interface has by default no interest in receiving packets MAC DA de:ad:be:ef:13:37, so it might drop them before they hit the tc filter and this might confuse the selftest. Enable promiscuous mode such that the filter properly counts received packets. Fixes: d4deb01467ec ("selftests: forwarding: Add a test for FDB learning") Signed-off-by: Vladimir Oltean Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/forwarding/lib.sh | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 4e69497f021f..26ba8b5d264a 100755 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -1240,6 +1240,7 @@ learning_test() # FDB entry was installed. bridge link set dev $br_port1 flood off + ip link set $host1_if promisc on tc qdisc add dev $host1_if ingress tc filter add dev $host1_if ingress protocol ip pref 1 handle 101 \ flower dst_mac $mac action drop @@ -1289,6 +1290,7 @@ learning_test() tc filter del dev $host1_if ingress protocol ip pref 1 handle 101 flower tc qdisc del dev $host1_if ingress + ip link set $host1_if promisc off bridge link set dev $br_port1 flood on -- cgit v1.2.3 From 83844aacab2015da1dba1df0cc61fc4b4c4e8076 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 3 Jul 2022 10:36:26 +0300 Subject: selftests: forwarding: fix error message in learning_test When packets are not received, they aren't received on $host1_if, so the message talking about the second host not receiving them is incorrect. Fix it. Fixes: d4deb01467ec ("selftests: forwarding: Add a test for FDB learning") Signed-off-by: Vladimir Oltean Reviewed-by: Ido Schimmel Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/forwarding/lib.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 26ba8b5d264a..3ffb9d6c0950 100755 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -1251,7 +1251,7 @@ learning_test() tc -j -s filter show dev $host1_if ingress \ | jq -e ".[] | select(.options.handle == 101) \ | select(.options.actions[0].stats.packets == 1)" &> /dev/null - check_fail $? "Packet reached second host when should not" + check_fail $? "Packet reached first host when should not" $MZ $host1_if -c 1 -p 64 -a $mac -t ip -q sleep 1 -- cgit v1.2.3 From 990a6194f7e16cc23334892287f32899e241b1a9 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 1 Jul 2022 10:38:05 +0100 Subject: bpftool: Rename "bpftool feature list" into "... feature list_builtins" To make it more explicit that the features listed with "bpftool feature list" are known to bpftool, but not necessary available on the system (as opposed to the probed features), rename the "feature list" command into "feature list_builtins". Note that "bpftool feature list" still works as before given that we recognise arguments from their prefixes; but the real name of the subcommand, in particular as displayed in the man page or the interactive help, will now include "_builtins". Since we update the bash completion accordingly, let's also take this chance to redirect error output to /dev/null in the completion script, to avoid displaying unexpected error messages when users attempt to tab-complete. Suggested-by: Daniel Borkmann Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220701093805.16920-1-quentin@isovalent.com --- tools/bpf/bpftool/Documentation/bpftool-feature.rst | 4 ++-- tools/bpf/bpftool/bash-completion/bpftool | 8 ++++---- tools/bpf/bpftool/feature.c | 10 +++++----- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index c08064628d39..e44039f89be7 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -24,7 +24,7 @@ FEATURE COMMANDS ================ | **bpftool** **feature probe** [*COMPONENT*] [**full**] [**unprivileged**] [**macros** [**prefix** *PREFIX*]] -| **bpftool** **feature list** *GROUP* +| **bpftool** **feature list_builtins** *GROUP* | **bpftool** **feature help** | | *COMPONENT* := { **kernel** | **dev** *NAME* } @@ -72,7 +72,7 @@ DESCRIPTION The keywords **full**, **macros** and **prefix** have the same role as when probing the kernel. - **bpftool feature list** *GROUP* + **bpftool feature list_builtins** *GROUP* List items known to bpftool. These can be BPF program types (**prog_types**), BPF map types (**map_types**), attach types (**attach_types**), link types (**link_types**), or BPF helper diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index ee177f83b179..dc1641e3670e 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -703,7 +703,7 @@ _bpftool() return 0 ;; type) - local BPFTOOL_MAP_CREATE_TYPES="$(bpftool feature list map_types | \ + local BPFTOOL_MAP_CREATE_TYPES="$(bpftool feature list_builtins map_types 2>/dev/null | \ grep -v '^unspec$')" COMPREPLY=( $( compgen -W "$BPFTOOL_MAP_CREATE_TYPES" -- "$cur" ) ) return 0 @@ -1032,7 +1032,7 @@ _bpftool() return 0 ;; attach|detach) - local BPFTOOL_CGROUP_ATTACH_TYPES="$(bpftool feature list attach_types | \ + local BPFTOOL_CGROUP_ATTACH_TYPES="$(bpftool feature list_builtins attach_types 2>/dev/null | \ grep '^cgroup_')" local ATTACH_FLAGS='multi override' local PROG_TYPE='id pinned tag name' @@ -1162,14 +1162,14 @@ _bpftool() _bpftool_once_attr 'full unprivileged' return 0 ;; - list) + list_builtins) [[ $prev != "$command" ]] && return 0 COMPREPLY=( $( compgen -W 'prog_types map_types \ attach_types link_types helpers' -- "$cur" ) ) ;; *) [[ $prev == $object ]] && \ - COMPREPLY=( $( compgen -W 'help list probe' -- "$cur" ) ) + COMPREPLY=( $( compgen -W 'help list_builtins probe' -- "$cur" ) ) ;; esac ;; diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 576cc6b90c6a..7ecabf7947fb 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -1266,7 +1266,7 @@ static const char *get_helper_name(unsigned int id) return helper_name[id]; } -static int do_list(int argc, char **argv) +static int do_list_builtins(int argc, char **argv) { const char *(*get_name)(unsigned int id); unsigned int id = 0; @@ -1319,7 +1319,7 @@ static int do_help(int argc, char **argv) fprintf(stderr, "Usage: %1$s %2$s probe [COMPONENT] [full] [unprivileged] [macros [prefix PREFIX]]\n" - " %1$s %2$s list GROUP\n" + " %1$s %2$s list_builtins GROUP\n" " %1$s %2$s help\n" "\n" " COMPONENT := { kernel | dev NAME }\n" @@ -1332,9 +1332,9 @@ static int do_help(int argc, char **argv) } static const struct cmd cmds[] = { - { "probe", do_probe }, - { "list", do_list }, - { "help", do_help }, + { "probe", do_probe }, + { "list_builtins", do_list_builtins }, + { "help", do_help }, { 0 } }; -- cgit v1.2.3 From 5352ebf735064373dc2dd96b39c0fbf347db0095 Mon Sep 17 00:00:00 2001 From: Siddh Raman Pant Date: Mon, 27 Jun 2022 13:21:48 +0530 Subject: tools/testing/crypto: Use vzalloc instead of vmalloc+memset This fixes the corresponding coccinelle warning. Signed-off-by: Siddh Raman Pant Signed-off-by: Harald Freudenberger Link: https://lore.kernel.org/r/20220627075148.140705-1-code@siddh.me [agordeev@linux.ibm.com added Link] Signed-off-by: Alexander Gordeev --- tools/testing/crypto/chacha20-s390/test-cipher.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/crypto/chacha20-s390/test-cipher.c b/tools/testing/crypto/chacha20-s390/test-cipher.c index 34e8b855266f..8141d45df51a 100644 --- a/tools/testing/crypto/chacha20-s390/test-cipher.c +++ b/tools/testing/crypto/chacha20-s390/test-cipher.c @@ -252,29 +252,26 @@ static int __init chacha_s390_test_init(void) memset(plain, 'a', data_size); get_random_bytes(plain, (data_size > 256 ? 256 : data_size)); - cipher_generic = vmalloc(data_size); + cipher_generic = vzalloc(data_size); if (!cipher_generic) { pr_info("could not allocate cipher_generic buffer\n"); ret = -2; goto out; } - memset(cipher_generic, 0, data_size); - cipher_s390 = vmalloc(data_size); + cipher_s390 = vzalloc(data_size); if (!cipher_s390) { pr_info("could not allocate cipher_s390 buffer\n"); ret = -2; goto out; } - memset(cipher_s390, 0, data_size); - revert = vmalloc(data_size); + revert = vzalloc(data_size); if (!revert) { pr_info("could not allocate revert buffer\n"); ret = -2; goto out; } - memset(revert, 0, data_size); if (debug) print_hex_dump(KERN_INFO, "src: ", DUMP_PREFIX_OFFSET, -- cgit v1.2.3 From 3c660a5d86f4c01cf641bfea004a49f5860a5bed Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Tue, 28 Jun 2022 16:01:18 +0000 Subject: bpf: Introduce TYPE_MATCH related constants/macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to provide type match support we require a new type of relocation which, in turn, requires toolchain support. Recent LLVM/Clang versions support a new value for the last argument to the __builtin_preserve_type_info builtin, for example. With this change we introduce the necessary constants into relevant header files, mirroring what the compiler may support. Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220628160127.607834-2-deso@posteo.net --- include/uapi/linux/bpf.h | 1 + tools/include/uapi/linux/bpf.h | 1 + tools/lib/bpf/bpf_core_read.h | 1 + 3 files changed, 3 insertions(+) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ad9e7311c4cf..379e68fb866f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6786,6 +6786,7 @@ enum bpf_core_relo_kind { BPF_CORE_TYPE_SIZE = 9, /* type size in bytes */ BPF_CORE_ENUMVAL_EXISTS = 10, /* enum value existence in target kernel */ BPF_CORE_ENUMVAL_VALUE = 11, /* enum value integer value */ + BPF_CORE_TYPE_MATCHES = 12, /* type match in target kernel */ }; /* diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ad9e7311c4cf..379e68fb866f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6786,6 +6786,7 @@ enum bpf_core_relo_kind { BPF_CORE_TYPE_SIZE = 9, /* type size in bytes */ BPF_CORE_ENUMVAL_EXISTS = 10, /* enum value existence in target kernel */ BPF_CORE_ENUMVAL_VALUE = 11, /* enum value integer value */ + BPF_CORE_TYPE_MATCHES = 12, /* type match in target kernel */ }; /* diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h index fd48b1ff59ca..2308f4990e96 100644 --- a/tools/lib/bpf/bpf_core_read.h +++ b/tools/lib/bpf/bpf_core_read.h @@ -29,6 +29,7 @@ enum bpf_type_id_kind { enum bpf_type_info_kind { BPF_TYPE_EXISTS = 0, /* type existence in target kernel */ BPF_TYPE_SIZE = 1, /* type size in target kernel */ + BPF_TYPE_MATCHES = 2, /* type match in target kernel */ }; /* second argument to __builtin_preserve_enum_value() built-in */ -- cgit v1.2.3 From 633e7ceb2cbbae9b2f5ca69106b0de65728c5988 Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Tue, 28 Jun 2022 16:01:19 +0000 Subject: bpftool: Honor BPF_CORE_TYPE_MATCHES relocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bpftool needs to know about the newly introduced BPF_CORE_TYPE_MATCHES relocation for its 'gen min_core_btf' command to work properly in the present of this relocation. Specifically, we need to make sure to mark types and fields so that they are present in the minimized BTF for "type match" checks to work out. However, contrary to the existing btfgen_record_field_relo, we need to rely on the BTF -- and not the spec -- to find fields. With this change we handle this new variant correctly. The functionality will be tested with follow on changes to BPF selftests, which already run against a minimized BTF created with bpftool. Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20220628160127.607834-3-deso@posteo.net --- tools/bpf/bpftool/gen.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) (limited to 'tools') diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 480cbd859359..3d35fbc5fe16 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -1856,6 +1856,112 @@ static int btfgen_record_field_relo(struct btfgen_info *info, struct bpf_core_sp return 0; } +/* Mark types, members, and member types. Compared to btfgen_record_field_relo, + * this function does not rely on the target spec for inferring members, but + * uses the associated BTF. + * + * The `behind_ptr` argument is used to stop marking of composite types reached + * through a pointer. This way, we can keep BTF size in check while providing + * reasonable match semantics. + */ +static int btfgen_mark_type_match(struct btfgen_info *info, __u32 type_id, bool behind_ptr) +{ + const struct btf_type *btf_type; + struct btf *btf = info->src_btf; + struct btf_type *cloned_type; + int i, err; + + if (type_id == 0) + return 0; + + btf_type = btf__type_by_id(btf, type_id); + /* mark type on cloned BTF as used */ + cloned_type = (struct btf_type *)btf__type_by_id(info->marked_btf, type_id); + cloned_type->name_off = MARKED; + + switch (btf_kind(btf_type)) { + case BTF_KIND_UNKN: + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + break; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + struct btf_member *m = btf_members(btf_type); + __u16 vlen = btf_vlen(btf_type); + + if (behind_ptr) + break; + + for (i = 0; i < vlen; i++, m++) { + /* mark member */ + btfgen_mark_member(info, type_id, i); + + /* mark member's type */ + err = btfgen_mark_type_match(info, m->type, false); + if (err) + return err; + } + break; + } + case BTF_KIND_CONST: + case BTF_KIND_FWD: + case BTF_KIND_RESTRICT: + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + return btfgen_mark_type_match(info, btf_type->type, behind_ptr); + case BTF_KIND_PTR: + return btfgen_mark_type_match(info, btf_type->type, true); + case BTF_KIND_ARRAY: { + struct btf_array *array; + + array = btf_array(btf_type); + /* mark array type */ + err = btfgen_mark_type_match(info, array->type, false); + /* mark array's index type */ + err = err ? : btfgen_mark_type_match(info, array->index_type, false); + if (err) + return err; + break; + } + case BTF_KIND_FUNC_PROTO: { + __u16 vlen = btf_vlen(btf_type); + struct btf_param *param; + + /* mark ret type */ + err = btfgen_mark_type_match(info, btf_type->type, false); + if (err) + return err; + + /* mark parameters types */ + param = btf_params(btf_type); + for (i = 0; i < vlen; i++) { + err = btfgen_mark_type_match(info, param->type, false); + if (err) + return err; + param++; + } + break; + } + /* tells if some other type needs to be handled */ + default: + p_err("unsupported kind: %s (%d)", btf_kind_str(btf_type), type_id); + return -EINVAL; + } + + return 0; +} + +/* Mark types, members, and member types. Compared to btfgen_record_field_relo, + * this function does not rely on the target spec for inferring members, but + * uses the associated BTF. + */ +static int btfgen_record_type_match_relo(struct btfgen_info *info, struct bpf_core_spec *targ_spec) +{ + return btfgen_mark_type_match(info, targ_spec->root_type_id, false); +} + static int btfgen_record_type_relo(struct btfgen_info *info, struct bpf_core_spec *targ_spec) { return btfgen_mark_type(info, targ_spec->root_type_id, true); @@ -1882,6 +1988,8 @@ static int btfgen_record_reloc(struct btfgen_info *info, struct bpf_core_spec *r case BPF_CORE_TYPE_EXISTS: case BPF_CORE_TYPE_SIZE: return btfgen_record_type_relo(info, res); + case BPF_CORE_TYPE_MATCHES: + return btfgen_record_type_match_relo(info, res); case BPF_CORE_ENUMVAL_EXISTS: case BPF_CORE_ENUMVAL_VALUE: return btfgen_record_enumval_relo(info, res); -- cgit v1.2.3 From ec6209c8d42f815bc3bef10934637ca92114cd1b Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Tue, 28 Jun 2022 16:01:21 +0000 Subject: bpf, libbpf: Add type match support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds support for the proposed type match relation to relo_core where it is shared between userspace and kernel. It plumbs through both kernel-side and libbpf-side support. The matching relation is defined as follows (copy from source): - modifiers and typedefs are stripped (and, hence, effectively ignored) - generally speaking types need to be of same kind (struct vs. struct, union vs. union, etc.) - exceptions are struct/union behind a pointer which could also match a forward declaration of a struct or union, respectively, and enum vs. enum64 (see below) Then, depending on type: - integers: - match if size and signedness match - arrays & pointers: - target types are recursively matched - structs & unions: - local members need to exist in target with the same name - for each member we recursively check match unless it is already behind a pointer, in which case we only check matching names and compatible kind - enums: - local variants have to have a match in target by symbolic name (but not numeric value) - size has to match (but enum may match enum64 and vice versa) - function pointers: - number and position of arguments in local type has to match target - for each argument and the return value we recursively check match Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220628160127.607834-5-deso@posteo.net --- kernel/bpf/btf.c | 9 ++ tools/lib/bpf/libbpf.c | 6 + tools/lib/bpf/relo_core.c | 279 +++++++++++++++++++++++++++++++++++++++++++++- tools/lib/bpf/relo_core.h | 4 + 4 files changed, 294 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8d3c7ab8af46..4f2408a4df08 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7443,6 +7443,15 @@ int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, MAX_TYPES_ARE_COMPAT_DEPTH); } +#define MAX_TYPES_MATCH_DEPTH 2 + +int bpf_core_types_match(const struct btf *local_btf, u32 local_id, + const struct btf *targ_btf, u32 targ_id) +{ + return __bpf_core_types_match(local_btf, local_id, targ_btf, targ_id, false, + MAX_TYPES_MATCH_DEPTH); +} + static bool bpf_core_is_flavor_sep(const char *s) { /* check X___Y name pattern, where X and Y are not underscores */ diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 8a45a84eb9b2..64c4cc6140d3 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5470,6 +5470,12 @@ int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, return __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id, 32); } +int bpf_core_types_match(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id) +{ + return __bpf_core_types_match(local_btf, local_id, targ_btf, targ_id, false, 32); +} + static size_t bpf_core_hash_fn(const void *key, void *ctx) { return (size_t)key; diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c index e070123332cd..fe2533022aa9 100644 --- a/tools/lib/bpf/relo_core.c +++ b/tools/lib/bpf/relo_core.c @@ -95,6 +95,7 @@ static const char *core_relo_kind_str(enum bpf_core_relo_kind kind) case BPF_CORE_TYPE_ID_LOCAL: return "local_type_id"; case BPF_CORE_TYPE_ID_TARGET: return "target_type_id"; case BPF_CORE_TYPE_EXISTS: return "type_exists"; + case BPF_CORE_TYPE_MATCHES: return "type_matches"; case BPF_CORE_TYPE_SIZE: return "type_size"; case BPF_CORE_ENUMVAL_EXISTS: return "enumval_exists"; case BPF_CORE_ENUMVAL_VALUE: return "enumval_value"; @@ -123,6 +124,7 @@ static bool core_relo_is_type_based(enum bpf_core_relo_kind kind) case BPF_CORE_TYPE_ID_LOCAL: case BPF_CORE_TYPE_ID_TARGET: case BPF_CORE_TYPE_EXISTS: + case BPF_CORE_TYPE_MATCHES: case BPF_CORE_TYPE_SIZE: return true; default: @@ -251,7 +253,7 @@ recur: * - field 'a' access (corresponds to '2' in low-level spec); * - array element #3 access (corresponds to '3' in low-level spec). * - * Type-based relocations (TYPE_EXISTS/TYPE_SIZE, + * Type-based relocations (TYPE_EXISTS/TYPE_MATCHES/TYPE_SIZE, * TYPE_ID_LOCAL/TYPE_ID_TARGET) don't capture any field information. Their * spec and raw_spec are kept empty. * @@ -568,9 +570,14 @@ static int bpf_core_spec_match(struct bpf_core_spec *local_spec, targ_spec->relo_kind = local_spec->relo_kind; if (core_relo_is_type_based(local_spec->relo_kind)) { - return bpf_core_types_are_compat(local_spec->btf, - local_spec->root_type_id, - targ_btf, targ_id); + if (local_spec->relo_kind == BPF_CORE_TYPE_MATCHES) + return bpf_core_types_match(local_spec->btf, + local_spec->root_type_id, + targ_btf, targ_id); + else + return bpf_core_types_are_compat(local_spec->btf, + local_spec->root_type_id, + targ_btf, targ_id); } local_acc = &local_spec->spec[0]; @@ -819,6 +826,7 @@ static int bpf_core_calc_type_relo(const struct bpf_core_relo *relo, *validate = false; break; case BPF_CORE_TYPE_EXISTS: + case BPF_CORE_TYPE_MATCHES: *val = 1; break; case BPF_CORE_TYPE_SIZE: @@ -1410,3 +1418,266 @@ int bpf_core_calc_relo_insn(const char *prog_name, return 0; } + +static bool bpf_core_names_match(const struct btf *local_btf, size_t local_name_off, + const struct btf *targ_btf, size_t targ_name_off) +{ + const char *local_n, *targ_n; + size_t local_len, targ_len; + + local_n = btf__name_by_offset(local_btf, local_name_off); + targ_n = btf__name_by_offset(targ_btf, targ_name_off); + + if (str_is_empty(targ_n)) + return str_is_empty(local_n); + + targ_len = bpf_core_essential_name_len(targ_n); + local_len = bpf_core_essential_name_len(local_n); + + return targ_len == local_len && strncmp(local_n, targ_n, local_len) == 0; +} + +static int bpf_core_enums_match(const struct btf *local_btf, const struct btf_type *local_t, + const struct btf *targ_btf, const struct btf_type *targ_t) +{ + __u16 local_vlen = btf_vlen(local_t); + __u16 targ_vlen = btf_vlen(targ_t); + int i, j; + + if (local_t->size != targ_t->size) + return 0; + + if (local_vlen > targ_vlen) + return 0; + + /* iterate over the local enum's variants and make sure each has + * a symbolic name correspondent in the target + */ + for (i = 0; i < local_vlen; i++) { + bool matched = false; + __u32 local_n_off, targ_n_off; + + local_n_off = btf_is_enum(local_t) ? btf_enum(local_t)[i].name_off : + btf_enum64(local_t)[i].name_off; + + for (j = 0; j < targ_vlen; j++) { + targ_n_off = btf_is_enum(targ_t) ? btf_enum(targ_t)[j].name_off : + btf_enum64(targ_t)[j].name_off; + + if (bpf_core_names_match(local_btf, local_n_off, targ_btf, targ_n_off)) { + matched = true; + break; + } + } + + if (!matched) + return 0; + } + return 1; +} + +static int bpf_core_composites_match(const struct btf *local_btf, const struct btf_type *local_t, + const struct btf *targ_btf, const struct btf_type *targ_t, + bool behind_ptr, int level) +{ + const struct btf_member *local_m = btf_members(local_t); + __u16 local_vlen = btf_vlen(local_t); + __u16 targ_vlen = btf_vlen(targ_t); + int i, j, err; + + if (local_vlen > targ_vlen) + return 0; + + /* check that all local members have a match in the target */ + for (i = 0; i < local_vlen; i++, local_m++) { + const struct btf_member *targ_m = btf_members(targ_t); + bool matched = false; + + for (j = 0; j < targ_vlen; j++, targ_m++) { + if (!bpf_core_names_match(local_btf, local_m->name_off, + targ_btf, targ_m->name_off)) + continue; + + err = __bpf_core_types_match(local_btf, local_m->type, targ_btf, + targ_m->type, behind_ptr, level - 1); + if (err > 0) { + matched = true; + break; + } + } + + if (!matched) + return 0; + } + return 1; +} + +/* Check that two types "match". + * + * The matching relation is defined as follows: + * - modifiers and typedefs are stripped (and, hence, effectively ignored) + * - generally speaking types need to be of same kind (struct vs. struct, union + * vs. union, etc.) + * - exceptions are struct/union behind a pointer which could also match a + * forward declaration of a struct or union, respectively, and enum vs. + * enum64 (see below) + * Then, depending on type: + * - integers: + * - match if size and signedness match + * - arrays & pointers: + * - target types are recursively matched + * - structs & unions: + * - local members need to exist in target with the same name + * - for each member we recursively check match unless it is already behind a + * pointer, in which case we only check matching names and compatible kind + * - enums: + * - local variants have to have a match in target by symbolic name (but not + * numeric value) + * - size has to match (but enum may match enum64 and vice versa) + * - function pointers: + * - number and position of arguments in local type has to match target + * - for each argument and the return value we recursively check match + */ +int __bpf_core_types_match(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, + __u32 targ_id, bool behind_ptr, int level) +{ + const struct btf_type *local_t, *targ_t; + int depth = 32; /* max recursion depth */ + __u16 local_k, targ_k; + + if (level <= 0) + return -EINVAL; + + local_t = btf_type_by_id(local_btf, local_id); + targ_t = btf_type_by_id(targ_btf, targ_id); + +recur: + depth--; + if (depth < 0) + return -EINVAL; + + local_t = skip_mods_and_typedefs(local_btf, local_id, &local_id); + targ_t = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); + if (!local_t || !targ_t) + return -EINVAL; + + if (!bpf_core_names_match(local_btf, local_t->name_off, targ_btf, targ_t->name_off)) + return 0; + + local_k = btf_kind(local_t); + targ_k = btf_kind(targ_t); + + switch (local_k) { + case BTF_KIND_UNKN: + return local_k == targ_k; + case BTF_KIND_FWD: { + bool local_f = BTF_INFO_KFLAG(local_t->info); + + if (behind_ptr) { + if (local_k == targ_k) + return local_f == BTF_INFO_KFLAG(targ_t->info); + + /* for forward declarations kflag dictates whether the + * target is a struct (0) or union (1) + */ + return (targ_k == BTF_KIND_STRUCT && !local_f) || + (targ_k == BTF_KIND_UNION && local_f); + } else { + if (local_k != targ_k) + return 0; + + /* match if the forward declaration is for the same kind */ + return local_f == BTF_INFO_KFLAG(targ_t->info); + } + } + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + if (!btf_is_any_enum(targ_t)) + return 0; + + return bpf_core_enums_match(local_btf, local_t, targ_btf, targ_t); + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + if (behind_ptr) { + bool targ_f = BTF_INFO_KFLAG(targ_t->info); + + if (local_k == targ_k) + return 1; + + if (targ_k != BTF_KIND_FWD) + return 0; + + return (local_k == BTF_KIND_UNION) == targ_f; + } else { + if (local_k != targ_k) + return 0; + + return bpf_core_composites_match(local_btf, local_t, targ_btf, targ_t, + behind_ptr, level); + } + case BTF_KIND_INT: { + __u8 local_sgn; + __u8 targ_sgn; + + if (local_k != targ_k) + return 0; + + local_sgn = btf_int_encoding(local_t) & BTF_INT_SIGNED; + targ_sgn = btf_int_encoding(targ_t) & BTF_INT_SIGNED; + + return local_t->size == targ_t->size && local_sgn == targ_sgn; + } + case BTF_KIND_PTR: + if (local_k != targ_k) + return 0; + + behind_ptr = true; + + local_id = local_t->type; + targ_id = targ_t->type; + goto recur; + case BTF_KIND_ARRAY: { + const struct btf_array *local_array = btf_array(local_t); + const struct btf_array *targ_array = btf_array(targ_t); + + if (local_k != targ_k) + return 0; + + if (local_array->nelems != targ_array->nelems) + return 0; + + local_id = local_array->type; + targ_id = targ_array->type; + goto recur; + } + case BTF_KIND_FUNC_PROTO: { + struct btf_param *local_p = btf_params(local_t); + struct btf_param *targ_p = btf_params(targ_t); + __u16 local_vlen = btf_vlen(local_t); + __u16 targ_vlen = btf_vlen(targ_t); + int i, err; + + if (local_k != targ_k) + return 0; + + if (local_vlen != targ_vlen) + return 0; + + for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { + err = __bpf_core_types_match(local_btf, local_p->type, targ_btf, + targ_p->type, behind_ptr, level - 1); + if (err <= 0) + return err; + } + + /* tail recurse for return type check */ + local_id = local_t->type; + targ_id = targ_t->type; + goto recur; + } + default: + pr_warn("unexpected kind %s relocated, local [%d], target [%d]\n", + btf_kind_str(local_t), local_id, targ_id); + return 0; + } +} diff --git a/tools/lib/bpf/relo_core.h b/tools/lib/bpf/relo_core.h index 3fd3842d4230..1c0566daf8e8 100644 --- a/tools/lib/bpf/relo_core.h +++ b/tools/lib/bpf/relo_core.h @@ -72,6 +72,10 @@ int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id, int level); int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id); +int __bpf_core_types_match(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, + __u32 targ_id, bool behind_ptr, int level); +int bpf_core_types_match(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, + __u32 targ_id); size_t bpf_core_essential_name_len(const char *name); -- cgit v1.2.3 From b8a195dc299391bc171c47971974ec6d5ea6fb14 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 5 Jul 2022 20:56:48 -0700 Subject: libbpf: add bpf_core_type_matches() helper macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch finalizes support for the proposed type match relation in libbpf by adding bpf_core_type_matches() macro which emits TYPE_MATCH relocation. Clang support for this relocation was added in [0]. [0] https://reviews.llvm.org/D126838 Signed-off-by: Daniel Müller ¬ Signed-off-by: Andrii Nakryiko ¬ Link: https://lore.kernel.org/bpf/20220628160127.607834-7-deso@posteo.net¬ --- tools/lib/bpf/bpf_core_read.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h index 2308f4990e96..496e6a8ee0dc 100644 --- a/tools/lib/bpf/bpf_core_read.h +++ b/tools/lib/bpf/bpf_core_read.h @@ -184,6 +184,16 @@ enum bpf_enum_value_kind { #define bpf_core_type_exists(type) \ __builtin_preserve_type_info(*(typeof(type) *)0, BPF_TYPE_EXISTS) +/* + * Convenience macro to check that provided named type + * (struct/union/enum/typedef) "matches" that in a target kernel. + * Returns: + * 1, if the type matches in the target kernel's BTF; + * 0, if the type does not match any in the target kernel + */ +#define bpf_core_type_matches(type) \ + __builtin_preserve_type_info(*(typeof(type) *)0, BPF_TYPE_MATCHES) + /* * Convenience macro to get the byte size of a provided named type * (struct/union/enum/typedef) in a target kernel. -- cgit v1.2.3 From 67d8ed4295258cb17e2bed7ed5ada92526a643f5 Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Tue, 28 Jun 2022 16:01:24 +0000 Subject: selftests/bpf: Add type-match checks to type-based tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that we have type-match logic in both libbpf and the kernel, this change adjusts the existing BPF self tests to check this functionality. Specifically, we extend the existing type-based tests to check the previously introduced bpf_core_type_matches macro. Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220628160127.607834-8-deso@posteo.net --- .../testing/selftests/bpf/prog_tests/core_reloc.c | 31 +++++++++++++++++++-- .../testing/selftests/bpf/progs/core_reloc_types.h | 14 +++++++++- .../bpf/progs/test_core_reloc_type_based.c | 32 +++++++++++++++++++++- 3 files changed, 73 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index 2f92feb809be..328dd744e5bd 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -543,7 +543,6 @@ static int __trigger_module_test_read(const struct core_reloc_test_case *test) return 0; } - static const struct core_reloc_test_case test_cases[] = { /* validate we can find kernel image and use its BTF for relocs */ { @@ -752,7 +751,7 @@ static const struct core_reloc_test_case test_cases[] = { SIZE_CASE(size___diff_offs), SIZE_ERR_CASE(size___err_ambiguous), - /* validate type existence and size relocations */ + /* validate type existence, match, and size relocations */ TYPE_BASED_CASE(type_based, { .struct_exists = 1, .union_exists = 1, @@ -765,6 +764,19 @@ static const struct core_reloc_test_case test_cases[] = { .typedef_void_ptr_exists = 1, .typedef_func_proto_exists = 1, .typedef_arr_exists = 1, + + .struct_matches = 1, + .union_matches = 1, + .enum_matches = 1, + .typedef_named_struct_matches = 1, + .typedef_anon_struct_matches = 1, + .typedef_struct_ptr_matches = 1, + .typedef_int_matches = 1, + .typedef_enum_matches = 1, + .typedef_void_ptr_matches = 1, + .typedef_func_proto_matches = 1, + .typedef_arr_matches = 1, + .struct_sz = sizeof(struct a_struct), .union_sz = sizeof(union a_union), .enum_sz = sizeof(enum an_enum), @@ -792,6 +804,19 @@ static const struct core_reloc_test_case test_cases[] = { .typedef_void_ptr_exists = 1, .typedef_func_proto_exists = 1, .typedef_arr_exists = 1, + + .struct_matches = 0, + .union_matches = 0, + .enum_matches = 0, + .typedef_named_struct_matches = 0, + .typedef_anon_struct_matches = 0, + .typedef_struct_ptr_matches = 1, + .typedef_int_matches = 0, + .typedef_enum_matches = 0, + .typedef_void_ptr_matches = 1, + .typedef_func_proto_matches = 0, + .typedef_arr_matches = 0, + .struct_sz = sizeof(struct a_struct___diff_sz), .union_sz = sizeof(union a_union___diff_sz), .enum_sz = sizeof(enum an_enum___diff_sz), @@ -806,10 +831,12 @@ static const struct core_reloc_test_case test_cases[] = { }), TYPE_BASED_CASE(type_based___incompat, { .enum_exists = 1, + .enum_matches = 1, .enum_sz = sizeof(enum an_enum), }), TYPE_BASED_CASE(type_based___fn_wrong_args, { .struct_exists = 1, + .struct_matches = 1, .struct_sz = sizeof(struct a_struct), }), diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h index 26e103302c05..6a44f3e63ae5 100644 --- a/tools/testing/selftests/bpf/progs/core_reloc_types.h +++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h @@ -860,7 +860,7 @@ struct core_reloc_size___err_ambiguous2 { }; /* - * TYPE EXISTENCE & SIZE + * TYPE EXISTENCE, MATCH & SIZE */ struct core_reloc_type_based_output { bool struct_exists; @@ -875,6 +875,18 @@ struct core_reloc_type_based_output { bool typedef_func_proto_exists; bool typedef_arr_exists; + bool struct_matches; + bool union_matches; + bool enum_matches; + bool typedef_named_struct_matches; + bool typedef_anon_struct_matches; + bool typedef_struct_ptr_matches; + bool typedef_int_matches; + bool typedef_enum_matches; + bool typedef_void_ptr_matches; + bool typedef_func_proto_matches; + bool typedef_arr_matches; + int struct_sz; int union_sz; int enum_sz; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c b/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c index fb60f8195c53..325ead666130 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c @@ -61,6 +61,18 @@ struct core_reloc_type_based_output { bool typedef_func_proto_exists; bool typedef_arr_exists; + bool struct_matches; + bool union_matches; + bool enum_matches; + bool typedef_named_struct_matches; + bool typedef_anon_struct_matches; + bool typedef_struct_ptr_matches; + bool typedef_int_matches; + bool typedef_enum_matches; + bool typedef_void_ptr_matches; + bool typedef_func_proto_matches; + bool typedef_arr_matches; + int struct_sz; int union_sz; int enum_sz; @@ -77,7 +89,13 @@ struct core_reloc_type_based_output { SEC("raw_tracepoint/sys_enter") int test_core_type_based(void *ctx) { -#if __has_builtin(__builtin_preserve_type_info) + /* Support for the BPF_TYPE_MATCHES argument to the + * __builtin_preserve_type_info builtin was added at some point during + * development of clang 15 and it's what we require for this test. Part of it + * could run with merely __builtin_preserve_type_info (which could be checked + * separately), but we have to find an upper bound. + */ +#if __has_builtin(__builtin_preserve_type_info) && __clang_major__ >= 15 struct core_reloc_type_based_output *out = (void *)&data.out; out->struct_exists = bpf_core_type_exists(struct a_struct); @@ -92,6 +110,18 @@ int test_core_type_based(void *ctx) out->typedef_func_proto_exists = bpf_core_type_exists(func_proto_typedef); out->typedef_arr_exists = bpf_core_type_exists(arr_typedef); + out->struct_matches = bpf_core_type_matches(struct a_struct); + out->union_matches = bpf_core_type_matches(union a_union); + out->enum_matches = bpf_core_type_matches(enum an_enum); + out->typedef_named_struct_matches = bpf_core_type_matches(named_struct_typedef); + out->typedef_anon_struct_matches = bpf_core_type_matches(anon_struct_typedef); + out->typedef_struct_ptr_matches = bpf_core_type_matches(struct_ptr_typedef); + out->typedef_int_matches = bpf_core_type_matches(int_typedef); + out->typedef_enum_matches = bpf_core_type_matches(enum_typedef); + out->typedef_void_ptr_matches = bpf_core_type_matches(void_ptr_typedef); + out->typedef_func_proto_matches = bpf_core_type_matches(func_proto_typedef); + out->typedef_arr_matches = bpf_core_type_matches(arr_typedef); + out->struct_sz = bpf_core_type_size(struct a_struct); out->union_sz = bpf_core_type_size(union a_union); out->enum_sz = bpf_core_type_size(enum an_enum); -- cgit v1.2.3 From bed56a6dd4cbf76ff757679a9333f9d3a72ed7ec Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Tue, 28 Jun 2022 16:01:25 +0000 Subject: selftests/bpf: Add test checking more characteristics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change adds another type-based self-test that specifically aims to test some more characteristics of the TYPE_MATCH logic. Specifically, it covers a few more potential differences between types, such as different orders, enum variant values, and integer signedness. Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220628160127.607834-9-deso@posteo.net --- .../testing/selftests/bpf/prog_tests/core_reloc.c | 37 ++++++++++++++++ .../bpf/progs/btf__core_reloc_type_based___diff.c | 3 ++ .../testing/selftests/bpf/progs/core_reloc_types.h | 51 ++++++++++++++++++++++ 3 files changed, 91 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/btf__core_reloc_type_based___diff.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index 328dd744e5bd..eb47bfde459c 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -792,6 +792,43 @@ static const struct core_reloc_test_case test_cases[] = { TYPE_BASED_CASE(type_based___all_missing, { /* all zeros */ }), + TYPE_BASED_CASE(type_based___diff, { + .struct_exists = 1, + .union_exists = 1, + .enum_exists = 1, + .typedef_named_struct_exists = 1, + .typedef_anon_struct_exists = 1, + .typedef_struct_ptr_exists = 1, + .typedef_int_exists = 1, + .typedef_enum_exists = 1, + .typedef_void_ptr_exists = 1, + .typedef_func_proto_exists = 1, + .typedef_arr_exists = 1, + + .struct_matches = 1, + .union_matches = 1, + .enum_matches = 1, + .typedef_named_struct_matches = 1, + .typedef_anon_struct_matches = 1, + .typedef_struct_ptr_matches = 1, + .typedef_int_matches = 0, + .typedef_enum_matches = 1, + .typedef_void_ptr_matches = 1, + .typedef_func_proto_matches = 0, + .typedef_arr_matches = 0, + + .struct_sz = sizeof(struct a_struct___diff), + .union_sz = sizeof(union a_union___diff), + .enum_sz = sizeof(enum an_enum___diff), + .typedef_named_struct_sz = sizeof(named_struct_typedef___diff), + .typedef_anon_struct_sz = sizeof(anon_struct_typedef___diff), + .typedef_struct_ptr_sz = sizeof(struct_ptr_typedef___diff), + .typedef_int_sz = sizeof(int_typedef___diff), + .typedef_enum_sz = sizeof(enum_typedef___diff), + .typedef_void_ptr_sz = sizeof(void_ptr_typedef___diff), + .typedef_func_proto_sz = sizeof(func_proto_typedef___diff), + .typedef_arr_sz = sizeof(arr_typedef___diff), + }), TYPE_BASED_CASE(type_based___diff_sz, { .struct_exists = 1, .union_exists = 1, diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_type_based___diff.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_type_based___diff.c new file mode 100644 index 000000000000..57ae2c258928 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_type_based___diff.c @@ -0,0 +1,3 @@ +#include "core_reloc_types.h" + +void f(struct core_reloc_type_based___diff x) {} diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h index 6a44f3e63ae5..e326b6a781e5 100644 --- a/tools/testing/selftests/bpf/progs/core_reloc_types.h +++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h @@ -951,6 +951,57 @@ struct core_reloc_type_based { struct core_reloc_type_based___all_missing { }; +/* different member orders, enum variant values, signedness, etc */ +struct a_struct___diff { + int x; + int a; +}; + +union a_union___diff { + int z; + int y; +}; + +typedef struct a_struct___diff named_struct_typedef___diff; + +typedef struct { int z, x, y; } anon_struct_typedef___diff; + +typedef struct { + int c; + int b; + int a; +} *struct_ptr_typedef___diff; + +enum an_enum___diff { + AN_ENUM_VAL2___diff = 0, + AN_ENUM_VAL1___diff = 42, + AN_ENUM_VAL3___diff = 1, +}; + +typedef unsigned int int_typedef___diff; + +typedef enum { TYPEDEF_ENUM_VAL2___diff, TYPEDEF_ENUM_VAL1___diff = 50 } enum_typedef___diff; + +typedef const void *void_ptr_typedef___diff; + +typedef int_typedef___diff (*func_proto_typedef___diff)(long); + +typedef char arr_typedef___diff[3]; + +struct core_reloc_type_based___diff { + struct a_struct___diff f1; + union a_union___diff f2; + enum an_enum___diff f3; + named_struct_typedef___diff f4; + anon_struct_typedef___diff f5; + struct_ptr_typedef___diff f6; + int_typedef___diff f7; + enum_typedef___diff f8; + void_ptr_typedef___diff f9; + func_proto_typedef___diff f10; + arr_typedef___diff f11; +}; + /* different type sizes, extra modifiers, anon vs named enums, etc */ struct a_struct___diff_sz { long x; -- cgit v1.2.3 From 537905c4b68fe3a32c4925122fe792eb2f594b73 Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Tue, 28 Jun 2022 16:01:26 +0000 Subject: selftests/bpf: Add nested type to type based tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change extends the type based tests with another struct type (in addition to a_struct) to check relocations against: a_complex_struct. This type is nested more deeply to provide additional coverage of certain paths in the type match logic. Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220628160127.607834-10-deso@posteo.net --- .../testing/selftests/bpf/prog_tests/core_reloc.c | 4 ++ .../testing/selftests/bpf/progs/core_reloc_types.h | 62 +++++++++++++++------- .../bpf/progs/test_core_reloc_type_based.c | 12 +++++ 3 files changed, 58 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index eb47bfde459c..8882c9c1519b 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -754,6 +754,7 @@ static const struct core_reloc_test_case test_cases[] = { /* validate type existence, match, and size relocations */ TYPE_BASED_CASE(type_based, { .struct_exists = 1, + .complex_struct_exists = 1, .union_exists = 1, .enum_exists = 1, .typedef_named_struct_exists = 1, @@ -766,6 +767,7 @@ static const struct core_reloc_test_case test_cases[] = { .typedef_arr_exists = 1, .struct_matches = 1, + .complex_struct_matches = 1, .union_matches = 1, .enum_matches = 1, .typedef_named_struct_matches = 1, @@ -794,6 +796,7 @@ static const struct core_reloc_test_case test_cases[] = { }), TYPE_BASED_CASE(type_based___diff, { .struct_exists = 1, + .complex_struct_exists = 1, .union_exists = 1, .enum_exists = 1, .typedef_named_struct_exists = 1, @@ -806,6 +809,7 @@ static const struct core_reloc_test_case test_cases[] = { .typedef_arr_exists = 1, .struct_matches = 1, + .complex_struct_matches = 1, .union_matches = 1, .enum_matches = 1, .typedef_named_struct_matches = 1, diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h index e326b6a781e5..474411c4b908 100644 --- a/tools/testing/selftests/bpf/progs/core_reloc_types.h +++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h @@ -864,6 +864,7 @@ struct core_reloc_size___err_ambiguous2 { */ struct core_reloc_type_based_output { bool struct_exists; + bool complex_struct_exists; bool union_exists; bool enum_exists; bool typedef_named_struct_exists; @@ -876,6 +877,7 @@ struct core_reloc_type_based_output { bool typedef_arr_exists; bool struct_matches; + bool complex_struct_matches; bool union_matches; bool enum_matches; bool typedef_named_struct_matches; @@ -904,6 +906,14 @@ struct a_struct { int x; }; +struct a_complex_struct { + union { + struct a_struct * restrict a; + void *b; + } x; + volatile long y; +}; + union a_union { int y; int z; @@ -935,16 +945,17 @@ typedef char arr_typedef[20]; struct core_reloc_type_based { struct a_struct f1; - union a_union f2; - enum an_enum f3; - named_struct_typedef f4; - anon_struct_typedef f5; - struct_ptr_typedef f6; - int_typedef f7; - enum_typedef f8; - void_ptr_typedef f9; - func_proto_typedef f10; - arr_typedef f11; + struct a_complex_struct f2; + union a_union f3; + enum an_enum f4; + named_struct_typedef f5; + anon_struct_typedef f6; + struct_ptr_typedef f7; + int_typedef f8; + enum_typedef f9; + void_ptr_typedef f10; + func_proto_typedef f11; + arr_typedef f12; }; /* no types in target */ @@ -957,6 +968,16 @@ struct a_struct___diff { int a; }; +struct a_struct___forward; + +struct a_complex_struct___diff { + union { + struct a_struct___forward *a; + void *b; + } x; + volatile long y; +}; + union a_union___diff { int z; int y; @@ -990,16 +1011,17 @@ typedef char arr_typedef___diff[3]; struct core_reloc_type_based___diff { struct a_struct___diff f1; - union a_union___diff f2; - enum an_enum___diff f3; - named_struct_typedef___diff f4; - anon_struct_typedef___diff f5; - struct_ptr_typedef___diff f6; - int_typedef___diff f7; - enum_typedef___diff f8; - void_ptr_typedef___diff f9; - func_proto_typedef___diff f10; - arr_typedef___diff f11; + struct a_complex_struct___diff f2; + union a_union___diff f3; + enum an_enum___diff f4; + named_struct_typedef___diff f5; + anon_struct_typedef___diff f6; + struct_ptr_typedef___diff f7; + int_typedef___diff f8; + enum_typedef___diff f9; + void_ptr_typedef___diff f10; + func_proto_typedef___diff f11; + arr_typedef___diff f12; }; /* different type sizes, extra modifiers, anon vs named enums, etc */ diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c b/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c index 325ead666130..d95bc08b75c1 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c @@ -19,6 +19,14 @@ struct a_struct { int x; }; +struct a_complex_struct { + union { + struct a_struct *a; + void *b; + } x; + volatile long y; +}; + union a_union { int y; int z; @@ -50,6 +58,7 @@ typedef char arr_typedef[20]; struct core_reloc_type_based_output { bool struct_exists; + bool complex_struct_exists; bool union_exists; bool enum_exists; bool typedef_named_struct_exists; @@ -62,6 +71,7 @@ struct core_reloc_type_based_output { bool typedef_arr_exists; bool struct_matches; + bool complex_struct_matches; bool union_matches; bool enum_matches; bool typedef_named_struct_matches; @@ -99,6 +109,7 @@ int test_core_type_based(void *ctx) struct core_reloc_type_based_output *out = (void *)&data.out; out->struct_exists = bpf_core_type_exists(struct a_struct); + out->complex_struct_exists = bpf_core_type_exists(struct a_complex_struct); out->union_exists = bpf_core_type_exists(union a_union); out->enum_exists = bpf_core_type_exists(enum an_enum); out->typedef_named_struct_exists = bpf_core_type_exists(named_struct_typedef); @@ -111,6 +122,7 @@ int test_core_type_based(void *ctx) out->typedef_arr_exists = bpf_core_type_exists(arr_typedef); out->struct_matches = bpf_core_type_matches(struct a_struct); + out->complex_struct_matches = bpf_core_type_matches(struct a_complex_struct); out->union_matches = bpf_core_type_matches(union a_union); out->enum_matches = bpf_core_type_matches(enum an_enum); out->typedef_named_struct_matches = bpf_core_type_matches(named_struct_typedef); -- cgit v1.2.3 From 950b347787224e62f59c099e3e3f3f6ecc720d61 Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Tue, 28 Jun 2022 16:01:27 +0000 Subject: selftests/bpf: Add type match test against kernel's task_struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change extends the existing core_reloc/kernel test to include a type match check of a local task_struct against the kernel's definition -- which we assume to succeed. Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220628160127.607834-11-deso@posteo.net --- tools/testing/selftests/bpf/prog_tests/core_reloc.c | 1 + tools/testing/selftests/bpf/progs/core_reloc_types.h | 1 + .../selftests/bpf/progs/test_core_reloc_kernel.c | 19 +++++++++++++++++++ 3 files changed, 21 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index 8882c9c1519b..a6f65e2236f4 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -555,6 +555,7 @@ static const struct core_reloc_test_case test_cases[] = { .valid = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, }, .comm = "test_progs", .comm_len = sizeof("test_progs"), + .local_task_struct_matches = true, }, .output_len = sizeof(struct core_reloc_kernel_output), .raw_tp_name = "sys_enter", diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h index 474411c4b908..7ef91d19c66e 100644 --- a/tools/testing/selftests/bpf/progs/core_reloc_types.h +++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h @@ -13,6 +13,7 @@ struct core_reloc_kernel_output { int valid[10]; char comm[sizeof("test_progs")]; int comm_len; + bool local_task_struct_matches; }; /* diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c b/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c index 145028b52ad8..a17dd83eae67 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c @@ -21,6 +21,7 @@ struct core_reloc_kernel_output { /* we have test_progs[-flavor], so cut flavor part */ char comm[sizeof("test_progs")]; int comm_len; + bool local_task_struct_matches; }; struct task_struct { @@ -30,11 +31,25 @@ struct task_struct { struct task_struct *group_leader; }; +struct mm_struct___wrong { + int abc_whatever_should_not_exist; +}; + +struct task_struct___local { + int pid; + struct mm_struct___wrong *mm; +}; + #define CORE_READ(dst, src) bpf_core_read(dst, sizeof(*(dst)), src) SEC("raw_tracepoint/sys_enter") int test_core_kernel(void *ctx) { + /* Support for the BPF_TYPE_MATCHES argument to the + * __builtin_preserve_type_info builtin was added at some point during + * development of clang 15 and it's what we require for this test. + */ +#if __has_builtin(__builtin_preserve_type_info) && __clang_major__ >= 15 struct task_struct *task = (void *)bpf_get_current_task(); struct core_reloc_kernel_output *out = (void *)&data.out; uint64_t pid_tgid = bpf_get_current_pid_tgid(); @@ -93,6 +108,10 @@ int test_core_kernel(void *ctx) group_leader, group_leader, group_leader, group_leader, comm); + out->local_task_struct_matches = bpf_core_type_matches(struct task_struct___local); +#else + data.skip = true; +#endif return 0; } -- cgit v1.2.3 From 8094029330a2f03fb406ecff80671cf27ce28d42 Mon Sep 17 00:00:00 2001 From: Chuang Wang Date: Wed, 29 Jun 2022 23:18:45 +0800 Subject: libbpf: Cleanup the legacy kprobe_event on failed add/attach_event() Before the 0bc11ed5ab60 commit ("kprobes: Allow kprobes coexist with livepatch"), in a scenario where livepatch and kprobe coexist on the same function entry, the creation of kprobe_event using add_kprobe_event_legacy() will be successful, at the same time as a trace event (e.g. /debugfs/tracing/events/kprobe/XXX) will exist, but perf_event_open() will return an error because both livepatch and kprobe use FTRACE_OPS_FL_IPMODIFY. As follows: 1) add a livepatch $ insmod livepatch-XXX.ko 2) add a kprobe using tracefs API (i.e. add_kprobe_event_legacy) $ echo 'p:mykprobe XXX' > /sys/kernel/debug/tracing/kprobe_events 3) enable this kprobe (i.e. sys_perf_event_open) This will return an error, -EBUSY. On Andrii Nakryiko's comment, few error paths in bpf_program__attach_kprobe_opts() that should need to call remove_kprobe_event_legacy(). With this patch, whenever an error is returned after add_kprobe_event_legacy() or bpf_program__attach_perf_event_opts(), this ensures that the created kprobe_event is cleaned. Signed-off-by: Chuang Wang Signed-off-by: Jingren Zhou Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220629151848.65587-2-nashuiliang@gmail.com --- tools/lib/bpf/libbpf.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 64c4cc6140d3..8a441d304924 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -9877,10 +9877,11 @@ static int perf_event_kprobe_open_legacy(const char *probe_name, bool retprobe, } type = determine_kprobe_perf_type_legacy(probe_name, retprobe); if (type < 0) { + err = type; pr_warn("failed to determine legacy kprobe event id for '%s+0x%zx': %s\n", kfunc_name, offset, - libbpf_strerror_r(type, errmsg, sizeof(errmsg))); - return type; + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_clean_legacy; } attr.size = sizeof(attr); attr.config = type; @@ -9894,9 +9895,14 @@ static int perf_event_kprobe_open_legacy(const char *probe_name, bool retprobe, err = -errno; pr_warn("legacy kprobe perf_event_open() failed: %s\n", libbpf_strerror_r(err, errmsg, sizeof(errmsg))); - return err; + goto err_clean_legacy; } return pfd; + +err_clean_legacy: + /* Clear the newly added legacy kprobe_event */ + remove_kprobe_event_legacy(probe_name, retprobe); + return err; } struct bpf_link * @@ -9953,7 +9959,7 @@ bpf_program__attach_kprobe_opts(const struct bpf_program *prog, prog->name, retprobe ? "kretprobe" : "kprobe", func_name, offset, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); - goto err_out; + goto err_clean_legacy; } if (legacy) { struct bpf_link_perf *perf_link = container_of(link, struct bpf_link_perf, link); @@ -9964,6 +9970,10 @@ bpf_program__attach_kprobe_opts(const struct bpf_program *prog, } return link; + +err_clean_legacy: + if (legacy) + remove_kprobe_event_legacy(legacy_probe, retprobe); err_out: free(legacy_probe); return libbpf_err_ptr(err); -- cgit v1.2.3 From 5666fc997ccb93859ea1d4437936c64c6d75c060 Mon Sep 17 00:00:00 2001 From: Chuang Wang Date: Wed, 29 Jun 2022 23:18:46 +0800 Subject: libbpf: Fix wrong variable used in perf_event_uprobe_open_legacy() Use "type" as opposed to "err" in pr_warn() after determine_uprobe_perf_type_legacy() returns an error. Signed-off-by: Chuang Wang Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220629151848.65587-3-nashuiliang@gmail.com --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 8a441d304924..571acd614dce 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -10249,7 +10249,7 @@ static int perf_event_uprobe_open_legacy(const char *probe_name, bool retprobe, type = determine_uprobe_perf_type_legacy(probe_name, retprobe); if (type < 0) { pr_warn("failed to determine legacy uprobe event id for %s:0x%zx: %d\n", - binary_path, offset, err); + binary_path, offset, type); return type; } -- cgit v1.2.3 From 2655144fb49bae26eae038c6d056f824a7db2726 Mon Sep 17 00:00:00 2001 From: Chuang Wang Date: Wed, 29 Jun 2022 23:18:47 +0800 Subject: libbpf: Cleanup the legacy uprobe_event on failed add/attach_event() A potential scenario, when an error is returned after add_uprobe_event_legacy() in perf_event_uprobe_open_legacy(), or bpf_program__attach_perf_event_opts() in bpf_program__attach_uprobe_opts() returns an error, the uprobe_event that was previously created is not cleaned. So, with this patch, when an error is returned, fix this by adding remove_uprobe_event_legacy() Signed-off-by: Chuang Wang Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220629151848.65587-4-nashuiliang@gmail.com --- tools/lib/bpf/libbpf.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 571acd614dce..cb49408eb298 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -10248,9 +10248,10 @@ static int perf_event_uprobe_open_legacy(const char *probe_name, bool retprobe, } type = determine_uprobe_perf_type_legacy(probe_name, retprobe); if (type < 0) { + err = type; pr_warn("failed to determine legacy uprobe event id for %s:0x%zx: %d\n", - binary_path, offset, type); - return type; + binary_path, offset, err); + goto err_clean_legacy; } memset(&attr, 0, sizeof(attr)); @@ -10265,9 +10266,14 @@ static int perf_event_uprobe_open_legacy(const char *probe_name, bool retprobe, if (pfd < 0) { err = -errno; pr_warn("legacy uprobe perf_event_open() failed: %d\n", err); - return err; + goto err_clean_legacy; } return pfd; + +err_clean_legacy: + /* Clear the newly added legacy uprobe_event */ + remove_uprobe_event_legacy(probe_name, retprobe); + return err; } /* Return next ELF section of sh_type after scn, or first of that type if scn is NULL. */ @@ -10601,7 +10607,7 @@ bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, prog->name, retprobe ? "uretprobe" : "uprobe", binary_path, func_offset, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); - goto err_out; + goto err_clean_legacy; } if (legacy) { struct bpf_link_perf *perf_link = container_of(link, struct bpf_link_perf, link); @@ -10611,10 +10617,13 @@ bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, perf_link->legacy_is_retprobe = retprobe; } return link; + +err_clean_legacy: + if (legacy) + remove_uprobe_event_legacy(legacy_probe, retprobe); err_out: free(legacy_probe); return libbpf_err_ptr(err); - } /* Format of u[ret]probe section definition supporting auto-attach: -- cgit v1.2.3 From 450a8dcb8c7f819431b09e5c1debbf0b6c2e824e Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 5 Jul 2022 21:04:56 +0100 Subject: bpftool: Remove zlib feature test from Makefile The feature test to detect the availability of zlib in bpftool's Makefile does not bring much. The library is not optional: it may or may not be required along libbfd for disassembling instructions, but in any case it is necessary to build feature.o or even libbpf, on which bpftool depends. If we remove the feature test, we lose the nicely formatted error message, but we get a compiler error about "zlib.h: No such file or directory", which is equally informative. Let's get rid of the test. Suggested-by: Andrii Nakryiko Signed-off-by: Quentin Monnet Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220705200456.285943-1-quentin@isovalent.com --- tools/bpf/bpftool/Makefile | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index c19e0e4c41bd..6b5b3a99f79d 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -93,10 +93,8 @@ INSTALL ?= install RM ?= rm -f FEATURE_USER = .bpftool -FEATURE_TESTS = libbfd disassembler-four-args zlib libcap \ - clang-bpf-co-re -FEATURE_DISPLAY = libbfd disassembler-four-args zlib libcap \ - clang-bpf-co-re +FEATURE_TESTS = libbfd disassembler-four-args libcap clang-bpf-co-re +FEATURE_DISPLAY = libbfd disassembler-four-args libcap clang-bpf-co-re check_feat := 1 NON_CHECK_FEAT_TARGETS := clean uninstall doc doc-clean doc-install doc-uninstall @@ -204,11 +202,6 @@ $(BOOTSTRAP_OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c $(QUIET_CC)$(CC) $(CFLAGS) -c -MMD $< -o $@ -$(OUTPUT)feature.o: -ifneq ($(feature-zlib), 1) - $(error "No zlib found") -endif - $(BPFTOOL_BOOTSTRAP): $(BOOTSTRAP_OBJS) $(LIBBPF_BOOTSTRAP) $(QUIET_LINK)$(HOSTCC) $(HOST_CFLAGS) $(LDFLAGS) $(BOOTSTRAP_OBJS) $(LIBS_BOOTSTRAP) -o $@ -- cgit v1.2.3 From ca188a25d43f85f9c6f1e0a303edad47c9d24989 Mon Sep 17 00:00:00 2001 From: Kishen Maloor Date: Tue, 5 Jul 2022 14:32:15 -0700 Subject: selftests: mptcp: userspace PM support for MP_PRIO signals This change updates the testing sample (pm_nl_ctl) to exercise the updated MPTCP_PM_CMD_SET_FLAGS command for userspace PMs to issue MP_PRIO signals over the selected subflow. E.g. ./pm_nl_ctl set 10.0.1.2 port 47234 flags backup token 823274047 rip 10.0.1.1 rport 50003 userspace_pm.sh has a new selftest that invokes this command. Fixes: 259a834fadda ("selftests: mptcp: functional tests for the userspace PM type") Acked-by: Paolo Abeni Signed-off-by: Kishen Maloor Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/pm_nl_ctl.c | 73 ++++++++++++++++++++++- tools/testing/selftests/net/mptcp/userspace_pm.sh | 32 ++++++++++ 2 files changed, 103 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c index 6a2f4b981e1d..cb79f0719e3b 100644 --- a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c +++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c @@ -39,7 +39,7 @@ static void syntax(char *argv[]) fprintf(stderr, "\tdsf lip lport rip rport token \n"); fprintf(stderr, "\tdel []\n"); fprintf(stderr, "\tget \n"); - fprintf(stderr, "\tset [] [id ] flags [no]backup|[no]fullmesh [port ]\n"); + fprintf(stderr, "\tset [] [id ] flags [no]backup|[no]fullmesh [port ] [token ] [rip ] [rport ]\n"); fprintf(stderr, "\tflush\n"); fprintf(stderr, "\tdump\n"); fprintf(stderr, "\tlimits [ ]\n"); @@ -1279,7 +1279,10 @@ int set_flags(int fd, int pm_family, int argc, char *argv[]) struct rtattr *rta, *nest; struct nlmsghdr *nh; u_int32_t flags = 0; + u_int32_t token = 0; + u_int16_t rport = 0; u_int16_t family; + void *rip = NULL; int nest_start; int use_id = 0; u_int8_t id; @@ -1339,7 +1342,13 @@ int set_flags(int fd, int pm_family, int argc, char *argv[]) error(1, 0, " missing flags keyword"); for (; arg < argc; arg++) { - if (!strcmp(argv[arg], "flags")) { + if (!strcmp(argv[arg], "token")) { + if (++arg >= argc) + error(1, 0, " missing token value"); + + /* token */ + token = atoi(argv[arg]); + } else if (!strcmp(argv[arg], "flags")) { char *tok, *str; /* flags */ @@ -1378,12 +1387,72 @@ int set_flags(int fd, int pm_family, int argc, char *argv[]) rta->rta_len = RTA_LENGTH(2); memcpy(RTA_DATA(rta), &port, 2); off += NLMSG_ALIGN(rta->rta_len); + } else if (!strcmp(argv[arg], "rport")) { + if (++arg >= argc) + error(1, 0, " missing remote port"); + + rport = atoi(argv[arg]); + } else if (!strcmp(argv[arg], "rip")) { + if (++arg >= argc) + error(1, 0, " missing remote ip"); + + rip = argv[arg]; } else { error(1, 0, "unknown keyword %s", argv[arg]); } } nest->rta_len = off - nest_start; + /* token */ + if (token) { + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ATTR_TOKEN; + rta->rta_len = RTA_LENGTH(4); + memcpy(RTA_DATA(rta), &token, 4); + off += NLMSG_ALIGN(rta->rta_len); + } + + /* remote addr/port */ + if (rip) { + nest_start = off; + nest = (void *)(data + off); + nest->rta_type = NLA_F_NESTED | MPTCP_PM_ATTR_ADDR_REMOTE; + nest->rta_len = RTA_LENGTH(0); + off += NLMSG_ALIGN(nest->rta_len); + + /* addr data */ + rta = (void *)(data + off); + if (inet_pton(AF_INET, rip, RTA_DATA(rta))) { + family = AF_INET; + rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR4; + rta->rta_len = RTA_LENGTH(4); + } else if (inet_pton(AF_INET6, rip, RTA_DATA(rta))) { + family = AF_INET6; + rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR6; + rta->rta_len = RTA_LENGTH(16); + } else { + error(1, errno, "can't parse ip %s", (char *)rip); + } + off += NLMSG_ALIGN(rta->rta_len); + + /* family */ + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_FAMILY; + rta->rta_len = RTA_LENGTH(2); + memcpy(RTA_DATA(rta), &family, 2); + off += NLMSG_ALIGN(rta->rta_len); + + if (rport) { + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_PORT; + rta->rta_len = RTA_LENGTH(2); + memcpy(RTA_DATA(rta), &rport, 2); + off += NLMSG_ALIGN(rta->rta_len); + } + + nest->rta_len = off - nest_start; + } + do_nl_req(fd, nh, off, 0); return 0; } diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index 78d0bb640b11..abe3d4ebe554 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -770,10 +770,42 @@ test_subflows() rm -f "$evts" } +test_prio() +{ + local count + + # Send MP_PRIO signal from client to server machine + ip netns exec "$ns2" ./pm_nl_ctl set 10.0.1.2 port "$client4_port" flags backup token "$client4_token" rip 10.0.1.1 rport "$server4_port" + sleep 0.5 + + # Check TX + stdbuf -o0 -e0 printf "MP_PRIO TX \t" + count=$(ip netns exec "$ns2" nstat -as | grep MPTcpExtMPPrioTx | awk '{print $2}') + [ -z "$count" ] && count=0 + if [ $count != 1 ]; then + stdbuf -o0 -e0 printf "[FAIL]\n" + exit 1 + else + stdbuf -o0 -e0 printf "[OK]\n" + fi + + # Check RX + stdbuf -o0 -e0 printf "MP_PRIO RX \t" + count=$(ip netns exec "$ns1" nstat -as | grep MPTcpExtMPPrioRx | awk '{print $2}') + [ -z "$count" ] && count=0 + if [ $count != 1 ]; then + stdbuf -o0 -e0 printf "[FAIL]\n" + exit 1 + else + stdbuf -o0 -e0 printf "[OK]\n" + fi +} + make_connection make_connection "v6" test_announce test_remove test_subflows +test_prio exit 0 -- cgit v1.2.3 From f36068a20256bad993d60e49602f02e3af336506 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 5 Jul 2022 16:59:25 -0700 Subject: selftests: tls: add selftest variant for pad Add a self-test variant with TLS 1.3 nopad set. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- tools/testing/selftests/net/tls.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 5d70b04c482c..e71ec5846be9 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -235,6 +235,7 @@ FIXTURE_VARIANT(tls) { uint16_t tls_version; uint16_t cipher_type; + bool nopad; }; FIXTURE_VARIANT_ADD(tls, 12_aes_gcm) @@ -297,9 +298,17 @@ FIXTURE_VARIANT_ADD(tls, 13_aes_gcm_256) .cipher_type = TLS_CIPHER_AES_GCM_256, }; +FIXTURE_VARIANT_ADD(tls, 13_nopad) +{ + .tls_version = TLS_1_3_VERSION, + .cipher_type = TLS_CIPHER_AES_GCM_128, + .nopad = true, +}; + FIXTURE_SETUP(tls) { struct tls_crypto_info_keys tls12; + int one = 1; int ret; tls_crypto_info_init(variant->tls_version, variant->cipher_type, @@ -315,6 +324,12 @@ FIXTURE_SETUP(tls) ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len); ASSERT_EQ(ret, 0); + + if (variant->nopad) { + ret = setsockopt(self->cfd, SOL_TLS, TLS_RX_EXPECT_NO_PAD, + (void *)&one, sizeof(one)); + ASSERT_EQ(ret, 0); + } } FIXTURE_TEARDOWN(tls) -- cgit v1.2.3 From 645d5d3bc001a7d77459cb8360c36a60954d6008 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 5 Jul 2022 15:48:16 -0700 Subject: selftests/bpf: Fix bogus uninitialized variable warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When compiling selftests/bpf in optimized mode (-O2), GCC erroneously complains about uninitialized token variable: In file included from network_helpers.c:22: network_helpers.c: In function ‘open_netns’: test_progs.h:355:22: error: ‘token’ may be used uninitialized [-Werror=maybe-uninitialized] 355 | int ___err = libbpf_get_error(___res); \ | ^~~~~~~~~~~~~~~~~~~~~~~~ network_helpers.c:440:14: note: in expansion of macro ‘ASSERT_OK_PTR’ 440 | if (!ASSERT_OK_PTR(token, "malloc token")) | ^~~~~~~~~~~~~ In file included from /data/users/andriin/linux/tools/testing/selftests/bpf/tools/include/bpf/libbpf.h:21, from bpf_util.h:9, from network_helpers.c:20: /data/users/andriin/linux/tools/testing/selftests/bpf/tools/include/bpf/libbpf_legacy.h:113:17: note: by argument 1 of type ‘const void *’ to ‘libbpf_get_error’ declared here 113 | LIBBPF_API long libbpf_get_error(const void *ptr); | ^~~~~~~~~~~~~~~~ cc1: all warnings being treated as errors make: *** [Makefile:522: /data/users/andriin/linux/tools/testing/selftests/bpf/network_helpers.o] Error 1 This is completely bogus becuase libbpf_get_error() doesn't dereference pointer, but the only easy way to silence this is to allocate initialized memory with calloc(). Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20220705224818.4026623-2-andrii@kernel.org --- tools/testing/selftests/bpf/network_helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 59cf81ec55af..bec15558fd93 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -436,7 +436,7 @@ struct nstoken *open_netns(const char *name) int err; struct nstoken *token; - token = malloc(sizeof(struct nstoken)); + token = calloc(1, sizeof(struct nstoken)); if (!ASSERT_OK_PTR(token, "malloc token")) return NULL; -- cgit v1.2.3 From c46a12200114ae5ad7b0922e6c8969a22c3afbd7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 5 Jul 2022 15:48:17 -0700 Subject: selftests/bpf: Fix few more compiler warnings When compiling with -O2, GCC detects few problems with selftests/bpf, so fix all of them. Two are real issues (uninitialized err and nums out-of-bounds access), but two other uninitialized variables warnings are due to GCC not being able to prove that variables are indeed initialized under conditions under which they are used. Fix all 4 cases, though. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20220705224818.4026623-3-andrii@kernel.org --- tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c | 4 ++-- tools/testing/selftests/bpf/prog_tests/usdt.c | 2 +- tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 586dc52d6fb9..a8cb8a96ddaf 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -329,7 +329,7 @@ static int get_syms(char ***symsp, size_t *cntp) struct hashmap *map; char buf[256]; FILE *f; - int err; + int err = 0; /* * The available_filter_functions contains many duplicates, @@ -404,7 +404,7 @@ static void test_bench_attach(void) double attach_delta, detach_delta; struct bpf_link *link = NULL; char **syms = NULL; - size_t cnt, i; + size_t cnt = 0, i; if (!ASSERT_OK(get_syms(&syms, &cnt), "get_syms")) return; diff --git a/tools/testing/selftests/bpf/prog_tests/usdt.c b/tools/testing/selftests/bpf/prog_tests/usdt.c index 5f733d50b0d7..9ad9da0f215e 100644 --- a/tools/testing/selftests/bpf/prog_tests/usdt.c +++ b/tools/testing/selftests/bpf/prog_tests/usdt.c @@ -12,7 +12,7 @@ int lets_test_this(int); static volatile int idx = 2; static volatile __u64 bla = 0xFEDCBA9876543210ULL; -static volatile short nums[] = {-1, -2, -3, }; +static volatile short nums[] = {-1, -2, -3, -4}; static volatile struct { int x; diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c index fb77a123fe89..874a846e298c 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c @@ -63,7 +63,7 @@ static bool expect_str(char *buf, size_t size, const char *str, const char *name static void test_synproxy(bool xdp) { int server_fd = -1, client_fd = -1, accept_fd = -1; - char *prog_id, *prog_id_end; + char *prog_id = NULL, *prog_id_end; struct nstoken *ns = NULL; FILE *ctrl_file = NULL; char buf[CMD_OUT_BUF_SIZE]; -- cgit v1.2.3 From 7c8121af1bfe29feedfa4fcb3154886660ecbe3a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 5 Jul 2022 15:48:18 -0700 Subject: libbpf: Remove unnecessary usdt_rel_ip assignments Coverity detected that usdt_rel_ip is unconditionally overwritten anyways, so there is no need to unnecessarily initialize it with unused value. Clean this up. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20220705224818.4026623-4-andrii@kernel.org --- tools/lib/bpf/usdt.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c index 5159207cbfd9..d18e37982344 100644 --- a/tools/lib/bpf/usdt.c +++ b/tools/lib/bpf/usdt.c @@ -652,11 +652,9 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * * * [0] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation */ - usdt_rel_ip = usdt_abs_ip = note.loc_addr; - if (base_addr) { + usdt_abs_ip = note.loc_addr; + if (base_addr) usdt_abs_ip += base_addr - note.base_addr; - usdt_rel_ip += base_addr - note.base_addr; - } /* When attaching uprobes (which is what USDTs basically are) * kernel expects file offset to be specified, not a relative -- cgit v1.2.3 From 829be057dbc1e71383b8d7de8edb31dcf07b4aa0 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 7 Jul 2022 02:31:52 +0200 Subject: wireguard: selftests: set fake real time in init Not all platforms have an RTC, and rather than trying to force one into each, it's much easier to just set a fixed time. This is necessary because WireGuard's latest handshakes parameter is returned in wallclock time, and if the system time isn't set, and the system is really fast, then this returns 0, which trips the test. Turning this on requires setting CONFIG_COMPAT_32BIT_TIME=y, as musl doesn't support settimeofday without it. Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- tools/testing/selftests/wireguard/qemu/arch/arm.config | 1 + tools/testing/selftests/wireguard/qemu/arch/armeb.config | 1 + tools/testing/selftests/wireguard/qemu/arch/i686.config | 1 + tools/testing/selftests/wireguard/qemu/arch/m68k.config | 1 + tools/testing/selftests/wireguard/qemu/arch/mips.config | 1 + tools/testing/selftests/wireguard/qemu/arch/mipsel.config | 1 + tools/testing/selftests/wireguard/qemu/arch/powerpc.config | 1 + tools/testing/selftests/wireguard/qemu/init.c | 11 +++++++++++ 8 files changed, 18 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/wireguard/qemu/arch/arm.config b/tools/testing/selftests/wireguard/qemu/arch/arm.config index fc7959bef9c2..0579c66be83e 100644 --- a/tools/testing/selftests/wireguard/qemu/arch/arm.config +++ b/tools/testing/selftests/wireguard/qemu/arch/arm.config @@ -7,6 +7,7 @@ CONFIG_SERIAL_AMBA_PL011_CONSOLE=y CONFIG_VIRTIO_MENU=y CONFIG_VIRTIO_MMIO=y CONFIG_VIRTIO_CONSOLE=y +CONFIG_COMPAT_32BIT_TIME=y CONFIG_CMDLINE_BOOL=y CONFIG_CMDLINE="console=ttyAMA0 wg.success=vport0p1 panic_on_warn=1" CONFIG_FRAME_WARN=1024 diff --git a/tools/testing/selftests/wireguard/qemu/arch/armeb.config b/tools/testing/selftests/wireguard/qemu/arch/armeb.config index f3066be81c19..2a3307bbe534 100644 --- a/tools/testing/selftests/wireguard/qemu/arch/armeb.config +++ b/tools/testing/selftests/wireguard/qemu/arch/armeb.config @@ -7,6 +7,7 @@ CONFIG_SERIAL_AMBA_PL011_CONSOLE=y CONFIG_VIRTIO_MENU=y CONFIG_VIRTIO_MMIO=y CONFIG_VIRTIO_CONSOLE=y +CONFIG_COMPAT_32BIT_TIME=y CONFIG_CMDLINE_BOOL=y CONFIG_CMDLINE="console=ttyAMA0 wg.success=vport0p1 panic_on_warn=1" CONFIG_CPU_BIG_ENDIAN=y diff --git a/tools/testing/selftests/wireguard/qemu/arch/i686.config b/tools/testing/selftests/wireguard/qemu/arch/i686.config index 6d90892a85a2..cd864b9be6fb 100644 --- a/tools/testing/selftests/wireguard/qemu/arch/i686.config +++ b/tools/testing/selftests/wireguard/qemu/arch/i686.config @@ -1,6 +1,7 @@ CONFIG_ACPI=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_COMPAT_32BIT_TIME=y CONFIG_CMDLINE_BOOL=y CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1 panic_on_warn=1" CONFIG_FRAME_WARN=1024 diff --git a/tools/testing/selftests/wireguard/qemu/arch/m68k.config b/tools/testing/selftests/wireguard/qemu/arch/m68k.config index 82c925e49beb..9639bfe06074 100644 --- a/tools/testing/selftests/wireguard/qemu/arch/m68k.config +++ b/tools/testing/selftests/wireguard/qemu/arch/m68k.config @@ -5,5 +5,6 @@ CONFIG_MAC=y CONFIG_SERIAL_PMACZILOG=y CONFIG_SERIAL_PMACZILOG_TTYS=y CONFIG_SERIAL_PMACZILOG_CONSOLE=y +CONFIG_COMPAT_32BIT_TIME=y CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1 panic_on_warn=1" CONFIG_FRAME_WARN=1024 diff --git a/tools/testing/selftests/wireguard/qemu/arch/mips.config b/tools/testing/selftests/wireguard/qemu/arch/mips.config index d7ec63c17b30..2a84402353ab 100644 --- a/tools/testing/selftests/wireguard/qemu/arch/mips.config +++ b/tools/testing/selftests/wireguard/qemu/arch/mips.config @@ -6,6 +6,7 @@ CONFIG_POWER_RESET=y CONFIG_POWER_RESET_SYSCON=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_COMPAT_32BIT_TIME=y CONFIG_CMDLINE_BOOL=y CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1 panic_on_warn=1" CONFIG_FRAME_WARN=1024 diff --git a/tools/testing/selftests/wireguard/qemu/arch/mipsel.config b/tools/testing/selftests/wireguard/qemu/arch/mipsel.config index 18a498293737..56146a101e7e 100644 --- a/tools/testing/selftests/wireguard/qemu/arch/mipsel.config +++ b/tools/testing/selftests/wireguard/qemu/arch/mipsel.config @@ -7,6 +7,7 @@ CONFIG_POWER_RESET=y CONFIG_POWER_RESET_SYSCON=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_COMPAT_32BIT_TIME=y CONFIG_CMDLINE_BOOL=y CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1 panic_on_warn=1" CONFIG_FRAME_WARN=1024 diff --git a/tools/testing/selftests/wireguard/qemu/arch/powerpc.config b/tools/testing/selftests/wireguard/qemu/arch/powerpc.config index 5e04882e8e35..174a9ffe2a36 100644 --- a/tools/testing/selftests/wireguard/qemu/arch/powerpc.config +++ b/tools/testing/selftests/wireguard/qemu/arch/powerpc.config @@ -4,6 +4,7 @@ CONFIG_PPC_85xx=y CONFIG_PHYS_64BIT=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_COMPAT_32BIT_TIME=y CONFIG_MATH_EMULATION=y CONFIG_CMDLINE_BOOL=y CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1 panic_on_warn=1" diff --git a/tools/testing/selftests/wireguard/qemu/init.c b/tools/testing/selftests/wireguard/qemu/init.c index c9e128436546..3e49924dd77e 100644 --- a/tools/testing/selftests/wireguard/qemu/init.c +++ b/tools/testing/selftests/wireguard/qemu/init.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -70,6 +71,15 @@ static void seed_rng(void) close(fd); } +static void set_time(void) +{ + if (time(NULL)) + return; + pretty_message("[+] Setting fake time..."); + if (stime(&(time_t){1433512680}) < 0) + panic("settimeofday()"); +} + static void mount_filesystems(void) { pretty_message("[+] Mounting filesystems..."); @@ -259,6 +269,7 @@ int main(int argc, char *argv[]) print_banner(); mount_filesystems(); seed_rng(); + set_time(); kmod_selftests(); enable_logging(); clear_leaks(); -- cgit v1.2.3 From 1f2f341a62639c7066ee4c76b7d9ebe867e0a1d5 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 7 Jul 2022 02:31:53 +0200 Subject: wireguard: selftests: use virt machine on m68k This should be a bit more stable hopefully. Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- tools/testing/selftests/wireguard/qemu/Makefile | 5 +++-- tools/testing/selftests/wireguard/qemu/arch/m68k.config | 9 +++------ 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/wireguard/qemu/Makefile b/tools/testing/selftests/wireguard/qemu/Makefile index 7d1b80988d8a..ed07fdc4589c 100644 --- a/tools/testing/selftests/wireguard/qemu/Makefile +++ b/tools/testing/selftests/wireguard/qemu/Makefile @@ -208,10 +208,11 @@ QEMU_ARCH := m68k KERNEL_ARCH := m68k KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux KERNEL_CMDLINE := $(shell sed -n 's/CONFIG_CMDLINE=\(.*\)/\1/p' arch/m68k.config) +QEMU_VPORT_RESULT := virtio-serial-device ifeq ($(HOST_ARCH),$(ARCH)) -QEMU_MACHINE := -cpu host,accel=kvm -machine q800 -append $(KERNEL_CMDLINE) +QEMU_MACHINE := -cpu host,accel=kvm -machine virt -append $(KERNEL_CMDLINE) else -QEMU_MACHINE := -machine q800 -smp 1 -append $(KERNEL_CMDLINE) +QEMU_MACHINE := -machine virt -smp 1 -append $(KERNEL_CMDLINE) endif else ifeq ($(ARCH),riscv64) CHOST := riscv64-linux-musl diff --git a/tools/testing/selftests/wireguard/qemu/arch/m68k.config b/tools/testing/selftests/wireguard/qemu/arch/m68k.config index 9639bfe06074..39c48cba56b7 100644 --- a/tools/testing/selftests/wireguard/qemu/arch/m68k.config +++ b/tools/testing/selftests/wireguard/qemu/arch/m68k.config @@ -1,10 +1,7 @@ CONFIG_MMU=y +CONFIG_VIRT=y CONFIG_M68KCLASSIC=y -CONFIG_M68040=y -CONFIG_MAC=y -CONFIG_SERIAL_PMACZILOG=y -CONFIG_SERIAL_PMACZILOG_TTYS=y -CONFIG_SERIAL_PMACZILOG_CONSOLE=y +CONFIG_VIRTIO_CONSOLE=y CONFIG_COMPAT_32BIT_TIME=y -CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1 panic_on_warn=1" +CONFIG_CMDLINE="console=ttyGF0 wg.success=vport0p1 panic_on_warn=1" CONFIG_FRAME_WARN=1024 -- cgit v1.2.3 From 1a087eec257154e26a81a7a0a15380d7a2431765 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 7 Jul 2022 02:31:54 +0200 Subject: wireguard: selftests: always call kernel makefile These selftests are used for much more extensive changes than just the wireguard source files. So always call the kernel's build file, which will do something or nothing after checking the whole tree, per usual. Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- tools/testing/selftests/wireguard/qemu/Makefile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/wireguard/qemu/Makefile b/tools/testing/selftests/wireguard/qemu/Makefile index ed07fdc4589c..ce6176928a7d 100644 --- a/tools/testing/selftests/wireguard/qemu/Makefile +++ b/tools/testing/selftests/wireguard/qemu/Makefile @@ -19,8 +19,6 @@ endif MIRROR := https://download.wireguard.com/qemu-test/distfiles/ KERNEL_BUILD_PATH := $(BUILD_PATH)/kernel$(if $(findstring yes,$(DEBUG_KERNEL)),-debug) -rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d)) -WIREGUARD_SOURCES := $(call rwildcard,$(KERNEL_PATH)/drivers/net/wireguard/,*) default: qemu @@ -323,8 +321,9 @@ $(KERNEL_BUILD_PATH)/.config: $(TOOLCHAIN_PATH)/.installed kernel.config arch/$( cd $(KERNEL_BUILD_PATH) && ARCH=$(KERNEL_ARCH) $(KERNEL_PATH)/scripts/kconfig/merge_config.sh -n $(KERNEL_BUILD_PATH)/.config $(KERNEL_BUILD_PATH)/minimal.config $(if $(findstring yes,$(DEBUG_KERNEL)),cp debug.config $(KERNEL_BUILD_PATH) && cd $(KERNEL_BUILD_PATH) && ARCH=$(KERNEL_ARCH) $(KERNEL_PATH)/scripts/kconfig/merge_config.sh -n $(KERNEL_BUILD_PATH)/.config debug.config,) -$(KERNEL_BZIMAGE): $(TOOLCHAIN_PATH)/.installed $(KERNEL_BUILD_PATH)/.config $(BUILD_PATH)/init-cpio-spec.txt $(IPERF_PATH)/src/iperf3 $(IPUTILS_PATH)/ping $(BASH_PATH)/bash $(IPROUTE2_PATH)/misc/ss $(IPROUTE2_PATH)/ip/ip $(IPTABLES_PATH)/iptables/xtables-legacy-multi $(NMAP_PATH)/ncat/ncat $(WIREGUARD_TOOLS_PATH)/src/wg $(BUILD_PATH)/init ../netns.sh $(WIREGUARD_SOURCES) +$(KERNEL_BZIMAGE): $(TOOLCHAIN_PATH)/.installed $(KERNEL_BUILD_PATH)/.config $(BUILD_PATH)/init-cpio-spec.txt $(IPERF_PATH)/src/iperf3 $(IPUTILS_PATH)/ping $(BASH_PATH)/bash $(IPROUTE2_PATH)/misc/ss $(IPROUTE2_PATH)/ip/ip $(IPTABLES_PATH)/iptables/xtables-legacy-multi $(NMAP_PATH)/ncat/ncat $(WIREGUARD_TOOLS_PATH)/src/wg $(BUILD_PATH)/init $(MAKE) -C $(KERNEL_PATH) O=$(KERNEL_BUILD_PATH) ARCH=$(KERNEL_ARCH) CROSS_COMPILE=$(CROSS_COMPILE) +.PHONY: $(KERNEL_BZIMAGE) $(TOOLCHAIN_PATH)/$(CHOST)/include/linux/.installed: | $(KERNEL_BUILD_PATH)/.config $(TOOLCHAIN_PATH)/.installed rm -rf $(TOOLCHAIN_PATH)/$(CHOST)/include/linux -- cgit v1.2.3 From b83fdcd9fb8ad7e59f4188ba9ec221917f463a17 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 7 Jul 2022 02:31:55 +0200 Subject: wireguard: selftests: use microvm on x86 This makes for faster tests, faster compile time, and allows us to ditch ACPI finally. Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- tools/testing/selftests/wireguard/qemu/Makefile | 10 ++++++---- tools/testing/selftests/wireguard/qemu/arch/i686.config | 7 +++++-- tools/testing/selftests/wireguard/qemu/arch/x86_64.config | 7 +++++-- 3 files changed, 16 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/wireguard/qemu/Makefile b/tools/testing/selftests/wireguard/qemu/Makefile index ce6176928a7d..9700358e4337 100644 --- a/tools/testing/selftests/wireguard/qemu/Makefile +++ b/tools/testing/selftests/wireguard/qemu/Makefile @@ -107,20 +107,22 @@ CHOST := x86_64-linux-musl QEMU_ARCH := x86_64 KERNEL_ARCH := x86_64 KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/x86/boot/bzImage +QEMU_VPORT_RESULT := virtio-serial-device ifeq ($(HOST_ARCH),$(ARCH)) -QEMU_MACHINE := -cpu host -machine q35,accel=kvm +QEMU_MACHINE := -cpu host -machine microvm,accel=kvm,pit=off,pic=off,rtc=off -no-acpi else -QEMU_MACHINE := -cpu max -machine q35 +QEMU_MACHINE := -cpu max -machine microvm -no-acpi endif else ifeq ($(ARCH),i686) CHOST := i686-linux-musl QEMU_ARCH := i386 KERNEL_ARCH := x86 KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/x86/boot/bzImage +QEMU_VPORT_RESULT := virtio-serial-device ifeq ($(subst x86_64,i686,$(HOST_ARCH)),$(ARCH)) -QEMU_MACHINE := -cpu host -machine q35,accel=kvm +QEMU_MACHINE := -cpu host -machine microvm,accel=kvm,pit=off,pic=off,rtc=off -no-acpi else -QEMU_MACHINE := -cpu max -machine q35 +QEMU_MACHINE := -cpu coreduo -machine microvm -no-acpi endif else ifeq ($(ARCH),mips64) CHOST := mips64-linux-musl diff --git a/tools/testing/selftests/wireguard/qemu/arch/i686.config b/tools/testing/selftests/wireguard/qemu/arch/i686.config index cd864b9be6fb..35b06502606f 100644 --- a/tools/testing/selftests/wireguard/qemu/arch/i686.config +++ b/tools/testing/selftests/wireguard/qemu/arch/i686.config @@ -1,7 +1,10 @@ -CONFIG_ACPI=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_VIRTIO_MENU=y +CONFIG_VIRTIO_MMIO=y +CONFIG_VIRTIO_CONSOLE=y +CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y CONFIG_COMPAT_32BIT_TIME=y CONFIG_CMDLINE_BOOL=y -CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1 panic_on_warn=1" +CONFIG_CMDLINE="console=ttyS0 wg.success=vport0p1 panic_on_warn=1 reboot=t" CONFIG_FRAME_WARN=1024 diff --git a/tools/testing/selftests/wireguard/qemu/arch/x86_64.config b/tools/testing/selftests/wireguard/qemu/arch/x86_64.config index efa00693e08b..cf2d1376d121 100644 --- a/tools/testing/selftests/wireguard/qemu/arch/x86_64.config +++ b/tools/testing/selftests/wireguard/qemu/arch/x86_64.config @@ -1,6 +1,9 @@ -CONFIG_ACPI=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_VIRTIO_MENU=y +CONFIG_VIRTIO_MMIO=y +CONFIG_VIRTIO_CONSOLE=y +CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y CONFIG_CMDLINE_BOOL=y -CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1 panic_on_warn=1" +CONFIG_CMDLINE="console=ttyS0 wg.success=vport0p1 panic_on_warn=1 reboot=t" CONFIG_FRAME_WARN=1280 -- cgit v1.2.3 From 38e0e4d04d4187c63d6b511396faae7db6a3cd9e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 28 Jun 2022 12:57:42 +0200 Subject: x86/ibt, objtool: Don't discard text references from tracepoint section On Tue, Jun 28, 2022 at 04:28:58PM +0800, Pengfei Xu wrote: > # ./ftracetest > === Ftrace unit tests === > [1] Basic trace file check [PASS] > [2] Basic test for tracers [PASS] > [3] Basic trace clock test [PASS] > [4] Basic event tracing check [PASS] > [5] Change the ringbuffer size [PASS] > [6] Snapshot and tracing setting [PASS] > [7] trace_pipe and trace_marker [PASS] > [8] Test ftrace direct functions against tracers [UNRESOLVED] > [9] Test ftrace direct functions against kprobes [UNRESOLVED] > [10] Generic dynamic event - add/remove eprobe events [FAIL] > [11] Generic dynamic event - add/remove kprobe events > > It 100% reproduced in step 11 and then missing ENDBR BUG generated: > " > [ 9332.752836] mmiotrace: enabled CPU7. > [ 9332.788612] mmiotrace: disabled. > [ 9337.103426] traps: Missing ENDBR: syscall_regfunc+0x0/0xb0 It turns out that while syscall_regfunc() does have an ENDBR when generated, it gets sealed by objtool's .ibt_endbr_seal list. Since the only text references to this function: $ git grep syscall_regfunc include/linux/tracepoint.h:extern int syscall_regfunc(void); include/trace/events/syscalls.h: syscall_regfunc, syscall_unregfunc include/trace/events/syscalls.h: syscall_regfunc, syscall_unregfunc kernel/tracepoint.c:int syscall_regfunc(void) appear in the __tracepoint section which is excluded by objtool. Fixes: 3c6f9f77e618 ("objtool: Rework ibt and extricate from stack validation") Reported-by: Pengfei Xu Link: https://lkml.kernel.org/r/Yrrepdaow4F5kqG0@hirez.programming.kicks-ass.net --- tools/objtool/check.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 864bb9dd3584..57153e00349c 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -3826,8 +3826,7 @@ static int validate_ibt(struct objtool_file *file) !strcmp(sec->name, "__bug_table") || !strcmp(sec->name, "__ex_table") || !strcmp(sec->name, "__jump_table") || - !strcmp(sec->name, "__mcount_loc") || - !strcmp(sec->name, "__tracepoints")) + !strcmp(sec->name, "__mcount_loc")) continue; list_for_each_entry(reloc, &sec->reloc->reloc_list, list) -- cgit v1.2.3 From 935dc35c75318fa213d26808ad8bb130fb0b486e Mon Sep 17 00:00:00 2001 From: Yixun Lan Date: Wed, 6 Jul 2022 22:02:04 +0800 Subject: libbpf, riscv: Use a0 for RC register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to the RISC-V calling convention register usage here [0], a0 is used as return value register, so rename it to make it consistent with the spec. [0] section 18.2, table 18.2 https://riscv.org/wp-content/uploads/2015/01/riscv-calling.pdf Fixes: 589fed479ba1 ("riscv, libbpf: Add RISC-V (RV64) support to bpf_tracing.h") Signed-off-by: Yixun Lan Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Acked-by: Amjad OULED-AMEUR Link: https://lore.kernel.org/bpf/20220706140204.47926-1-dlan@gentoo.org --- tools/lib/bpf/bpf_tracing.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index 01ce121c302d..11f9096407fc 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -233,7 +233,7 @@ struct pt_regs___arm64 { #define __PT_PARM5_REG a4 #define __PT_RET_REG ra #define __PT_FP_REG s0 -#define __PT_RC_REG a5 +#define __PT_RC_REG a0 #define __PT_SP_REG sp #define __PT_IP_REG pc /* riscv does not select ARCH_HAS_SYSCALL_WRAPPER. */ -- cgit v1.2.3 From 2b4b2621fd6401865b31b9f403e4b936b7439e94 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 5 Jul 2022 12:00:18 -0700 Subject: selftests/bpf: Add benchmark for local_storage RCU Tasks Trace usage This benchmark measures grace period latency and kthread cpu usage of RCU Tasks Trace when many processes are creating/deleting BPF local_storage. Intent here is to quantify improvement on these metrics after Paul's recent RCU Tasks patches [0]. Specifically, fork 15k tasks which call a bpf prog that creates/destroys task local_storage and sleep in a loop, resulting in many call_rcu_tasks_trace calls. To determine grace period latency, trace time elapsed between rcu_tasks_trace_pregp_step and rcu_tasks_trace_postgp; for cpu usage look at rcu_task_trace_kthread's stime in /proc/PID/stat. On my virtualized test environment (Skylake, 8 cpus) benchmark results demonstrate significant improvement: BEFORE Paul's patches: SUMMARY tasks_trace grace period latency avg 22298.551 us stddev 1302.165 us SUMMARY ticks per tasks_trace grace period avg 2.291 stddev 0.324 AFTER Paul's patches: SUMMARY tasks_trace grace period latency avg 16969.197 us stddev 2525.053 us SUMMARY ticks per tasks_trace grace period avg 1.146 stddev 0.178 Note that since these patches are not in bpf-next benchmarking was done by cherry-picking this patch onto rcu tree. [0] https://lore.kernel.org/rcu/20220620225402.GA3842369@paulmck-ThinkPad-P17-Gen-1/ Signed-off-by: Dave Marchevsky Signed-off-by: Daniel Borkmann Acked-by: Paul E. McKenney Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220705190018.3239050-1-davemarchevsky@fb.com --- tools/testing/selftests/bpf/Makefile | 4 +- tools/testing/selftests/bpf/bench.c | 42 +++ tools/testing/selftests/bpf/bench.h | 12 + .../benchs/bench_local_storage_rcu_tasks_trace.c | 281 +++++++++++++++++++++ .../run_bench_local_storage_rcu_tasks_trace.sh | 11 + .../progs/local_storage_rcu_tasks_trace_bench.c | 67 +++++ 6 files changed, 416 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/benchs/bench_local_storage_rcu_tasks_trace.c create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_local_storage_rcu_tasks_trace.sh create mode 100644 tools/testing/selftests/bpf/progs/local_storage_rcu_tasks_trace_bench.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index e32a28fe8bc1..dfaac97222af 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -574,6 +574,7 @@ $(OUTPUT)/bench_bpf_loop.o: $(OUTPUT)/bpf_loop_bench.skel.h $(OUTPUT)/bench_strncmp.o: $(OUTPUT)/strncmp_bench.skel.h $(OUTPUT)/bench_bpf_hashmap_full_update.o: $(OUTPUT)/bpf_hashmap_full_update_bench.skel.h $(OUTPUT)/bench_local_storage.o: $(OUTPUT)/local_storage_bench.skel.h +$(OUTPUT)/bench_local_storage_rcu_tasks_trace.o: $(OUTPUT)/local_storage_rcu_tasks_trace_bench.skel.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ) $(OUTPUT)/bench: LDLIBS += -lm $(OUTPUT)/bench: $(OUTPUT)/bench.o \ @@ -587,7 +588,8 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \ $(OUTPUT)/bench_bpf_loop.o \ $(OUTPUT)/bench_strncmp.o \ $(OUTPUT)/bench_bpf_hashmap_full_update.o \ - $(OUTPUT)/bench_local_storage.o + $(OUTPUT)/bench_local_storage.o \ + $(OUTPUT)/bench_local_storage_rcu_tasks_trace.o $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 1e7b5d4b1f11..c1f20a147462 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -79,6 +79,43 @@ void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns) hits_per_sec, hits_per_prod, drops_per_sec, hits_per_sec + drops_per_sec); } +void +grace_period_latency_basic_stats(struct bench_res res[], int res_cnt, struct basic_stats *gp_stat) +{ + int i; + + memset(gp_stat, 0, sizeof(struct basic_stats)); + + for (i = 0; i < res_cnt; i++) + gp_stat->mean += res[i].gp_ns / 1000.0 / (double)res[i].gp_ct / (0.0 + res_cnt); + +#define IT_MEAN_DIFF (res[i].gp_ns / 1000.0 / (double)res[i].gp_ct - gp_stat->mean) + if (res_cnt > 1) { + for (i = 0; i < res_cnt; i++) + gp_stat->stddev += (IT_MEAN_DIFF * IT_MEAN_DIFF) / (res_cnt - 1.0); + } + gp_stat->stddev = sqrt(gp_stat->stddev); +#undef IT_MEAN_DIFF +} + +void +grace_period_ticks_basic_stats(struct bench_res res[], int res_cnt, struct basic_stats *gp_stat) +{ + int i; + + memset(gp_stat, 0, sizeof(struct basic_stats)); + for (i = 0; i < res_cnt; i++) + gp_stat->mean += res[i].stime / (double)res[i].gp_ct / (0.0 + res_cnt); + +#define IT_MEAN_DIFF (res[i].stime / (double)res[i].gp_ct - gp_stat->mean) + if (res_cnt > 1) { + for (i = 0; i < res_cnt; i++) + gp_stat->stddev += (IT_MEAN_DIFF * IT_MEAN_DIFF) / (res_cnt - 1.0); + } + gp_stat->stddev = sqrt(gp_stat->stddev); +#undef IT_MEAN_DIFF +} + void hits_drops_report_final(struct bench_res res[], int res_cnt) { int i; @@ -236,6 +273,7 @@ extern struct argp bench_ringbufs_argp; extern struct argp bench_bloom_map_argp; extern struct argp bench_bpf_loop_argp; extern struct argp bench_local_storage_argp; +extern struct argp bench_local_storage_rcu_tasks_trace_argp; extern struct argp bench_strncmp_argp; static const struct argp_child bench_parsers[] = { @@ -244,6 +282,8 @@ static const struct argp_child bench_parsers[] = { { &bench_bpf_loop_argp, 0, "bpf_loop helper benchmark", 0 }, { &bench_local_storage_argp, 0, "local_storage benchmark", 0 }, { &bench_strncmp_argp, 0, "bpf_strncmp helper benchmark", 0 }, + { &bench_local_storage_rcu_tasks_trace_argp, 0, + "local_storage RCU Tasks Trace slowdown benchmark", 0 }, {}, }; @@ -449,6 +489,7 @@ extern const struct bench bench_bpf_hashmap_full_update; extern const struct bench bench_local_storage_cache_seq_get; extern const struct bench bench_local_storage_cache_interleaved_get; extern const struct bench bench_local_storage_cache_hashmap_control; +extern const struct bench bench_local_storage_tasks_trace; static const struct bench *benchs[] = { &bench_count_global, @@ -487,6 +528,7 @@ static const struct bench *benchs[] = { &bench_local_storage_cache_seq_get, &bench_local_storage_cache_interleaved_get, &bench_local_storage_cache_hashmap_control, + &bench_local_storage_tasks_trace, }; static void setup_benchmark() diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h index 4b15286753ba..d748255877e2 100644 --- a/tools/testing/selftests/bpf/bench.h +++ b/tools/testing/selftests/bpf/bench.h @@ -30,11 +30,19 @@ struct env { struct cpu_set cons_cpus; }; +struct basic_stats { + double mean; + double stddev; +}; + struct bench_res { long hits; long drops; long false_hits; long important_hits; + unsigned long gp_ns; + unsigned long gp_ct; + unsigned int stime; }; struct bench { @@ -65,6 +73,10 @@ void ops_report_final(struct bench_res res[], int res_cnt); void local_storage_report_progress(int iter, struct bench_res *res, long delta_ns); void local_storage_report_final(struct bench_res res[], int res_cnt); +void grace_period_latency_basic_stats(struct bench_res res[], int res_cnt, + struct basic_stats *gp_stat); +void grace_period_ticks_basic_stats(struct bench_res res[], int res_cnt, + struct basic_stats *gp_stat); static inline __u64 get_time_ns(void) { diff --git a/tools/testing/selftests/bpf/benchs/bench_local_storage_rcu_tasks_trace.c b/tools/testing/selftests/bpf/benchs/bench_local_storage_rcu_tasks_trace.c new file mode 100644 index 000000000000..43f109d93130 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_local_storage_rcu_tasks_trace.c @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#include + +#include +#include "local_storage_rcu_tasks_trace_bench.skel.h" +#include "bench.h" + +#include + +static struct { + __u32 nr_procs; + __u32 kthread_pid; + bool quiet; +} args = { + .nr_procs = 1000, + .kthread_pid = 0, + .quiet = false, +}; + +enum { + ARG_NR_PROCS = 7000, + ARG_KTHREAD_PID = 7001, + ARG_QUIET = 7002, +}; + +static const struct argp_option opts[] = { + { "nr_procs", ARG_NR_PROCS, "NR_PROCS", 0, + "Set number of user processes to spin up"}, + { "kthread_pid", ARG_KTHREAD_PID, "PID", 0, + "Pid of rcu_tasks_trace kthread for ticks tracking"}, + { "quiet", ARG_QUIET, "{0,1}", 0, + "If true, don't report progress"}, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + long ret; + + switch (key) { + case ARG_NR_PROCS: + ret = strtol(arg, NULL, 10); + if (ret < 1 || ret > UINT_MAX) { + fprintf(stderr, "invalid nr_procs\n"); + argp_usage(state); + } + args.nr_procs = ret; + break; + case ARG_KTHREAD_PID: + ret = strtol(arg, NULL, 10); + if (ret < 1) { + fprintf(stderr, "invalid kthread_pid\n"); + argp_usage(state); + } + args.kthread_pid = ret; + break; + case ARG_QUIET: + ret = strtol(arg, NULL, 10); + if (ret < 0 || ret > 1) { + fprintf(stderr, "invalid quiet %ld\n", ret); + argp_usage(state); + } + args.quiet = ret; + break; +break; + default: + return ARGP_ERR_UNKNOWN; + } + + return 0; +} + +const struct argp bench_local_storage_rcu_tasks_trace_argp = { + .options = opts, + .parser = parse_arg, +}; + +#define MAX_SLEEP_PROCS 150000 + +static void validate(void) +{ + if (env.producer_cnt != 1) { + fprintf(stderr, "benchmark doesn't support multi-producer!\n"); + exit(1); + } + if (env.consumer_cnt != 1) { + fprintf(stderr, "benchmark doesn't support multi-consumer!\n"); + exit(1); + } + + if (args.nr_procs > MAX_SLEEP_PROCS) { + fprintf(stderr, "benchmark supports up to %u sleeper procs!\n", + MAX_SLEEP_PROCS); + exit(1); + } +} + +static long kthread_pid_ticks(void) +{ + char procfs_path[100]; + long stime; + FILE *f; + + if (!args.kthread_pid) + return -1; + + sprintf(procfs_path, "/proc/%u/stat", args.kthread_pid); + f = fopen(procfs_path, "r"); + if (!f) { + fprintf(stderr, "couldn't open %s, exiting\n", procfs_path); + goto err_out; + } + if (fscanf(f, "%*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %ld", &stime) != 1) { + fprintf(stderr, "fscanf of %s failed, exiting\n", procfs_path); + goto err_out; + } + fclose(f); + return stime; + +err_out: + if (f) + fclose(f); + exit(1); + return 0; +} + +static struct { + struct local_storage_rcu_tasks_trace_bench *skel; + long prev_kthread_stime; +} ctx; + +static void sleep_and_loop(void) +{ + while (true) { + sleep(rand() % 4); + syscall(__NR_getpgid); + } +} + +static void local_storage_tasks_trace_setup(void) +{ + int i, err, forkret, runner_pid; + + runner_pid = getpid(); + + for (i = 0; i < args.nr_procs; i++) { + forkret = fork(); + if (forkret < 0) { + fprintf(stderr, "Error forking sleeper proc %u of %u, exiting\n", i, + args.nr_procs); + goto err_out; + } + + if (!forkret) { + err = prctl(PR_SET_PDEATHSIG, SIGKILL); + if (err < 0) { + fprintf(stderr, "prctl failed with err %d, exiting\n", errno); + goto err_out; + } + + if (getppid() != runner_pid) { + fprintf(stderr, "Runner died while spinning up procs, exiting\n"); + goto err_out; + } + sleep_and_loop(); + } + } + printf("Spun up %u procs (our pid %d)\n", args.nr_procs, runner_pid); + + setup_libbpf(); + + ctx.skel = local_storage_rcu_tasks_trace_bench__open_and_load(); + if (!ctx.skel) { + fprintf(stderr, "Error doing open_and_load, exiting\n"); + goto err_out; + } + + ctx.prev_kthread_stime = kthread_pid_ticks(); + + if (!bpf_program__attach(ctx.skel->progs.get_local)) { + fprintf(stderr, "Error attaching bpf program\n"); + goto err_out; + } + + if (!bpf_program__attach(ctx.skel->progs.pregp_step)) { + fprintf(stderr, "Error attaching bpf program\n"); + goto err_out; + } + + if (!bpf_program__attach(ctx.skel->progs.postgp)) { + fprintf(stderr, "Error attaching bpf program\n"); + goto err_out; + } + + return; +err_out: + exit(1); +} + +static void measure(struct bench_res *res) +{ + long ticks; + + res->gp_ct = atomic_swap(&ctx.skel->bss->gp_hits, 0); + res->gp_ns = atomic_swap(&ctx.skel->bss->gp_times, 0); + ticks = kthread_pid_ticks(); + res->stime = ticks - ctx.prev_kthread_stime; + ctx.prev_kthread_stime = ticks; +} + +static void *consumer(void *input) +{ + return NULL; +} + +static void *producer(void *input) +{ + while (true) + syscall(__NR_getpgid); + return NULL; +} + +static void report_progress(int iter, struct bench_res *res, long delta_ns) +{ + if (ctx.skel->bss->unexpected) { + fprintf(stderr, "Error: Unexpected order of bpf prog calls (postgp after pregp)."); + fprintf(stderr, "Data can't be trusted, exiting\n"); + exit(1); + } + + if (args.quiet) + return; + + printf("Iter %d\t avg tasks_trace grace period latency\t%lf ns\n", + iter, res->gp_ns / (double)res->gp_ct); + printf("Iter %d\t avg ticks per tasks_trace grace period\t%lf\n", + iter, res->stime / (double)res->gp_ct); +} + +static void report_final(struct bench_res res[], int res_cnt) +{ + struct basic_stats gp_stat; + + grace_period_latency_basic_stats(res, res_cnt, &gp_stat); + printf("SUMMARY tasks_trace grace period latency"); + printf("\tavg %.3lf us\tstddev %.3lf us\n", gp_stat.mean, gp_stat.stddev); + grace_period_ticks_basic_stats(res, res_cnt, &gp_stat); + printf("SUMMARY ticks per tasks_trace grace period"); + printf("\tavg %.3lf\tstddev %.3lf\n", gp_stat.mean, gp_stat.stddev); +} + +/* local-storage-tasks-trace: Benchmark performance of BPF local_storage's use + * of RCU Tasks-Trace. + * + * Stress RCU Tasks Trace by forking many tasks, all of which do no work aside + * from sleep() loop, and creating/destroying BPF task-local storage on wakeup. + * The number of forked tasks is configurable. + * + * exercising code paths which call call_rcu_tasks_trace while there are many + * thousands of tasks on the system should result in RCU Tasks-Trace having to + * do a noticeable amount of work. + * + * This should be observable by measuring rcu_tasks_trace_kthread CPU usage + * after the grace period has ended, or by measuring grace period latency. + * + * This benchmark uses both approaches, attaching to rcu_tasks_trace_pregp_step + * and rcu_tasks_trace_postgp functions to measure grace period latency and + * using /proc/PID/stat to measure rcu_tasks_trace_kthread kernel ticks + */ +const struct bench bench_local_storage_tasks_trace = { + .name = "local-storage-tasks-trace", + .validate = validate, + .setup = local_storage_tasks_trace_setup, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = report_progress, + .report_final = report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/run_bench_local_storage_rcu_tasks_trace.sh b/tools/testing/selftests/bpf/benchs/run_bench_local_storage_rcu_tasks_trace.sh new file mode 100755 index 000000000000..5dac1f02892c --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/run_bench_local_storage_rcu_tasks_trace.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +kthread_pid=`pgrep rcu_tasks_trace_kthread` + +if [ -z $kthread_pid ]; then + echo "error: Couldn't find rcu_tasks_trace_kthread" + exit 1 +fi + +./bench --nr_procs 15000 --kthread_pid $kthread_pid -d 600 --quiet 1 local-storage-tasks-trace diff --git a/tools/testing/selftests/bpf/progs/local_storage_rcu_tasks_trace_bench.c b/tools/testing/selftests/bpf/progs/local_storage_rcu_tasks_trace_bench.c new file mode 100644 index 000000000000..03bf69f49075 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/local_storage_rcu_tasks_trace_bench.c @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include +#include "bpf_misc.h" + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, int); +} task_storage SEC(".maps"); + +long hits; +long gp_hits; +long gp_times; +long current_gp_start; +long unexpected; +bool postgp_seen; + +SEC("fentry/" SYS_PREFIX "sys_getpgid") +int get_local(void *ctx) +{ + struct task_struct *task; + int idx; + int *s; + + idx = 0; + task = bpf_get_current_task_btf(); + s = bpf_task_storage_get(&task_storage, task, &idx, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!s) + return 0; + + *s = 3; + bpf_task_storage_delete(&task_storage, task); + __sync_add_and_fetch(&hits, 1); + return 0; +} + +SEC("fentry/rcu_tasks_trace_pregp_step") +int pregp_step(struct pt_regs *ctx) +{ + current_gp_start = bpf_ktime_get_ns(); + return 0; +} + +SEC("fentry/rcu_tasks_trace_postgp") +int postgp(struct pt_regs *ctx) +{ + if (!current_gp_start && postgp_seen) { + /* Will only happen if prog tracing rcu_tasks_trace_pregp_step doesn't + * execute before this prog + */ + __sync_add_and_fetch(&unexpected, 1); + return 0; + } + + __sync_add_and_fetch(&gp_times, bpf_ktime_get_ns() - current_gp_start); + __sync_add_and_fetch(&gp_hits, 1); + current_gp_start = 0; + postgp_seen = true; + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 20404a808593a6812cb485bec16256e702ff94c3 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:08:58 -0700 Subject: selftests/sgx: Add test for EPCM permission changes EPCM permission changes could be made from within (to relax permissions) or out (to restrict permissions) the enclave. Kernel support is needed when permissions are restricted to be able to call the privileged ENCLS[EMODPR] instruction. EPCM permissions can be relaxed via ENCLU[EMODPE] from within the enclave but the enclave still depends on the kernel to install PTEs with the needed permissions. Add a test that exercises a few of the enclave page permission flows: 1) Test starts with a RW (from enclave and kernel perspective) enclave page that is mapped via a RW VMA. 2) Use the SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS ioctl() to restrict the enclave (EPCM) page permissions to read-only. 3) Run ENCLU[EACCEPT] from within the enclave to accept the new page permissions. 4) Attempt to write to the enclave page from within the enclave - this should fail with a page fault on the EPCM permissions since the page table entry continues to allow RW access. 5) Restore EPCM permissions to RW by running ENCLU[EMODPE] from within the enclave. 6) Attempt to write to the enclave page from within the enclave - this should succeed since both EPCM and PTE permissions allow this access. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/2617bf2b2d1e27ca1d0096e1192ae5896baf3f80.1652137848.git.reinette.chatre@intel.com --- tools/testing/selftests/sgx/defines.h | 15 +++ tools/testing/selftests/sgx/main.c | 214 ++++++++++++++++++++++++++++++++ tools/testing/selftests/sgx/test_encl.c | 38 ++++++ 3 files changed, 267 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/sgx/defines.h b/tools/testing/selftests/sgx/defines.h index 02d775789ea7..b638eb98c80c 100644 --- a/tools/testing/selftests/sgx/defines.h +++ b/tools/testing/selftests/sgx/defines.h @@ -24,6 +24,8 @@ enum encl_op_type { ENCL_OP_PUT_TO_ADDRESS, ENCL_OP_GET_FROM_ADDRESS, ENCL_OP_NOP, + ENCL_OP_EACCEPT, + ENCL_OP_EMODPE, ENCL_OP_MAX, }; @@ -53,4 +55,17 @@ struct encl_op_get_from_addr { uint64_t addr; }; +struct encl_op_eaccept { + struct encl_op_header header; + uint64_t epc_addr; + uint64_t flags; + uint64_t ret; +}; + +struct encl_op_emodpe { + struct encl_op_header header; + uint64_t epc_addr; + uint64_t flags; +}; + #endif /* DEFINES_H */ diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index dd74fa42302e..46eac09cd955 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -25,6 +25,18 @@ static const uint64_t MAGIC = 0x1122334455667788ULL; static const uint64_t MAGIC2 = 0x8877665544332211ULL; vdso_sgx_enter_enclave_t vdso_sgx_enter_enclave; +/* + * Security Information (SECINFO) data structure needed by a few SGX + * instructions (eg. ENCLU[EACCEPT] and ENCLU[EMODPE]) holds meta-data + * about an enclave page. &enum sgx_secinfo_page_state specifies the + * secinfo flags used for page state. + */ +enum sgx_secinfo_page_state { + SGX_SECINFO_PENDING = (1 << 3), + SGX_SECINFO_MODIFIED = (1 << 4), + SGX_SECINFO_PR = (1 << 5), +}; + struct vdso_symtab { Elf64_Sym *elf_symtab; const char *elf_symstrtab; @@ -555,4 +567,206 @@ TEST_F(enclave, pte_permissions) EXPECT_EQ(self->run.exception_addr, 0); } +/* + * Enclave page permission test. + * + * Modify and restore enclave page's EPCM (enclave) permissions from + * outside enclave (ENCLS[EMODPR] via kernel) as well as from within + * enclave (via ENCLU[EMODPE]). Check for page fault if + * VMA allows access but EPCM permissions do not. + */ +TEST_F(enclave, epcm_permissions) +{ + struct sgx_enclave_restrict_permissions restrict_ioc; + struct encl_op_get_from_addr get_addr_op; + struct encl_op_put_to_addr put_addr_op; + struct encl_op_eaccept eaccept_op; + struct encl_op_emodpe emodpe_op; + unsigned long data_start; + int ret, errno_save; + + ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); + + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + + /* + * Ensure kernel supports needed ioctl() and system supports needed + * commands. + */ + memset(&restrict_ioc, 0, sizeof(restrict_ioc)); + + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS, + &restrict_ioc); + errno_save = ret == -1 ? errno : 0; + + /* + * Invalid parameters were provided during sanity check, + * expect command to fail. + */ + ASSERT_EQ(ret, -1); + + /* ret == -1 */ + if (errno_save == ENOTTY) + SKIP(return, + "Kernel does not support SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS ioctl()"); + else if (errno_save == ENODEV) + SKIP(return, "System does not support SGX2"); + + /* + * Page that will have its permissions changed is the second data + * page in the .data segment. This forms part of the local encl_buffer + * within the enclave. + * + * At start of test @data_start should have EPCM as well as PTE and + * VMA permissions of RW. + */ + + data_start = self->encl.encl_base + + encl_get_data_offset(&self->encl) + PAGE_SIZE; + + /* + * Sanity check that page at @data_start is writable before making + * any changes to page permissions. + * + * Start by writing MAGIC to test page. + */ + put_addr_op.value = MAGIC; + put_addr_op.addr = data_start; + put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* + * Read memory that was just written to, confirming that + * page is writable. + */ + get_addr_op.value = 0; + get_addr_op.addr = data_start; + get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); + + EXPECT_EQ(get_addr_op.value, MAGIC); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* + * Change EPCM permissions to read-only. Kernel still considers + * the page writable. + */ + memset(&restrict_ioc, 0, sizeof(restrict_ioc)); + + restrict_ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; + restrict_ioc.length = PAGE_SIZE; + restrict_ioc.permissions = SGX_SECINFO_R; + + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS, + &restrict_ioc); + errno_save = ret == -1 ? errno : 0; + + EXPECT_EQ(ret, 0); + EXPECT_EQ(errno_save, 0); + EXPECT_EQ(restrict_ioc.result, 0); + EXPECT_EQ(restrict_ioc.count, 4096); + + /* + * EPCM permissions changed from kernel, need to EACCEPT from enclave. + */ + eaccept_op.epc_addr = data_start; + eaccept_op.flags = SGX_SECINFO_R | SGX_SECINFO_REG | SGX_SECINFO_PR; + eaccept_op.ret = 0; + eaccept_op.header.type = ENCL_OP_EACCEPT; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + /* + * EPCM permissions of page is now read-only, expect #PF + * on EPCM when attempting to write to page from within enclave. + */ + put_addr_op.value = MAGIC2; + + EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); + + EXPECT_EQ(self->run.function, ERESUME); + EXPECT_EQ(self->run.exception_vector, 14); + EXPECT_EQ(self->run.exception_error_code, 0x8007); + EXPECT_EQ(self->run.exception_addr, data_start); + + self->run.exception_vector = 0; + self->run.exception_error_code = 0; + self->run.exception_addr = 0; + + /* + * Received AEX but cannot return to enclave at same entrypoint, + * need different TCS from where EPCM permission can be made writable + * again. + */ + self->run.tcs = self->encl.encl_base + PAGE_SIZE; + + /* + * Enter enclave at new TCS to change EPCM permissions to be + * writable again and thus fix the page fault that triggered the + * AEX. + */ + + emodpe_op.epc_addr = data_start; + emodpe_op.flags = SGX_SECINFO_R | SGX_SECINFO_W; + emodpe_op.header.type = ENCL_OP_EMODPE; + + EXPECT_EQ(ENCL_CALL(&emodpe_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* + * Attempt to return to main TCS to resume execution at faulting + * instruction, PTE should continue to allow writing to the page. + */ + self->run.tcs = self->encl.encl_base; + + /* + * Wrong page permissions that caused original fault has + * now been fixed via EPCM permissions. + * Resume execution in main TCS to re-attempt the memory access. + */ + self->run.tcs = self->encl.encl_base; + + EXPECT_EQ(vdso_sgx_enter_enclave((unsigned long)&put_addr_op, 0, 0, + ERESUME, 0, 0, + &self->run), + 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + get_addr_op.value = 0; + + EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); + + EXPECT_EQ(get_addr_op.value, MAGIC2); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.user_data, 0); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/sgx/test_encl.c b/tools/testing/selftests/sgx/test_encl.c index 4fca01cfd898..5b6c65331527 100644 --- a/tools/testing/selftests/sgx/test_encl.c +++ b/tools/testing/selftests/sgx/test_encl.c @@ -11,6 +11,42 @@ */ static uint8_t encl_buffer[8192] = { 1 }; +enum sgx_enclu_function { + EACCEPT = 0x5, + EMODPE = 0x6, +}; + +static void do_encl_emodpe(void *_op) +{ + struct sgx_secinfo secinfo __aligned(sizeof(struct sgx_secinfo)) = {0}; + struct encl_op_emodpe *op = _op; + + secinfo.flags = op->flags; + + asm volatile(".byte 0x0f, 0x01, 0xd7" + : + : "a" (EMODPE), + "b" (&secinfo), + "c" (op->epc_addr)); +} + +static void do_encl_eaccept(void *_op) +{ + struct sgx_secinfo secinfo __aligned(sizeof(struct sgx_secinfo)) = {0}; + struct encl_op_eaccept *op = _op; + int rax; + + secinfo.flags = op->flags; + + asm volatile(".byte 0x0f, 0x01, 0xd7" + : "=a" (rax) + : "a" (EACCEPT), + "b" (&secinfo), + "c" (op->epc_addr)); + + op->ret = rax; +} + static void *memcpy(void *dest, const void *src, size_t n) { size_t i; @@ -62,6 +98,8 @@ void encl_body(void *rdi, void *rsi) do_encl_op_put_to_addr, do_encl_op_get_from_addr, do_encl_op_nop, + do_encl_eaccept, + do_encl_emodpe, }; struct encl_op_header *op = (struct encl_op_header *)rdi; -- cgit v1.2.3 From 7088c81f94733fd5d103f8975d5e1d1fad12f665 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:08:59 -0700 Subject: selftests/sgx: Add test for TCS page permission changes Kernel should not allow permission changes on TCS pages. Add test to confirm this behavior. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/0121ad1b21befb94519072e2c18b89aa5dca00d4.1652137848.git.reinette.chatre@intel.com --- tools/testing/selftests/sgx/main.c | 71 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index 46eac09cd955..016ae3e5f398 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -121,6 +121,24 @@ static Elf64_Sym *vdso_symtab_get(struct vdso_symtab *symtab, const char *name) return NULL; } +/* + * Return the offset in the enclave where the TCS segment can be found. + * The first RW segment loaded is the TCS. + */ +static off_t encl_get_tcs_offset(struct encl *encl) +{ + int i; + + for (i = 0; i < encl->nr_segments; i++) { + struct encl_segment *seg = &encl->segment_tbl[i]; + + if (i == 0 && seg->prot == (PROT_READ | PROT_WRITE)) + return seg->offset; + } + + return -1; +} + /* * Return the offset in the enclave where the data segment can be found. * The first RW segment loaded is the TCS, skip that to get info on the @@ -567,6 +585,59 @@ TEST_F(enclave, pte_permissions) EXPECT_EQ(self->run.exception_addr, 0); } +/* + * Modifying permissions of TCS page should not be possible. + */ +TEST_F(enclave, tcs_permissions) +{ + struct sgx_enclave_restrict_permissions ioc; + int ret, errno_save; + + ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); + + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + + memset(&ioc, 0, sizeof(ioc)); + + /* + * Ensure kernel supports needed ioctl() and system supports needed + * commands. + */ + + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS, &ioc); + errno_save = ret == -1 ? errno : 0; + + /* + * Invalid parameters were provided during sanity check, + * expect command to fail. + */ + ASSERT_EQ(ret, -1); + + /* ret == -1 */ + if (errno_save == ENOTTY) + SKIP(return, + "Kernel does not support SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS ioctl()"); + else if (errno_save == ENODEV) + SKIP(return, "System does not support SGX2"); + + /* + * Attempt to make TCS page read-only. This is not allowed and + * should be prevented by the kernel. + */ + ioc.offset = encl_get_tcs_offset(&self->encl); + ioc.length = PAGE_SIZE; + ioc.permissions = SGX_SECINFO_R; + + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS, &ioc); + errno_save = ret == -1 ? errno : 0; + + EXPECT_EQ(ret, -1); + EXPECT_EQ(errno_save, EINVAL); + EXPECT_EQ(ioc.result, 0); + EXPECT_EQ(ioc.count, 0); +} + /* * Enclave page permission test. * -- cgit v1.2.3 From 67f1f70a23d117628d5cfc78bcdf8eb9d2d04874 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:09:00 -0700 Subject: selftests/sgx: Test two different SGX2 EAUG flows Enclave pages can be added to an initialized enclave when an address belonging to the enclave but without a backing page is accessed from within the enclave. Accessing memory without a backing enclave page from within an enclave can be in different ways: 1) Pre-emptively run ENCLU[EACCEPT]. Since the addition of a page always needs to be accepted by the enclave via ENCLU[EACCEPT] this flow is efficient since the first execution of ENCLU[EACCEPT] triggers the addition of the page and when execution returns to the same instruction the second execution would be successful as an acceptance of the page. 2) A direct read or write. The flow where a direct read or write triggers the page addition execution cannot resume from the instruction (read/write) that triggered the fault but instead the enclave needs to be entered at a different entry point to run needed ENCLU[EACCEPT] before execution can return to the original entry point and the read/write instruction that faulted. Add tests for both flows. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/0c321e0e32790ac1de742ce5017a331e6d902ac1.1652137848.git.reinette.chatre@intel.com --- tools/testing/selftests/sgx/main.c | 250 +++++++++++++++++++++++++++++++++++++ 1 file changed, 250 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index 016ae3e5f398..79c08e347112 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -86,6 +86,15 @@ static bool vdso_get_symtab(void *addr, struct vdso_symtab *symtab) return true; } +static inline int sgx2_supported(void) +{ + unsigned int eax, ebx, ecx, edx; + + __cpuid_count(SGX_CPUID, 0x0, eax, ebx, ecx, edx); + + return eax & 0x2; +} + static unsigned long elf_sym_hash(const char *name) { unsigned long h = 0, high; @@ -840,4 +849,245 @@ TEST_F(enclave, epcm_permissions) EXPECT_EQ(self->run.exception_addr, 0); } +/* + * Test the addition of pages to an initialized enclave via writing to + * a page belonging to the enclave's address space but was not added + * during enclave creation. + */ +TEST_F(enclave, augment) +{ + struct encl_op_get_from_addr get_addr_op; + struct encl_op_put_to_addr put_addr_op; + struct encl_op_eaccept eaccept_op; + size_t total_size = 0; + void *addr; + int i; + + if (!sgx2_supported()) + SKIP(return, "SGX2 not supported"); + + ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); + + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + + for (i = 0; i < self->encl.nr_segments; i++) { + struct encl_segment *seg = &self->encl.segment_tbl[i]; + + total_size += seg->size; + } + + /* + * Actual enclave size is expected to be larger than the loaded + * test enclave since enclave size must be a power of 2 in bytes + * and test_encl does not consume it all. + */ + EXPECT_LT(total_size + PAGE_SIZE, self->encl.encl_size); + + /* + * Create memory mapping for the page that will be added. New + * memory mapping is for one page right after all existing + * mappings. + * Kernel will allow new mapping using any permissions if it + * falls into the enclave's address range but not backed + * by existing enclave pages. + */ + addr = mmap((void *)self->encl.encl_base + total_size, PAGE_SIZE, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_SHARED | MAP_FIXED, self->encl.fd, 0); + EXPECT_NE(addr, MAP_FAILED); + + self->run.exception_vector = 0; + self->run.exception_error_code = 0; + self->run.exception_addr = 0; + + /* + * Attempt to write to the new page from within enclave. + * Expected to fail since page is not (yet) part of the enclave. + * The first #PF will trigger the addition of the page to the + * enclave, but since the new page needs an EACCEPT from within the + * enclave before it can be used it would not be possible + * to successfully return to the failing instruction. This is the + * cause of the second #PF captured here having the SGX bit set, + * it is from hardware preventing the page from being used. + */ + put_addr_op.value = MAGIC; + put_addr_op.addr = (unsigned long)addr; + put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); + + EXPECT_EQ(self->run.function, ERESUME); + EXPECT_EQ(self->run.exception_vector, 14); + EXPECT_EQ(self->run.exception_addr, (unsigned long)addr); + + if (self->run.exception_error_code == 0x6) { + munmap(addr, PAGE_SIZE); + SKIP(return, "Kernel does not support adding pages to initialized enclave"); + } + + EXPECT_EQ(self->run.exception_error_code, 0x8007); + + self->run.exception_vector = 0; + self->run.exception_error_code = 0; + self->run.exception_addr = 0; + + /* Handle AEX by running EACCEPT from new entry point. */ + self->run.tcs = self->encl.encl_base + PAGE_SIZE; + + eaccept_op.epc_addr = self->encl.encl_base + total_size; + eaccept_op.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_REG | SGX_SECINFO_PENDING; + eaccept_op.ret = 0; + eaccept_op.header.type = ENCL_OP_EACCEPT; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + /* Can now return to main TCS to resume execution. */ + self->run.tcs = self->encl.encl_base; + + EXPECT_EQ(vdso_sgx_enter_enclave((unsigned long)&put_addr_op, 0, 0, + ERESUME, 0, 0, + &self->run), + 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* + * Read memory from newly added page that was just written to, + * confirming that data previously written (MAGIC) is present. + */ + get_addr_op.value = 0; + get_addr_op.addr = (unsigned long)addr; + get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); + + EXPECT_EQ(get_addr_op.value, MAGIC); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + munmap(addr, PAGE_SIZE); +} + +/* + * Test for the addition of pages to an initialized enclave via a + * pre-emptive run of EACCEPT on page to be added. + */ +TEST_F(enclave, augment_via_eaccept) +{ + struct encl_op_get_from_addr get_addr_op; + struct encl_op_put_to_addr put_addr_op; + struct encl_op_eaccept eaccept_op; + size_t total_size = 0; + void *addr; + int i; + + if (!sgx2_supported()) + SKIP(return, "SGX2 not supported"); + + ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); + + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + + for (i = 0; i < self->encl.nr_segments; i++) { + struct encl_segment *seg = &self->encl.segment_tbl[i]; + + total_size += seg->size; + } + + /* + * Actual enclave size is expected to be larger than the loaded + * test enclave since enclave size must be a power of 2 in bytes while + * test_encl does not consume it all. + */ + EXPECT_LT(total_size + PAGE_SIZE, self->encl.encl_size); + + /* + * mmap() a page at end of existing enclave to be used for dynamic + * EPC page. + * + * Kernel will allow new mapping using any permissions if it + * falls into the enclave's address range but not backed + * by existing enclave pages. + */ + + addr = mmap((void *)self->encl.encl_base + total_size, PAGE_SIZE, + PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_FIXED, + self->encl.fd, 0); + EXPECT_NE(addr, MAP_FAILED); + + self->run.exception_vector = 0; + self->run.exception_error_code = 0; + self->run.exception_addr = 0; + + /* + * Run EACCEPT on new page to trigger the #PF->EAUG->EACCEPT(again + * without a #PF). All should be transparent to userspace. + */ + eaccept_op.epc_addr = self->encl.encl_base + total_size; + eaccept_op.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_REG | SGX_SECINFO_PENDING; + eaccept_op.ret = 0; + eaccept_op.header.type = ENCL_OP_EACCEPT; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + if (self->run.exception_vector == 14 && + self->run.exception_error_code == 4 && + self->run.exception_addr == self->encl.encl_base + total_size) { + munmap(addr, PAGE_SIZE); + SKIP(return, "Kernel does not support adding pages to initialized enclave"); + } + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + /* + * New page should be accessible from within enclave - attempt to + * write to it. + */ + put_addr_op.value = MAGIC; + put_addr_op.addr = (unsigned long)addr; + put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* + * Read memory from newly added page that was just written to, + * confirming that data previously written (MAGIC) is present. + */ + get_addr_op.value = 0; + get_addr_op.addr = (unsigned long)addr; + get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); + + EXPECT_EQ(get_addr_op.value, MAGIC); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + munmap(addr, PAGE_SIZE); +} + TEST_HARNESS_MAIN -- cgit v1.2.3 From 7eb4370152beb2f1e25543088bce2e3f0621ab81 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:09:01 -0700 Subject: selftests/sgx: Introduce dynamic entry point The test enclave (test_encl.elf) is built with two initialized Thread Control Structures (TCS) included in the binary. Both TCS are initialized with the same entry point, encl_entry, that correctly computes the absolute address of the stack based on the stack of each TCS that is also built into the binary. A new TCS can be added dynamically to the enclave and requires to be initialized with an entry point used to enter the enclave. Since the existing entry point, encl_entry, assumes that the TCS and its stack exists at particular offsets within the binary it is not able to handle a dynamically added TCS and its stack. Introduce a new entry point, encl_dyn_entry, that initializes the absolute address of that thread's stack to the address immediately preceding the TCS itself. It is now possible to dynamically add a contiguous memory region to the enclave with the new stack preceding the new TCS. With the new TCS initialized with encl_dyn_entry as entry point the absolute address of the stack is computed correctly on entry. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/93e9c420dedf5f773ba6965c18245bc7d62aca83.1652137848.git.reinette.chatre@intel.com --- tools/testing/selftests/sgx/test_encl_bootstrap.S | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/sgx/test_encl_bootstrap.S b/tools/testing/selftests/sgx/test_encl_bootstrap.S index 82fb0dfcbd23..03ae0f57e29d 100644 --- a/tools/testing/selftests/sgx/test_encl_bootstrap.S +++ b/tools/testing/selftests/sgx/test_encl_bootstrap.S @@ -45,6 +45,12 @@ encl_entry: # TCS #2. By adding the value of encl_stack to it, we get # the absolute address for the stack. lea (encl_stack)(%rbx), %rax + jmp encl_entry_core +encl_dyn_entry: + # Entry point for dynamically created TCS page expected to follow + # its stack directly. + lea -1(%rbx), %rax +encl_entry_core: xchg %rsp, %rax push %rax -- cgit v1.2.3 From b564982fda13be6314e49f2344e7c422565e34d3 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:09:02 -0700 Subject: selftests/sgx: Introduce TCS initialization enclave operation The Thread Control Structure (TCS) contains meta-data used by the hardware to save and restore thread specific information when entering/exiting the enclave. A TCS can be added to an initialized enclave by first adding a new regular enclave page, initializing the content of the new page from within the enclave, and then changing that page's type to a TCS. Support the initialization of a TCS from within the enclave. The variable information needed that should be provided from outside the enclave is the address of the TCS, address of the State Save Area (SSA), and the entry point that the thread should use to enter the enclave. With this information provided all needed fields of a TCS can be initialized. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/bad6052056188bde753a54313da1ac8f1e29088a.1652137848.git.reinette.chatre@intel.com --- tools/testing/selftests/sgx/defines.h | 8 ++++++++ tools/testing/selftests/sgx/test_encl.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/sgx/defines.h b/tools/testing/selftests/sgx/defines.h index b638eb98c80c..d8587c971941 100644 --- a/tools/testing/selftests/sgx/defines.h +++ b/tools/testing/selftests/sgx/defines.h @@ -26,6 +26,7 @@ enum encl_op_type { ENCL_OP_NOP, ENCL_OP_EACCEPT, ENCL_OP_EMODPE, + ENCL_OP_INIT_TCS_PAGE, ENCL_OP_MAX, }; @@ -68,4 +69,11 @@ struct encl_op_emodpe { uint64_t flags; }; +struct encl_op_init_tcs_page { + struct encl_op_header header; + uint64_t tcs_page; + uint64_t ssa; + uint64_t entry; +}; + #endif /* DEFINES_H */ diff --git a/tools/testing/selftests/sgx/test_encl.c b/tools/testing/selftests/sgx/test_encl.c index 5b6c65331527..c0d6397295e3 100644 --- a/tools/testing/selftests/sgx/test_encl.c +++ b/tools/testing/selftests/sgx/test_encl.c @@ -57,6 +57,35 @@ static void *memcpy(void *dest, const void *src, size_t n) return dest; } +static void *memset(void *dest, int c, size_t n) +{ + size_t i; + + for (i = 0; i < n; i++) + ((char *)dest)[i] = c; + + return dest; +} + +static void do_encl_init_tcs_page(void *_op) +{ + struct encl_op_init_tcs_page *op = _op; + void *tcs = (void *)op->tcs_page; + uint32_t val_32; + + memset(tcs, 0, 16); /* STATE and FLAGS */ + memcpy(tcs + 16, &op->ssa, 8); /* OSSA */ + memset(tcs + 24, 0, 4); /* CSSA */ + val_32 = 1; + memcpy(tcs + 28, &val_32, 4); /* NSSA */ + memcpy(tcs + 32, &op->entry, 8); /* OENTRY */ + memset(tcs + 40, 0, 24); /* AEP, OFSBASE, OGSBASE */ + val_32 = 0xFFFFFFFF; + memcpy(tcs + 64, &val_32, 4); /* FSLIMIT */ + memcpy(tcs + 68, &val_32, 4); /* GSLIMIT */ + memset(tcs + 72, 0, 4024); /* Reserved */ +} + static void do_encl_op_put_to_buf(void *op) { struct encl_op_put_to_buf *op2 = op; @@ -100,6 +129,7 @@ void encl_body(void *rdi, void *rsi) do_encl_op_nop, do_encl_eaccept, do_encl_emodpe, + do_encl_init_tcs_page, }; struct encl_op_header *op = (struct encl_op_header *)rdi; -- cgit v1.2.3 From 33c5aac3bf32c3ef120ad6d2eb5c65ab64a5fec4 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:09:03 -0700 Subject: selftests/sgx: Test complete changing of page type flow Support for changing an enclave page's type enables an initialized enclave to be expanded with support for more threads by changing the type of a regular enclave page to that of a Thread Control Structure (TCS). Additionally, being able to change a TCS or regular enclave page's type to be trimmed (SGX_PAGE_TYPE_TRIM) initiates the removal of the page from the enclave. Test changing page type to TCS as well as page removal flows in two phases: In the first phase support for a new thread is dynamically added to an initialized enclave and in the second phase the pages associated with the new thread are removed from the enclave. As an additional sanity check after the second phase the page used as a TCS page during the first phase is added back as a regular page and ensured that it can be written to (which is not possible if it was a TCS page). Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/d05b48b00338683a94dcaef9f478540fc3d6d5f9.1652137848.git.reinette.chatre@intel.com --- tools/testing/selftests/sgx/load.c | 41 +++++ tools/testing/selftests/sgx/main.c | 343 +++++++++++++++++++++++++++++++++++++ tools/testing/selftests/sgx/main.h | 1 + 3 files changed, 385 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/sgx/load.c b/tools/testing/selftests/sgx/load.c index 006b464c8fc9..94bdeac1cf04 100644 --- a/tools/testing/selftests/sgx/load.c +++ b/tools/testing/selftests/sgx/load.c @@ -130,6 +130,47 @@ static bool encl_ioc_add_pages(struct encl *encl, struct encl_segment *seg) return true; } +/* + * Parse the enclave code's symbol table to locate and return address of + * the provided symbol + */ +uint64_t encl_get_entry(struct encl *encl, const char *symbol) +{ + Elf64_Shdr *sections; + Elf64_Sym *symtab; + Elf64_Ehdr *ehdr; + char *sym_names; + int num_sym; + int i; + + ehdr = encl->bin; + sections = encl->bin + ehdr->e_shoff; + + for (i = 0; i < ehdr->e_shnum; i++) { + if (sections[i].sh_type == SHT_SYMTAB) { + symtab = (Elf64_Sym *)((char *)encl->bin + sections[i].sh_offset); + num_sym = sections[i].sh_size / sections[i].sh_entsize; + break; + } + } + + for (i = 0; i < ehdr->e_shnum; i++) { + if (sections[i].sh_type == SHT_STRTAB) { + sym_names = (char *)encl->bin + sections[i].sh_offset; + break; + } + } + + for (i = 0; i < num_sym; i++) { + Elf64_Sym *sym = &symtab[i]; + + if (!strcmp(symbol, sym_names + sym->st_name)) + return (uint64_t)sym->st_value; + } + + return 0; +} + bool encl_load(const char *path, struct encl *encl, unsigned long heap_size) { const char device_path[] = "/dev/sgx_enclave"; diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index 79c08e347112..8bf43646e0bb 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -1090,4 +1090,347 @@ TEST_F(enclave, augment_via_eaccept) munmap(addr, PAGE_SIZE); } +/* + * SGX2 page type modification test in two phases: + * Phase 1: + * Create a new TCS, consisting out of three new pages (stack page with regular + * page type, SSA page with regular page type, and TCS page with TCS page + * type) in an initialized enclave and run a simple workload within it. + * Phase 2: + * Remove the three pages added in phase 1, add a new regular page at the + * same address that previously hosted the TCS page and verify that it can + * be modified. + */ +TEST_F(enclave, tcs_create) +{ + struct encl_op_init_tcs_page init_tcs_page_op; + struct sgx_enclave_remove_pages remove_ioc; + struct encl_op_get_from_addr get_addr_op; + struct sgx_enclave_modify_types modt_ioc; + struct encl_op_put_to_addr put_addr_op; + struct encl_op_get_from_buf get_buf_op; + struct encl_op_put_to_buf put_buf_op; + void *addr, *tcs, *stack_end, *ssa; + struct encl_op_eaccept eaccept_op; + size_t total_size = 0; + uint64_t val_64; + int errno_save; + int ret, i; + + ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, + _metadata)); + + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + + /* + * Hardware (SGX2) and kernel support is needed for this test. Start + * with check that test has a chance of succeeding. + */ + memset(&modt_ioc, 0, sizeof(modt_ioc)); + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); + + if (ret == -1) { + if (errno == ENOTTY) + SKIP(return, + "Kernel does not support SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl()"); + else if (errno == ENODEV) + SKIP(return, "System does not support SGX2"); + } + + /* + * Invalid parameters were provided during sanity check, + * expect command to fail. + */ + EXPECT_EQ(ret, -1); + + /* + * Add three regular pages via EAUG: one will be the TCS stack, one + * will be the TCS SSA, and one will be the new TCS. The stack and + * SSA will remain as regular pages, the TCS page will need its + * type changed after populated with needed data. + */ + for (i = 0; i < self->encl.nr_segments; i++) { + struct encl_segment *seg = &self->encl.segment_tbl[i]; + + total_size += seg->size; + } + + /* + * Actual enclave size is expected to be larger than the loaded + * test enclave since enclave size must be a power of 2 in bytes while + * test_encl does not consume it all. + */ + EXPECT_LT(total_size + 3 * PAGE_SIZE, self->encl.encl_size); + + /* + * mmap() three pages at end of existing enclave to be used for the + * three new pages. + */ + addr = mmap((void *)self->encl.encl_base + total_size, 3 * PAGE_SIZE, + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, + self->encl.fd, 0); + EXPECT_NE(addr, MAP_FAILED); + + self->run.exception_vector = 0; + self->run.exception_error_code = 0; + self->run.exception_addr = 0; + + stack_end = (void *)self->encl.encl_base + total_size; + tcs = (void *)self->encl.encl_base + total_size + PAGE_SIZE; + ssa = (void *)self->encl.encl_base + total_size + 2 * PAGE_SIZE; + + /* + * Run EACCEPT on each new page to trigger the + * EACCEPT->(#PF)->EAUG->EACCEPT(again without a #PF) flow. + */ + + eaccept_op.epc_addr = (unsigned long)stack_end; + eaccept_op.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_REG | SGX_SECINFO_PENDING; + eaccept_op.ret = 0; + eaccept_op.header.type = ENCL_OP_EACCEPT; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + if (self->run.exception_vector == 14 && + self->run.exception_error_code == 4 && + self->run.exception_addr == (unsigned long)stack_end) { + munmap(addr, 3 * PAGE_SIZE); + SKIP(return, "Kernel does not support adding pages to initialized enclave"); + } + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + eaccept_op.epc_addr = (unsigned long)ssa; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + eaccept_op.epc_addr = (unsigned long)tcs; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + /* + * Three new pages added to enclave. Now populate the TCS page with + * needed data. This should be done from within enclave. Provide + * the function that will do the actual data population with needed + * data. + */ + + /* + * New TCS will use the "encl_dyn_entry" entrypoint that expects + * stack to begin in page before TCS page. + */ + val_64 = encl_get_entry(&self->encl, "encl_dyn_entry"); + EXPECT_NE(val_64, 0); + + init_tcs_page_op.tcs_page = (unsigned long)tcs; + init_tcs_page_op.ssa = (unsigned long)total_size + 2 * PAGE_SIZE; + init_tcs_page_op.entry = val_64; + init_tcs_page_op.header.type = ENCL_OP_INIT_TCS_PAGE; + + EXPECT_EQ(ENCL_CALL(&init_tcs_page_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* Change TCS page type to TCS. */ + memset(&modt_ioc, 0, sizeof(modt_ioc)); + + modt_ioc.offset = total_size + PAGE_SIZE; + modt_ioc.length = PAGE_SIZE; + modt_ioc.page_type = SGX_PAGE_TYPE_TCS; + + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); + errno_save = ret == -1 ? errno : 0; + + EXPECT_EQ(ret, 0); + EXPECT_EQ(errno_save, 0); + EXPECT_EQ(modt_ioc.result, 0); + EXPECT_EQ(modt_ioc.count, 4096); + + /* EACCEPT new TCS page from enclave. */ + eaccept_op.epc_addr = (unsigned long)tcs; + eaccept_op.flags = SGX_SECINFO_TCS | SGX_SECINFO_MODIFIED; + eaccept_op.ret = 0; + eaccept_op.header.type = ENCL_OP_EACCEPT; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + /* Run workload from new TCS. */ + self->run.tcs = (unsigned long)tcs; + + /* + * Simple workload to write to data buffer and read value back. + */ + put_buf_op.header.type = ENCL_OP_PUT_TO_BUFFER; + put_buf_op.value = MAGIC; + + EXPECT_EQ(ENCL_CALL(&put_buf_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + get_buf_op.header.type = ENCL_OP_GET_FROM_BUFFER; + get_buf_op.value = 0; + + EXPECT_EQ(ENCL_CALL(&get_buf_op, &self->run, true), 0); + + EXPECT_EQ(get_buf_op.value, MAGIC); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* + * Phase 2 of test: + * Remove pages associated with new TCS, create a regular page + * where TCS page used to be and verify it can be used as a regular + * page. + */ + + /* Start page removal by requesting change of page type to PT_TRIM. */ + memset(&modt_ioc, 0, sizeof(modt_ioc)); + + modt_ioc.offset = total_size; + modt_ioc.length = 3 * PAGE_SIZE; + modt_ioc.page_type = SGX_PAGE_TYPE_TRIM; + + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); + errno_save = ret == -1 ? errno : 0; + + EXPECT_EQ(ret, 0); + EXPECT_EQ(errno_save, 0); + EXPECT_EQ(modt_ioc.result, 0); + EXPECT_EQ(modt_ioc.count, 3 * PAGE_SIZE); + + /* + * Enter enclave via TCS #1 and approve page removal by sending + * EACCEPT for each of three removed pages. + */ + self->run.tcs = self->encl.encl_base; + + eaccept_op.epc_addr = (unsigned long)stack_end; + eaccept_op.flags = SGX_SECINFO_TRIM | SGX_SECINFO_MODIFIED; + eaccept_op.ret = 0; + eaccept_op.header.type = ENCL_OP_EACCEPT; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + eaccept_op.epc_addr = (unsigned long)tcs; + eaccept_op.ret = 0; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + eaccept_op.epc_addr = (unsigned long)ssa; + eaccept_op.ret = 0; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + /* Send final ioctl() to complete page removal. */ + memset(&remove_ioc, 0, sizeof(remove_ioc)); + + remove_ioc.offset = total_size; + remove_ioc.length = 3 * PAGE_SIZE; + + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_REMOVE_PAGES, &remove_ioc); + errno_save = ret == -1 ? errno : 0; + + EXPECT_EQ(ret, 0); + EXPECT_EQ(errno_save, 0); + EXPECT_EQ(remove_ioc.count, 3 * PAGE_SIZE); + + /* + * Enter enclave via TCS #1 and access location where TCS #3 was to + * trigger dynamic add of regular page at that location. + */ + eaccept_op.epc_addr = (unsigned long)tcs; + eaccept_op.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_REG | SGX_SECINFO_PENDING; + eaccept_op.ret = 0; + eaccept_op.header.type = ENCL_OP_EACCEPT; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + /* + * New page should be accessible from within enclave - write to it. + */ + put_addr_op.value = MAGIC; + put_addr_op.addr = (unsigned long)tcs; + put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* + * Read memory from newly added page that was just written to, + * confirming that data previously written (MAGIC) is present. + */ + get_addr_op.value = 0; + get_addr_op.addr = (unsigned long)tcs; + get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); + + EXPECT_EQ(get_addr_op.value, MAGIC); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + munmap(addr, 3 * PAGE_SIZE); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/sgx/main.h b/tools/testing/selftests/sgx/main.h index b45c52ec7ab3..fc585be97e2f 100644 --- a/tools/testing/selftests/sgx/main.h +++ b/tools/testing/selftests/sgx/main.h @@ -38,6 +38,7 @@ void encl_delete(struct encl *ctx); bool encl_load(const char *path, struct encl *encl, unsigned long heap_size); bool encl_measure(struct encl *encl); bool encl_build(struct encl *encl); +uint64_t encl_get_entry(struct encl *encl, const char *symbol); int sgx_enter_enclave(void *rdi, void *rsi, long rdx, u32 function, void *r8, void *r9, struct sgx_enclave_run *run); -- cgit v1.2.3 From 50b822e4b785948ed663c89c84e124fc8c099c9b Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:09:04 -0700 Subject: selftests/sgx: Test faulty enclave behavior Removing a page from an initialized enclave involves three steps: first the user requests changing the page type to SGX_PAGE_TYPE_TRIM via an ioctl(), on success the ENCLU[EACCEPT] instruction needs to be run from within the enclave to accept the page removal, finally the user requests page removal to be completed via an ioctl(). Only after acceptance (ENCLU[EACCEPT]) from within the enclave can the kernel remove the page from a running enclave. Test the behavior when the user's request to change the page type succeeds, but the ENCLU[EACCEPT] instruction is not run before the ioctl() requesting page removal is run. This should not be permitted. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/fa5da30ebac108b7517194c3038b52995602b996.1652137848.git.reinette.chatre@intel.com --- tools/testing/selftests/sgx/main.c | 114 +++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index 8bf43646e0bb..3a82bae915d1 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -1433,4 +1433,118 @@ TEST_F(enclave, tcs_create) munmap(addr, 3 * PAGE_SIZE); } +/* + * Ensure sane behavior if user requests page removal, does not run + * EACCEPT from within enclave but still attempts to finalize page removal + * with the SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl(). The latter should fail + * because the removal was not EACCEPTed from within the enclave. + */ +TEST_F(enclave, remove_added_page_no_eaccept) +{ + struct sgx_enclave_remove_pages remove_ioc; + struct encl_op_get_from_addr get_addr_op; + struct sgx_enclave_modify_types modt_ioc; + struct encl_op_put_to_addr put_addr_op; + unsigned long data_start; + int ret, errno_save; + + ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); + + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + + /* + * Hardware (SGX2) and kernel support is needed for this test. Start + * with check that test has a chance of succeeding. + */ + memset(&modt_ioc, 0, sizeof(modt_ioc)); + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); + + if (ret == -1) { + if (errno == ENOTTY) + SKIP(return, + "Kernel does not support SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl()"); + else if (errno == ENODEV) + SKIP(return, "System does not support SGX2"); + } + + /* + * Invalid parameters were provided during sanity check, + * expect command to fail. + */ + EXPECT_EQ(ret, -1); + + /* + * Page that will be removed is the second data page in the .data + * segment. This forms part of the local encl_buffer within the + * enclave. + */ + data_start = self->encl.encl_base + + encl_get_data_offset(&self->encl) + PAGE_SIZE; + + /* + * Sanity check that page at @data_start is writable before + * removing it. + * + * Start by writing MAGIC to test page. + */ + put_addr_op.value = MAGIC; + put_addr_op.addr = data_start; + put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* + * Read memory that was just written to, confirming that data + * previously written (MAGIC) is present. + */ + get_addr_op.value = 0; + get_addr_op.addr = data_start; + get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); + + EXPECT_EQ(get_addr_op.value, MAGIC); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* Start page removal by requesting change of page type to PT_TRIM */ + memset(&modt_ioc, 0, sizeof(modt_ioc)); + + modt_ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; + modt_ioc.length = PAGE_SIZE; + modt_ioc.page_type = SGX_PAGE_TYPE_TRIM; + + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); + errno_save = ret == -1 ? errno : 0; + + EXPECT_EQ(ret, 0); + EXPECT_EQ(errno_save, 0); + EXPECT_EQ(modt_ioc.result, 0); + EXPECT_EQ(modt_ioc.count, 4096); + + /* Skip EACCEPT */ + + /* Send final ioctl() to complete page removal */ + memset(&remove_ioc, 0, sizeof(remove_ioc)); + + remove_ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; + remove_ioc.length = PAGE_SIZE; + + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_REMOVE_PAGES, &remove_ioc); + errno_save = ret == -1 ? errno : 0; + + /* Operation not permitted since EACCEPT was omitted. */ + EXPECT_EQ(ret, -1); + EXPECT_EQ(errno_save, EPERM); + EXPECT_EQ(remove_ioc.count, 0); +} + TEST_HARNESS_MAIN -- cgit v1.2.3 From 35c7e6dacb038e9311e98901d56bb1abd56f9ae0 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:09:05 -0700 Subject: selftests/sgx: Test invalid access to removed enclave page Removing a page from an initialized enclave involves three steps: (1) the user requests changing the page type to SGX_PAGE_TYPE_TRIM via the SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl(), (2) on success the ENCLU[EACCEPT] instruction is run from within the enclave to accept the page removal, (3) the user initiates the actual removal of the page via the SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl(). Test two possible invalid accesses during the page removal flow: * Test the behavior when a request to remove the page by changing its type to SGX_PAGE_TYPE_TRIM completes successfully but instead of executing ENCLU[EACCEPT] from within the enclave the enclave attempts to read from the page. Even though the page is accessible from the page table entries its type is SGX_PAGE_TYPE_TRIM and thus not accessible according to SGX. The expected behavior is a page fault with the SGX flag set in the error code. * Test the behavior when the page type is changed successfully and ENCLU[EACCEPT] was run from within the enclave. The final ioctl(), SGX_IOC_ENCLAVE_REMOVE_PAGES, is omitted and replaced with an attempt to access the page. Even though the page is accessible from the page table entries its type is SGX_PAGE_TYPE_TRIM and thus not accessible according to SGX. The expected behavior is a page fault with the SGX flag set in the error code. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/189a86c25d6d62da7cfdd08ee97abc1a06fcc179.1652137848.git.reinette.chatre@intel.com --- tools/testing/selftests/sgx/main.c | 243 +++++++++++++++++++++++++++++++++++++ 1 file changed, 243 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index 3a82bae915d1..2c69045253b2 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -1547,4 +1547,247 @@ TEST_F(enclave, remove_added_page_no_eaccept) EXPECT_EQ(remove_ioc.count, 0); } +/* + * Request enclave page removal but instead of correctly following with + * EACCEPT a read attempt to page is made from within the enclave. + */ +TEST_F(enclave, remove_added_page_invalid_access) +{ + struct encl_op_get_from_addr get_addr_op; + struct encl_op_put_to_addr put_addr_op; + struct sgx_enclave_modify_types ioc; + unsigned long data_start; + int ret, errno_save; + + ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); + + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + + /* + * Hardware (SGX2) and kernel support is needed for this test. Start + * with check that test has a chance of succeeding. + */ + memset(&ioc, 0, sizeof(ioc)); + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &ioc); + + if (ret == -1) { + if (errno == ENOTTY) + SKIP(return, + "Kernel does not support SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl()"); + else if (errno == ENODEV) + SKIP(return, "System does not support SGX2"); + } + + /* + * Invalid parameters were provided during sanity check, + * expect command to fail. + */ + EXPECT_EQ(ret, -1); + + /* + * Page that will be removed is the second data page in the .data + * segment. This forms part of the local encl_buffer within the + * enclave. + */ + data_start = self->encl.encl_base + + encl_get_data_offset(&self->encl) + PAGE_SIZE; + + /* + * Sanity check that page at @data_start is writable before + * removing it. + * + * Start by writing MAGIC to test page. + */ + put_addr_op.value = MAGIC; + put_addr_op.addr = data_start; + put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* + * Read memory that was just written to, confirming that data + * previously written (MAGIC) is present. + */ + get_addr_op.value = 0; + get_addr_op.addr = data_start; + get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); + + EXPECT_EQ(get_addr_op.value, MAGIC); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* Start page removal by requesting change of page type to PT_TRIM. */ + memset(&ioc, 0, sizeof(ioc)); + + ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; + ioc.length = PAGE_SIZE; + ioc.page_type = SGX_PAGE_TYPE_TRIM; + + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &ioc); + errno_save = ret == -1 ? errno : 0; + + EXPECT_EQ(ret, 0); + EXPECT_EQ(errno_save, 0); + EXPECT_EQ(ioc.result, 0); + EXPECT_EQ(ioc.count, 4096); + + /* + * Read from page that was just removed. + */ + get_addr_op.value = 0; + + EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); + + /* + * From kernel perspective the page is present but according to SGX the + * page should not be accessible so a #PF with SGX bit set is + * expected. + */ + + EXPECT_EQ(self->run.function, ERESUME); + EXPECT_EQ(self->run.exception_vector, 14); + EXPECT_EQ(self->run.exception_error_code, 0x8005); + EXPECT_EQ(self->run.exception_addr, data_start); +} + +/* + * Request enclave page removal and correctly follow with + * EACCEPT but do not follow with removal ioctl() but instead a read attempt + * to removed page is made from within the enclave. + */ +TEST_F(enclave, remove_added_page_invalid_access_after_eaccept) +{ + struct encl_op_get_from_addr get_addr_op; + struct encl_op_put_to_addr put_addr_op; + struct sgx_enclave_modify_types ioc; + struct encl_op_eaccept eaccept_op; + unsigned long data_start; + int ret, errno_save; + + ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); + + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + + /* + * Hardware (SGX2) and kernel support is needed for this test. Start + * with check that test has a chance of succeeding. + */ + memset(&ioc, 0, sizeof(ioc)); + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &ioc); + + if (ret == -1) { + if (errno == ENOTTY) + SKIP(return, + "Kernel does not support SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl()"); + else if (errno == ENODEV) + SKIP(return, "System does not support SGX2"); + } + + /* + * Invalid parameters were provided during sanity check, + * expect command to fail. + */ + EXPECT_EQ(ret, -1); + + /* + * Page that will be removed is the second data page in the .data + * segment. This forms part of the local encl_buffer within the + * enclave. + */ + data_start = self->encl.encl_base + + encl_get_data_offset(&self->encl) + PAGE_SIZE; + + /* + * Sanity check that page at @data_start is writable before + * removing it. + * + * Start by writing MAGIC to test page. + */ + put_addr_op.value = MAGIC; + put_addr_op.addr = data_start; + put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* + * Read memory that was just written to, confirming that data + * previously written (MAGIC) is present. + */ + get_addr_op.value = 0; + get_addr_op.addr = data_start; + get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS; + + EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); + + EXPECT_EQ(get_addr_op.value, MAGIC); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + + /* Start page removal by requesting change of page type to PT_TRIM. */ + memset(&ioc, 0, sizeof(ioc)); + + ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; + ioc.length = PAGE_SIZE; + ioc.page_type = SGX_PAGE_TYPE_TRIM; + + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &ioc); + errno_save = ret == -1 ? errno : 0; + + EXPECT_EQ(ret, 0); + EXPECT_EQ(errno_save, 0); + EXPECT_EQ(ioc.result, 0); + EXPECT_EQ(ioc.count, 4096); + + eaccept_op.epc_addr = (unsigned long)data_start; + eaccept_op.ret = 0; + eaccept_op.flags = SGX_SECINFO_TRIM | SGX_SECINFO_MODIFIED; + eaccept_op.header.type = ENCL_OP_EACCEPT; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + /* Skip ioctl() to remove page. */ + + /* + * Read from page that was just removed. + */ + get_addr_op.value = 0; + + EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0); + + /* + * From kernel perspective the page is present but according to SGX the + * page should not be accessible so a #PF with SGX bit set is + * expected. + */ + + EXPECT_EQ(self->run.function, ERESUME); + EXPECT_EQ(self->run.exception_vector, 14); + EXPECT_EQ(self->run.exception_error_code, 0x8005); + EXPECT_EQ(self->run.exception_addr, data_start); +} + TEST_HARNESS_MAIN -- cgit v1.2.3 From 08ceab2c37d32f422f8d98540656ee5a416ba729 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:09:06 -0700 Subject: selftests/sgx: Test reclaiming of untouched page Removing a page from an initialized enclave involves three steps: (1) the user requests changing the page type to PT_TRIM via the SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl() (2) on success the ENCLU[EACCEPT] instruction is run from within the enclave to accept the page removal (3) the user initiates the actual removal of the page via the SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl(). Remove a page that has never been accessed. This means that when the first ioctl() requesting page removal arrives, there will be no page table entry, yet a valid page table entry needs to exist for the ENCLU[EACCEPT] function to succeed. In this test it is verified that a page table entry can still be installed for a page that is in the process of being removed. Suggested-by: Haitao Huang Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/45e1b2a2fcd8c14597d04e40af5d8a9c1c5b017e.1652137848.git.reinette.chatre@intel.com --- tools/testing/selftests/sgx/main.c | 80 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index 2c69045253b2..ba16671aef79 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -1790,4 +1790,84 @@ TEST_F(enclave, remove_added_page_invalid_access_after_eaccept) EXPECT_EQ(self->run.exception_addr, data_start); } +TEST_F(enclave, remove_untouched_page) +{ + struct sgx_enclave_remove_pages remove_ioc; + struct sgx_enclave_modify_types modt_ioc; + struct encl_op_eaccept eaccept_op; + unsigned long data_start; + int ret, errno_save; + + ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata)); + + /* + * Hardware (SGX2) and kernel support is needed for this test. Start + * with check that test has a chance of succeeding. + */ + memset(&modt_ioc, 0, sizeof(modt_ioc)); + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); + + if (ret == -1) { + if (errno == ENOTTY) + SKIP(return, + "Kernel does not support SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl()"); + else if (errno == ENODEV) + SKIP(return, "System does not support SGX2"); + } + + /* + * Invalid parameters were provided during sanity check, + * expect command to fail. + */ + EXPECT_EQ(ret, -1); + + /* SGX2 is supported by kernel and hardware, test can proceed. */ + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + + data_start = self->encl.encl_base + + encl_get_data_offset(&self->encl) + PAGE_SIZE; + + memset(&modt_ioc, 0, sizeof(modt_ioc)); + + modt_ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; + modt_ioc.length = PAGE_SIZE; + modt_ioc.page_type = SGX_PAGE_TYPE_TRIM; + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); + errno_save = ret == -1 ? errno : 0; + + EXPECT_EQ(ret, 0); + EXPECT_EQ(errno_save, 0); + EXPECT_EQ(modt_ioc.result, 0); + EXPECT_EQ(modt_ioc.count, 4096); + + /* + * Enter enclave via TCS #1 and approve page removal by sending + * EACCEPT for removed page. + */ + + eaccept_op.epc_addr = data_start; + eaccept_op.flags = SGX_SECINFO_TRIM | SGX_SECINFO_MODIFIED; + eaccept_op.ret = 0; + eaccept_op.header.type = ENCL_OP_EACCEPT; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + EXPECT_EQ(eaccept_op.ret, 0); + + memset(&remove_ioc, 0, sizeof(remove_ioc)); + + remove_ioc.offset = encl_get_data_offset(&self->encl) + PAGE_SIZE; + remove_ioc.length = PAGE_SIZE; + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_REMOVE_PAGES, &remove_ioc); + errno_save = ret == -1 ? errno : 0; + + EXPECT_EQ(ret, 0); + EXPECT_EQ(errno_save, 0); + EXPECT_EQ(remove_ioc.count, 4096); +} + TEST_HARNESS_MAIN -- cgit v1.2.3 From 6507cce561b43b071999502103804e3dc1478e60 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 10 May 2022 11:09:07 -0700 Subject: selftests/sgx: Page removal stress test Create enclave with additional heap that consumes all physical SGX memory and then remove it. Depending on the available SGX memory this test could take a significant time to run (several minutes) as it (1) creates the enclave, (2) changes the type of every page to be trimmed, (3) enters the enclave once per page to run EACCEPT, before (4) the pages are finally removed. Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/e7c6aa2ab30cb1c41e52b776958409c06970d168.1652137848.git.reinette.chatre@intel.com --- tools/testing/selftests/sgx/main.c | 120 +++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index ba16671aef79..9820b3809c69 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -378,7 +378,127 @@ TEST_F(enclave, unclobbered_vdso_oversubscribed) EXPECT_EQ(get_op.value, MAGIC); EXPECT_EEXIT(&self->run); EXPECT_EQ(self->run.user_data, 0); +} + +TEST_F_TIMEOUT(enclave, unclobbered_vdso_oversubscribed_remove, 900) +{ + struct sgx_enclave_remove_pages remove_ioc; + struct sgx_enclave_modify_types modt_ioc; + struct encl_op_get_from_buf get_op; + struct encl_op_eaccept eaccept_op; + struct encl_op_put_to_buf put_op; + struct encl_segment *heap; + unsigned long total_mem; + int ret, errno_save; + unsigned long addr; + unsigned long i; + + /* + * Create enclave with additional heap that is as big as all + * available physical SGX memory. + */ + total_mem = get_total_epc_mem(); + ASSERT_NE(total_mem, 0); + TH_LOG("Creating an enclave with %lu bytes heap may take a while ...", + total_mem); + ASSERT_TRUE(setup_test_encl(total_mem, &self->encl, _metadata)); + + /* + * Hardware (SGX2) and kernel support is needed for this test. Start + * with check that test has a chance of succeeding. + */ + memset(&modt_ioc, 0, sizeof(modt_ioc)); + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); + + if (ret == -1) { + if (errno == ENOTTY) + SKIP(return, + "Kernel does not support SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl()"); + else if (errno == ENODEV) + SKIP(return, "System does not support SGX2"); + } + + /* + * Invalid parameters were provided during sanity check, + * expect command to fail. + */ + EXPECT_EQ(ret, -1); + + /* SGX2 is supported by kernel and hardware, test can proceed. */ + memset(&self->run, 0, sizeof(self->run)); + self->run.tcs = self->encl.encl_base; + + heap = &self->encl.segment_tbl[self->encl.nr_segments - 1]; + + put_op.header.type = ENCL_OP_PUT_TO_BUFFER; + put_op.value = MAGIC; + + EXPECT_EQ(ENCL_CALL(&put_op, &self->run, false), 0); + + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.user_data, 0); + + get_op.header.type = ENCL_OP_GET_FROM_BUFFER; + get_op.value = 0; + + EXPECT_EQ(ENCL_CALL(&get_op, &self->run, false), 0); + + EXPECT_EQ(get_op.value, MAGIC); + EXPECT_EEXIT(&self->run); + EXPECT_EQ(self->run.user_data, 0); + /* Trim entire heap. */ + memset(&modt_ioc, 0, sizeof(modt_ioc)); + + modt_ioc.offset = heap->offset; + modt_ioc.length = heap->size; + modt_ioc.page_type = SGX_PAGE_TYPE_TRIM; + + TH_LOG("Changing type of %zd bytes to trimmed may take a while ...", + heap->size); + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_MODIFY_TYPES, &modt_ioc); + errno_save = ret == -1 ? errno : 0; + + EXPECT_EQ(ret, 0); + EXPECT_EQ(errno_save, 0); + EXPECT_EQ(modt_ioc.result, 0); + EXPECT_EQ(modt_ioc.count, heap->size); + + /* EACCEPT all removed pages. */ + addr = self->encl.encl_base + heap->offset; + + eaccept_op.flags = SGX_SECINFO_TRIM | SGX_SECINFO_MODIFIED; + eaccept_op.header.type = ENCL_OP_EACCEPT; + + TH_LOG("Entering enclave to run EACCEPT for each page of %zd bytes may take a while ...", + heap->size); + for (i = 0; i < heap->size; i += 4096) { + eaccept_op.epc_addr = addr + i; + eaccept_op.ret = 0; + + EXPECT_EQ(ENCL_CALL(&eaccept_op, &self->run, true), 0); + + EXPECT_EQ(self->run.exception_vector, 0); + EXPECT_EQ(self->run.exception_error_code, 0); + EXPECT_EQ(self->run.exception_addr, 0); + ASSERT_EQ(eaccept_op.ret, 0); + ASSERT_EQ(self->run.function, EEXIT); + } + + /* Complete page removal. */ + memset(&remove_ioc, 0, sizeof(remove_ioc)); + + remove_ioc.offset = heap->offset; + remove_ioc.length = heap->size; + + TH_LOG("Removing %zd bytes from enclave may take a while ...", + heap->size); + ret = ioctl(self->encl.fd, SGX_IOC_ENCLAVE_REMOVE_PAGES, &remove_ioc); + errno_save = ret == -1 ? errno : 0; + + EXPECT_EQ(ret, 0); + EXPECT_EQ(errno_save, 0); + EXPECT_EQ(remove_ioc.count, heap->size); } TEST_F(enclave, clobbered_vdso) -- cgit v1.2.3 From d6a21f2d73258f2a4cd2e7806f5755ee73fddced Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Sun, 26 Jun 2022 10:11:01 +0100 Subject: objtool: update objtool.txt references Changeset a8e35fece49b ("objtool: Update documentation") renamed: tools/objtool/Documentation/stack-validation.txt to: tools/objtool/Documentation/objtool.txt. Update the cross-references accordingly. Fixes: a8e35fece49b ("objtool: Update documentation") Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/ec285ece6348a5be191aebe45f78d06b3319056b.1656234456.git.mchehab@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/x86/orc-unwinder.rst | 2 +- include/linux/objtool.h | 2 +- lib/Kconfig.debug | 2 +- tools/include/linux/objtool.h | 2 +- tools/objtool/check.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/Documentation/x86/orc-unwinder.rst b/Documentation/x86/orc-unwinder.rst index 9a66a88be765..cdb257015bd9 100644 --- a/Documentation/x86/orc-unwinder.rst +++ b/Documentation/x86/orc-unwinder.rst @@ -140,7 +140,7 @@ Unwinder implementation details Objtool generates the ORC data by integrating with the compile-time stack metadata validation feature, which is described in detail in -tools/objtool/Documentation/stack-validation.txt. After analyzing all +tools/objtool/Documentation/objtool.txt. After analyzing all the code paths of a .o file, it creates an array of orc_entry structs, and a parallel array of instruction addresses associated with those structs, and writes them to the .orc_unwind and .orc_unwind_ip sections diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 6491fa8fba6d..6f89471005ee 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -62,7 +62,7 @@ struct unwind_hint { * It should only be used in special cases where you're 100% sure it won't * affect the reliability of frame pointers and kernel stack traces. * - * For more information, see tools/objtool/Documentation/stack-validation.txt. + * For more information, see tools/objtool/Documentation/objtool.txt. */ #define STACK_FRAME_NON_STANDARD(func) \ static void __used __section(".discard.func_stack_frame_non_standard") \ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2e24db4bff19..79a71eb96111 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -498,7 +498,7 @@ config STACK_VALIDATION runtime stack traces are more reliable. For more information, see - tools/objtool/Documentation/stack-validation.txt. + tools/objtool/Documentation/objtool.txt. config NOINSTR_VALIDATION bool diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h index 6491fa8fba6d..6f89471005ee 100644 --- a/tools/include/linux/objtool.h +++ b/tools/include/linux/objtool.h @@ -62,7 +62,7 @@ struct unwind_hint { * It should only be used in special cases where you're 100% sure it won't * affect the reliability of frame pointers and kernel stack traces. * - * For more information, see tools/objtool/Documentation/stack-validation.txt. + * For more information, see tools/objtool/Documentation/objtool.txt. */ #define STACK_FRAME_NON_STANDARD(func) \ static void __used __section(".discard.func_stack_frame_non_standard") \ diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 864bb9dd3584..970844ceecdc 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -3190,7 +3190,7 @@ static struct instruction *next_insn_to_validate(struct objtool_file *file, * Follow the branch starting at the given instruction, and recursively follow * any other branches (jumps). Meanwhile, track the frame pointer state at * each instruction and validate all the rules described in - * tools/objtool/Documentation/stack-validation.txt. + * tools/objtool/Documentation/objtool.txt. */ static int validate_branch(struct objtool_file *file, struct symbol *func, struct instruction *insn, struct insn_state state) -- cgit v1.2.3 From 76f0d6f581693fbfd1be82aadc28fb52e33000fd Mon Sep 17 00:00:00 2001 From: Daniel Latypov Date: Mon, 16 May 2022 12:47:28 -0700 Subject: kunit: tool: drop unused load_config argument It's always set to true except in one test case. And in that test case it can safely be set to true anyways. Signed-off-by: Daniel Latypov Reviewed-by: David Gow Reviewed-by: Brendan Higgins Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit_kernel.py | 4 ---- tools/testing/kunit/kunit_tool_test.py | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py index 3539efaf99ba..8bc8305ba817 100644 --- a/tools/testing/kunit/kunit_kernel.py +++ b/tools/testing/kunit/kunit_kernel.py @@ -219,7 +219,6 @@ class LinuxSourceTree: def __init__( self, build_dir: str, - load_config=True, kunitconfig_path='', kconfig_add: Optional[List[str]]=None, arch=None, @@ -233,9 +232,6 @@ class LinuxSourceTree: self._arch = 'um' if arch is None else arch self._ops = get_source_tree_ops(self._arch, cross_compile) - if not load_config: - return - if kunitconfig_path: if os.path.isdir(kunitconfig_path): kunitconfig_path = os.path.join(kunitconfig_path, KUNITCONFIG_PATH) diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index 25a2eb3bf114..b9158017ece6 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -393,7 +393,7 @@ class LinuxSourceTreeTest(unittest.TestCase): return subprocess.Popen(['echo "hi\nbye"'], shell=True, text=True, stdout=subprocess.PIPE) with tempfile.TemporaryDirectory('') as build_dir: - tree = kunit_kernel.LinuxSourceTree(build_dir, load_config=False) + tree = kunit_kernel.LinuxSourceTree(build_dir) mock.patch.object(tree._ops, 'start', side_effect=fake_start).start() with self.assertRaises(ValueError): -- cgit v1.2.3 From 8a04930f2bb0047de4ae18af12e5731084bd555c Mon Sep 17 00:00:00 2001 From: Daniel Latypov Date: Mon, 16 May 2022 12:47:29 -0700 Subject: kunit: tool: redo how we construct and mock LinuxSourceTree Our main function currently has an optional `linux` argument which is used to by our unit tests to inject a mock. We currently have the same code copy-pasted several times to do if not linux: linux = MakeRealInstance(cli_args.foo, cli_args.bar, ...) But in python, dependency injection isn't necessary or idiomatic when we can just use mock.patch() to mock things out. This change 1. adds a helper to create a LinuxSourceTree from the cli_args 2. drops the `linux` parameter in favor of mocking the __init__ func. Signed-off-by: Daniel Latypov Reviewed-by: David Gow Reviewed-by: Brendan Higgins Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit.py | 49 +++++----------- tools/testing/kunit/kunit_tool_test.py | 103 ++++++++++++++++----------------- 2 files changed, 65 insertions(+), 87 deletions(-) (limited to 'tools') diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py index 13bd72e47da8..8a90d80ee66e 100755 --- a/tools/testing/kunit/kunit.py +++ b/tools/testing/kunit/kunit.py @@ -365,7 +365,18 @@ def add_parse_opts(parser) -> None: 'filename is specified', type=str, const='stdout', default=None, metavar='FILE') -def main(argv, linux=None): + +def tree_from_args(cli_args: argparse.Namespace) -> kunit_kernel.LinuxSourceTree: + """Returns a LinuxSourceTree based on the user's arguments.""" + return kunit_kernel.LinuxSourceTree(cli_args.build_dir, + kunitconfig_path=cli_args.kunitconfig, + kconfig_add=cli_args.kconfig_add, + arch=cli_args.arch, + cross_compile=cli_args.cross_compile, + qemu_config_path=cli_args.qemu_config) + + +def main(argv): parser = argparse.ArgumentParser( description='Helps writing and running KUnit tests.') subparser = parser.add_subparsers(dest='subcommand') @@ -412,14 +423,7 @@ def main(argv, linux=None): if not os.path.exists(cli_args.build_dir): os.mkdir(cli_args.build_dir) - if not linux: - linux = kunit_kernel.LinuxSourceTree(cli_args.build_dir, - kunitconfig_path=cli_args.kunitconfig, - kconfig_add=cli_args.kconfig_add, - arch=cli_args.arch, - cross_compile=cli_args.cross_compile, - qemu_config_path=cli_args.qemu_config) - + linux = tree_from_args(cli_args) request = KunitRequest(build_dir=cli_args.build_dir, make_options=cli_args.make_options, jobs=cli_args.jobs, @@ -438,14 +442,7 @@ def main(argv, linux=None): not os.path.exists(cli_args.build_dir)): os.mkdir(cli_args.build_dir) - if not linux: - linux = kunit_kernel.LinuxSourceTree(cli_args.build_dir, - kunitconfig_path=cli_args.kunitconfig, - kconfig_add=cli_args.kconfig_add, - arch=cli_args.arch, - cross_compile=cli_args.cross_compile, - qemu_config_path=cli_args.qemu_config) - + linux = tree_from_args(cli_args) request = KunitConfigRequest(build_dir=cli_args.build_dir, make_options=cli_args.make_options) result = config_tests(linux, request) @@ -455,14 +452,7 @@ def main(argv, linux=None): if result.status != KunitStatus.SUCCESS: sys.exit(1) elif cli_args.subcommand == 'build': - if not linux: - linux = kunit_kernel.LinuxSourceTree(cli_args.build_dir, - kunitconfig_path=cli_args.kunitconfig, - kconfig_add=cli_args.kconfig_add, - arch=cli_args.arch, - cross_compile=cli_args.cross_compile, - qemu_config_path=cli_args.qemu_config) - + linux = tree_from_args(cli_args) request = KunitBuildRequest(build_dir=cli_args.build_dir, make_options=cli_args.make_options, jobs=cli_args.jobs, @@ -474,14 +464,7 @@ def main(argv, linux=None): if result.status != KunitStatus.SUCCESS: sys.exit(1) elif cli_args.subcommand == 'exec': - if not linux: - linux = kunit_kernel.LinuxSourceTree(cli_args.build_dir, - kunitconfig_path=cli_args.kunitconfig, - kconfig_add=cli_args.kconfig_add, - arch=cli_args.arch, - cross_compile=cli_args.cross_compile, - qemu_config_path=cli_args.qemu_config) - + linux = tree_from_args(cli_args) exec_request = KunitExecRequest(raw_output=cli_args.raw_output, build_dir=cli_args.build_dir, json=cli_args.json, diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index b9158017ece6..baee11d96474 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -503,24 +503,25 @@ class KUnitMainTest(unittest.TestCase): self.print_mock = mock.patch('builtins.print').start() self.addCleanup(mock.patch.stopall) - self.linux_source_mock = mock.Mock() - self.linux_source_mock.build_reconfig = mock.Mock(return_value=True) - self.linux_source_mock.build_kernel = mock.Mock(return_value=True) - self.linux_source_mock.run_kernel = mock.Mock(return_value=all_passed_log) + self.mock_linux_init = mock.patch.object(kunit_kernel, 'LinuxSourceTree').start() + self.linux_source_mock = self.mock_linux_init.return_value + self.linux_source_mock.build_reconfig.return_value = True + self.linux_source_mock.build_kernel.return_value = True + self.linux_source_mock.run_kernel.return_value = all_passed_log def test_config_passes_args_pass(self): - kunit.main(['config', '--build_dir=.kunit'], self.linux_source_mock) + kunit.main(['config', '--build_dir=.kunit']) self.assertEqual(self.linux_source_mock.build_reconfig.call_count, 1) self.assertEqual(self.linux_source_mock.run_kernel.call_count, 0) def test_build_passes_args_pass(self): - kunit.main(['build'], self.linux_source_mock) + kunit.main(['build']) self.assertEqual(self.linux_source_mock.build_reconfig.call_count, 1) self.linux_source_mock.build_kernel.assert_called_once_with(False, kunit.get_default_jobs(), '.kunit', None) self.assertEqual(self.linux_source_mock.run_kernel.call_count, 0) def test_exec_passes_args_pass(self): - kunit.main(['exec'], self.linux_source_mock) + kunit.main(['exec']) self.assertEqual(self.linux_source_mock.build_reconfig.call_count, 0) self.assertEqual(self.linux_source_mock.run_kernel.call_count, 1) self.linux_source_mock.run_kernel.assert_called_once_with( @@ -528,7 +529,7 @@ class KUnitMainTest(unittest.TestCase): self.print_mock.assert_any_call(StrContains('Testing complete.')) def test_run_passes_args_pass(self): - kunit.main(['run'], self.linux_source_mock) + kunit.main(['run']) self.assertEqual(self.linux_source_mock.build_reconfig.call_count, 1) self.assertEqual(self.linux_source_mock.run_kernel.call_count, 1) self.linux_source_mock.run_kernel.assert_called_once_with( @@ -538,13 +539,13 @@ class KUnitMainTest(unittest.TestCase): def test_exec_passes_args_fail(self): self.linux_source_mock.run_kernel = mock.Mock(return_value=[]) with self.assertRaises(SystemExit) as e: - kunit.main(['exec'], self.linux_source_mock) + kunit.main(['exec']) self.assertEqual(e.exception.code, 1) def test_run_passes_args_fail(self): self.linux_source_mock.run_kernel = mock.Mock(return_value=[]) with self.assertRaises(SystemExit) as e: - kunit.main(['run'], self.linux_source_mock) + kunit.main(['run']) self.assertEqual(e.exception.code, 1) self.assertEqual(self.linux_source_mock.build_reconfig.call_count, 1) self.assertEqual(self.linux_source_mock.run_kernel.call_count, 1) @@ -553,7 +554,7 @@ class KUnitMainTest(unittest.TestCase): def test_exec_no_tests(self): self.linux_source_mock.run_kernel = mock.Mock(return_value=['TAP version 14', '1..0']) with self.assertRaises(SystemExit) as e: - kunit.main(['run'], self.linux_source_mock) + kunit.main(['run']) self.assertEqual(e.exception.code, 1) self.linux_source_mock.run_kernel.assert_called_once_with( args=None, build_dir='.kunit', filter_glob='', timeout=300) @@ -561,7 +562,7 @@ class KUnitMainTest(unittest.TestCase): def test_exec_raw_output(self): self.linux_source_mock.run_kernel = mock.Mock(return_value=[]) - kunit.main(['exec', '--raw_output'], self.linux_source_mock) + kunit.main(['exec', '--raw_output']) self.assertEqual(self.linux_source_mock.run_kernel.call_count, 1) for call in self.print_mock.call_args_list: self.assertNotEqual(call, mock.call(StrContains('Testing complete.'))) @@ -569,7 +570,7 @@ class KUnitMainTest(unittest.TestCase): def test_run_raw_output(self): self.linux_source_mock.run_kernel = mock.Mock(return_value=[]) - kunit.main(['run', '--raw_output'], self.linux_source_mock) + kunit.main(['run', '--raw_output']) self.assertEqual(self.linux_source_mock.build_reconfig.call_count, 1) self.assertEqual(self.linux_source_mock.run_kernel.call_count, 1) for call in self.print_mock.call_args_list: @@ -578,7 +579,7 @@ class KUnitMainTest(unittest.TestCase): def test_run_raw_output_kunit(self): self.linux_source_mock.run_kernel = mock.Mock(return_value=[]) - kunit.main(['run', '--raw_output=kunit'], self.linux_source_mock) + kunit.main(['run', '--raw_output=kunit']) self.assertEqual(self.linux_source_mock.build_reconfig.call_count, 1) self.assertEqual(self.linux_source_mock.run_kernel.call_count, 1) for call in self.print_mock.call_args_list: @@ -588,27 +589,27 @@ class KUnitMainTest(unittest.TestCase): def test_run_raw_output_invalid(self): self.linux_source_mock.run_kernel = mock.Mock(return_value=[]) with self.assertRaises(SystemExit) as e: - kunit.main(['run', '--raw_output=invalid'], self.linux_source_mock) + kunit.main(['run', '--raw_output=invalid']) self.assertNotEqual(e.exception.code, 0) def test_run_raw_output_does_not_take_positional_args(self): # --raw_output is a string flag, but we don't want it to consume # any positional arguments, only ones after an '=' self.linux_source_mock.run_kernel = mock.Mock(return_value=[]) - kunit.main(['run', '--raw_output', 'filter_glob'], self.linux_source_mock) + kunit.main(['run', '--raw_output', 'filter_glob']) self.linux_source_mock.run_kernel.assert_called_once_with( args=None, build_dir='.kunit', filter_glob='filter_glob', timeout=300) def test_exec_timeout(self): timeout = 3453 - kunit.main(['exec', '--timeout', str(timeout)], self.linux_source_mock) + kunit.main(['exec', '--timeout', str(timeout)]) self.linux_source_mock.run_kernel.assert_called_once_with( args=None, build_dir='.kunit', filter_glob='', timeout=timeout) self.print_mock.assert_any_call(StrContains('Testing complete.')) def test_run_timeout(self): timeout = 3453 - kunit.main(['run', '--timeout', str(timeout)], self.linux_source_mock) + kunit.main(['run', '--timeout', str(timeout)]) self.assertEqual(self.linux_source_mock.build_reconfig.call_count, 1) self.linux_source_mock.run_kernel.assert_called_once_with( args=None, build_dir='.kunit', filter_glob='', timeout=timeout) @@ -616,7 +617,7 @@ class KUnitMainTest(unittest.TestCase): def test_run_builddir(self): build_dir = '.kunit' - kunit.main(['run', '--build_dir=.kunit'], self.linux_source_mock) + kunit.main(['run', '--build_dir=.kunit']) self.assertEqual(self.linux_source_mock.build_reconfig.call_count, 1) self.linux_source_mock.run_kernel.assert_called_once_with( args=None, build_dir=build_dir, filter_glob='', timeout=300) @@ -624,60 +625,54 @@ class KUnitMainTest(unittest.TestCase): def test_config_builddir(self): build_dir = '.kunit' - kunit.main(['config', '--build_dir', build_dir], self.linux_source_mock) + kunit.main(['config', '--build_dir', build_dir]) self.assertEqual(self.linux_source_mock.build_reconfig.call_count, 1) def test_build_builddir(self): build_dir = '.kunit' jobs = kunit.get_default_jobs() - kunit.main(['build', '--build_dir', build_dir], self.linux_source_mock) + kunit.main(['build', '--build_dir', build_dir]) self.linux_source_mock.build_kernel.assert_called_once_with(False, jobs, build_dir, None) def test_exec_builddir(self): build_dir = '.kunit' - kunit.main(['exec', '--build_dir', build_dir], self.linux_source_mock) + kunit.main(['exec', '--build_dir', build_dir]) self.linux_source_mock.run_kernel.assert_called_once_with( args=None, build_dir=build_dir, filter_glob='', timeout=300) self.print_mock.assert_any_call(StrContains('Testing complete.')) - @mock.patch.object(kunit_kernel, 'LinuxSourceTree') - def test_run_kunitconfig(self, mock_linux_init): - mock_linux_init.return_value = self.linux_source_mock + def test_run_kunitconfig(self): kunit.main(['run', '--kunitconfig=mykunitconfig']) # Just verify that we parsed and initialized it correctly here. - mock_linux_init.assert_called_once_with('.kunit', - kunitconfig_path='mykunitconfig', - kconfig_add=None, - arch='um', - cross_compile=None, - qemu_config_path=None) - - @mock.patch.object(kunit_kernel, 'LinuxSourceTree') - def test_config_kunitconfig(self, mock_linux_init): - mock_linux_init.return_value = self.linux_source_mock + self.mock_linux_init.assert_called_once_with('.kunit', + kunitconfig_path='mykunitconfig', + kconfig_add=None, + arch='um', + cross_compile=None, + qemu_config_path=None) + + def test_config_kunitconfig(self): kunit.main(['config', '--kunitconfig=mykunitconfig']) # Just verify that we parsed and initialized it correctly here. - mock_linux_init.assert_called_once_with('.kunit', - kunitconfig_path='mykunitconfig', - kconfig_add=None, - arch='um', - cross_compile=None, - qemu_config_path=None) - - @mock.patch.object(kunit_kernel, 'LinuxSourceTree') - def test_run_kconfig_add(self, mock_linux_init): - mock_linux_init.return_value = self.linux_source_mock + self.mock_linux_init.assert_called_once_with('.kunit', + kunitconfig_path='mykunitconfig', + kconfig_add=None, + arch='um', + cross_compile=None, + qemu_config_path=None) + + def test_run_kconfig_add(self): kunit.main(['run', '--kconfig_add=CONFIG_KASAN=y', '--kconfig_add=CONFIG_KCSAN=y']) # Just verify that we parsed and initialized it correctly here. - mock_linux_init.assert_called_once_with('.kunit', - kunitconfig_path=None, - kconfig_add=['CONFIG_KASAN=y', 'CONFIG_KCSAN=y'], - arch='um', - cross_compile=None, - qemu_config_path=None) + self.mock_linux_init.assert_called_once_with('.kunit', + kunitconfig_path=None, + kconfig_add=['CONFIG_KASAN=y', 'CONFIG_KCSAN=y'], + arch='um', + cross_compile=None, + qemu_config_path=None) def test_run_kernel_args(self): - kunit.main(['run', '--kernel_args=a=1', '--kernel_args=b=2'], self.linux_source_mock) + kunit.main(['run', '--kernel_args=a=1', '--kernel_args=b=2']) self.assertEqual(self.linux_source_mock.build_reconfig.call_count, 1) self.linux_source_mock.run_kernel.assert_called_once_with( args=['a=1','b=2'], build_dir='.kunit', filter_glob='', timeout=300) @@ -699,7 +694,7 @@ class KUnitMainTest(unittest.TestCase): @mock.patch.object(kunit, '_list_tests') def test_run_isolated_by_suite(self, mock_tests): mock_tests.return_value = ['suite.test1', 'suite.test2', 'suite2.test1'] - kunit.main(['exec', '--run_isolated=suite', 'suite*.test*'], self.linux_source_mock) + kunit.main(['exec', '--run_isolated=suite', 'suite*.test*']) # Should respect the user's filter glob when listing tests. mock_tests.assert_called_once_with(mock.ANY, @@ -712,7 +707,7 @@ class KUnitMainTest(unittest.TestCase): @mock.patch.object(kunit, '_list_tests') def test_run_isolated_by_test(self, mock_tests): mock_tests.return_value = ['suite.test1', 'suite.test2', 'suite2.test1'] - kunit.main(['exec', '--run_isolated=test', 'suite*'], self.linux_source_mock) + kunit.main(['exec', '--run_isolated=test', 'suite*']) # Should respect the user's filter glob when listing tests. mock_tests.assert_called_once_with(mock.ANY, -- cgit v1.2.3 From e756dbebd95d7ea7ae2a2343e8924eee10ec6253 Mon Sep 17 00:00:00 2001 From: Daniel Latypov Date: Mon, 16 May 2022 12:47:30 -0700 Subject: kunit: tool: refactoring printing logic into kunit_printer.py Context: * kunit_kernel.py is importing kunit_parser.py just to use the print_with_timestamp() function * the parser is directly printing to stdout, which will become an issue if we ever try to run multiple kernels in parallel This patch introduces a kunit_printer.py file and migrates callers of kunit_parser.print_with_timestamp() to call kunit_printer.stdout.print_with_timestamp() instead. Future changes: If we want to support showing results for parallel runs, we could then create new Printer's that don't directly write to stdout and refactor the code to pass around these Printer objects. Signed-off-by: Daniel Latypov Reviewed-by: David Gow Reviewed-by: Brendan Higgins Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit.py | 17 ++++----- tools/testing/kunit/kunit_kernel.py | 8 ++--- tools/testing/kunit/kunit_parser.py | 63 ++++++++++------------------------ tools/testing/kunit/kunit_printer.py | 48 ++++++++++++++++++++++++++ tools/testing/kunit/kunit_tool_test.py | 4 +-- 5 files changed, 82 insertions(+), 58 deletions(-) create mode 100644 tools/testing/kunit/kunit_printer.py (limited to 'tools') diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py index 8a90d80ee66e..114e548e4f04 100755 --- a/tools/testing/kunit/kunit.py +++ b/tools/testing/kunit/kunit.py @@ -22,6 +22,7 @@ from typing import Iterable, List, Optional, Sequence, Tuple import kunit_json import kunit_kernel import kunit_parser +from kunit_printer import stdout class KunitStatus(Enum): SUCCESS = auto() @@ -72,7 +73,7 @@ def get_kernel_root_path() -> str: def config_tests(linux: kunit_kernel.LinuxSourceTree, request: KunitConfigRequest) -> KunitResult: - kunit_parser.print_with_timestamp('Configuring KUnit Kernel ...') + stdout.print_with_timestamp('Configuring KUnit Kernel ...') config_start = time.time() success = linux.build_reconfig(request.build_dir, request.make_options) @@ -85,7 +86,7 @@ def config_tests(linux: kunit_kernel.LinuxSourceTree, def build_tests(linux: kunit_kernel.LinuxSourceTree, request: KunitBuildRequest) -> KunitResult: - kunit_parser.print_with_timestamp('Building KUnit Kernel ...') + stdout.print_with_timestamp('Building KUnit Kernel ...') build_start = time.time() success = linux.build_kernel(request.alltests, @@ -158,7 +159,7 @@ def exec_tests(linux: kunit_kernel.LinuxSourceTree, request: KunitExecRequest) - test_counts = kunit_parser.TestCounts() exec_time = 0.0 for i, filter_glob in enumerate(filter_globs): - kunit_parser.print_with_timestamp('Starting KUnit Kernel ({}/{})...'.format(i+1, len(filter_globs))) + stdout.print_with_timestamp('Starting KUnit Kernel ({}/{})...'.format(i+1, len(filter_globs))) test_start = time.time() run_result = linux.run_kernel( @@ -221,7 +222,7 @@ def parse_tests(request: KunitParseRequest, metadata: kunit_json.Metadata, input else: with open(request.json, 'w') as f: f.write(json_str) - kunit_parser.print_with_timestamp("Test results stored in %s" % + stdout.print_with_timestamp("Test results stored in %s" % os.path.abspath(request.json)) if test_result.status != kunit_parser.TestStatus.SUCCESS: @@ -245,7 +246,7 @@ def run_tests(linux: kunit_kernel.LinuxSourceTree, run_end = time.time() - kunit_parser.print_with_timestamp(( + stdout.print_with_timestamp(( 'Elapsed time: %.3fs total, %.3fs configuring, %.3fs ' + 'building, %.3fs running\n') % ( run_end - run_start, @@ -446,7 +447,7 @@ def main(argv): request = KunitConfigRequest(build_dir=cli_args.build_dir, make_options=cli_args.make_options) result = config_tests(linux, request) - kunit_parser.print_with_timestamp(( + stdout.print_with_timestamp(( 'Elapsed time: %.3fs\n') % ( result.elapsed_time)) if result.status != KunitStatus.SUCCESS: @@ -458,7 +459,7 @@ def main(argv): jobs=cli_args.jobs, alltests=cli_args.alltests) result = config_and_build_tests(linux, request) - kunit_parser.print_with_timestamp(( + stdout.print_with_timestamp(( 'Elapsed time: %.3fs\n') % ( result.elapsed_time)) if result.status != KunitStatus.SUCCESS: @@ -474,7 +475,7 @@ def main(argv): kernel_args=cli_args.kernel_args, run_isolated=cli_args.run_isolated) result = exec_tests(linux, exec_request) - kunit_parser.print_with_timestamp(( + stdout.print_with_timestamp(( 'Elapsed time: %.3fs\n') % (result.elapsed_time)) if result.status != KunitStatus.SUCCESS: sys.exit(1) diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py index 8bc8305ba817..b51ce102d82e 100644 --- a/tools/testing/kunit/kunit_kernel.py +++ b/tools/testing/kunit/kunit_kernel.py @@ -18,7 +18,7 @@ import threading from typing import Iterator, List, Optional, Tuple import kunit_config -import kunit_parser +from kunit_printer import stdout import qemu_config KCONFIG_PATH = '.config' @@ -138,7 +138,7 @@ class LinuxSourceTreeOperationsUml(LinuxSourceTreeOperations): super().__init__(linux_arch='um', cross_compile=cross_compile) def make_allyesconfig(self, build_dir: str, make_options) -> None: - kunit_parser.print_with_timestamp( + stdout.print_with_timestamp( 'Enabling all CONFIGs for UML...') command = ['make', 'ARCH=um', 'O=' + build_dir, 'allyesconfig'] if make_options: @@ -148,13 +148,13 @@ class LinuxSourceTreeOperationsUml(LinuxSourceTreeOperations): stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) process.wait() - kunit_parser.print_with_timestamp( + stdout.print_with_timestamp( 'Disabling broken configs to run KUnit tests...') with open(get_kconfig_path(build_dir), 'a') as config: with open(BROKEN_ALLCONFIG_PATH, 'r') as disable: config.write(disable.read()) - kunit_parser.print_with_timestamp( + stdout.print_with_timestamp( 'Starting Kernel with all configs takes a few minutes...') def start(self, params: List[str], build_dir: str) -> subprocess.Popen: diff --git a/tools/testing/kunit/kunit_parser.py b/tools/testing/kunit/kunit_parser.py index c5569b367c69..12d3ec77f427 100644 --- a/tools/testing/kunit/kunit_parser.py +++ b/tools/testing/kunit/kunit_parser.py @@ -13,10 +13,11 @@ from __future__ import annotations import re import sys -import datetime from enum import Enum, auto from typing import Iterable, Iterator, List, Optional, Tuple +from kunit_printer import stdout + class Test: """ A class to represent a test parsed from KTAP results. All KTAP @@ -55,7 +56,7 @@ class Test: def add_error(self, error_message: str) -> None: """Records an error that occurred while parsing this test.""" self.counts.errors += 1 - print_with_timestamp(red('[ERROR]') + f' Test: {self.name}: {error_message}') + stdout.print_with_timestamp(stdout.red('[ERROR]') + f' Test: {self.name}: {error_message}') class TestStatus(Enum): """An enumeration class to represent the status of a test.""" @@ -461,32 +462,6 @@ def parse_diagnostic(lines: LineStream) -> List[str]: DIVIDER = '=' * 60 -RESET = '\033[0;0m' - -def red(text: str) -> str: - """Returns inputted string with red color code.""" - if not sys.stdout.isatty(): - return text - return '\033[1;31m' + text + RESET - -def yellow(text: str) -> str: - """Returns inputted string with yellow color code.""" - if not sys.stdout.isatty(): - return text - return '\033[1;33m' + text + RESET - -def green(text: str) -> str: - """Returns inputted string with green color code.""" - if not sys.stdout.isatty(): - return text - return '\033[1;32m' + text + RESET - -ANSI_LEN = len(red('')) - -def print_with_timestamp(message: str) -> None: - """Prints message with timestamp at beginning.""" - print('[%s] %s' % (datetime.datetime.now().strftime('%H:%M:%S'), message)) - def format_test_divider(message: str, len_message: int) -> str: """ Returns string with message centered in fixed width divider. @@ -529,12 +504,12 @@ def print_test_header(test: Test) -> None: message += ' (1 subtest)' else: message += f' ({test.expected_count} subtests)' - print_with_timestamp(format_test_divider(message, len(message))) + stdout.print_with_timestamp(format_test_divider(message, len(message))) def print_log(log: Iterable[str]) -> None: """Prints all strings in saved log for test in yellow.""" for m in log: - print_with_timestamp(yellow(m)) + stdout.print_with_timestamp(stdout.yellow(m)) def format_test_result(test: Test) -> str: """ @@ -551,16 +526,16 @@ def format_test_result(test: Test) -> str: String containing formatted test result """ if test.status == TestStatus.SUCCESS: - return green('[PASSED] ') + test.name + return stdout.green('[PASSED] ') + test.name if test.status == TestStatus.SKIPPED: - return yellow('[SKIPPED] ') + test.name + return stdout.yellow('[SKIPPED] ') + test.name if test.status == TestStatus.NO_TESTS: - return yellow('[NO TESTS RUN] ') + test.name + return stdout.yellow('[NO TESTS RUN] ') + test.name if test.status == TestStatus.TEST_CRASHED: print_log(test.log) - return red('[CRASHED] ') + test.name + return stdout.red('[CRASHED] ') + test.name print_log(test.log) - return red('[FAILED] ') + test.name + return stdout.red('[FAILED] ') + test.name def print_test_result(test: Test) -> None: """ @@ -572,7 +547,7 @@ def print_test_result(test: Test) -> None: Parameters: test - Test object representing current test being printed """ - print_with_timestamp(format_test_result(test)) + stdout.print_with_timestamp(format_test_result(test)) def print_test_footer(test: Test) -> None: """ @@ -585,8 +560,8 @@ def print_test_footer(test: Test) -> None: test - Test object representing current test being printed """ message = format_test_result(test) - print_with_timestamp(format_test_divider(message, - len(message) - ANSI_LEN)) + stdout.print_with_timestamp(format_test_divider(message, + len(message) - stdout.color_len())) def print_summary_line(test: Test) -> None: """ @@ -603,12 +578,12 @@ def print_summary_line(test: Test) -> None: test - Test object representing current test being printed """ if test.status == TestStatus.SUCCESS: - color = green + color = stdout.green elif test.status in (TestStatus.SKIPPED, TestStatus.NO_TESTS): - color = yellow + color = stdout.yellow else: - color = red - print_with_timestamp(color(f'Testing complete. {test.counts}')) + color = stdout.red + stdout.print_with_timestamp(color(f'Testing complete. {test.counts}')) # Other methods: @@ -762,7 +737,7 @@ def parse_run_tests(kernel_output: Iterable[str]) -> Test: Return: Test - the main test object with all subtests. """ - print_with_timestamp(DIVIDER) + stdout.print_with_timestamp(DIVIDER) lines = extract_tap_lines(kernel_output) test = Test() if not lines: @@ -773,6 +748,6 @@ def parse_run_tests(kernel_output: Iterable[str]) -> Test: test = parse_test(lines, 0, []) if test.status != TestStatus.NO_TESTS: test.status = test.counts.get_status() - print_with_timestamp(DIVIDER) + stdout.print_with_timestamp(DIVIDER) print_summary_line(test) return test diff --git a/tools/testing/kunit/kunit_printer.py b/tools/testing/kunit/kunit_printer.py new file mode 100644 index 000000000000..5f1cc55ecdf5 --- /dev/null +++ b/tools/testing/kunit/kunit_printer.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# +# Utilities for printing and coloring output. +# +# Copyright (C) 2022, Google LLC. +# Author: Daniel Latypov + +import datetime +import sys +import typing + +_RESET = '\033[0;0m' + +class Printer: + """Wraps a file object, providing utilities for coloring output, etc.""" + + def __init__(self, output: typing.IO): + self._output = output + self._use_color = output.isatty() + + def print(self, message: str) -> None: + print(message, file=self._output) + + def print_with_timestamp(self, message: str) -> None: + ts = datetime.datetime.now().strftime('%H:%M:%S') + self.print(f'[{ts}] {message}') + + def _color(self, code: str, text: str) -> str: + if not self._use_color: + return text + return code + text + _RESET + + def red(self, text: str) -> str: + return self._color('\033[1;31m', text) + + def yellow(self, text: str) -> str: + return self._color('\033[1;33m', text) + + def green(self, text: str) -> str: + return self._color('\033[1;32m', text) + + def color_len(self) -> int: + """Returns the length of the color escape codes.""" + return len(self.red('')) + +# Provides a default instance that prints to stdout +stdout = Printer(sys.stdout) diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index baee11d96474..2973402c5053 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -222,7 +222,7 @@ class KUnitParserTest(unittest.TestCase): def test_no_kunit_output(self): crash_log = test_data_path('test_insufficient_memory.log') - print_mock = mock.patch('builtins.print').start() + print_mock = mock.patch('kunit_printer.Printer.print').start() with open(crash_log) as file: result = kunit_parser.parse_run_tests( kunit_parser.extract_tap_lines(file.readlines())) @@ -500,7 +500,7 @@ class KUnitMainTest(unittest.TestCase): with open(path) as file: all_passed_log = file.readlines() - self.print_mock = mock.patch('builtins.print').start() + self.print_mock = mock.patch('kunit_printer.Printer.print').start() self.addCleanup(mock.patch.stopall) self.mock_linux_init = mock.patch.object(kunit_kernel, 'LinuxSourceTree').start() -- cgit v1.2.3 From 9241bc818d54b9cb7d1c9a28f26afa58e6d0bb7c Mon Sep 17 00:00:00 2001 From: Daniel Latypov Date: Fri, 13 May 2022 11:10:32 -0700 Subject: kunit: tool: cosmetic: don't specify duplicate kernel cmdline options Context: When using a non-UML arch, kunit.py will boot the test kernel with options like these by default (this is x86_64): > mem=1G console=tty kunit_shutdown=halt console=ttyS0 kunit_shutdown=reboot The first three options are added unconditionally but are only intended for UML. 1. 'mem=1G' is redundant with the '-m 1024' that we hard-code into the qemu commandline. 2. We specify a 'console' for all tools/testing/kunit/qemu_configs/*.py already, so 'console=tty' gets overwritten. 3. For QEMU, we need to use 'reboot', and for UML we need to use 'halt'. If you switch them, kunit.py will hang until the --timeout expires. This patch: Having these duplicate options is a bit noisy. Switch so we only add UML-specific options for UML. I.e. we now get UML: 'mem=1G console=tty kunit_shutdown=halt' (unchanged) x86_64: 'console=ttyS0 kunit_shutdown=reboot' Side effect: you can't overwrite these options on UML w/ --kernel_arg. But you already couldn't for QEMU (console, kunit_shutdown), and why would you want to? Signed-off-by: Daniel Latypov Reviewed-by: David Gow Reviewed-by: Brendan Higgins Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit_kernel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py index b51ce102d82e..12527e7f92a2 100644 --- a/tools/testing/kunit/kunit_kernel.py +++ b/tools/testing/kunit/kunit_kernel.py @@ -160,6 +160,7 @@ class LinuxSourceTreeOperationsUml(LinuxSourceTreeOperations): def start(self, params: List[str], build_dir: str) -> subprocess.Popen: """Runs the Linux UML binary. Must be named 'linux'.""" linux_bin = os.path.join(build_dir, 'linux') + params.extend(['mem=1G', 'console=tty', 'kunit_shutdown=halt']) return subprocess.Popen([linux_bin] + params, stdin=subprocess.PIPE, stdout=subprocess.PIPE, @@ -330,7 +331,6 @@ class LinuxSourceTree: def run_kernel(self, args=None, build_dir='', filter_glob='', timeout=None) -> Iterator[str]: if not args: args = [] - args.extend(['mem=1G', 'console=tty', 'kunit_shutdown=halt']) if filter_glob: args.append('kunit.filter_glob='+filter_glob) -- cgit v1.2.3 From 8c278d97ad721442692e610007494e8e18cc0288 Mon Sep 17 00:00:00 2001 From: Daniel Latypov Date: Wed, 18 May 2022 10:01:23 -0700 Subject: kunit: tool: simplify creating LinuxSourceTreeOperations Drop get_source_tree_ops() and just call what used to be get_source_tree_ops_from_qemu_config() in both cases. Also rename the functions to have shorter names and add a "_" prefix to note they're not meant to be used outside this function. Signed-off-by: Daniel Latypov Reviewed-by: David Gow Reviewed-by: Brendan Higgins Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit_kernel.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py index 12527e7f92a2..eb12e97deceb 100644 --- a/tools/testing/kunit/kunit_kernel.py +++ b/tools/testing/kunit/kunit_kernel.py @@ -179,19 +179,16 @@ def get_old_kunitconfig_path(build_dir: str) -> str: def get_outfile_path(build_dir: str) -> str: return os.path.join(build_dir, OUTFILE_PATH) -def get_source_tree_ops(arch: str, cross_compile: Optional[str]) -> LinuxSourceTreeOperations: +def _default_qemu_config_path(arch: str) -> str: config_path = os.path.join(QEMU_CONFIGS_DIR, arch + '.py') - if arch == 'um': - return LinuxSourceTreeOperationsUml(cross_compile=cross_compile) if os.path.isfile(config_path): - return get_source_tree_ops_from_qemu_config(config_path, cross_compile)[1] + return config_path options = [f[:-3] for f in os.listdir(QEMU_CONFIGS_DIR) if f.endswith('.py')] raise ConfigError(arch + ' is not a valid arch, options are ' + str(sorted(options))) -def get_source_tree_ops_from_qemu_config(config_path: str, - cross_compile: Optional[str]) -> Tuple[ - str, LinuxSourceTreeOperations]: +def _get_qemu_ops(config_path: str, + cross_compile: Optional[str]) -> Tuple[str, LinuxSourceTreeOperations]: # The module name/path has very little to do with where the actual file # exists (I learned this through experimentation and could not find it # anywhere in the Python documentation). @@ -227,11 +224,14 @@ class LinuxSourceTree: qemu_config_path=None) -> None: signal.signal(signal.SIGINT, self.signal_handler) if qemu_config_path: - self._arch, self._ops = get_source_tree_ops_from_qemu_config( - qemu_config_path, cross_compile) + self._arch, self._ops = _get_qemu_ops(qemu_config_path, cross_compile) else: self._arch = 'um' if arch is None else arch - self._ops = get_source_tree_ops(self._arch, cross_compile) + if self._arch == 'um': + self._ops = LinuxSourceTreeOperationsUml(cross_compile=cross_compile) + else: + qemu_config_path = _default_qemu_config_path(self._arch) + _, self._ops = _get_qemu_ops(qemu_config_path, cross_compile) if kunitconfig_path: if os.path.isdir(kunitconfig_path): -- cgit v1.2.3 From a9333bd344ad6eaf942221e0497ed65ec3224052 Mon Sep 17 00:00:00 2001 From: Daniel Latypov Date: Wed, 18 May 2022 10:01:24 -0700 Subject: kunit: tool: introduce --qemu_args Example usage: $ ./tools/testing/kunit/kunit.py run --arch=x86_64 \ --kconfig_add=CONFIG_SMP=y --qemu_args='-smp 8' Looking in the test.log, one can see > smp: Bringing up secondary CPUs ... > .... node #0, CPUs: #1 #2 #3 #4 #5 #6 #7 > smp: Brought up 1 node, 8 CPUs This flag would allow people to make tweaks like this without having to create custom qemu_config files. For consistency with --kernel_args, we allow users to repeat this argument, e.g. you can tack on a --qemu_args='-m 2048', or you could just append it to the first string ('-smp 8 -m 2048'). Signed-off-by: Daniel Latypov Reviewed-by: David Gow Reviewed-by: Brendan Higgins Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit.py | 14 +++++++++++++- tools/testing/kunit/kunit_kernel.py | 10 +++++++--- tools/testing/kunit/kunit_tool_test.py | 20 +++++++++++++++++--- 3 files changed, 37 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py index 114e548e4f04..b686126afb40 100755 --- a/tools/testing/kunit/kunit.py +++ b/tools/testing/kunit/kunit.py @@ -10,6 +10,7 @@ import argparse import os import re +import shlex import sys import time @@ -324,6 +325,10 @@ def add_common_opts(parser) -> None: 'a QemuArchParams object.'), type=str, metavar='FILE') + parser.add_argument('--qemu_args', + help='Additional QEMU arguments, e.g. "-smp 8"', + action='append', metavar='') + def add_build_opts(parser) -> None: parser.add_argument('--jobs', help='As in the make command, "Specifies the number of ' @@ -369,12 +374,19 @@ def add_parse_opts(parser) -> None: def tree_from_args(cli_args: argparse.Namespace) -> kunit_kernel.LinuxSourceTree: """Returns a LinuxSourceTree based on the user's arguments.""" + # Allow users to specify multiple arguments in one string, e.g. '-smp 8' + qemu_args: List[str] = [] + if cli_args.qemu_args: + for arg in cli_args.qemu_args: + qemu_args.extend(shlex.split(arg)) + return kunit_kernel.LinuxSourceTree(cli_args.build_dir, kunitconfig_path=cli_args.kunitconfig, kconfig_add=cli_args.kconfig_add, arch=cli_args.arch, cross_compile=cli_args.cross_compile, - qemu_config_path=cli_args.qemu_config) + qemu_config_path=cli_args.qemu_config, + extra_qemu_args=qemu_args) def main(argv): diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py index eb12e97deceb..b30622aa374f 100644 --- a/tools/testing/kunit/kunit_kernel.py +++ b/tools/testing/kunit/kunit_kernel.py @@ -188,6 +188,7 @@ def _default_qemu_config_path(arch: str) -> str: raise ConfigError(arch + ' is not a valid arch, options are ' + str(sorted(options))) def _get_qemu_ops(config_path: str, + extra_qemu_args: Optional[List[str]], cross_compile: Optional[str]) -> Tuple[str, LinuxSourceTreeOperations]: # The module name/path has very little to do with where the actual file # exists (I learned this through experimentation and could not find it @@ -208,6 +209,8 @@ def _get_qemu_ops(config_path: str, if not hasattr(config, 'QEMU_ARCH'): raise ValueError('qemu_config module missing "QEMU_ARCH": ' + config_path) params: qemu_config.QemuArchParams = config.QEMU_ARCH # type: ignore + if extra_qemu_args: + params.extra_qemu_params.extend(extra_qemu_args) return params.linux_arch, LinuxSourceTreeOperationsQemu( params, cross_compile=cross_compile) @@ -221,17 +224,18 @@ class LinuxSourceTree: kconfig_add: Optional[List[str]]=None, arch=None, cross_compile=None, - qemu_config_path=None) -> None: + qemu_config_path=None, + extra_qemu_args=None) -> None: signal.signal(signal.SIGINT, self.signal_handler) if qemu_config_path: - self._arch, self._ops = _get_qemu_ops(qemu_config_path, cross_compile) + self._arch, self._ops = _get_qemu_ops(qemu_config_path, extra_qemu_args, cross_compile) else: self._arch = 'um' if arch is None else arch if self._arch == 'um': self._ops = LinuxSourceTreeOperationsUml(cross_compile=cross_compile) else: qemu_config_path = _default_qemu_config_path(self._arch) - _, self._ops = _get_qemu_ops(qemu_config_path, cross_compile) + _, self._ops = _get_qemu_ops(qemu_config_path, extra_qemu_args, cross_compile) if kunitconfig_path: if os.path.isdir(kunitconfig_path): diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index 2973402c5053..59fba0926629 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -649,7 +649,8 @@ class KUnitMainTest(unittest.TestCase): kconfig_add=None, arch='um', cross_compile=None, - qemu_config_path=None) + qemu_config_path=None, + extra_qemu_args=[]) def test_config_kunitconfig(self): kunit.main(['config', '--kunitconfig=mykunitconfig']) @@ -659,7 +660,8 @@ class KUnitMainTest(unittest.TestCase): kconfig_add=None, arch='um', cross_compile=None, - qemu_config_path=None) + qemu_config_path=None, + extra_qemu_args=[]) def test_run_kconfig_add(self): kunit.main(['run', '--kconfig_add=CONFIG_KASAN=y', '--kconfig_add=CONFIG_KCSAN=y']) @@ -669,7 +671,19 @@ class KUnitMainTest(unittest.TestCase): kconfig_add=['CONFIG_KASAN=y', 'CONFIG_KCSAN=y'], arch='um', cross_compile=None, - qemu_config_path=None) + qemu_config_path=None, + extra_qemu_args=[]) + + def test_run_qemu_args(self): + kunit.main(['run', '--arch=x86_64', '--qemu_args', '-m 2048']) + # Just verify that we parsed and initialized it correctly here. + self.mock_linux_init.assert_called_once_with('.kunit', + kunitconfig_path=None, + kconfig_add=None, + arch='x86_64', + cross_compile=None, + qemu_config_path=None, + extra_qemu_args=['-m', '2048']) def test_run_kernel_args(self): kunit.main(['run', '--kernel_args=a=1', '--kernel_args=b=2']) -- cgit v1.2.3 From 8a7c6f859a20ca36a9e3ce71662de697898c9ef5 Mon Sep 17 00:00:00 2001 From: Daniel Latypov Date: Mon, 27 Jun 2022 22:14:44 +0000 Subject: kunit: tool: refactor internal kconfig handling, allow overriding Currently, you cannot ovewrwrite what's in your kunitconfig via --kconfig_add. Nor can you override something in a qemu_config via either means. This patch makes it so we have this level of priority * --kconfig_add * kunitconfig file (the default or the one from --kunitconfig) * qemu_config The rationale for this order is that the more "dynamic" sources of kconfig options should take priority. --kconfig_add is obviously the most dynamic. And for kunitconfig, users probably tweak the file manually or specify --kunitconfig more often than they delve into qemu_config python files. And internally, we convert the kconfigs from a python list into a set or dict fairly often. We should just use a dict internally. We exposed the set transform in the past since we didn't define __eq__, so also take the chance to shore up the kunit_kconfig.Kconfig interface. Example ======= Let's consider the unrealistic example where someone would want to disable CONFIG_KUNIT. I.e. they run $ ./tools/testing/kunit/kunit.py config --kconfig_add=CONFIG_KUNIT=n Before ------ We'd write the following > # CONFIG_KUNIT is not set > CONFIG_KUNIT_ALL_TESTS=y > CONFIG_KUNIT_TEST=y > CONFIG_KUNIT=y > CONFIG_KUNIT_EXAMPLE_TEST=y And we'd error out with > ERROR:root:Not all Kconfig options selected in kunitconfig were in the generated .config. > This is probably due to unsatisfied dependencies. > Missing: # CONFIG_KUNIT is not set After ----- We'd write the following > # CONFIG_KUNIT is not set > CONFIG_KUNIT_TEST=y > CONFIG_KUNIT_ALL_TESTS=y > CONFIG_KUNIT_EXAMPLE_TEST=y And we'd error out with > ERROR:root:Not all Kconfig options selected in kunitconfig were in the generated .config. > This is probably due to unsatisfied dependencies. > Missing: CONFIG_KUNIT_EXAMPLE_TEST=y, CONFIG_KUNIT_TEST=y, CONFIG_KUNIT_ALL_TESTS=y Signed-off-by: Daniel Latypov Reviewed-by: David Gow Reviewed-by: Brendan Higgins Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit_config.py | 45 +++++++++++++++++++--------------- tools/testing/kunit/kunit_kernel.py | 20 ++++++++------- tools/testing/kunit/kunit_tool_test.py | 45 ++++++++++++++-------------------- 3 files changed, 54 insertions(+), 56 deletions(-) (limited to 'tools') diff --git a/tools/testing/kunit/kunit_config.py b/tools/testing/kunit/kunit_config.py index 75a8dc1683d4..898b2a35eb29 100644 --- a/tools/testing/kunit/kunit_config.py +++ b/tools/testing/kunit/kunit_config.py @@ -8,7 +8,7 @@ from dataclasses import dataclass import re -from typing import List, Set +from typing import Dict, Iterable, Set CONFIG_IS_NOT_SET_PATTERN = r'^# CONFIG_(\w+) is not set$' CONFIG_PATTERN = r'^CONFIG_(\w+)=(\S+|".*")$' @@ -32,35 +32,42 @@ class Kconfig: """Represents defconfig or .config specified using the Kconfig language.""" def __init__(self) -> None: - self._entries = [] # type: List[KconfigEntry] + self._entries = {} # type: Dict[str, str] - def entries(self) -> Set[KconfigEntry]: - return set(self._entries) + def __eq__(self, other) -> bool: + if not isinstance(other, self.__class__): + return False + return self._entries == other._entries - def add_entry(self, entry: KconfigEntry) -> None: - self._entries.append(entry) + def __repr__(self) -> str: + return ','.join(str(e) for e in self.as_entries()) + + def as_entries(self) -> Iterable[KconfigEntry]: + for name, value in self._entries.items(): + yield KconfigEntry(name, value) + + def add_entry(self, name: str, value: str) -> None: + self._entries[name] = value def is_subset_of(self, other: 'Kconfig') -> bool: - other_dict = {e.name: e.value for e in other.entries()} - for a in self.entries(): - b = other_dict.get(a.name) + for name, value in self._entries.items(): + b = other._entries.get(name) if b is None: - if a.value == 'n': + if value == 'n': continue return False - if a.value != b: + if value != b: return False return True def merge_in_entries(self, other: 'Kconfig') -> None: - if other.is_subset_of(self): - return - self._entries = list(self.entries().union(other.entries())) + for name, value in other._entries.items(): + self._entries[name] = value def write_to_file(self, path: str) -> None: with open(path, 'a+') as f: - for entry in self.entries(): - f.write(str(entry) + '\n') + for e in self.as_entries(): + f.write(str(e) + '\n') def parse_file(path: str) -> Kconfig: with open(path, 'r') as f: @@ -78,14 +85,12 @@ def parse_from_string(blob: str) -> Kconfig: match = config_matcher.match(line) if match: - entry = KconfigEntry(match.group(1), match.group(2)) - kconfig.add_entry(entry) + kconfig.add_entry(match.group(1), match.group(2)) continue empty_match = is_not_set_matcher.match(line) if empty_match: - entry = KconfigEntry(empty_match.group(1), 'n') - kconfig.add_entry(entry) + kconfig.add_entry(empty_match.group(1), 'n') continue if line[0] == '#': diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py index b30622aa374f..94ec9f65ef19 100644 --- a/tools/testing/kunit/kunit_kernel.py +++ b/tools/testing/kunit/kunit_kernel.py @@ -53,8 +53,8 @@ class LinuxSourceTreeOperations: except subprocess.CalledProcessError as e: raise ConfigError(e.output.decode()) - def make_arch_qemuconfig(self, base_kunitconfig: kunit_config.Kconfig) -> None: - pass + def make_arch_qemuconfig(self, base_kunitconfig: kunit_config.Kconfig) -> kunit_config.Kconfig: + return base_kunitconfig def make_allyesconfig(self, build_dir: str, make_options) -> None: raise ConfigError('Only the "um" arch is supported for alltests') @@ -109,9 +109,10 @@ class LinuxSourceTreeOperationsQemu(LinuxSourceTreeOperations): self._kernel_command_line = qemu_arch_params.kernel_command_line + ' kunit_shutdown=reboot' self._extra_qemu_params = qemu_arch_params.extra_qemu_params - def make_arch_qemuconfig(self, base_kunitconfig: kunit_config.Kconfig) -> None: + def make_arch_qemuconfig(self, base_kunitconfig: kunit_config.Kconfig) -> kunit_config.Kconfig: kconfig = kunit_config.parse_from_string(self._kconfig) - base_kunitconfig.merge_in_entries(kconfig) + kconfig.merge_in_entries(base_kunitconfig) + return kconfig def start(self, params: List[str], build_dir: str) -> subprocess.Popen: kernel_path = os.path.join(build_dir, self._kernel_path) @@ -268,10 +269,10 @@ class LinuxSourceTree: validated_kconfig = kunit_config.parse_file(kconfig_path) if self._kconfig.is_subset_of(validated_kconfig): return True - invalid = self._kconfig.entries() - validated_kconfig.entries() + missing = set(self._kconfig.as_entries()) - set(validated_kconfig.as_entries()) message = 'Not all Kconfig options selected in kunitconfig were in the generated .config.\n' \ 'This is probably due to unsatisfied dependencies.\n' \ - 'Missing: ' + ', '.join([str(e) for e in invalid]) + 'Missing: ' + ', '.join(str(e) for e in missing) if self._arch == 'um': message += '\nNote: many Kconfig options aren\'t available on UML. You can try running ' \ 'on a different architecture with something like "--arch=x86_64".' @@ -283,7 +284,7 @@ class LinuxSourceTree: if build_dir and not os.path.exists(build_dir): os.mkdir(build_dir) try: - self._ops.make_arch_qemuconfig(self._kconfig) + self._kconfig = self._ops.make_arch_qemuconfig(self._kconfig) self._kconfig.write_to_file(kconfig_path) self._ops.make_olddefconfig(build_dir, make_options) except ConfigError as e: @@ -304,7 +305,7 @@ class LinuxSourceTree: return True old_kconfig = kunit_config.parse_file(old_path) - return old_kconfig.entries() != self._kconfig.entries() + return old_kconfig != self._kconfig def build_reconfig(self, build_dir: str, make_options) -> bool: """Creates a new .config if it is not a subset of the .kunitconfig.""" @@ -314,7 +315,8 @@ class LinuxSourceTree: return self.build_config(build_dir, make_options) existing_kconfig = kunit_config.parse_file(kconfig_path) - self._ops.make_arch_qemuconfig(self._kconfig) + self._kconfig = self._ops.make_arch_qemuconfig(self._kconfig) + if self._kconfig.is_subset_of(existing_kconfig) and not self._kunitconfig_changed(build_dir): return True print('Regenerating .config ...') diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index 59fba0926629..e56544d58147 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -45,7 +45,7 @@ class KconfigTest(unittest.TestCase): self.assertTrue(kconfig0.is_subset_of(kconfig0)) kconfig1 = kunit_config.Kconfig() - kconfig1.add_entry(kunit_config.KconfigEntry('TEST', 'y')) + kconfig1.add_entry('TEST', 'y') self.assertTrue(kconfig1.is_subset_of(kconfig1)) self.assertTrue(kconfig0.is_subset_of(kconfig1)) self.assertFalse(kconfig1.is_subset_of(kconfig0)) @@ -56,40 +56,28 @@ class KconfigTest(unittest.TestCase): kconfig = kunit_config.parse_file(kconfig_path) expected_kconfig = kunit_config.Kconfig() - expected_kconfig.add_entry( - kunit_config.KconfigEntry('UML', 'y')) - expected_kconfig.add_entry( - kunit_config.KconfigEntry('MMU', 'y')) - expected_kconfig.add_entry( - kunit_config.KconfigEntry('TEST', 'y')) - expected_kconfig.add_entry( - kunit_config.KconfigEntry('EXAMPLE_TEST', 'y')) - expected_kconfig.add_entry( - kunit_config.KconfigEntry('MK8', 'n')) - - self.assertEqual(kconfig.entries(), expected_kconfig.entries()) + expected_kconfig.add_entry('UML', 'y') + expected_kconfig.add_entry('MMU', 'y') + expected_kconfig.add_entry('TEST', 'y') + expected_kconfig.add_entry('EXAMPLE_TEST', 'y') + expected_kconfig.add_entry('MK8', 'n') + + self.assertEqual(kconfig, expected_kconfig) def test_write_to_file(self): kconfig_path = os.path.join(test_tmpdir, '.config') expected_kconfig = kunit_config.Kconfig() - expected_kconfig.add_entry( - kunit_config.KconfigEntry('UML', 'y')) - expected_kconfig.add_entry( - kunit_config.KconfigEntry('MMU', 'y')) - expected_kconfig.add_entry( - kunit_config.KconfigEntry('TEST', 'y')) - expected_kconfig.add_entry( - kunit_config.KconfigEntry('EXAMPLE_TEST', 'y')) - expected_kconfig.add_entry( - kunit_config.KconfigEntry('MK8', 'n')) + expected_kconfig.add_entry('UML', 'y') + expected_kconfig.add_entry('MMU', 'y') + expected_kconfig.add_entry('TEST', 'y') + expected_kconfig.add_entry('EXAMPLE_TEST', 'y') + expected_kconfig.add_entry('MK8', 'n') expected_kconfig.write_to_file(kconfig_path) actual_kconfig = kunit_config.parse_file(kconfig_path) - - self.assertEqual(actual_kconfig.entries(), - expected_kconfig.entries()) + self.assertEqual(actual_kconfig, expected_kconfig) class KUnitParserTest(unittest.TestCase): @@ -381,8 +369,11 @@ class LinuxSourceTreeTest(unittest.TestCase): kunit_kernel.LinuxSourceTree('', kunitconfig_path=dir) def test_kconfig_add(self): + want_kconfig = kunit_config.Kconfig() + want_kconfig.add_entry('NOT_REAL', 'y') + tree = kunit_kernel.LinuxSourceTree('', kconfig_add=['CONFIG_NOT_REAL=y']) - self.assertIn(kunit_config.KconfigEntry('NOT_REAL', 'y'), tree._kconfig.entries()) + self.assertTrue(want_kconfig.is_subset_of(tree._kconfig), msg=tree._kconfig) def test_invalid_arch(self): with self.assertRaisesRegex(kunit_kernel.ConfigError, 'not a valid arch, options are.*x86_64'): -- cgit v1.2.3 From 1d202d1496a0be94100d8cbc2b658dcd980a3edf Mon Sep 17 00:00:00 2001 From: Daniel Latypov Date: Mon, 27 Jun 2022 22:14:46 +0000 Subject: kunit: add coverage_uml.config to enable GCOV on UML Now that kunit.py's --kunitconfig is repeatable, let's create a file to hold the various options needed to enable coverage under UML. This can be used like so: $ ./tools/testing/kunit/kunit.py run \ --kunitconfig=tools/testing/kunit/configs/all_tests_uml.config \ --kunitconfig=tools/testing/kunit/configs/coverage_uml.config \ --make_options=CC=/usr/bin/gcc-6 which on my system is enough to get coverage working [1]. This is still a clunky command, but far better than before. [1] at the time of this commit, I get: Overall coverage rate: lines......: 11.6% (34112 of 295033 lines) functions..: 15.3% (3721 of 24368 functions) Signed-off-by: Daniel Latypov Reviewed-by: David Gow Reviewed-by: Brendan Higgins Signed-off-by: Shuah Khan --- Documentation/dev-tools/kunit/running_tips.rst | 3 +-- tools/testing/kunit/configs/coverage_uml.config | 11 +++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 tools/testing/kunit/configs/coverage_uml.config (limited to 'tools') diff --git a/Documentation/dev-tools/kunit/running_tips.rst b/Documentation/dev-tools/kunit/running_tips.rst index c36f6760087d..205ea21c9cca 100644 --- a/Documentation/dev-tools/kunit/running_tips.rst +++ b/Documentation/dev-tools/kunit/running_tips.rst @@ -123,8 +123,7 @@ Putting it together into a copy-pastable sequence of commands: .. code-block:: bash # Append coverage options to the current config - $ echo -e "CONFIG_DEBUG_KERNEL=y\nCONFIG_DEBUG_INFO=y\nCONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y\nCONFIG_GCOV=y" >> .kunit/.kunitconfig - $ ./tools/testing/kunit/kunit.py run + $ ./tools/testing/kunit/kunit.py run --kunitconfig=.kunit/ --kunitconfig=tools/testing/kunit/configs/coverage_uml.config # Extract the coverage information from the build dir (.kunit/) $ lcov -t "my_kunit_tests" -o coverage.info -c -d .kunit/ diff --git a/tools/testing/kunit/configs/coverage_uml.config b/tools/testing/kunit/configs/coverage_uml.config new file mode 100644 index 000000000000..bacb77664fa8 --- /dev/null +++ b/tools/testing/kunit/configs/coverage_uml.config @@ -0,0 +1,11 @@ +# This config fragment enables coverage on UML, which is different from the +# normal gcov used in other arches (no debugfs). +# Example usage: +# ./tools/testing/kunit/kunit.py run \ +# --kunitconfig=tools/testing/kunit/configs/all_tests_uml.config \ +# --kunitconfig=tools/testing/kunit/configs/coverage_uml.config + +CONFIG_DEBUG_KERNEL=y +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y +CONFIG_GCOV=y -- cgit v1.2.3 From f8d3da4ef8faf027261e06b7864583930dd7c7b9 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 6 Jul 2022 16:25:47 -0700 Subject: bpf: Add flags arg to bpf_dynptr_read and bpf_dynptr_write APIs Commit 13bbbfbea759 ("bpf: Add bpf_dynptr_read and bpf_dynptr_write") added the bpf_dynptr_write() and bpf_dynptr_read() APIs. However, it will be needed for some dynptr types to pass in flags as well (e.g. when writing to a skb, the user may like to invalidate the hash or recompute the checksum). This patch adds a "u64 flags" arg to the bpf_dynptr_read() and bpf_dynptr_write() APIs before their UAPI signature freezes where we then cannot change them anymore with a 5.19.x released kernel. Fixes: 13bbbfbea759 ("bpf: Add bpf_dynptr_read and bpf_dynptr_write") Signed-off-by: Joanne Koong Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20220706232547.4016651-1-joannelkoong@gmail.com --- include/uapi/linux/bpf.h | 11 +++++++---- kernel/bpf/helpers.c | 12 ++++++++---- tools/include/uapi/linux/bpf.h | 11 +++++++---- tools/testing/selftests/bpf/progs/dynptr_fail.c | 10 +++++----- tools/testing/selftests/bpf/progs/dynptr_success.c | 4 ++-- 5 files changed, 29 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f4009dbdf62d..ef78e0e1a754 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5222,22 +5222,25 @@ union bpf_attr { * Return * Nothing. Always succeeds. * - * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset) + * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags) * Description * Read *len* bytes from *src* into *dst*, starting from *offset* * into *src*. + * *flags* is currently unused. * Return * 0 on success, -E2BIG if *offset* + *len* exceeds the length - * of *src*'s data, -EINVAL if *src* is an invalid dynptr. + * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if + * *flags* is not 0. * - * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len) + * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) * Description * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. + * *flags* is currently unused. * Return * 0 on success, -E2BIG if *offset* + *len* exceeds the length * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* - * is a read-only dynptr. + * is a read-only dynptr or if *flags* is not 0. * * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len) * Description diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 225806a02efb..bb1254f07667 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1497,11 +1497,12 @@ const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT, }; -BPF_CALL_4(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src, u32, offset) +BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src, + u32, offset, u64, flags) { int err; - if (!src->data) + if (!src->data || flags) return -EINVAL; err = bpf_dynptr_check_off_len(src, offset, len); @@ -1521,13 +1522,15 @@ const struct bpf_func_proto bpf_dynptr_read_proto = { .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_PTR_TO_DYNPTR, .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, }; -BPF_CALL_4(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src, u32, len) +BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src, + u32, len, u64, flags) { int err; - if (!dst->data || bpf_dynptr_is_rdonly(dst)) + if (!dst->data || flags || bpf_dynptr_is_rdonly(dst)) return -EINVAL; err = bpf_dynptr_check_off_len(dst, offset, len); @@ -1547,6 +1550,7 @@ const struct bpf_func_proto bpf_dynptr_write_proto = { .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, + .arg5_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f4009dbdf62d..ef78e0e1a754 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5222,22 +5222,25 @@ union bpf_attr { * Return * Nothing. Always succeeds. * - * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset) + * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags) * Description * Read *len* bytes from *src* into *dst*, starting from *offset* * into *src*. + * *flags* is currently unused. * Return * 0 on success, -E2BIG if *offset* + *len* exceeds the length - * of *src*'s data, -EINVAL if *src* is an invalid dynptr. + * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if + * *flags* is not 0. * - * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len) + * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) * Description * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. + * *flags* is currently unused. * Return * 0 on success, -E2BIG if *offset* + *len* exceeds the length * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* - * is a read-only dynptr. + * is a read-only dynptr or if *flags* is not 0. * * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len) * Description diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index d811cff73597..0a26c243e6e9 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -140,12 +140,12 @@ int use_after_invalid(void *ctx) bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(read_data), 0, &ptr); - bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0); + bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0, 0); bpf_ringbuf_submit_dynptr(&ptr, 0); /* this should fail */ - bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0); + bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0, 0); return 0; } @@ -338,7 +338,7 @@ int invalid_helper2(void *ctx) get_map_val_dynptr(&ptr); /* this should fail */ - bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 8, 0); + bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 8, 0, 0); return 0; } @@ -377,7 +377,7 @@ int invalid_write2(void *ctx) memcpy((void *)&ptr + 8, &x, sizeof(x)); /* this should fail */ - bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0); + bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0, 0); bpf_ringbuf_submit_dynptr(&ptr, 0); @@ -473,7 +473,7 @@ int invalid_read2(void *ctx) get_map_val_dynptr(&ptr); /* this should fail */ - bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 1, 0); + bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 1, 0, 0); return 0; } diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c index d67be48df4b2..a3a6103c8569 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_success.c +++ b/tools/testing/selftests/bpf/progs/dynptr_success.c @@ -43,10 +43,10 @@ int test_read_write(void *ctx) bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(write_data), 0, &ptr); /* Write data into the dynptr */ - err = err ?: bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data)); + err = bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data), 0); /* Read the data that was written into the dynptr */ - err = err ?: bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0); + err = err ?: bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0, 0); /* Ensure the data we read matches the data we wrote */ for (i = 0; i < sizeof(read_data); i++) { -- cgit v1.2.3 From 018a8e75b49cb846ebfa48076bc4fe0bb67c9c24 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 7 Jul 2022 13:16:12 +0200 Subject: selftests, xsk: Rename AF_XDP testing app Recently, xsk part of libbpf was moved to selftests/bpf directory and lives on its own because there is an AF_XDP testing application that needs it called xdpxceiver. That name makes it a bit hard to indicate who maintains it as there are other XDP samples in there, whereas this one is strictly about AF_XDP. Do s/xdpxceiver/xskxceiver so that it will be easier to figure out who maintains it. A follow-up patch will correct MAINTAINERS file. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220707111613.49031-2-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/.gitignore | 2 +- tools/testing/selftests/bpf/Makefile | 4 +- tools/testing/selftests/bpf/test_xsk.sh | 6 +- tools/testing/selftests/bpf/xdpxceiver.c | 1682 ---------------------------- tools/testing/selftests/bpf/xdpxceiver.h | 172 --- tools/testing/selftests/bpf/xsk_prereqs.sh | 4 +- tools/testing/selftests/bpf/xskxceiver.c | 1682 ++++++++++++++++++++++++++++ tools/testing/selftests/bpf/xskxceiver.h | 172 +++ 8 files changed, 1862 insertions(+), 1862 deletions(-) delete mode 100644 tools/testing/selftests/bpf/xdpxceiver.c delete mode 100644 tools/testing/selftests/bpf/xdpxceiver.h create mode 100644 tools/testing/selftests/bpf/xskxceiver.c create mode 100644 tools/testing/selftests/bpf/xskxceiver.h (limited to 'tools') diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index ca2f47f45670..3a8cb2404ea6 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -41,6 +41,6 @@ test_cpp /bench *.ko *.tmp -xdpxceiver +xskxceiver xdp_redirect_multi xdp_synproxy diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index dfaac97222af..8d59ec7f4c2d 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -82,7 +82,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \ - xdpxceiver xdp_redirect_multi xdp_synproxy + xskxceiver xdp_redirect_multi xdp_synproxy TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read @@ -231,7 +231,7 @@ $(OUTPUT)/flow_dissector_load: $(TESTING_HELPERS) $(OUTPUT)/test_maps: $(TESTING_HELPERS) $(OUTPUT)/test_verifier: $(TESTING_HELPERS) $(CAP_HELPERS) $(OUTPUT)/xsk.o: $(BPFOBJ) -$(OUTPUT)/xdpxceiver: $(OUTPUT)/xsk.o +$(OUTPUT)/xskxceiver: $(OUTPUT)/xsk.o BPFTOOL ?= $(DEFAULT_BPFTOOL) $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh index 567500299231..096a957594cd 100755 --- a/tools/testing/selftests/bpf/test_xsk.sh +++ b/tools/testing/selftests/bpf/test_xsk.sh @@ -47,7 +47,7 @@ # conflict with any existing interface # * tests the veth and xsk layers of the topology # -# See the source xdpxceiver.c for information on each test +# See the source xskxceiver.c for information on each test # # Kernel configuration: # --------------------- @@ -160,14 +160,14 @@ statusList=() TEST_NAME="XSK_SELFTESTS_SOFTIRQ" -execxdpxceiver +exec_xskxceiver cleanup_exit ${VETH0} ${VETH1} ${NS1} TEST_NAME="XSK_SELFTESTS_BUSY_POLL" busy_poll=1 setup_vethPairs -execxdpxceiver +exec_xskxceiver ## END TESTS diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c deleted file mode 100644 index 4c425a43e5b0..000000000000 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ /dev/null @@ -1,1682 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright(c) 2020 Intel Corporation. */ - -/* - * Some functions in this program are taken from - * Linux kernel samples/bpf/xdpsock* and modified - * for use. - * - * See test_xsk.sh for detailed information on test topology - * and prerequisite network setup. - * - * This test program contains two threads, each thread is single socket with - * a unique UMEM. It validates in-order packet delivery and packet content - * by sending packets to each other. - * - * Tests Information: - * ------------------ - * These selftests test AF_XDP SKB and Native/DRV modes using veth - * Virtual Ethernet interfaces. - * - * For each mode, the following tests are run: - * a. nopoll - soft-irq processing in run-to-completion mode - * b. poll - using poll() syscall - * c. Socket Teardown - * Create a Tx and a Rx socket, Tx from one socket, Rx on another. Destroy - * both sockets, then repeat multiple times. Only nopoll mode is used - * d. Bi-directional sockets - * Configure sockets as bi-directional tx/rx sockets, sets up fill and - * completion rings on each socket, tx/rx in both directions. Only nopoll - * mode is used - * e. Statistics - * Trigger some error conditions and ensure that the appropriate statistics - * are incremented. Within this test, the following statistics are tested: - * i. rx dropped - * Increase the UMEM frame headroom to a value which results in - * insufficient space in the rx buffer for both the packet and the headroom. - * ii. tx invalid - * Set the 'len' field of tx descriptors to an invalid value (umem frame - * size + 1). - * iii. rx ring full - * Reduce the size of the RX ring to a fraction of the fill ring size. - * iv. fill queue empty - * Do not populate the fill queue and then try to receive pkts. - * f. bpf_link resource persistence - * Configure sockets at indexes 0 and 1, run a traffic on queue ids 0, - * then remove xsk sockets from queue 0 on both veth interfaces and - * finally run a traffic on queues ids 1 - * g. unaligned mode - * h. tests for invalid and corner case Tx descriptors so that the correct ones - * are discarded and let through, respectively. - * i. 2K frame size tests - * - * Total tests: 12 - * - * Flow: - * ----- - * - Single process spawns two threads: Tx and Rx - * - Each of these two threads attach to a veth interface within their assigned - * namespaces - * - Each thread Creates one AF_XDP socket connected to a unique umem for each - * veth interface - * - Tx thread Transmits 10k packets from veth to veth - * - Rx thread verifies if all 10k packets were received and delivered in-order, - * and have the right content - * - * Enable/disable packet dump mode: - * -------------------------- - * To enable L2 - L4 headers and payload dump of each packet on STDOUT, add - * parameter -D to params array in test_xsk.sh, i.e. params=("-S" "-D") - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "xsk.h" -#include "xdpxceiver.h" -#include "../kselftest.h" - -/* AF_XDP APIs were moved into libxdp and marked as deprecated in libbpf. - * Until xdpxceiver is either moved or re-writed into libxdp, suppress - * deprecation warnings in this file - */ -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - -static const char *MAC1 = "\x00\x0A\x56\x9E\xEE\x62"; -static const char *MAC2 = "\x00\x0A\x56\x9E\xEE\x61"; -static const char *IP1 = "192.168.100.162"; -static const char *IP2 = "192.168.100.161"; -static const u16 UDP_PORT1 = 2020; -static const u16 UDP_PORT2 = 2121; - -static void __exit_with_error(int error, const char *file, const char *func, int line) -{ - ksft_test_result_fail("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, - strerror(error)); - ksft_exit_xfail(); -} - -#define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__) - -#define mode_string(test) (test)->ifobj_tx->xdp_flags & XDP_FLAGS_SKB_MODE ? "SKB" : "DRV" -#define busy_poll_string(test) (test)->ifobj_tx->busy_poll ? "BUSY-POLL " : "" - -static void report_failure(struct test_spec *test) -{ - if (test->fail) - return; - - ksft_test_result_fail("FAIL: %s %s%s\n", mode_string(test), busy_poll_string(test), - test->name); - test->fail = true; -} - -static void memset32_htonl(void *dest, u32 val, u32 size) -{ - u32 *ptr = (u32 *)dest; - int i; - - val = htonl(val); - - for (i = 0; i < (size & (~0x3)); i += 4) - ptr[i >> 2] = val; -} - -/* - * Fold a partial checksum - * This function code has been taken from - * Linux kernel include/asm-generic/checksum.h - */ -static __u16 csum_fold(__u32 csum) -{ - u32 sum = (__force u32)csum; - - sum = (sum & 0xffff) + (sum >> 16); - sum = (sum & 0xffff) + (sum >> 16); - return (__force __u16)~sum; -} - -/* - * This function code has been taken from - * Linux kernel lib/checksum.c - */ -static u32 from64to32(u64 x) -{ - /* add up 32-bit and 32-bit for 32+c bit */ - x = (x & 0xffffffff) + (x >> 32); - /* add up carry.. */ - x = (x & 0xffffffff) + (x >> 32); - return (u32)x; -} - -/* - * This function code has been taken from - * Linux kernel lib/checksum.c - */ -static __u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum) -{ - unsigned long long s = (__force u32)sum; - - s += (__force u32)saddr; - s += (__force u32)daddr; -#ifdef __BIG_ENDIAN__ - s += proto + len; -#else - s += (proto + len) << 8; -#endif - return (__force __u32)from64to32(s); -} - -/* - * This function has been taken from - * Linux kernel include/asm-generic/checksum.h - */ -static __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum) -{ - return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); -} - -static u16 udp_csum(u32 saddr, u32 daddr, u32 len, u8 proto, u16 *udp_pkt) -{ - u32 csum = 0; - u32 cnt = 0; - - /* udp hdr and data */ - for (; cnt < len; cnt += 2) - csum += udp_pkt[cnt >> 1]; - - return csum_tcpudp_magic(saddr, daddr, len, proto, csum); -} - -static void gen_eth_hdr(struct ifobject *ifobject, struct ethhdr *eth_hdr) -{ - memcpy(eth_hdr->h_dest, ifobject->dst_mac, ETH_ALEN); - memcpy(eth_hdr->h_source, ifobject->src_mac, ETH_ALEN); - eth_hdr->h_proto = htons(ETH_P_IP); -} - -static void gen_ip_hdr(struct ifobject *ifobject, struct iphdr *ip_hdr) -{ - ip_hdr->version = IP_PKT_VER; - ip_hdr->ihl = 0x5; - ip_hdr->tos = IP_PKT_TOS; - ip_hdr->tot_len = htons(IP_PKT_SIZE); - ip_hdr->id = 0; - ip_hdr->frag_off = 0; - ip_hdr->ttl = IPDEFTTL; - ip_hdr->protocol = IPPROTO_UDP; - ip_hdr->saddr = ifobject->src_ip; - ip_hdr->daddr = ifobject->dst_ip; - ip_hdr->check = 0; -} - -static void gen_udp_hdr(u32 payload, void *pkt, struct ifobject *ifobject, - struct udphdr *udp_hdr) -{ - udp_hdr->source = htons(ifobject->src_port); - udp_hdr->dest = htons(ifobject->dst_port); - udp_hdr->len = htons(UDP_PKT_SIZE); - memset32_htonl(pkt + PKT_HDR_SIZE, payload, UDP_PKT_DATA_SIZE); -} - -static void gen_udp_csum(struct udphdr *udp_hdr, struct iphdr *ip_hdr) -{ - udp_hdr->check = 0; - udp_hdr->check = - udp_csum(ip_hdr->saddr, ip_hdr->daddr, UDP_PKT_SIZE, IPPROTO_UDP, (u16 *)udp_hdr); -} - -static int xsk_configure_umem(struct xsk_umem_info *umem, void *buffer, u64 size) -{ - struct xsk_umem_config cfg = { - .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, - .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, - .frame_size = umem->frame_size, - .frame_headroom = umem->frame_headroom, - .flags = XSK_UMEM__DEFAULT_FLAGS - }; - int ret; - - if (umem->unaligned_mode) - cfg.flags |= XDP_UMEM_UNALIGNED_CHUNK_FLAG; - - ret = xsk_umem__create(&umem->umem, buffer, size, - &umem->fq, &umem->cq, &cfg); - if (ret) - return ret; - - umem->buffer = buffer; - return 0; -} - -static void enable_busy_poll(struct xsk_socket_info *xsk) -{ - int sock_opt; - - sock_opt = 1; - if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_PREFER_BUSY_POLL, - (void *)&sock_opt, sizeof(sock_opt)) < 0) - exit_with_error(errno); - - sock_opt = 20; - if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL, - (void *)&sock_opt, sizeof(sock_opt)) < 0) - exit_with_error(errno); - - sock_opt = BATCH_SIZE; - if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL_BUDGET, - (void *)&sock_opt, sizeof(sock_opt)) < 0) - exit_with_error(errno); -} - -static int xsk_configure_socket(struct xsk_socket_info *xsk, struct xsk_umem_info *umem, - struct ifobject *ifobject, bool shared) -{ - struct xsk_socket_config cfg = {}; - struct xsk_ring_cons *rxr; - struct xsk_ring_prod *txr; - - xsk->umem = umem; - cfg.rx_size = xsk->rxqsize; - cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; - cfg.libbpf_flags = XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD; - cfg.xdp_flags = ifobject->xdp_flags; - cfg.bind_flags = ifobject->bind_flags; - if (shared) - cfg.bind_flags |= XDP_SHARED_UMEM; - - txr = ifobject->tx_on ? &xsk->tx : NULL; - rxr = ifobject->rx_on ? &xsk->rx : NULL; - return xsk_socket__create(&xsk->xsk, ifobject->ifname, 0, umem->umem, rxr, txr, &cfg); -} - -static struct option long_options[] = { - {"interface", required_argument, 0, 'i'}, - {"busy-poll", no_argument, 0, 'b'}, - {"dump-pkts", no_argument, 0, 'D'}, - {"verbose", no_argument, 0, 'v'}, - {0, 0, 0, 0} -}; - -static void usage(const char *prog) -{ - const char *str = - " Usage: %s [OPTIONS]\n" - " Options:\n" - " -i, --interface Use interface\n" - " -D, --dump-pkts Dump packets L2 - L5\n" - " -v, --verbose Verbose output\n" - " -b, --busy-poll Enable busy poll\n"; - - ksft_print_msg(str, prog); -} - -static int switch_namespace(const char *nsname) -{ - char fqns[26] = "/var/run/netns/"; - int nsfd; - - if (!nsname || strlen(nsname) == 0) - return -1; - - strncat(fqns, nsname, sizeof(fqns) - strlen(fqns) - 1); - nsfd = open(fqns, O_RDONLY); - - if (nsfd == -1) - exit_with_error(errno); - - if (setns(nsfd, 0) == -1) - exit_with_error(errno); - - print_verbose("NS switched: %s\n", nsname); - - return nsfd; -} - -static bool validate_interface(struct ifobject *ifobj) -{ - if (!strcmp(ifobj->ifname, "")) - return false; - return true; -} - -static void parse_command_line(struct ifobject *ifobj_tx, struct ifobject *ifobj_rx, int argc, - char **argv) -{ - struct ifobject *ifobj; - u32 interface_nb = 0; - int option_index, c; - - opterr = 0; - - for (;;) { - char *sptr, *token; - - c = getopt_long(argc, argv, "i:Dvb", long_options, &option_index); - if (c == -1) - break; - - switch (c) { - case 'i': - if (interface_nb == 0) - ifobj = ifobj_tx; - else if (interface_nb == 1) - ifobj = ifobj_rx; - else - break; - - sptr = strndupa(optarg, strlen(optarg)); - memcpy(ifobj->ifname, strsep(&sptr, ","), MAX_INTERFACE_NAME_CHARS); - token = strsep(&sptr, ","); - if (token) - memcpy(ifobj->nsname, token, MAX_INTERFACES_NAMESPACE_CHARS); - interface_nb++; - break; - case 'D': - opt_pkt_dump = true; - break; - case 'v': - opt_verbose = true; - break; - case 'b': - ifobj_tx->busy_poll = true; - ifobj_rx->busy_poll = true; - break; - default: - usage(basename(argv[0])); - ksft_exit_xfail(); - } - } -} - -static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, - struct ifobject *ifobj_rx) -{ - u32 i, j; - - for (i = 0; i < MAX_INTERFACES; i++) { - struct ifobject *ifobj = i ? ifobj_rx : ifobj_tx; - - ifobj->xsk = &ifobj->xsk_arr[0]; - ifobj->use_poll = false; - ifobj->use_fill_ring = true; - ifobj->release_rx = true; - ifobj->pkt_stream = test->pkt_stream_default; - ifobj->validation_func = NULL; - - if (i == 0) { - ifobj->rx_on = false; - ifobj->tx_on = true; - } else { - ifobj->rx_on = true; - ifobj->tx_on = false; - } - - memset(ifobj->umem, 0, sizeof(*ifobj->umem)); - ifobj->umem->num_frames = DEFAULT_UMEM_BUFFERS; - ifobj->umem->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; - - for (j = 0; j < MAX_SOCKETS; j++) { - memset(&ifobj->xsk_arr[j], 0, sizeof(ifobj->xsk_arr[j])); - ifobj->xsk_arr[j].rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; - } - } - - test->ifobj_tx = ifobj_tx; - test->ifobj_rx = ifobj_rx; - test->current_step = 0; - test->total_steps = 1; - test->nb_sockets = 1; - test->fail = false; -} - -static void test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, - struct ifobject *ifobj_rx, enum test_mode mode) -{ - struct pkt_stream *pkt_stream; - u32 i; - - pkt_stream = test->pkt_stream_default; - memset(test, 0, sizeof(*test)); - test->pkt_stream_default = pkt_stream; - - for (i = 0; i < MAX_INTERFACES; i++) { - struct ifobject *ifobj = i ? ifobj_rx : ifobj_tx; - - ifobj->xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; - if (mode == TEST_MODE_SKB) - ifobj->xdp_flags |= XDP_FLAGS_SKB_MODE; - else - ifobj->xdp_flags |= XDP_FLAGS_DRV_MODE; - - ifobj->bind_flags = XDP_USE_NEED_WAKEUP | XDP_COPY; - } - - __test_spec_init(test, ifobj_tx, ifobj_rx); -} - -static void test_spec_reset(struct test_spec *test) -{ - __test_spec_init(test, test->ifobj_tx, test->ifobj_rx); -} - -static void test_spec_set_name(struct test_spec *test, const char *name) -{ - strncpy(test->name, name, MAX_TEST_NAME_SIZE); -} - -static void pkt_stream_reset(struct pkt_stream *pkt_stream) -{ - if (pkt_stream) - pkt_stream->rx_pkt_nb = 0; -} - -static struct pkt *pkt_stream_get_pkt(struct pkt_stream *pkt_stream, u32 pkt_nb) -{ - if (pkt_nb >= pkt_stream->nb_pkts) - return NULL; - - return &pkt_stream->pkts[pkt_nb]; -} - -static struct pkt *pkt_stream_get_next_rx_pkt(struct pkt_stream *pkt_stream, u32 *pkts_sent) -{ - while (pkt_stream->rx_pkt_nb < pkt_stream->nb_pkts) { - (*pkts_sent)++; - if (pkt_stream->pkts[pkt_stream->rx_pkt_nb].valid) - return &pkt_stream->pkts[pkt_stream->rx_pkt_nb++]; - pkt_stream->rx_pkt_nb++; - } - return NULL; -} - -static void pkt_stream_delete(struct pkt_stream *pkt_stream) -{ - free(pkt_stream->pkts); - free(pkt_stream); -} - -static void pkt_stream_restore_default(struct test_spec *test) -{ - struct pkt_stream *tx_pkt_stream = test->ifobj_tx->pkt_stream; - - if (tx_pkt_stream != test->pkt_stream_default) { - pkt_stream_delete(test->ifobj_tx->pkt_stream); - test->ifobj_tx->pkt_stream = test->pkt_stream_default; - } - - if (test->ifobj_rx->pkt_stream != test->pkt_stream_default && - test->ifobj_rx->pkt_stream != tx_pkt_stream) - pkt_stream_delete(test->ifobj_rx->pkt_stream); - test->ifobj_rx->pkt_stream = test->pkt_stream_default; -} - -static struct pkt_stream *__pkt_stream_alloc(u32 nb_pkts) -{ - struct pkt_stream *pkt_stream; - - pkt_stream = calloc(1, sizeof(*pkt_stream)); - if (!pkt_stream) - return NULL; - - pkt_stream->pkts = calloc(nb_pkts, sizeof(*pkt_stream->pkts)); - if (!pkt_stream->pkts) { - free(pkt_stream); - return NULL; - } - - pkt_stream->nb_pkts = nb_pkts; - return pkt_stream; -} - -static void pkt_set(struct xsk_umem_info *umem, struct pkt *pkt, u64 addr, u32 len) -{ - pkt->addr = addr; - pkt->len = len; - if (len > umem->frame_size - XDP_PACKET_HEADROOM - MIN_PKT_SIZE * 2 - umem->frame_headroom) - pkt->valid = false; - else - pkt->valid = true; -} - -static struct pkt_stream *pkt_stream_generate(struct xsk_umem_info *umem, u32 nb_pkts, u32 pkt_len) -{ - struct pkt_stream *pkt_stream; - u32 i; - - pkt_stream = __pkt_stream_alloc(nb_pkts); - if (!pkt_stream) - exit_with_error(ENOMEM); - - pkt_stream->nb_pkts = nb_pkts; - for (i = 0; i < nb_pkts; i++) { - pkt_set(umem, &pkt_stream->pkts[i], (i % umem->num_frames) * umem->frame_size, - pkt_len); - pkt_stream->pkts[i].payload = i; - } - - return pkt_stream; -} - -static struct pkt_stream *pkt_stream_clone(struct xsk_umem_info *umem, - struct pkt_stream *pkt_stream) -{ - return pkt_stream_generate(umem, pkt_stream->nb_pkts, pkt_stream->pkts[0].len); -} - -static void pkt_stream_replace(struct test_spec *test, u32 nb_pkts, u32 pkt_len) -{ - struct pkt_stream *pkt_stream; - - pkt_stream = pkt_stream_generate(test->ifobj_tx->umem, nb_pkts, pkt_len); - test->ifobj_tx->pkt_stream = pkt_stream; - test->ifobj_rx->pkt_stream = pkt_stream; -} - -static void pkt_stream_replace_half(struct test_spec *test, u32 pkt_len, int offset) -{ - struct xsk_umem_info *umem = test->ifobj_tx->umem; - struct pkt_stream *pkt_stream; - u32 i; - - pkt_stream = pkt_stream_clone(umem, test->pkt_stream_default); - for (i = 1; i < test->pkt_stream_default->nb_pkts; i += 2) - pkt_set(umem, &pkt_stream->pkts[i], - (i % umem->num_frames) * umem->frame_size + offset, pkt_len); - - test->ifobj_tx->pkt_stream = pkt_stream; - test->ifobj_rx->pkt_stream = pkt_stream; -} - -static void pkt_stream_receive_half(struct test_spec *test) -{ - struct xsk_umem_info *umem = test->ifobj_rx->umem; - struct pkt_stream *pkt_stream = test->ifobj_tx->pkt_stream; - u32 i; - - test->ifobj_rx->pkt_stream = pkt_stream_generate(umem, pkt_stream->nb_pkts, - pkt_stream->pkts[0].len); - pkt_stream = test->ifobj_rx->pkt_stream; - for (i = 1; i < pkt_stream->nb_pkts; i += 2) - pkt_stream->pkts[i].valid = false; -} - -static struct pkt *pkt_generate(struct ifobject *ifobject, u32 pkt_nb) -{ - struct pkt *pkt = pkt_stream_get_pkt(ifobject->pkt_stream, pkt_nb); - struct udphdr *udp_hdr; - struct ethhdr *eth_hdr; - struct iphdr *ip_hdr; - void *data; - - if (!pkt) - return NULL; - if (!pkt->valid || pkt->len < MIN_PKT_SIZE) - return pkt; - - data = xsk_umem__get_data(ifobject->umem->buffer, pkt->addr); - udp_hdr = (struct udphdr *)(data + sizeof(struct ethhdr) + sizeof(struct iphdr)); - ip_hdr = (struct iphdr *)(data + sizeof(struct ethhdr)); - eth_hdr = (struct ethhdr *)data; - - gen_udp_hdr(pkt_nb, data, ifobject, udp_hdr); - gen_ip_hdr(ifobject, ip_hdr); - gen_udp_csum(udp_hdr, ip_hdr); - gen_eth_hdr(ifobject, eth_hdr); - - return pkt; -} - -static void pkt_stream_generate_custom(struct test_spec *test, struct pkt *pkts, u32 nb_pkts) -{ - struct pkt_stream *pkt_stream; - u32 i; - - pkt_stream = __pkt_stream_alloc(nb_pkts); - if (!pkt_stream) - exit_with_error(ENOMEM); - - test->ifobj_tx->pkt_stream = pkt_stream; - test->ifobj_rx->pkt_stream = pkt_stream; - - for (i = 0; i < nb_pkts; i++) { - pkt_stream->pkts[i].addr = pkts[i].addr; - pkt_stream->pkts[i].len = pkts[i].len; - pkt_stream->pkts[i].payload = i; - pkt_stream->pkts[i].valid = pkts[i].valid; - } -} - -static void pkt_dump(void *pkt, u32 len) -{ - char s[INET_ADDRSTRLEN]; - struct ethhdr *ethhdr; - struct udphdr *udphdr; - struct iphdr *iphdr; - int payload, i; - - ethhdr = pkt; - iphdr = pkt + sizeof(*ethhdr); - udphdr = pkt + sizeof(*ethhdr) + sizeof(*iphdr); - - /*extract L2 frame */ - fprintf(stdout, "DEBUG>> L2: dst mac: "); - for (i = 0; i < ETH_ALEN; i++) - fprintf(stdout, "%02X", ethhdr->h_dest[i]); - - fprintf(stdout, "\nDEBUG>> L2: src mac: "); - for (i = 0; i < ETH_ALEN; i++) - fprintf(stdout, "%02X", ethhdr->h_source[i]); - - /*extract L3 frame */ - fprintf(stdout, "\nDEBUG>> L3: ip_hdr->ihl: %02X\n", iphdr->ihl); - fprintf(stdout, "DEBUG>> L3: ip_hdr->saddr: %s\n", - inet_ntop(AF_INET, &iphdr->saddr, s, sizeof(s))); - fprintf(stdout, "DEBUG>> L3: ip_hdr->daddr: %s\n", - inet_ntop(AF_INET, &iphdr->daddr, s, sizeof(s))); - /*extract L4 frame */ - fprintf(stdout, "DEBUG>> L4: udp_hdr->src: %d\n", ntohs(udphdr->source)); - fprintf(stdout, "DEBUG>> L4: udp_hdr->dst: %d\n", ntohs(udphdr->dest)); - /*extract L5 frame */ - payload = *((uint32_t *)(pkt + PKT_HDR_SIZE)); - - fprintf(stdout, "DEBUG>> L5: payload: %d\n", payload); - fprintf(stdout, "---------------------------------------\n"); -} - -static bool is_offset_correct(struct xsk_umem_info *umem, struct pkt_stream *pkt_stream, u64 addr, - u64 pkt_stream_addr) -{ - u32 headroom = umem->unaligned_mode ? 0 : umem->frame_headroom; - u32 offset = addr % umem->frame_size, expected_offset = 0; - - if (!pkt_stream->use_addr_for_fill) - pkt_stream_addr = 0; - - expected_offset += (pkt_stream_addr + headroom + XDP_PACKET_HEADROOM) % umem->frame_size; - - if (offset == expected_offset) - return true; - - ksft_print_msg("[%s] expected [%u], got [%u]\n", __func__, expected_offset, offset); - return false; -} - -static bool is_pkt_valid(struct pkt *pkt, void *buffer, u64 addr, u32 len) -{ - void *data = xsk_umem__get_data(buffer, addr); - struct iphdr *iphdr = (struct iphdr *)(data + sizeof(struct ethhdr)); - - if (!pkt) { - ksft_print_msg("[%s] too many packets received\n", __func__); - return false; - } - - if (len < MIN_PKT_SIZE || pkt->len < MIN_PKT_SIZE) { - /* Do not try to verify packets that are smaller than minimum size. */ - return true; - } - - if (pkt->len != len) { - ksft_print_msg("[%s] expected length [%d], got length [%d]\n", - __func__, pkt->len, len); - return false; - } - - if (iphdr->version == IP_PKT_VER && iphdr->tos == IP_PKT_TOS) { - u32 seqnum = ntohl(*((u32 *)(data + PKT_HDR_SIZE))); - - if (opt_pkt_dump) - pkt_dump(data, PKT_SIZE); - - if (pkt->payload != seqnum) { - ksft_print_msg("[%s] expected seqnum [%d], got seqnum [%d]\n", - __func__, pkt->payload, seqnum); - return false; - } - } else { - ksft_print_msg("Invalid frame received: "); - ksft_print_msg("[IP_PKT_VER: %02X], [IP_PKT_TOS: %02X]\n", iphdr->version, - iphdr->tos); - return false; - } - - return true; -} - -static void kick_tx(struct xsk_socket_info *xsk) -{ - int ret; - - ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); - if (ret >= 0) - return; - if (errno == ENOBUFS || errno == EAGAIN || errno == EBUSY || errno == ENETDOWN) { - usleep(100); - return; - } - exit_with_error(errno); -} - -static void kick_rx(struct xsk_socket_info *xsk) -{ - int ret; - - ret = recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL); - if (ret < 0) - exit_with_error(errno); -} - -static int complete_pkts(struct xsk_socket_info *xsk, int batch_size) -{ - unsigned int rcvd; - u32 idx; - - if (xsk_ring_prod__needs_wakeup(&xsk->tx)) - kick_tx(xsk); - - rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx); - if (rcvd) { - if (rcvd > xsk->outstanding_tx) { - u64 addr = *xsk_ring_cons__comp_addr(&xsk->umem->cq, idx + rcvd - 1); - - ksft_print_msg("[%s] Too many packets completed\n", __func__); - ksft_print_msg("Last completion address: %llx\n", addr); - return TEST_FAILURE; - } - - xsk_ring_cons__release(&xsk->umem->cq, rcvd); - xsk->outstanding_tx -= rcvd; - } - - return TEST_PASS; -} - -static int receive_pkts(struct ifobject *ifobj, struct pollfd *fds) -{ - struct timeval tv_end, tv_now, tv_timeout = {RECV_TMOUT, 0}; - u32 idx_rx = 0, idx_fq = 0, rcvd, i, pkts_sent = 0; - struct pkt_stream *pkt_stream = ifobj->pkt_stream; - struct xsk_socket_info *xsk = ifobj->xsk; - struct xsk_umem_info *umem = xsk->umem; - struct pkt *pkt; - int ret; - - ret = gettimeofday(&tv_now, NULL); - if (ret) - exit_with_error(errno); - timeradd(&tv_now, &tv_timeout, &tv_end); - - pkt = pkt_stream_get_next_rx_pkt(pkt_stream, &pkts_sent); - while (pkt) { - ret = gettimeofday(&tv_now, NULL); - if (ret) - exit_with_error(errno); - if (timercmp(&tv_now, &tv_end, >)) { - ksft_print_msg("ERROR: [%s] Receive loop timed out\n", __func__); - return TEST_FAILURE; - } - - kick_rx(xsk); - - rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx); - if (!rcvd) { - if (xsk_ring_prod__needs_wakeup(&umem->fq)) { - ret = poll(fds, 1, POLL_TMOUT); - if (ret < 0) - exit_with_error(-ret); - } - continue; - } - - if (ifobj->use_fill_ring) { - ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); - while (ret != rcvd) { - if (ret < 0) - exit_with_error(-ret); - if (xsk_ring_prod__needs_wakeup(&umem->fq)) { - ret = poll(fds, 1, POLL_TMOUT); - if (ret < 0) - exit_with_error(-ret); - } - ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); - } - } - - for (i = 0; i < rcvd; i++) { - const struct xdp_desc *desc = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++); - u64 addr = desc->addr, orig; - - orig = xsk_umem__extract_addr(addr); - addr = xsk_umem__add_offset_to_addr(addr); - - if (!is_pkt_valid(pkt, umem->buffer, addr, desc->len) || - !is_offset_correct(umem, pkt_stream, addr, pkt->addr)) - return TEST_FAILURE; - - if (ifobj->use_fill_ring) - *xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) = orig; - pkt = pkt_stream_get_next_rx_pkt(pkt_stream, &pkts_sent); - } - - if (ifobj->use_fill_ring) - xsk_ring_prod__submit(&umem->fq, rcvd); - if (ifobj->release_rx) - xsk_ring_cons__release(&xsk->rx, rcvd); - - pthread_mutex_lock(&pacing_mutex); - pkts_in_flight -= pkts_sent; - if (pkts_in_flight < umem->num_frames) - pthread_cond_signal(&pacing_cond); - pthread_mutex_unlock(&pacing_mutex); - pkts_sent = 0; - } - - return TEST_PASS; -} - -static int __send_pkts(struct ifobject *ifobject, u32 *pkt_nb) -{ - struct xsk_socket_info *xsk = ifobject->xsk; - u32 i, idx, valid_pkts = 0; - - while (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) < BATCH_SIZE) - complete_pkts(xsk, BATCH_SIZE); - - for (i = 0; i < BATCH_SIZE; i++) { - struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i); - struct pkt *pkt = pkt_generate(ifobject, *pkt_nb); - - if (!pkt) - break; - - tx_desc->addr = pkt->addr; - tx_desc->len = pkt->len; - (*pkt_nb)++; - if (pkt->valid) - valid_pkts++; - } - - pthread_mutex_lock(&pacing_mutex); - pkts_in_flight += valid_pkts; - /* pkts_in_flight might be negative if many invalid packets are sent */ - if (pkts_in_flight >= (int)(ifobject->umem->num_frames - BATCH_SIZE)) { - kick_tx(xsk); - pthread_cond_wait(&pacing_cond, &pacing_mutex); - } - pthread_mutex_unlock(&pacing_mutex); - - xsk_ring_prod__submit(&xsk->tx, i); - xsk->outstanding_tx += valid_pkts; - if (complete_pkts(xsk, i)) - return TEST_FAILURE; - - usleep(10); - return TEST_PASS; -} - -static void wait_for_tx_completion(struct xsk_socket_info *xsk) -{ - while (xsk->outstanding_tx) - complete_pkts(xsk, BATCH_SIZE); -} - -static int send_pkts(struct test_spec *test, struct ifobject *ifobject) -{ - struct pollfd fds = { }; - u32 pkt_cnt = 0; - - fds.fd = xsk_socket__fd(ifobject->xsk->xsk); - fds.events = POLLOUT; - - while (pkt_cnt < ifobject->pkt_stream->nb_pkts) { - int err; - - if (ifobject->use_poll) { - int ret; - - ret = poll(&fds, 1, POLL_TMOUT); - if (ret <= 0) - continue; - - if (!(fds.revents & POLLOUT)) - continue; - } - - err = __send_pkts(ifobject, &pkt_cnt); - if (err || test->fail) - return TEST_FAILURE; - } - - wait_for_tx_completion(ifobject->xsk); - return TEST_PASS; -} - -static int get_xsk_stats(struct xsk_socket *xsk, struct xdp_statistics *stats) -{ - int fd = xsk_socket__fd(xsk), err; - socklen_t optlen, expected_len; - - optlen = sizeof(*stats); - err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, stats, &optlen); - if (err) { - ksft_print_msg("[%s] getsockopt(XDP_STATISTICS) error %u %s\n", - __func__, -err, strerror(-err)); - return TEST_FAILURE; - } - - expected_len = sizeof(struct xdp_statistics); - if (optlen != expected_len) { - ksft_print_msg("[%s] getsockopt optlen error. Expected: %u got: %u\n", - __func__, expected_len, optlen); - return TEST_FAILURE; - } - - return TEST_PASS; -} - -static int validate_rx_dropped(struct ifobject *ifobject) -{ - struct xsk_socket *xsk = ifobject->xsk->xsk; - struct xdp_statistics stats; - int err; - - kick_rx(ifobject->xsk); - - err = get_xsk_stats(xsk, &stats); - if (err) - return TEST_FAILURE; - - if (stats.rx_dropped == ifobject->pkt_stream->nb_pkts / 2) - return TEST_PASS; - - return TEST_FAILURE; -} - -static int validate_rx_full(struct ifobject *ifobject) -{ - struct xsk_socket *xsk = ifobject->xsk->xsk; - struct xdp_statistics stats; - int err; - - usleep(1000); - kick_rx(ifobject->xsk); - - err = get_xsk_stats(xsk, &stats); - if (err) - return TEST_FAILURE; - - if (stats.rx_ring_full) - return TEST_PASS; - - return TEST_FAILURE; -} - -static int validate_fill_empty(struct ifobject *ifobject) -{ - struct xsk_socket *xsk = ifobject->xsk->xsk; - struct xdp_statistics stats; - int err; - - usleep(1000); - kick_rx(ifobject->xsk); - - err = get_xsk_stats(xsk, &stats); - if (err) - return TEST_FAILURE; - - if (stats.rx_fill_ring_empty_descs) - return TEST_PASS; - - return TEST_FAILURE; -} - -static int validate_tx_invalid_descs(struct ifobject *ifobject) -{ - struct xsk_socket *xsk = ifobject->xsk->xsk; - int fd = xsk_socket__fd(xsk); - struct xdp_statistics stats; - socklen_t optlen; - int err; - - optlen = sizeof(stats); - err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen); - if (err) { - ksft_print_msg("[%s] getsockopt(XDP_STATISTICS) error %u %s\n", - __func__, -err, strerror(-err)); - return TEST_FAILURE; - } - - if (stats.tx_invalid_descs != ifobject->pkt_stream->nb_pkts / 2) { - ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%u] expected [%u]\n", - __func__, stats.tx_invalid_descs, ifobject->pkt_stream->nb_pkts); - return TEST_FAILURE; - } - - return TEST_PASS; -} - -static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject) -{ - u64 umem_sz = ifobject->umem->num_frames * ifobject->umem->frame_size; - int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE; - LIBBPF_OPTS(bpf_xdp_query_opts, opts); - int ret, ifindex; - void *bufs; - u32 i; - - ifobject->ns_fd = switch_namespace(ifobject->nsname); - - if (ifobject->umem->unaligned_mode) - mmap_flags |= MAP_HUGETLB; - - bufs = mmap(NULL, umem_sz, PROT_READ | PROT_WRITE, mmap_flags, -1, 0); - if (bufs == MAP_FAILED) - exit_with_error(errno); - - ret = xsk_configure_umem(ifobject->umem, bufs, umem_sz); - if (ret) - exit_with_error(-ret); - - for (i = 0; i < test->nb_sockets; i++) { - u32 ctr = 0; - - while (ctr++ < SOCK_RECONF_CTR) { - ret = xsk_configure_socket(&ifobject->xsk_arr[i], ifobject->umem, - ifobject, !!i); - if (!ret) - break; - - /* Retry if it fails as xsk_socket__create() is asynchronous */ - if (ctr >= SOCK_RECONF_CTR) - exit_with_error(-ret); - usleep(USLEEP_MAX); - } - - if (ifobject->busy_poll) - enable_busy_poll(&ifobject->xsk_arr[i]); - } - - ifobject->xsk = &ifobject->xsk_arr[0]; - - if (!ifobject->rx_on) - return; - - ifindex = if_nametoindex(ifobject->ifname); - if (!ifindex) - exit_with_error(errno); - - ret = xsk_setup_xdp_prog_xsk(ifobject->xsk->xsk, &ifobject->xsk_map_fd); - if (ret) - exit_with_error(-ret); - - ret = bpf_xdp_query(ifindex, ifobject->xdp_flags, &opts); - if (ret) - exit_with_error(-ret); - - if (ifobject->xdp_flags & XDP_FLAGS_SKB_MODE) { - if (opts.attach_mode != XDP_ATTACHED_SKB) { - ksft_print_msg("ERROR: [%s] XDP prog not in SKB mode\n"); - exit_with_error(-EINVAL); - } - } else if (ifobject->xdp_flags & XDP_FLAGS_DRV_MODE) { - if (opts.attach_mode != XDP_ATTACHED_DRV) { - ksft_print_msg("ERROR: [%s] XDP prog not in DRV mode\n"); - exit_with_error(-EINVAL); - } - } - - ret = xsk_socket__update_xskmap(ifobject->xsk->xsk, ifobject->xsk_map_fd); - if (ret) - exit_with_error(-ret); -} - -static void testapp_cleanup_xsk_res(struct ifobject *ifobj) -{ - print_verbose("Destroying socket\n"); - xsk_socket__delete(ifobj->xsk->xsk); - munmap(ifobj->umem->buffer, ifobj->umem->num_frames * ifobj->umem->frame_size); - xsk_umem__delete(ifobj->umem->umem); -} - -static void *worker_testapp_validate_tx(void *arg) -{ - struct test_spec *test = (struct test_spec *)arg; - struct ifobject *ifobject = test->ifobj_tx; - int err; - - if (test->current_step == 1) - thread_common_ops(test, ifobject); - - print_verbose("Sending %d packets on interface %s\n", ifobject->pkt_stream->nb_pkts, - ifobject->ifname); - err = send_pkts(test, ifobject); - - if (!err && ifobject->validation_func) - err = ifobject->validation_func(ifobject); - if (err) - report_failure(test); - - if (test->total_steps == test->current_step || err) - testapp_cleanup_xsk_res(ifobject); - pthread_exit(NULL); -} - -static void xsk_populate_fill_ring(struct xsk_umem_info *umem, struct pkt_stream *pkt_stream) -{ - u32 idx = 0, i, buffers_to_fill; - int ret; - - if (umem->num_frames < XSK_RING_PROD__DEFAULT_NUM_DESCS) - buffers_to_fill = umem->num_frames; - else - buffers_to_fill = XSK_RING_PROD__DEFAULT_NUM_DESCS; - - ret = xsk_ring_prod__reserve(&umem->fq, buffers_to_fill, &idx); - if (ret != buffers_to_fill) - exit_with_error(ENOSPC); - for (i = 0; i < buffers_to_fill; i++) { - u64 addr; - - if (pkt_stream->use_addr_for_fill) { - struct pkt *pkt = pkt_stream_get_pkt(pkt_stream, i); - - if (!pkt) - break; - addr = pkt->addr; - } else { - addr = i * umem->frame_size; - } - - *xsk_ring_prod__fill_addr(&umem->fq, idx++) = addr; - } - xsk_ring_prod__submit(&umem->fq, buffers_to_fill); -} - -static void *worker_testapp_validate_rx(void *arg) -{ - struct test_spec *test = (struct test_spec *)arg; - struct ifobject *ifobject = test->ifobj_rx; - struct pollfd fds = { }; - int err; - - if (test->current_step == 1) - thread_common_ops(test, ifobject); - - xsk_populate_fill_ring(ifobject->umem, ifobject->pkt_stream); - - fds.fd = xsk_socket__fd(ifobject->xsk->xsk); - fds.events = POLLIN; - - pthread_barrier_wait(&barr); - - err = receive_pkts(ifobject, &fds); - - if (!err && ifobject->validation_func) - err = ifobject->validation_func(ifobject); - if (err) { - report_failure(test); - pthread_mutex_lock(&pacing_mutex); - pthread_cond_signal(&pacing_cond); - pthread_mutex_unlock(&pacing_mutex); - } - - if (test->total_steps == test->current_step || err) - testapp_cleanup_xsk_res(ifobject); - pthread_exit(NULL); -} - -static int testapp_validate_traffic(struct test_spec *test) -{ - struct ifobject *ifobj_tx = test->ifobj_tx; - struct ifobject *ifobj_rx = test->ifobj_rx; - pthread_t t0, t1; - - if (pthread_barrier_init(&barr, NULL, 2)) - exit_with_error(errno); - - test->current_step++; - pkt_stream_reset(ifobj_rx->pkt_stream); - pkts_in_flight = 0; - - /*Spawn RX thread */ - pthread_create(&t0, NULL, ifobj_rx->func_ptr, test); - - pthread_barrier_wait(&barr); - if (pthread_barrier_destroy(&barr)) - exit_with_error(errno); - - /*Spawn TX thread */ - pthread_create(&t1, NULL, ifobj_tx->func_ptr, test); - - pthread_join(t1, NULL); - pthread_join(t0, NULL); - - return !!test->fail; -} - -static void testapp_teardown(struct test_spec *test) -{ - int i; - - test_spec_set_name(test, "TEARDOWN"); - for (i = 0; i < MAX_TEARDOWN_ITER; i++) { - if (testapp_validate_traffic(test)) - return; - test_spec_reset(test); - } -} - -static void swap_directions(struct ifobject **ifobj1, struct ifobject **ifobj2) -{ - thread_func_t tmp_func_ptr = (*ifobj1)->func_ptr; - struct ifobject *tmp_ifobj = (*ifobj1); - - (*ifobj1)->func_ptr = (*ifobj2)->func_ptr; - (*ifobj2)->func_ptr = tmp_func_ptr; - - *ifobj1 = *ifobj2; - *ifobj2 = tmp_ifobj; -} - -static void testapp_bidi(struct test_spec *test) -{ - test_spec_set_name(test, "BIDIRECTIONAL"); - test->ifobj_tx->rx_on = true; - test->ifobj_rx->tx_on = true; - test->total_steps = 2; - if (testapp_validate_traffic(test)) - return; - - print_verbose("Switching Tx/Rx vectors\n"); - swap_directions(&test->ifobj_rx, &test->ifobj_tx); - testapp_validate_traffic(test); - - swap_directions(&test->ifobj_rx, &test->ifobj_tx); -} - -static void swap_xsk_resources(struct ifobject *ifobj_tx, struct ifobject *ifobj_rx) -{ - int ret; - - xsk_socket__delete(ifobj_tx->xsk->xsk); - xsk_socket__delete(ifobj_rx->xsk->xsk); - ifobj_tx->xsk = &ifobj_tx->xsk_arr[1]; - ifobj_rx->xsk = &ifobj_rx->xsk_arr[1]; - - ret = xsk_socket__update_xskmap(ifobj_rx->xsk->xsk, ifobj_rx->xsk_map_fd); - if (ret) - exit_with_error(-ret); -} - -static void testapp_bpf_res(struct test_spec *test) -{ - test_spec_set_name(test, "BPF_RES"); - test->total_steps = 2; - test->nb_sockets = 2; - if (testapp_validate_traffic(test)) - return; - - swap_xsk_resources(test->ifobj_tx, test->ifobj_rx); - testapp_validate_traffic(test); -} - -static void testapp_headroom(struct test_spec *test) -{ - test_spec_set_name(test, "UMEM_HEADROOM"); - test->ifobj_rx->umem->frame_headroom = UMEM_HEADROOM_TEST_SIZE; - testapp_validate_traffic(test); -} - -static void testapp_stats_rx_dropped(struct test_spec *test) -{ - test_spec_set_name(test, "STAT_RX_DROPPED"); - test->ifobj_rx->umem->frame_headroom = test->ifobj_rx->umem->frame_size - - XDP_PACKET_HEADROOM - MIN_PKT_SIZE * 3; - pkt_stream_replace_half(test, MIN_PKT_SIZE * 4, 0); - pkt_stream_receive_half(test); - test->ifobj_rx->validation_func = validate_rx_dropped; - testapp_validate_traffic(test); -} - -static void testapp_stats_tx_invalid_descs(struct test_spec *test) -{ - test_spec_set_name(test, "STAT_TX_INVALID"); - pkt_stream_replace_half(test, XSK_UMEM__INVALID_FRAME_SIZE, 0); - test->ifobj_tx->validation_func = validate_tx_invalid_descs; - testapp_validate_traffic(test); - - pkt_stream_restore_default(test); -} - -static void testapp_stats_rx_full(struct test_spec *test) -{ - test_spec_set_name(test, "STAT_RX_FULL"); - pkt_stream_replace(test, DEFAULT_UMEM_BUFFERS + DEFAULT_UMEM_BUFFERS / 2, PKT_SIZE); - test->ifobj_rx->pkt_stream = pkt_stream_generate(test->ifobj_rx->umem, - DEFAULT_UMEM_BUFFERS, PKT_SIZE); - if (!test->ifobj_rx->pkt_stream) - exit_with_error(ENOMEM); - - test->ifobj_rx->xsk->rxqsize = DEFAULT_UMEM_BUFFERS; - test->ifobj_rx->release_rx = false; - test->ifobj_rx->validation_func = validate_rx_full; - testapp_validate_traffic(test); - - pkt_stream_restore_default(test); -} - -static void testapp_stats_fill_empty(struct test_spec *test) -{ - test_spec_set_name(test, "STAT_RX_FILL_EMPTY"); - pkt_stream_replace(test, DEFAULT_UMEM_BUFFERS + DEFAULT_UMEM_BUFFERS / 2, PKT_SIZE); - test->ifobj_rx->pkt_stream = pkt_stream_generate(test->ifobj_rx->umem, - DEFAULT_UMEM_BUFFERS, PKT_SIZE); - if (!test->ifobj_rx->pkt_stream) - exit_with_error(ENOMEM); - - test->ifobj_rx->use_fill_ring = false; - test->ifobj_rx->validation_func = validate_fill_empty; - testapp_validate_traffic(test); - - pkt_stream_restore_default(test); -} - -/* Simple test */ -static bool hugepages_present(struct ifobject *ifobject) -{ - const size_t mmap_sz = 2 * ifobject->umem->num_frames * ifobject->umem->frame_size; - void *bufs; - - bufs = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); - if (bufs == MAP_FAILED) - return false; - - munmap(bufs, mmap_sz); - return true; -} - -static bool testapp_unaligned(struct test_spec *test) -{ - if (!hugepages_present(test->ifobj_tx)) { - ksft_test_result_skip("No 2M huge pages present.\n"); - return false; - } - - test_spec_set_name(test, "UNALIGNED_MODE"); - test->ifobj_tx->umem->unaligned_mode = true; - test->ifobj_rx->umem->unaligned_mode = true; - /* Let half of the packets straddle a buffer boundrary */ - pkt_stream_replace_half(test, PKT_SIZE, -PKT_SIZE / 2); - test->ifobj_rx->pkt_stream->use_addr_for_fill = true; - testapp_validate_traffic(test); - - pkt_stream_restore_default(test); - return true; -} - -static void testapp_single_pkt(struct test_spec *test) -{ - struct pkt pkts[] = {{0x1000, PKT_SIZE, 0, true}}; - - pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts)); - testapp_validate_traffic(test); - pkt_stream_restore_default(test); -} - -static void testapp_invalid_desc(struct test_spec *test) -{ - struct pkt pkts[] = { - /* Zero packet address allowed */ - {0, PKT_SIZE, 0, true}, - /* Allowed packet */ - {0x1000, PKT_SIZE, 0, true}, - /* Straddling the start of umem */ - {-2, PKT_SIZE, 0, false}, - /* Packet too large */ - {0x2000, XSK_UMEM__INVALID_FRAME_SIZE, 0, false}, - /* After umem ends */ - {UMEM_SIZE, PKT_SIZE, 0, false}, - /* Straddle the end of umem */ - {UMEM_SIZE - PKT_SIZE / 2, PKT_SIZE, 0, false}, - /* Straddle a page boundrary */ - {0x3000 - PKT_SIZE / 2, PKT_SIZE, 0, false}, - /* Straddle a 2K boundrary */ - {0x3800 - PKT_SIZE / 2, PKT_SIZE, 0, true}, - /* Valid packet for synch so that something is received */ - {0x4000, PKT_SIZE, 0, true}}; - - if (test->ifobj_tx->umem->unaligned_mode) { - /* Crossing a page boundrary allowed */ - pkts[6].valid = true; - } - if (test->ifobj_tx->umem->frame_size == XSK_UMEM__DEFAULT_FRAME_SIZE / 2) { - /* Crossing a 2K frame size boundrary not allowed */ - pkts[7].valid = false; - } - - pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts)); - testapp_validate_traffic(test); - pkt_stream_restore_default(test); -} - -static void init_iface(struct ifobject *ifobj, const char *dst_mac, const char *src_mac, - const char *dst_ip, const char *src_ip, const u16 dst_port, - const u16 src_port, thread_func_t func_ptr) -{ - struct in_addr ip; - - memcpy(ifobj->dst_mac, dst_mac, ETH_ALEN); - memcpy(ifobj->src_mac, src_mac, ETH_ALEN); - - inet_aton(dst_ip, &ip); - ifobj->dst_ip = ip.s_addr; - - inet_aton(src_ip, &ip); - ifobj->src_ip = ip.s_addr; - - ifobj->dst_port = dst_port; - ifobj->src_port = src_port; - - ifobj->func_ptr = func_ptr; -} - -static void run_pkt_test(struct test_spec *test, enum test_mode mode, enum test_type type) -{ - switch (type) { - case TEST_TYPE_STATS_RX_DROPPED: - testapp_stats_rx_dropped(test); - break; - case TEST_TYPE_STATS_TX_INVALID_DESCS: - testapp_stats_tx_invalid_descs(test); - break; - case TEST_TYPE_STATS_RX_FULL: - testapp_stats_rx_full(test); - break; - case TEST_TYPE_STATS_FILL_EMPTY: - testapp_stats_fill_empty(test); - break; - case TEST_TYPE_TEARDOWN: - testapp_teardown(test); - break; - case TEST_TYPE_BIDI: - testapp_bidi(test); - break; - case TEST_TYPE_BPF_RES: - testapp_bpf_res(test); - break; - case TEST_TYPE_RUN_TO_COMPLETION: - test_spec_set_name(test, "RUN_TO_COMPLETION"); - testapp_validate_traffic(test); - break; - case TEST_TYPE_RUN_TO_COMPLETION_SINGLE_PKT: - test_spec_set_name(test, "RUN_TO_COMPLETION_SINGLE_PKT"); - testapp_single_pkt(test); - break; - case TEST_TYPE_RUN_TO_COMPLETION_2K_FRAME: - test_spec_set_name(test, "RUN_TO_COMPLETION_2K_FRAME_SIZE"); - test->ifobj_tx->umem->frame_size = 2048; - test->ifobj_rx->umem->frame_size = 2048; - pkt_stream_replace(test, DEFAULT_PKT_CNT, PKT_SIZE); - testapp_validate_traffic(test); - - pkt_stream_restore_default(test); - break; - case TEST_TYPE_POLL: - test->ifobj_tx->use_poll = true; - test->ifobj_rx->use_poll = true; - test_spec_set_name(test, "POLL"); - testapp_validate_traffic(test); - break; - case TEST_TYPE_ALIGNED_INV_DESC: - test_spec_set_name(test, "ALIGNED_INV_DESC"); - testapp_invalid_desc(test); - break; - case TEST_TYPE_ALIGNED_INV_DESC_2K_FRAME: - test_spec_set_name(test, "ALIGNED_INV_DESC_2K_FRAME_SIZE"); - test->ifobj_tx->umem->frame_size = 2048; - test->ifobj_rx->umem->frame_size = 2048; - testapp_invalid_desc(test); - break; - case TEST_TYPE_UNALIGNED_INV_DESC: - if (!hugepages_present(test->ifobj_tx)) { - ksft_test_result_skip("No 2M huge pages present.\n"); - return; - } - test_spec_set_name(test, "UNALIGNED_INV_DESC"); - test->ifobj_tx->umem->unaligned_mode = true; - test->ifobj_rx->umem->unaligned_mode = true; - testapp_invalid_desc(test); - break; - case TEST_TYPE_UNALIGNED: - if (!testapp_unaligned(test)) - return; - break; - case TEST_TYPE_HEADROOM: - testapp_headroom(test); - break; - default: - break; - } - - if (!test->fail) - ksft_test_result_pass("PASS: %s %s%s\n", mode_string(test), busy_poll_string(test), - test->name); -} - -static struct ifobject *ifobject_create(void) -{ - struct ifobject *ifobj; - - ifobj = calloc(1, sizeof(struct ifobject)); - if (!ifobj) - return NULL; - - ifobj->xsk_arr = calloc(MAX_SOCKETS, sizeof(*ifobj->xsk_arr)); - if (!ifobj->xsk_arr) - goto out_xsk_arr; - - ifobj->umem = calloc(1, sizeof(*ifobj->umem)); - if (!ifobj->umem) - goto out_umem; - - return ifobj; - -out_umem: - free(ifobj->xsk_arr); -out_xsk_arr: - free(ifobj); - return NULL; -} - -static void ifobject_delete(struct ifobject *ifobj) -{ - free(ifobj->umem); - free(ifobj->xsk_arr); - free(ifobj); -} - -int main(int argc, char **argv) -{ - struct pkt_stream *pkt_stream_default; - struct ifobject *ifobj_tx, *ifobj_rx; - u32 i, j, failed_tests = 0; - struct test_spec test; - - /* Use libbpf 1.0 API mode */ - libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - - ifobj_tx = ifobject_create(); - if (!ifobj_tx) - exit_with_error(ENOMEM); - ifobj_rx = ifobject_create(); - if (!ifobj_rx) - exit_with_error(ENOMEM); - - setlocale(LC_ALL, ""); - - parse_command_line(ifobj_tx, ifobj_rx, argc, argv); - - if (!validate_interface(ifobj_tx) || !validate_interface(ifobj_rx)) { - usage(basename(argv[0])); - ksft_exit_xfail(); - } - - init_iface(ifobj_tx, MAC1, MAC2, IP1, IP2, UDP_PORT1, UDP_PORT2, - worker_testapp_validate_tx); - init_iface(ifobj_rx, MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1, - worker_testapp_validate_rx); - - test_spec_init(&test, ifobj_tx, ifobj_rx, 0); - pkt_stream_default = pkt_stream_generate(ifobj_tx->umem, DEFAULT_PKT_CNT, PKT_SIZE); - if (!pkt_stream_default) - exit_with_error(ENOMEM); - test.pkt_stream_default = pkt_stream_default; - - ksft_set_plan(TEST_MODE_MAX * TEST_TYPE_MAX); - - for (i = 0; i < TEST_MODE_MAX; i++) - for (j = 0; j < TEST_TYPE_MAX; j++) { - test_spec_init(&test, ifobj_tx, ifobj_rx, i); - run_pkt_test(&test, i, j); - usleep(USLEEP_MAX); - - if (test.fail) - failed_tests++; - } - - pkt_stream_delete(pkt_stream_default); - ifobject_delete(ifobj_tx); - ifobject_delete(ifobj_rx); - - if (failed_tests) - ksft_exit_fail(); - else - ksft_exit_pass(); -} diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h deleted file mode 100644 index 8f672b0fe0e1..000000000000 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ /dev/null @@ -1,172 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * Copyright(c) 2020 Intel Corporation. - */ - -#ifndef XDPXCEIVER_H_ -#define XDPXCEIVER_H_ - -#ifndef SOL_XDP -#define SOL_XDP 283 -#endif - -#ifndef AF_XDP -#define AF_XDP 44 -#endif - -#ifndef PF_XDP -#define PF_XDP AF_XDP -#endif - -#ifndef SO_BUSY_POLL_BUDGET -#define SO_BUSY_POLL_BUDGET 70 -#endif - -#ifndef SO_PREFER_BUSY_POLL -#define SO_PREFER_BUSY_POLL 69 -#endif - -#define TEST_PASS 0 -#define TEST_FAILURE -1 -#define MAX_INTERFACES 2 -#define MAX_INTERFACE_NAME_CHARS 7 -#define MAX_INTERFACES_NAMESPACE_CHARS 10 -#define MAX_SOCKETS 2 -#define MAX_TEST_NAME_SIZE 32 -#define MAX_TEARDOWN_ITER 10 -#define PKT_HDR_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ - sizeof(struct udphdr)) -#define MIN_ETH_PKT_SIZE 64 -#define ETH_FCS_SIZE 4 -#define MIN_PKT_SIZE (MIN_ETH_PKT_SIZE - ETH_FCS_SIZE) -#define PKT_SIZE (MIN_PKT_SIZE) -#define IP_PKT_SIZE (PKT_SIZE - sizeof(struct ethhdr)) -#define IP_PKT_VER 0x4 -#define IP_PKT_TOS 0x9 -#define UDP_PKT_SIZE (IP_PKT_SIZE - sizeof(struct iphdr)) -#define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - sizeof(struct udphdr)) -#define USLEEP_MAX 10000 -#define SOCK_RECONF_CTR 10 -#define BATCH_SIZE 64 -#define POLL_TMOUT 1000 -#define RECV_TMOUT 3 -#define DEFAULT_PKT_CNT (4 * 1024) -#define DEFAULT_UMEM_BUFFERS (DEFAULT_PKT_CNT / 4) -#define UMEM_SIZE (DEFAULT_UMEM_BUFFERS * XSK_UMEM__DEFAULT_FRAME_SIZE) -#define RX_FULL_RXQSIZE 32 -#define UMEM_HEADROOM_TEST_SIZE 128 -#define XSK_UMEM__INVALID_FRAME_SIZE (XSK_UMEM__DEFAULT_FRAME_SIZE + 1) - -#define print_verbose(x...) do { if (opt_verbose) ksft_print_msg(x); } while (0) - -enum test_mode { - TEST_MODE_SKB, - TEST_MODE_DRV, - TEST_MODE_MAX -}; - -enum test_type { - TEST_TYPE_RUN_TO_COMPLETION, - TEST_TYPE_RUN_TO_COMPLETION_2K_FRAME, - TEST_TYPE_RUN_TO_COMPLETION_SINGLE_PKT, - TEST_TYPE_POLL, - TEST_TYPE_UNALIGNED, - TEST_TYPE_ALIGNED_INV_DESC, - TEST_TYPE_ALIGNED_INV_DESC_2K_FRAME, - TEST_TYPE_UNALIGNED_INV_DESC, - TEST_TYPE_HEADROOM, - TEST_TYPE_TEARDOWN, - TEST_TYPE_BIDI, - TEST_TYPE_STATS_RX_DROPPED, - TEST_TYPE_STATS_TX_INVALID_DESCS, - TEST_TYPE_STATS_RX_FULL, - TEST_TYPE_STATS_FILL_EMPTY, - TEST_TYPE_BPF_RES, - TEST_TYPE_MAX -}; - -static bool opt_pkt_dump; -static bool opt_verbose; - -struct xsk_umem_info { - struct xsk_ring_prod fq; - struct xsk_ring_cons cq; - struct xsk_umem *umem; - u32 num_frames; - u32 frame_headroom; - void *buffer; - u32 frame_size; - bool unaligned_mode; -}; - -struct xsk_socket_info { - struct xsk_ring_cons rx; - struct xsk_ring_prod tx; - struct xsk_umem_info *umem; - struct xsk_socket *xsk; - u32 outstanding_tx; - u32 rxqsize; -}; - -struct pkt { - u64 addr; - u32 len; - u32 payload; - bool valid; -}; - -struct pkt_stream { - u32 nb_pkts; - u32 rx_pkt_nb; - struct pkt *pkts; - bool use_addr_for_fill; -}; - -struct ifobject; -typedef int (*validation_func_t)(struct ifobject *ifobj); -typedef void *(*thread_func_t)(void *arg); - -struct ifobject { - char ifname[MAX_INTERFACE_NAME_CHARS]; - char nsname[MAX_INTERFACES_NAMESPACE_CHARS]; - struct xsk_socket_info *xsk; - struct xsk_socket_info *xsk_arr; - struct xsk_umem_info *umem; - thread_func_t func_ptr; - validation_func_t validation_func; - struct pkt_stream *pkt_stream; - int ns_fd; - int xsk_map_fd; - u32 dst_ip; - u32 src_ip; - u32 xdp_flags; - u32 bind_flags; - u16 src_port; - u16 dst_port; - bool tx_on; - bool rx_on; - bool use_poll; - bool busy_poll; - bool use_fill_ring; - bool release_rx; - u8 dst_mac[ETH_ALEN]; - u8 src_mac[ETH_ALEN]; -}; - -struct test_spec { - struct ifobject *ifobj_tx; - struct ifobject *ifobj_rx; - struct pkt_stream *pkt_stream_default; - u16 total_steps; - u16 current_step; - u16 nb_sockets; - bool fail; - char name[MAX_TEST_NAME_SIZE]; -}; - -pthread_barrier_t barr; -pthread_mutex_t pacing_mutex = PTHREAD_MUTEX_INITIALIZER; -pthread_cond_t pacing_cond = PTHREAD_COND_INITIALIZER; - -int pkts_in_flight; - -#endif /* XDPXCEIVER_H */ diff --git a/tools/testing/selftests/bpf/xsk_prereqs.sh b/tools/testing/selftests/bpf/xsk_prereqs.sh index 684e813803ec..a0b71723a818 100755 --- a/tools/testing/selftests/bpf/xsk_prereqs.sh +++ b/tools/testing/selftests/bpf/xsk_prereqs.sh @@ -8,7 +8,7 @@ ksft_xfail=2 ksft_xpass=3 ksft_skip=4 -XSKOBJ=xdpxceiver +XSKOBJ=xskxceiver validate_root_exec() { @@ -77,7 +77,7 @@ validate_ip_utility() [ ! $(type -P ip) ] && { echo "'ip' not found. Skipping tests."; test_exit $ksft_skip; } } -execxdpxceiver() +exec_xskxceiver() { if [[ $busy_poll -eq 1 ]]; then ARGS+="-b " diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c new file mode 100644 index 000000000000..74d56d971baf --- /dev/null +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -0,0 +1,1682 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2020 Intel Corporation. */ + +/* + * Some functions in this program are taken from + * Linux kernel samples/bpf/xdpsock* and modified + * for use. + * + * See test_xsk.sh for detailed information on test topology + * and prerequisite network setup. + * + * This test program contains two threads, each thread is single socket with + * a unique UMEM. It validates in-order packet delivery and packet content + * by sending packets to each other. + * + * Tests Information: + * ------------------ + * These selftests test AF_XDP SKB and Native/DRV modes using veth + * Virtual Ethernet interfaces. + * + * For each mode, the following tests are run: + * a. nopoll - soft-irq processing in run-to-completion mode + * b. poll - using poll() syscall + * c. Socket Teardown + * Create a Tx and a Rx socket, Tx from one socket, Rx on another. Destroy + * both sockets, then repeat multiple times. Only nopoll mode is used + * d. Bi-directional sockets + * Configure sockets as bi-directional tx/rx sockets, sets up fill and + * completion rings on each socket, tx/rx in both directions. Only nopoll + * mode is used + * e. Statistics + * Trigger some error conditions and ensure that the appropriate statistics + * are incremented. Within this test, the following statistics are tested: + * i. rx dropped + * Increase the UMEM frame headroom to a value which results in + * insufficient space in the rx buffer for both the packet and the headroom. + * ii. tx invalid + * Set the 'len' field of tx descriptors to an invalid value (umem frame + * size + 1). + * iii. rx ring full + * Reduce the size of the RX ring to a fraction of the fill ring size. + * iv. fill queue empty + * Do not populate the fill queue and then try to receive pkts. + * f. bpf_link resource persistence + * Configure sockets at indexes 0 and 1, run a traffic on queue ids 0, + * then remove xsk sockets from queue 0 on both veth interfaces and + * finally run a traffic on queues ids 1 + * g. unaligned mode + * h. tests for invalid and corner case Tx descriptors so that the correct ones + * are discarded and let through, respectively. + * i. 2K frame size tests + * + * Total tests: 12 + * + * Flow: + * ----- + * - Single process spawns two threads: Tx and Rx + * - Each of these two threads attach to a veth interface within their assigned + * namespaces + * - Each thread Creates one AF_XDP socket connected to a unique umem for each + * veth interface + * - Tx thread Transmits 10k packets from veth to veth + * - Rx thread verifies if all 10k packets were received and delivered in-order, + * and have the right content + * + * Enable/disable packet dump mode: + * -------------------------- + * To enable L2 - L4 headers and payload dump of each packet on STDOUT, add + * parameter -D to params array in test_xsk.sh, i.e. params=("-S" "-D") + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "xsk.h" +#include "xskxceiver.h" +#include "../kselftest.h" + +/* AF_XDP APIs were moved into libxdp and marked as deprecated in libbpf. + * Until xskxceiver is either moved or re-writed into libxdp, suppress + * deprecation warnings in this file + */ +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + +static const char *MAC1 = "\x00\x0A\x56\x9E\xEE\x62"; +static const char *MAC2 = "\x00\x0A\x56\x9E\xEE\x61"; +static const char *IP1 = "192.168.100.162"; +static const char *IP2 = "192.168.100.161"; +static const u16 UDP_PORT1 = 2020; +static const u16 UDP_PORT2 = 2121; + +static void __exit_with_error(int error, const char *file, const char *func, int line) +{ + ksft_test_result_fail("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, + strerror(error)); + ksft_exit_xfail(); +} + +#define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__) + +#define mode_string(test) (test)->ifobj_tx->xdp_flags & XDP_FLAGS_SKB_MODE ? "SKB" : "DRV" +#define busy_poll_string(test) (test)->ifobj_tx->busy_poll ? "BUSY-POLL " : "" + +static void report_failure(struct test_spec *test) +{ + if (test->fail) + return; + + ksft_test_result_fail("FAIL: %s %s%s\n", mode_string(test), busy_poll_string(test), + test->name); + test->fail = true; +} + +static void memset32_htonl(void *dest, u32 val, u32 size) +{ + u32 *ptr = (u32 *)dest; + int i; + + val = htonl(val); + + for (i = 0; i < (size & (~0x3)); i += 4) + ptr[i >> 2] = val; +} + +/* + * Fold a partial checksum + * This function code has been taken from + * Linux kernel include/asm-generic/checksum.h + */ +static __u16 csum_fold(__u32 csum) +{ + u32 sum = (__force u32)csum; + + sum = (sum & 0xffff) + (sum >> 16); + sum = (sum & 0xffff) + (sum >> 16); + return (__force __u16)~sum; +} + +/* + * This function code has been taken from + * Linux kernel lib/checksum.c + */ +static u32 from64to32(u64 x) +{ + /* add up 32-bit and 32-bit for 32+c bit */ + x = (x & 0xffffffff) + (x >> 32); + /* add up carry.. */ + x = (x & 0xffffffff) + (x >> 32); + return (u32)x; +} + +/* + * This function code has been taken from + * Linux kernel lib/checksum.c + */ +static __u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum) +{ + unsigned long long s = (__force u32)sum; + + s += (__force u32)saddr; + s += (__force u32)daddr; +#ifdef __BIG_ENDIAN__ + s += proto + len; +#else + s += (proto + len) << 8; +#endif + return (__force __u32)from64to32(s); +} + +/* + * This function has been taken from + * Linux kernel include/asm-generic/checksum.h + */ +static __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum) +{ + return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); +} + +static u16 udp_csum(u32 saddr, u32 daddr, u32 len, u8 proto, u16 *udp_pkt) +{ + u32 csum = 0; + u32 cnt = 0; + + /* udp hdr and data */ + for (; cnt < len; cnt += 2) + csum += udp_pkt[cnt >> 1]; + + return csum_tcpudp_magic(saddr, daddr, len, proto, csum); +} + +static void gen_eth_hdr(struct ifobject *ifobject, struct ethhdr *eth_hdr) +{ + memcpy(eth_hdr->h_dest, ifobject->dst_mac, ETH_ALEN); + memcpy(eth_hdr->h_source, ifobject->src_mac, ETH_ALEN); + eth_hdr->h_proto = htons(ETH_P_IP); +} + +static void gen_ip_hdr(struct ifobject *ifobject, struct iphdr *ip_hdr) +{ + ip_hdr->version = IP_PKT_VER; + ip_hdr->ihl = 0x5; + ip_hdr->tos = IP_PKT_TOS; + ip_hdr->tot_len = htons(IP_PKT_SIZE); + ip_hdr->id = 0; + ip_hdr->frag_off = 0; + ip_hdr->ttl = IPDEFTTL; + ip_hdr->protocol = IPPROTO_UDP; + ip_hdr->saddr = ifobject->src_ip; + ip_hdr->daddr = ifobject->dst_ip; + ip_hdr->check = 0; +} + +static void gen_udp_hdr(u32 payload, void *pkt, struct ifobject *ifobject, + struct udphdr *udp_hdr) +{ + udp_hdr->source = htons(ifobject->src_port); + udp_hdr->dest = htons(ifobject->dst_port); + udp_hdr->len = htons(UDP_PKT_SIZE); + memset32_htonl(pkt + PKT_HDR_SIZE, payload, UDP_PKT_DATA_SIZE); +} + +static void gen_udp_csum(struct udphdr *udp_hdr, struct iphdr *ip_hdr) +{ + udp_hdr->check = 0; + udp_hdr->check = + udp_csum(ip_hdr->saddr, ip_hdr->daddr, UDP_PKT_SIZE, IPPROTO_UDP, (u16 *)udp_hdr); +} + +static int xsk_configure_umem(struct xsk_umem_info *umem, void *buffer, u64 size) +{ + struct xsk_umem_config cfg = { + .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, + .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, + .frame_size = umem->frame_size, + .frame_headroom = umem->frame_headroom, + .flags = XSK_UMEM__DEFAULT_FLAGS + }; + int ret; + + if (umem->unaligned_mode) + cfg.flags |= XDP_UMEM_UNALIGNED_CHUNK_FLAG; + + ret = xsk_umem__create(&umem->umem, buffer, size, + &umem->fq, &umem->cq, &cfg); + if (ret) + return ret; + + umem->buffer = buffer; + return 0; +} + +static void enable_busy_poll(struct xsk_socket_info *xsk) +{ + int sock_opt; + + sock_opt = 1; + if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_PREFER_BUSY_POLL, + (void *)&sock_opt, sizeof(sock_opt)) < 0) + exit_with_error(errno); + + sock_opt = 20; + if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL, + (void *)&sock_opt, sizeof(sock_opt)) < 0) + exit_with_error(errno); + + sock_opt = BATCH_SIZE; + if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL_BUDGET, + (void *)&sock_opt, sizeof(sock_opt)) < 0) + exit_with_error(errno); +} + +static int xsk_configure_socket(struct xsk_socket_info *xsk, struct xsk_umem_info *umem, + struct ifobject *ifobject, bool shared) +{ + struct xsk_socket_config cfg = {}; + struct xsk_ring_cons *rxr; + struct xsk_ring_prod *txr; + + xsk->umem = umem; + cfg.rx_size = xsk->rxqsize; + cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + cfg.libbpf_flags = XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD; + cfg.xdp_flags = ifobject->xdp_flags; + cfg.bind_flags = ifobject->bind_flags; + if (shared) + cfg.bind_flags |= XDP_SHARED_UMEM; + + txr = ifobject->tx_on ? &xsk->tx : NULL; + rxr = ifobject->rx_on ? &xsk->rx : NULL; + return xsk_socket__create(&xsk->xsk, ifobject->ifname, 0, umem->umem, rxr, txr, &cfg); +} + +static struct option long_options[] = { + {"interface", required_argument, 0, 'i'}, + {"busy-poll", no_argument, 0, 'b'}, + {"dump-pkts", no_argument, 0, 'D'}, + {"verbose", no_argument, 0, 'v'}, + {0, 0, 0, 0} +}; + +static void usage(const char *prog) +{ + const char *str = + " Usage: %s [OPTIONS]\n" + " Options:\n" + " -i, --interface Use interface\n" + " -D, --dump-pkts Dump packets L2 - L5\n" + " -v, --verbose Verbose output\n" + " -b, --busy-poll Enable busy poll\n"; + + ksft_print_msg(str, prog); +} + +static int switch_namespace(const char *nsname) +{ + char fqns[26] = "/var/run/netns/"; + int nsfd; + + if (!nsname || strlen(nsname) == 0) + return -1; + + strncat(fqns, nsname, sizeof(fqns) - strlen(fqns) - 1); + nsfd = open(fqns, O_RDONLY); + + if (nsfd == -1) + exit_with_error(errno); + + if (setns(nsfd, 0) == -1) + exit_with_error(errno); + + print_verbose("NS switched: %s\n", nsname); + + return nsfd; +} + +static bool validate_interface(struct ifobject *ifobj) +{ + if (!strcmp(ifobj->ifname, "")) + return false; + return true; +} + +static void parse_command_line(struct ifobject *ifobj_tx, struct ifobject *ifobj_rx, int argc, + char **argv) +{ + struct ifobject *ifobj; + u32 interface_nb = 0; + int option_index, c; + + opterr = 0; + + for (;;) { + char *sptr, *token; + + c = getopt_long(argc, argv, "i:Dvb", long_options, &option_index); + if (c == -1) + break; + + switch (c) { + case 'i': + if (interface_nb == 0) + ifobj = ifobj_tx; + else if (interface_nb == 1) + ifobj = ifobj_rx; + else + break; + + sptr = strndupa(optarg, strlen(optarg)); + memcpy(ifobj->ifname, strsep(&sptr, ","), MAX_INTERFACE_NAME_CHARS); + token = strsep(&sptr, ","); + if (token) + memcpy(ifobj->nsname, token, MAX_INTERFACES_NAMESPACE_CHARS); + interface_nb++; + break; + case 'D': + opt_pkt_dump = true; + break; + case 'v': + opt_verbose = true; + break; + case 'b': + ifobj_tx->busy_poll = true; + ifobj_rx->busy_poll = true; + break; + default: + usage(basename(argv[0])); + ksft_exit_xfail(); + } + } +} + +static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, + struct ifobject *ifobj_rx) +{ + u32 i, j; + + for (i = 0; i < MAX_INTERFACES; i++) { + struct ifobject *ifobj = i ? ifobj_rx : ifobj_tx; + + ifobj->xsk = &ifobj->xsk_arr[0]; + ifobj->use_poll = false; + ifobj->use_fill_ring = true; + ifobj->release_rx = true; + ifobj->pkt_stream = test->pkt_stream_default; + ifobj->validation_func = NULL; + + if (i == 0) { + ifobj->rx_on = false; + ifobj->tx_on = true; + } else { + ifobj->rx_on = true; + ifobj->tx_on = false; + } + + memset(ifobj->umem, 0, sizeof(*ifobj->umem)); + ifobj->umem->num_frames = DEFAULT_UMEM_BUFFERS; + ifobj->umem->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; + + for (j = 0; j < MAX_SOCKETS; j++) { + memset(&ifobj->xsk_arr[j], 0, sizeof(ifobj->xsk_arr[j])); + ifobj->xsk_arr[j].rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; + } + } + + test->ifobj_tx = ifobj_tx; + test->ifobj_rx = ifobj_rx; + test->current_step = 0; + test->total_steps = 1; + test->nb_sockets = 1; + test->fail = false; +} + +static void test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, + struct ifobject *ifobj_rx, enum test_mode mode) +{ + struct pkt_stream *pkt_stream; + u32 i; + + pkt_stream = test->pkt_stream_default; + memset(test, 0, sizeof(*test)); + test->pkt_stream_default = pkt_stream; + + for (i = 0; i < MAX_INTERFACES; i++) { + struct ifobject *ifobj = i ? ifobj_rx : ifobj_tx; + + ifobj->xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; + if (mode == TEST_MODE_SKB) + ifobj->xdp_flags |= XDP_FLAGS_SKB_MODE; + else + ifobj->xdp_flags |= XDP_FLAGS_DRV_MODE; + + ifobj->bind_flags = XDP_USE_NEED_WAKEUP | XDP_COPY; + } + + __test_spec_init(test, ifobj_tx, ifobj_rx); +} + +static void test_spec_reset(struct test_spec *test) +{ + __test_spec_init(test, test->ifobj_tx, test->ifobj_rx); +} + +static void test_spec_set_name(struct test_spec *test, const char *name) +{ + strncpy(test->name, name, MAX_TEST_NAME_SIZE); +} + +static void pkt_stream_reset(struct pkt_stream *pkt_stream) +{ + if (pkt_stream) + pkt_stream->rx_pkt_nb = 0; +} + +static struct pkt *pkt_stream_get_pkt(struct pkt_stream *pkt_stream, u32 pkt_nb) +{ + if (pkt_nb >= pkt_stream->nb_pkts) + return NULL; + + return &pkt_stream->pkts[pkt_nb]; +} + +static struct pkt *pkt_stream_get_next_rx_pkt(struct pkt_stream *pkt_stream, u32 *pkts_sent) +{ + while (pkt_stream->rx_pkt_nb < pkt_stream->nb_pkts) { + (*pkts_sent)++; + if (pkt_stream->pkts[pkt_stream->rx_pkt_nb].valid) + return &pkt_stream->pkts[pkt_stream->rx_pkt_nb++]; + pkt_stream->rx_pkt_nb++; + } + return NULL; +} + +static void pkt_stream_delete(struct pkt_stream *pkt_stream) +{ + free(pkt_stream->pkts); + free(pkt_stream); +} + +static void pkt_stream_restore_default(struct test_spec *test) +{ + struct pkt_stream *tx_pkt_stream = test->ifobj_tx->pkt_stream; + + if (tx_pkt_stream != test->pkt_stream_default) { + pkt_stream_delete(test->ifobj_tx->pkt_stream); + test->ifobj_tx->pkt_stream = test->pkt_stream_default; + } + + if (test->ifobj_rx->pkt_stream != test->pkt_stream_default && + test->ifobj_rx->pkt_stream != tx_pkt_stream) + pkt_stream_delete(test->ifobj_rx->pkt_stream); + test->ifobj_rx->pkt_stream = test->pkt_stream_default; +} + +static struct pkt_stream *__pkt_stream_alloc(u32 nb_pkts) +{ + struct pkt_stream *pkt_stream; + + pkt_stream = calloc(1, sizeof(*pkt_stream)); + if (!pkt_stream) + return NULL; + + pkt_stream->pkts = calloc(nb_pkts, sizeof(*pkt_stream->pkts)); + if (!pkt_stream->pkts) { + free(pkt_stream); + return NULL; + } + + pkt_stream->nb_pkts = nb_pkts; + return pkt_stream; +} + +static void pkt_set(struct xsk_umem_info *umem, struct pkt *pkt, u64 addr, u32 len) +{ + pkt->addr = addr; + pkt->len = len; + if (len > umem->frame_size - XDP_PACKET_HEADROOM - MIN_PKT_SIZE * 2 - umem->frame_headroom) + pkt->valid = false; + else + pkt->valid = true; +} + +static struct pkt_stream *pkt_stream_generate(struct xsk_umem_info *umem, u32 nb_pkts, u32 pkt_len) +{ + struct pkt_stream *pkt_stream; + u32 i; + + pkt_stream = __pkt_stream_alloc(nb_pkts); + if (!pkt_stream) + exit_with_error(ENOMEM); + + pkt_stream->nb_pkts = nb_pkts; + for (i = 0; i < nb_pkts; i++) { + pkt_set(umem, &pkt_stream->pkts[i], (i % umem->num_frames) * umem->frame_size, + pkt_len); + pkt_stream->pkts[i].payload = i; + } + + return pkt_stream; +} + +static struct pkt_stream *pkt_stream_clone(struct xsk_umem_info *umem, + struct pkt_stream *pkt_stream) +{ + return pkt_stream_generate(umem, pkt_stream->nb_pkts, pkt_stream->pkts[0].len); +} + +static void pkt_stream_replace(struct test_spec *test, u32 nb_pkts, u32 pkt_len) +{ + struct pkt_stream *pkt_stream; + + pkt_stream = pkt_stream_generate(test->ifobj_tx->umem, nb_pkts, pkt_len); + test->ifobj_tx->pkt_stream = pkt_stream; + test->ifobj_rx->pkt_stream = pkt_stream; +} + +static void pkt_stream_replace_half(struct test_spec *test, u32 pkt_len, int offset) +{ + struct xsk_umem_info *umem = test->ifobj_tx->umem; + struct pkt_stream *pkt_stream; + u32 i; + + pkt_stream = pkt_stream_clone(umem, test->pkt_stream_default); + for (i = 1; i < test->pkt_stream_default->nb_pkts; i += 2) + pkt_set(umem, &pkt_stream->pkts[i], + (i % umem->num_frames) * umem->frame_size + offset, pkt_len); + + test->ifobj_tx->pkt_stream = pkt_stream; + test->ifobj_rx->pkt_stream = pkt_stream; +} + +static void pkt_stream_receive_half(struct test_spec *test) +{ + struct xsk_umem_info *umem = test->ifobj_rx->umem; + struct pkt_stream *pkt_stream = test->ifobj_tx->pkt_stream; + u32 i; + + test->ifobj_rx->pkt_stream = pkt_stream_generate(umem, pkt_stream->nb_pkts, + pkt_stream->pkts[0].len); + pkt_stream = test->ifobj_rx->pkt_stream; + for (i = 1; i < pkt_stream->nb_pkts; i += 2) + pkt_stream->pkts[i].valid = false; +} + +static struct pkt *pkt_generate(struct ifobject *ifobject, u32 pkt_nb) +{ + struct pkt *pkt = pkt_stream_get_pkt(ifobject->pkt_stream, pkt_nb); + struct udphdr *udp_hdr; + struct ethhdr *eth_hdr; + struct iphdr *ip_hdr; + void *data; + + if (!pkt) + return NULL; + if (!pkt->valid || pkt->len < MIN_PKT_SIZE) + return pkt; + + data = xsk_umem__get_data(ifobject->umem->buffer, pkt->addr); + udp_hdr = (struct udphdr *)(data + sizeof(struct ethhdr) + sizeof(struct iphdr)); + ip_hdr = (struct iphdr *)(data + sizeof(struct ethhdr)); + eth_hdr = (struct ethhdr *)data; + + gen_udp_hdr(pkt_nb, data, ifobject, udp_hdr); + gen_ip_hdr(ifobject, ip_hdr); + gen_udp_csum(udp_hdr, ip_hdr); + gen_eth_hdr(ifobject, eth_hdr); + + return pkt; +} + +static void pkt_stream_generate_custom(struct test_spec *test, struct pkt *pkts, u32 nb_pkts) +{ + struct pkt_stream *pkt_stream; + u32 i; + + pkt_stream = __pkt_stream_alloc(nb_pkts); + if (!pkt_stream) + exit_with_error(ENOMEM); + + test->ifobj_tx->pkt_stream = pkt_stream; + test->ifobj_rx->pkt_stream = pkt_stream; + + for (i = 0; i < nb_pkts; i++) { + pkt_stream->pkts[i].addr = pkts[i].addr; + pkt_stream->pkts[i].len = pkts[i].len; + pkt_stream->pkts[i].payload = i; + pkt_stream->pkts[i].valid = pkts[i].valid; + } +} + +static void pkt_dump(void *pkt, u32 len) +{ + char s[INET_ADDRSTRLEN]; + struct ethhdr *ethhdr; + struct udphdr *udphdr; + struct iphdr *iphdr; + int payload, i; + + ethhdr = pkt; + iphdr = pkt + sizeof(*ethhdr); + udphdr = pkt + sizeof(*ethhdr) + sizeof(*iphdr); + + /*extract L2 frame */ + fprintf(stdout, "DEBUG>> L2: dst mac: "); + for (i = 0; i < ETH_ALEN; i++) + fprintf(stdout, "%02X", ethhdr->h_dest[i]); + + fprintf(stdout, "\nDEBUG>> L2: src mac: "); + for (i = 0; i < ETH_ALEN; i++) + fprintf(stdout, "%02X", ethhdr->h_source[i]); + + /*extract L3 frame */ + fprintf(stdout, "\nDEBUG>> L3: ip_hdr->ihl: %02X\n", iphdr->ihl); + fprintf(stdout, "DEBUG>> L3: ip_hdr->saddr: %s\n", + inet_ntop(AF_INET, &iphdr->saddr, s, sizeof(s))); + fprintf(stdout, "DEBUG>> L3: ip_hdr->daddr: %s\n", + inet_ntop(AF_INET, &iphdr->daddr, s, sizeof(s))); + /*extract L4 frame */ + fprintf(stdout, "DEBUG>> L4: udp_hdr->src: %d\n", ntohs(udphdr->source)); + fprintf(stdout, "DEBUG>> L4: udp_hdr->dst: %d\n", ntohs(udphdr->dest)); + /*extract L5 frame */ + payload = *((uint32_t *)(pkt + PKT_HDR_SIZE)); + + fprintf(stdout, "DEBUG>> L5: payload: %d\n", payload); + fprintf(stdout, "---------------------------------------\n"); +} + +static bool is_offset_correct(struct xsk_umem_info *umem, struct pkt_stream *pkt_stream, u64 addr, + u64 pkt_stream_addr) +{ + u32 headroom = umem->unaligned_mode ? 0 : umem->frame_headroom; + u32 offset = addr % umem->frame_size, expected_offset = 0; + + if (!pkt_stream->use_addr_for_fill) + pkt_stream_addr = 0; + + expected_offset += (pkt_stream_addr + headroom + XDP_PACKET_HEADROOM) % umem->frame_size; + + if (offset == expected_offset) + return true; + + ksft_print_msg("[%s] expected [%u], got [%u]\n", __func__, expected_offset, offset); + return false; +} + +static bool is_pkt_valid(struct pkt *pkt, void *buffer, u64 addr, u32 len) +{ + void *data = xsk_umem__get_data(buffer, addr); + struct iphdr *iphdr = (struct iphdr *)(data + sizeof(struct ethhdr)); + + if (!pkt) { + ksft_print_msg("[%s] too many packets received\n", __func__); + return false; + } + + if (len < MIN_PKT_SIZE || pkt->len < MIN_PKT_SIZE) { + /* Do not try to verify packets that are smaller than minimum size. */ + return true; + } + + if (pkt->len != len) { + ksft_print_msg("[%s] expected length [%d], got length [%d]\n", + __func__, pkt->len, len); + return false; + } + + if (iphdr->version == IP_PKT_VER && iphdr->tos == IP_PKT_TOS) { + u32 seqnum = ntohl(*((u32 *)(data + PKT_HDR_SIZE))); + + if (opt_pkt_dump) + pkt_dump(data, PKT_SIZE); + + if (pkt->payload != seqnum) { + ksft_print_msg("[%s] expected seqnum [%d], got seqnum [%d]\n", + __func__, pkt->payload, seqnum); + return false; + } + } else { + ksft_print_msg("Invalid frame received: "); + ksft_print_msg("[IP_PKT_VER: %02X], [IP_PKT_TOS: %02X]\n", iphdr->version, + iphdr->tos); + return false; + } + + return true; +} + +static void kick_tx(struct xsk_socket_info *xsk) +{ + int ret; + + ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); + if (ret >= 0) + return; + if (errno == ENOBUFS || errno == EAGAIN || errno == EBUSY || errno == ENETDOWN) { + usleep(100); + return; + } + exit_with_error(errno); +} + +static void kick_rx(struct xsk_socket_info *xsk) +{ + int ret; + + ret = recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL); + if (ret < 0) + exit_with_error(errno); +} + +static int complete_pkts(struct xsk_socket_info *xsk, int batch_size) +{ + unsigned int rcvd; + u32 idx; + + if (xsk_ring_prod__needs_wakeup(&xsk->tx)) + kick_tx(xsk); + + rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx); + if (rcvd) { + if (rcvd > xsk->outstanding_tx) { + u64 addr = *xsk_ring_cons__comp_addr(&xsk->umem->cq, idx + rcvd - 1); + + ksft_print_msg("[%s] Too many packets completed\n", __func__); + ksft_print_msg("Last completion address: %llx\n", addr); + return TEST_FAILURE; + } + + xsk_ring_cons__release(&xsk->umem->cq, rcvd); + xsk->outstanding_tx -= rcvd; + } + + return TEST_PASS; +} + +static int receive_pkts(struct ifobject *ifobj, struct pollfd *fds) +{ + struct timeval tv_end, tv_now, tv_timeout = {RECV_TMOUT, 0}; + u32 idx_rx = 0, idx_fq = 0, rcvd, i, pkts_sent = 0; + struct pkt_stream *pkt_stream = ifobj->pkt_stream; + struct xsk_socket_info *xsk = ifobj->xsk; + struct xsk_umem_info *umem = xsk->umem; + struct pkt *pkt; + int ret; + + ret = gettimeofday(&tv_now, NULL); + if (ret) + exit_with_error(errno); + timeradd(&tv_now, &tv_timeout, &tv_end); + + pkt = pkt_stream_get_next_rx_pkt(pkt_stream, &pkts_sent); + while (pkt) { + ret = gettimeofday(&tv_now, NULL); + if (ret) + exit_with_error(errno); + if (timercmp(&tv_now, &tv_end, >)) { + ksft_print_msg("ERROR: [%s] Receive loop timed out\n", __func__); + return TEST_FAILURE; + } + + kick_rx(xsk); + + rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx); + if (!rcvd) { + if (xsk_ring_prod__needs_wakeup(&umem->fq)) { + ret = poll(fds, 1, POLL_TMOUT); + if (ret < 0) + exit_with_error(-ret); + } + continue; + } + + if (ifobj->use_fill_ring) { + ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); + while (ret != rcvd) { + if (ret < 0) + exit_with_error(-ret); + if (xsk_ring_prod__needs_wakeup(&umem->fq)) { + ret = poll(fds, 1, POLL_TMOUT); + if (ret < 0) + exit_with_error(-ret); + } + ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); + } + } + + for (i = 0; i < rcvd; i++) { + const struct xdp_desc *desc = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++); + u64 addr = desc->addr, orig; + + orig = xsk_umem__extract_addr(addr); + addr = xsk_umem__add_offset_to_addr(addr); + + if (!is_pkt_valid(pkt, umem->buffer, addr, desc->len) || + !is_offset_correct(umem, pkt_stream, addr, pkt->addr)) + return TEST_FAILURE; + + if (ifobj->use_fill_ring) + *xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) = orig; + pkt = pkt_stream_get_next_rx_pkt(pkt_stream, &pkts_sent); + } + + if (ifobj->use_fill_ring) + xsk_ring_prod__submit(&umem->fq, rcvd); + if (ifobj->release_rx) + xsk_ring_cons__release(&xsk->rx, rcvd); + + pthread_mutex_lock(&pacing_mutex); + pkts_in_flight -= pkts_sent; + if (pkts_in_flight < umem->num_frames) + pthread_cond_signal(&pacing_cond); + pthread_mutex_unlock(&pacing_mutex); + pkts_sent = 0; + } + + return TEST_PASS; +} + +static int __send_pkts(struct ifobject *ifobject, u32 *pkt_nb) +{ + struct xsk_socket_info *xsk = ifobject->xsk; + u32 i, idx, valid_pkts = 0; + + while (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) < BATCH_SIZE) + complete_pkts(xsk, BATCH_SIZE); + + for (i = 0; i < BATCH_SIZE; i++) { + struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i); + struct pkt *pkt = pkt_generate(ifobject, *pkt_nb); + + if (!pkt) + break; + + tx_desc->addr = pkt->addr; + tx_desc->len = pkt->len; + (*pkt_nb)++; + if (pkt->valid) + valid_pkts++; + } + + pthread_mutex_lock(&pacing_mutex); + pkts_in_flight += valid_pkts; + /* pkts_in_flight might be negative if many invalid packets are sent */ + if (pkts_in_flight >= (int)(ifobject->umem->num_frames - BATCH_SIZE)) { + kick_tx(xsk); + pthread_cond_wait(&pacing_cond, &pacing_mutex); + } + pthread_mutex_unlock(&pacing_mutex); + + xsk_ring_prod__submit(&xsk->tx, i); + xsk->outstanding_tx += valid_pkts; + if (complete_pkts(xsk, i)) + return TEST_FAILURE; + + usleep(10); + return TEST_PASS; +} + +static void wait_for_tx_completion(struct xsk_socket_info *xsk) +{ + while (xsk->outstanding_tx) + complete_pkts(xsk, BATCH_SIZE); +} + +static int send_pkts(struct test_spec *test, struct ifobject *ifobject) +{ + struct pollfd fds = { }; + u32 pkt_cnt = 0; + + fds.fd = xsk_socket__fd(ifobject->xsk->xsk); + fds.events = POLLOUT; + + while (pkt_cnt < ifobject->pkt_stream->nb_pkts) { + int err; + + if (ifobject->use_poll) { + int ret; + + ret = poll(&fds, 1, POLL_TMOUT); + if (ret <= 0) + continue; + + if (!(fds.revents & POLLOUT)) + continue; + } + + err = __send_pkts(ifobject, &pkt_cnt); + if (err || test->fail) + return TEST_FAILURE; + } + + wait_for_tx_completion(ifobject->xsk); + return TEST_PASS; +} + +static int get_xsk_stats(struct xsk_socket *xsk, struct xdp_statistics *stats) +{ + int fd = xsk_socket__fd(xsk), err; + socklen_t optlen, expected_len; + + optlen = sizeof(*stats); + err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, stats, &optlen); + if (err) { + ksft_print_msg("[%s] getsockopt(XDP_STATISTICS) error %u %s\n", + __func__, -err, strerror(-err)); + return TEST_FAILURE; + } + + expected_len = sizeof(struct xdp_statistics); + if (optlen != expected_len) { + ksft_print_msg("[%s] getsockopt optlen error. Expected: %u got: %u\n", + __func__, expected_len, optlen); + return TEST_FAILURE; + } + + return TEST_PASS; +} + +static int validate_rx_dropped(struct ifobject *ifobject) +{ + struct xsk_socket *xsk = ifobject->xsk->xsk; + struct xdp_statistics stats; + int err; + + kick_rx(ifobject->xsk); + + err = get_xsk_stats(xsk, &stats); + if (err) + return TEST_FAILURE; + + if (stats.rx_dropped == ifobject->pkt_stream->nb_pkts / 2) + return TEST_PASS; + + return TEST_FAILURE; +} + +static int validate_rx_full(struct ifobject *ifobject) +{ + struct xsk_socket *xsk = ifobject->xsk->xsk; + struct xdp_statistics stats; + int err; + + usleep(1000); + kick_rx(ifobject->xsk); + + err = get_xsk_stats(xsk, &stats); + if (err) + return TEST_FAILURE; + + if (stats.rx_ring_full) + return TEST_PASS; + + return TEST_FAILURE; +} + +static int validate_fill_empty(struct ifobject *ifobject) +{ + struct xsk_socket *xsk = ifobject->xsk->xsk; + struct xdp_statistics stats; + int err; + + usleep(1000); + kick_rx(ifobject->xsk); + + err = get_xsk_stats(xsk, &stats); + if (err) + return TEST_FAILURE; + + if (stats.rx_fill_ring_empty_descs) + return TEST_PASS; + + return TEST_FAILURE; +} + +static int validate_tx_invalid_descs(struct ifobject *ifobject) +{ + struct xsk_socket *xsk = ifobject->xsk->xsk; + int fd = xsk_socket__fd(xsk); + struct xdp_statistics stats; + socklen_t optlen; + int err; + + optlen = sizeof(stats); + err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen); + if (err) { + ksft_print_msg("[%s] getsockopt(XDP_STATISTICS) error %u %s\n", + __func__, -err, strerror(-err)); + return TEST_FAILURE; + } + + if (stats.tx_invalid_descs != ifobject->pkt_stream->nb_pkts / 2) { + ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%u] expected [%u]\n", + __func__, stats.tx_invalid_descs, ifobject->pkt_stream->nb_pkts); + return TEST_FAILURE; + } + + return TEST_PASS; +} + +static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject) +{ + u64 umem_sz = ifobject->umem->num_frames * ifobject->umem->frame_size; + int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE; + LIBBPF_OPTS(bpf_xdp_query_opts, opts); + int ret, ifindex; + void *bufs; + u32 i; + + ifobject->ns_fd = switch_namespace(ifobject->nsname); + + if (ifobject->umem->unaligned_mode) + mmap_flags |= MAP_HUGETLB; + + bufs = mmap(NULL, umem_sz, PROT_READ | PROT_WRITE, mmap_flags, -1, 0); + if (bufs == MAP_FAILED) + exit_with_error(errno); + + ret = xsk_configure_umem(ifobject->umem, bufs, umem_sz); + if (ret) + exit_with_error(-ret); + + for (i = 0; i < test->nb_sockets; i++) { + u32 ctr = 0; + + while (ctr++ < SOCK_RECONF_CTR) { + ret = xsk_configure_socket(&ifobject->xsk_arr[i], ifobject->umem, + ifobject, !!i); + if (!ret) + break; + + /* Retry if it fails as xsk_socket__create() is asynchronous */ + if (ctr >= SOCK_RECONF_CTR) + exit_with_error(-ret); + usleep(USLEEP_MAX); + } + + if (ifobject->busy_poll) + enable_busy_poll(&ifobject->xsk_arr[i]); + } + + ifobject->xsk = &ifobject->xsk_arr[0]; + + if (!ifobject->rx_on) + return; + + ifindex = if_nametoindex(ifobject->ifname); + if (!ifindex) + exit_with_error(errno); + + ret = xsk_setup_xdp_prog_xsk(ifobject->xsk->xsk, &ifobject->xsk_map_fd); + if (ret) + exit_with_error(-ret); + + ret = bpf_xdp_query(ifindex, ifobject->xdp_flags, &opts); + if (ret) + exit_with_error(-ret); + + if (ifobject->xdp_flags & XDP_FLAGS_SKB_MODE) { + if (opts.attach_mode != XDP_ATTACHED_SKB) { + ksft_print_msg("ERROR: [%s] XDP prog not in SKB mode\n"); + exit_with_error(-EINVAL); + } + } else if (ifobject->xdp_flags & XDP_FLAGS_DRV_MODE) { + if (opts.attach_mode != XDP_ATTACHED_DRV) { + ksft_print_msg("ERROR: [%s] XDP prog not in DRV mode\n"); + exit_with_error(-EINVAL); + } + } + + ret = xsk_socket__update_xskmap(ifobject->xsk->xsk, ifobject->xsk_map_fd); + if (ret) + exit_with_error(-ret); +} + +static void testapp_cleanup_xsk_res(struct ifobject *ifobj) +{ + print_verbose("Destroying socket\n"); + xsk_socket__delete(ifobj->xsk->xsk); + munmap(ifobj->umem->buffer, ifobj->umem->num_frames * ifobj->umem->frame_size); + xsk_umem__delete(ifobj->umem->umem); +} + +static void *worker_testapp_validate_tx(void *arg) +{ + struct test_spec *test = (struct test_spec *)arg; + struct ifobject *ifobject = test->ifobj_tx; + int err; + + if (test->current_step == 1) + thread_common_ops(test, ifobject); + + print_verbose("Sending %d packets on interface %s\n", ifobject->pkt_stream->nb_pkts, + ifobject->ifname); + err = send_pkts(test, ifobject); + + if (!err && ifobject->validation_func) + err = ifobject->validation_func(ifobject); + if (err) + report_failure(test); + + if (test->total_steps == test->current_step || err) + testapp_cleanup_xsk_res(ifobject); + pthread_exit(NULL); +} + +static void xsk_populate_fill_ring(struct xsk_umem_info *umem, struct pkt_stream *pkt_stream) +{ + u32 idx = 0, i, buffers_to_fill; + int ret; + + if (umem->num_frames < XSK_RING_PROD__DEFAULT_NUM_DESCS) + buffers_to_fill = umem->num_frames; + else + buffers_to_fill = XSK_RING_PROD__DEFAULT_NUM_DESCS; + + ret = xsk_ring_prod__reserve(&umem->fq, buffers_to_fill, &idx); + if (ret != buffers_to_fill) + exit_with_error(ENOSPC); + for (i = 0; i < buffers_to_fill; i++) { + u64 addr; + + if (pkt_stream->use_addr_for_fill) { + struct pkt *pkt = pkt_stream_get_pkt(pkt_stream, i); + + if (!pkt) + break; + addr = pkt->addr; + } else { + addr = i * umem->frame_size; + } + + *xsk_ring_prod__fill_addr(&umem->fq, idx++) = addr; + } + xsk_ring_prod__submit(&umem->fq, buffers_to_fill); +} + +static void *worker_testapp_validate_rx(void *arg) +{ + struct test_spec *test = (struct test_spec *)arg; + struct ifobject *ifobject = test->ifobj_rx; + struct pollfd fds = { }; + int err; + + if (test->current_step == 1) + thread_common_ops(test, ifobject); + + xsk_populate_fill_ring(ifobject->umem, ifobject->pkt_stream); + + fds.fd = xsk_socket__fd(ifobject->xsk->xsk); + fds.events = POLLIN; + + pthread_barrier_wait(&barr); + + err = receive_pkts(ifobject, &fds); + + if (!err && ifobject->validation_func) + err = ifobject->validation_func(ifobject); + if (err) { + report_failure(test); + pthread_mutex_lock(&pacing_mutex); + pthread_cond_signal(&pacing_cond); + pthread_mutex_unlock(&pacing_mutex); + } + + if (test->total_steps == test->current_step || err) + testapp_cleanup_xsk_res(ifobject); + pthread_exit(NULL); +} + +static int testapp_validate_traffic(struct test_spec *test) +{ + struct ifobject *ifobj_tx = test->ifobj_tx; + struct ifobject *ifobj_rx = test->ifobj_rx; + pthread_t t0, t1; + + if (pthread_barrier_init(&barr, NULL, 2)) + exit_with_error(errno); + + test->current_step++; + pkt_stream_reset(ifobj_rx->pkt_stream); + pkts_in_flight = 0; + + /*Spawn RX thread */ + pthread_create(&t0, NULL, ifobj_rx->func_ptr, test); + + pthread_barrier_wait(&barr); + if (pthread_barrier_destroy(&barr)) + exit_with_error(errno); + + /*Spawn TX thread */ + pthread_create(&t1, NULL, ifobj_tx->func_ptr, test); + + pthread_join(t1, NULL); + pthread_join(t0, NULL); + + return !!test->fail; +} + +static void testapp_teardown(struct test_spec *test) +{ + int i; + + test_spec_set_name(test, "TEARDOWN"); + for (i = 0; i < MAX_TEARDOWN_ITER; i++) { + if (testapp_validate_traffic(test)) + return; + test_spec_reset(test); + } +} + +static void swap_directions(struct ifobject **ifobj1, struct ifobject **ifobj2) +{ + thread_func_t tmp_func_ptr = (*ifobj1)->func_ptr; + struct ifobject *tmp_ifobj = (*ifobj1); + + (*ifobj1)->func_ptr = (*ifobj2)->func_ptr; + (*ifobj2)->func_ptr = tmp_func_ptr; + + *ifobj1 = *ifobj2; + *ifobj2 = tmp_ifobj; +} + +static void testapp_bidi(struct test_spec *test) +{ + test_spec_set_name(test, "BIDIRECTIONAL"); + test->ifobj_tx->rx_on = true; + test->ifobj_rx->tx_on = true; + test->total_steps = 2; + if (testapp_validate_traffic(test)) + return; + + print_verbose("Switching Tx/Rx vectors\n"); + swap_directions(&test->ifobj_rx, &test->ifobj_tx); + testapp_validate_traffic(test); + + swap_directions(&test->ifobj_rx, &test->ifobj_tx); +} + +static void swap_xsk_resources(struct ifobject *ifobj_tx, struct ifobject *ifobj_rx) +{ + int ret; + + xsk_socket__delete(ifobj_tx->xsk->xsk); + xsk_socket__delete(ifobj_rx->xsk->xsk); + ifobj_tx->xsk = &ifobj_tx->xsk_arr[1]; + ifobj_rx->xsk = &ifobj_rx->xsk_arr[1]; + + ret = xsk_socket__update_xskmap(ifobj_rx->xsk->xsk, ifobj_rx->xsk_map_fd); + if (ret) + exit_with_error(-ret); +} + +static void testapp_bpf_res(struct test_spec *test) +{ + test_spec_set_name(test, "BPF_RES"); + test->total_steps = 2; + test->nb_sockets = 2; + if (testapp_validate_traffic(test)) + return; + + swap_xsk_resources(test->ifobj_tx, test->ifobj_rx); + testapp_validate_traffic(test); +} + +static void testapp_headroom(struct test_spec *test) +{ + test_spec_set_name(test, "UMEM_HEADROOM"); + test->ifobj_rx->umem->frame_headroom = UMEM_HEADROOM_TEST_SIZE; + testapp_validate_traffic(test); +} + +static void testapp_stats_rx_dropped(struct test_spec *test) +{ + test_spec_set_name(test, "STAT_RX_DROPPED"); + test->ifobj_rx->umem->frame_headroom = test->ifobj_rx->umem->frame_size - + XDP_PACKET_HEADROOM - MIN_PKT_SIZE * 3; + pkt_stream_replace_half(test, MIN_PKT_SIZE * 4, 0); + pkt_stream_receive_half(test); + test->ifobj_rx->validation_func = validate_rx_dropped; + testapp_validate_traffic(test); +} + +static void testapp_stats_tx_invalid_descs(struct test_spec *test) +{ + test_spec_set_name(test, "STAT_TX_INVALID"); + pkt_stream_replace_half(test, XSK_UMEM__INVALID_FRAME_SIZE, 0); + test->ifobj_tx->validation_func = validate_tx_invalid_descs; + testapp_validate_traffic(test); + + pkt_stream_restore_default(test); +} + +static void testapp_stats_rx_full(struct test_spec *test) +{ + test_spec_set_name(test, "STAT_RX_FULL"); + pkt_stream_replace(test, DEFAULT_UMEM_BUFFERS + DEFAULT_UMEM_BUFFERS / 2, PKT_SIZE); + test->ifobj_rx->pkt_stream = pkt_stream_generate(test->ifobj_rx->umem, + DEFAULT_UMEM_BUFFERS, PKT_SIZE); + if (!test->ifobj_rx->pkt_stream) + exit_with_error(ENOMEM); + + test->ifobj_rx->xsk->rxqsize = DEFAULT_UMEM_BUFFERS; + test->ifobj_rx->release_rx = false; + test->ifobj_rx->validation_func = validate_rx_full; + testapp_validate_traffic(test); + + pkt_stream_restore_default(test); +} + +static void testapp_stats_fill_empty(struct test_spec *test) +{ + test_spec_set_name(test, "STAT_RX_FILL_EMPTY"); + pkt_stream_replace(test, DEFAULT_UMEM_BUFFERS + DEFAULT_UMEM_BUFFERS / 2, PKT_SIZE); + test->ifobj_rx->pkt_stream = pkt_stream_generate(test->ifobj_rx->umem, + DEFAULT_UMEM_BUFFERS, PKT_SIZE); + if (!test->ifobj_rx->pkt_stream) + exit_with_error(ENOMEM); + + test->ifobj_rx->use_fill_ring = false; + test->ifobj_rx->validation_func = validate_fill_empty; + testapp_validate_traffic(test); + + pkt_stream_restore_default(test); +} + +/* Simple test */ +static bool hugepages_present(struct ifobject *ifobject) +{ + const size_t mmap_sz = 2 * ifobject->umem->num_frames * ifobject->umem->frame_size; + void *bufs; + + bufs = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); + if (bufs == MAP_FAILED) + return false; + + munmap(bufs, mmap_sz); + return true; +} + +static bool testapp_unaligned(struct test_spec *test) +{ + if (!hugepages_present(test->ifobj_tx)) { + ksft_test_result_skip("No 2M huge pages present.\n"); + return false; + } + + test_spec_set_name(test, "UNALIGNED_MODE"); + test->ifobj_tx->umem->unaligned_mode = true; + test->ifobj_rx->umem->unaligned_mode = true; + /* Let half of the packets straddle a buffer boundrary */ + pkt_stream_replace_half(test, PKT_SIZE, -PKT_SIZE / 2); + test->ifobj_rx->pkt_stream->use_addr_for_fill = true; + testapp_validate_traffic(test); + + pkt_stream_restore_default(test); + return true; +} + +static void testapp_single_pkt(struct test_spec *test) +{ + struct pkt pkts[] = {{0x1000, PKT_SIZE, 0, true}}; + + pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts)); + testapp_validate_traffic(test); + pkt_stream_restore_default(test); +} + +static void testapp_invalid_desc(struct test_spec *test) +{ + struct pkt pkts[] = { + /* Zero packet address allowed */ + {0, PKT_SIZE, 0, true}, + /* Allowed packet */ + {0x1000, PKT_SIZE, 0, true}, + /* Straddling the start of umem */ + {-2, PKT_SIZE, 0, false}, + /* Packet too large */ + {0x2000, XSK_UMEM__INVALID_FRAME_SIZE, 0, false}, + /* After umem ends */ + {UMEM_SIZE, PKT_SIZE, 0, false}, + /* Straddle the end of umem */ + {UMEM_SIZE - PKT_SIZE / 2, PKT_SIZE, 0, false}, + /* Straddle a page boundrary */ + {0x3000 - PKT_SIZE / 2, PKT_SIZE, 0, false}, + /* Straddle a 2K boundrary */ + {0x3800 - PKT_SIZE / 2, PKT_SIZE, 0, true}, + /* Valid packet for synch so that something is received */ + {0x4000, PKT_SIZE, 0, true}}; + + if (test->ifobj_tx->umem->unaligned_mode) { + /* Crossing a page boundrary allowed */ + pkts[6].valid = true; + } + if (test->ifobj_tx->umem->frame_size == XSK_UMEM__DEFAULT_FRAME_SIZE / 2) { + /* Crossing a 2K frame size boundrary not allowed */ + pkts[7].valid = false; + } + + pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts)); + testapp_validate_traffic(test); + pkt_stream_restore_default(test); +} + +static void init_iface(struct ifobject *ifobj, const char *dst_mac, const char *src_mac, + const char *dst_ip, const char *src_ip, const u16 dst_port, + const u16 src_port, thread_func_t func_ptr) +{ + struct in_addr ip; + + memcpy(ifobj->dst_mac, dst_mac, ETH_ALEN); + memcpy(ifobj->src_mac, src_mac, ETH_ALEN); + + inet_aton(dst_ip, &ip); + ifobj->dst_ip = ip.s_addr; + + inet_aton(src_ip, &ip); + ifobj->src_ip = ip.s_addr; + + ifobj->dst_port = dst_port; + ifobj->src_port = src_port; + + ifobj->func_ptr = func_ptr; +} + +static void run_pkt_test(struct test_spec *test, enum test_mode mode, enum test_type type) +{ + switch (type) { + case TEST_TYPE_STATS_RX_DROPPED: + testapp_stats_rx_dropped(test); + break; + case TEST_TYPE_STATS_TX_INVALID_DESCS: + testapp_stats_tx_invalid_descs(test); + break; + case TEST_TYPE_STATS_RX_FULL: + testapp_stats_rx_full(test); + break; + case TEST_TYPE_STATS_FILL_EMPTY: + testapp_stats_fill_empty(test); + break; + case TEST_TYPE_TEARDOWN: + testapp_teardown(test); + break; + case TEST_TYPE_BIDI: + testapp_bidi(test); + break; + case TEST_TYPE_BPF_RES: + testapp_bpf_res(test); + break; + case TEST_TYPE_RUN_TO_COMPLETION: + test_spec_set_name(test, "RUN_TO_COMPLETION"); + testapp_validate_traffic(test); + break; + case TEST_TYPE_RUN_TO_COMPLETION_SINGLE_PKT: + test_spec_set_name(test, "RUN_TO_COMPLETION_SINGLE_PKT"); + testapp_single_pkt(test); + break; + case TEST_TYPE_RUN_TO_COMPLETION_2K_FRAME: + test_spec_set_name(test, "RUN_TO_COMPLETION_2K_FRAME_SIZE"); + test->ifobj_tx->umem->frame_size = 2048; + test->ifobj_rx->umem->frame_size = 2048; + pkt_stream_replace(test, DEFAULT_PKT_CNT, PKT_SIZE); + testapp_validate_traffic(test); + + pkt_stream_restore_default(test); + break; + case TEST_TYPE_POLL: + test->ifobj_tx->use_poll = true; + test->ifobj_rx->use_poll = true; + test_spec_set_name(test, "POLL"); + testapp_validate_traffic(test); + break; + case TEST_TYPE_ALIGNED_INV_DESC: + test_spec_set_name(test, "ALIGNED_INV_DESC"); + testapp_invalid_desc(test); + break; + case TEST_TYPE_ALIGNED_INV_DESC_2K_FRAME: + test_spec_set_name(test, "ALIGNED_INV_DESC_2K_FRAME_SIZE"); + test->ifobj_tx->umem->frame_size = 2048; + test->ifobj_rx->umem->frame_size = 2048; + testapp_invalid_desc(test); + break; + case TEST_TYPE_UNALIGNED_INV_DESC: + if (!hugepages_present(test->ifobj_tx)) { + ksft_test_result_skip("No 2M huge pages present.\n"); + return; + } + test_spec_set_name(test, "UNALIGNED_INV_DESC"); + test->ifobj_tx->umem->unaligned_mode = true; + test->ifobj_rx->umem->unaligned_mode = true; + testapp_invalid_desc(test); + break; + case TEST_TYPE_UNALIGNED: + if (!testapp_unaligned(test)) + return; + break; + case TEST_TYPE_HEADROOM: + testapp_headroom(test); + break; + default: + break; + } + + if (!test->fail) + ksft_test_result_pass("PASS: %s %s%s\n", mode_string(test), busy_poll_string(test), + test->name); +} + +static struct ifobject *ifobject_create(void) +{ + struct ifobject *ifobj; + + ifobj = calloc(1, sizeof(struct ifobject)); + if (!ifobj) + return NULL; + + ifobj->xsk_arr = calloc(MAX_SOCKETS, sizeof(*ifobj->xsk_arr)); + if (!ifobj->xsk_arr) + goto out_xsk_arr; + + ifobj->umem = calloc(1, sizeof(*ifobj->umem)); + if (!ifobj->umem) + goto out_umem; + + return ifobj; + +out_umem: + free(ifobj->xsk_arr); +out_xsk_arr: + free(ifobj); + return NULL; +} + +static void ifobject_delete(struct ifobject *ifobj) +{ + free(ifobj->umem); + free(ifobj->xsk_arr); + free(ifobj); +} + +int main(int argc, char **argv) +{ + struct pkt_stream *pkt_stream_default; + struct ifobject *ifobj_tx, *ifobj_rx; + u32 i, j, failed_tests = 0; + struct test_spec test; + + /* Use libbpf 1.0 API mode */ + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + + ifobj_tx = ifobject_create(); + if (!ifobj_tx) + exit_with_error(ENOMEM); + ifobj_rx = ifobject_create(); + if (!ifobj_rx) + exit_with_error(ENOMEM); + + setlocale(LC_ALL, ""); + + parse_command_line(ifobj_tx, ifobj_rx, argc, argv); + + if (!validate_interface(ifobj_tx) || !validate_interface(ifobj_rx)) { + usage(basename(argv[0])); + ksft_exit_xfail(); + } + + init_iface(ifobj_tx, MAC1, MAC2, IP1, IP2, UDP_PORT1, UDP_PORT2, + worker_testapp_validate_tx); + init_iface(ifobj_rx, MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1, + worker_testapp_validate_rx); + + test_spec_init(&test, ifobj_tx, ifobj_rx, 0); + pkt_stream_default = pkt_stream_generate(ifobj_tx->umem, DEFAULT_PKT_CNT, PKT_SIZE); + if (!pkt_stream_default) + exit_with_error(ENOMEM); + test.pkt_stream_default = pkt_stream_default; + + ksft_set_plan(TEST_MODE_MAX * TEST_TYPE_MAX); + + for (i = 0; i < TEST_MODE_MAX; i++) + for (j = 0; j < TEST_TYPE_MAX; j++) { + test_spec_init(&test, ifobj_tx, ifobj_rx, i); + run_pkt_test(&test, i, j); + usleep(USLEEP_MAX); + + if (test.fail) + failed_tests++; + } + + pkt_stream_delete(pkt_stream_default); + ifobject_delete(ifobj_tx); + ifobject_delete(ifobj_rx); + + if (failed_tests) + ksft_exit_fail(); + else + ksft_exit_pass(); +} diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h new file mode 100644 index 000000000000..3d17053f98e5 --- /dev/null +++ b/tools/testing/selftests/bpf/xskxceiver.h @@ -0,0 +1,172 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright(c) 2020 Intel Corporation. + */ + +#ifndef XSKXCEIVER_H_ +#define XSKXCEIVER_H_ + +#ifndef SOL_XDP +#define SOL_XDP 283 +#endif + +#ifndef AF_XDP +#define AF_XDP 44 +#endif + +#ifndef PF_XDP +#define PF_XDP AF_XDP +#endif + +#ifndef SO_BUSY_POLL_BUDGET +#define SO_BUSY_POLL_BUDGET 70 +#endif + +#ifndef SO_PREFER_BUSY_POLL +#define SO_PREFER_BUSY_POLL 69 +#endif + +#define TEST_PASS 0 +#define TEST_FAILURE -1 +#define MAX_INTERFACES 2 +#define MAX_INTERFACE_NAME_CHARS 7 +#define MAX_INTERFACES_NAMESPACE_CHARS 10 +#define MAX_SOCKETS 2 +#define MAX_TEST_NAME_SIZE 32 +#define MAX_TEARDOWN_ITER 10 +#define PKT_HDR_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ + sizeof(struct udphdr)) +#define MIN_ETH_PKT_SIZE 64 +#define ETH_FCS_SIZE 4 +#define MIN_PKT_SIZE (MIN_ETH_PKT_SIZE - ETH_FCS_SIZE) +#define PKT_SIZE (MIN_PKT_SIZE) +#define IP_PKT_SIZE (PKT_SIZE - sizeof(struct ethhdr)) +#define IP_PKT_VER 0x4 +#define IP_PKT_TOS 0x9 +#define UDP_PKT_SIZE (IP_PKT_SIZE - sizeof(struct iphdr)) +#define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - sizeof(struct udphdr)) +#define USLEEP_MAX 10000 +#define SOCK_RECONF_CTR 10 +#define BATCH_SIZE 64 +#define POLL_TMOUT 1000 +#define RECV_TMOUT 3 +#define DEFAULT_PKT_CNT (4 * 1024) +#define DEFAULT_UMEM_BUFFERS (DEFAULT_PKT_CNT / 4) +#define UMEM_SIZE (DEFAULT_UMEM_BUFFERS * XSK_UMEM__DEFAULT_FRAME_SIZE) +#define RX_FULL_RXQSIZE 32 +#define UMEM_HEADROOM_TEST_SIZE 128 +#define XSK_UMEM__INVALID_FRAME_SIZE (XSK_UMEM__DEFAULT_FRAME_SIZE + 1) + +#define print_verbose(x...) do { if (opt_verbose) ksft_print_msg(x); } while (0) + +enum test_mode { + TEST_MODE_SKB, + TEST_MODE_DRV, + TEST_MODE_MAX +}; + +enum test_type { + TEST_TYPE_RUN_TO_COMPLETION, + TEST_TYPE_RUN_TO_COMPLETION_2K_FRAME, + TEST_TYPE_RUN_TO_COMPLETION_SINGLE_PKT, + TEST_TYPE_POLL, + TEST_TYPE_UNALIGNED, + TEST_TYPE_ALIGNED_INV_DESC, + TEST_TYPE_ALIGNED_INV_DESC_2K_FRAME, + TEST_TYPE_UNALIGNED_INV_DESC, + TEST_TYPE_HEADROOM, + TEST_TYPE_TEARDOWN, + TEST_TYPE_BIDI, + TEST_TYPE_STATS_RX_DROPPED, + TEST_TYPE_STATS_TX_INVALID_DESCS, + TEST_TYPE_STATS_RX_FULL, + TEST_TYPE_STATS_FILL_EMPTY, + TEST_TYPE_BPF_RES, + TEST_TYPE_MAX +}; + +static bool opt_pkt_dump; +static bool opt_verbose; + +struct xsk_umem_info { + struct xsk_ring_prod fq; + struct xsk_ring_cons cq; + struct xsk_umem *umem; + u32 num_frames; + u32 frame_headroom; + void *buffer; + u32 frame_size; + bool unaligned_mode; +}; + +struct xsk_socket_info { + struct xsk_ring_cons rx; + struct xsk_ring_prod tx; + struct xsk_umem_info *umem; + struct xsk_socket *xsk; + u32 outstanding_tx; + u32 rxqsize; +}; + +struct pkt { + u64 addr; + u32 len; + u32 payload; + bool valid; +}; + +struct pkt_stream { + u32 nb_pkts; + u32 rx_pkt_nb; + struct pkt *pkts; + bool use_addr_for_fill; +}; + +struct ifobject; +typedef int (*validation_func_t)(struct ifobject *ifobj); +typedef void *(*thread_func_t)(void *arg); + +struct ifobject { + char ifname[MAX_INTERFACE_NAME_CHARS]; + char nsname[MAX_INTERFACES_NAMESPACE_CHARS]; + struct xsk_socket_info *xsk; + struct xsk_socket_info *xsk_arr; + struct xsk_umem_info *umem; + thread_func_t func_ptr; + validation_func_t validation_func; + struct pkt_stream *pkt_stream; + int ns_fd; + int xsk_map_fd; + u32 dst_ip; + u32 src_ip; + u32 xdp_flags; + u32 bind_flags; + u16 src_port; + u16 dst_port; + bool tx_on; + bool rx_on; + bool use_poll; + bool busy_poll; + bool use_fill_ring; + bool release_rx; + u8 dst_mac[ETH_ALEN]; + u8 src_mac[ETH_ALEN]; +}; + +struct test_spec { + struct ifobject *ifobj_tx; + struct ifobject *ifobj_rx; + struct pkt_stream *pkt_stream_default; + u16 total_steps; + u16 current_step; + u16 nb_sockets; + bool fail; + char name[MAX_TEST_NAME_SIZE]; +}; + +pthread_barrier_t barr; +pthread_mutex_t pacing_mutex = PTHREAD_MUTEX_INITIALIZER; +pthread_cond_t pacing_cond = PTHREAD_COND_INITIALIZER; + +int pkts_in_flight; + +#endif /* XSKXCEIVER_H_ */ -- cgit v1.2.3 From aad53f17f0ad7485872d66fbcb53cc0c60e811f2 Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Wed, 6 Jul 2022 21:28:54 +0000 Subject: bpftool: Add support for KIND_RESTRICT to gen min_core_btf command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change adjusts bpftool's type marking logic, as used in conjunction with TYPE_EXISTS relocations, to correctly recognize and handle the RESTRICT BTF kind. Suggested-by: Andrii Nakryiko Signed-off-by: Daniel Müller Signed-off-by: Daniel Borkmann Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20220623212205.2805002-1-deso@posteo.net/T/#m4c75205145701762a4b398e0cdb911d5b5305ffc Link: https://lore.kernel.org/bpf/20220706212855.1700615-2-deso@posteo.net --- tools/bpf/bpftool/gen.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 3d35fbc5fe16..1cf53bb01936 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -1762,6 +1762,7 @@ btfgen_mark_type(struct btfgen_info *info, unsigned int type_id, bool follow_poi } break; case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: case BTF_KIND_VOLATILE: case BTF_KIND_TYPEDEF: err = btfgen_mark_type(info, btf_type->type, follow_pointers); -- cgit v1.2.3 From 32e0d9b3104845e0b3f24d89033a17a317ba37f9 Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Wed, 6 Jul 2022 21:28:55 +0000 Subject: selftests/bpf: Add test involving restrict type qualifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change adds a type based test involving the restrict type qualifier to the BPF selftests. On the btfgen path, this will verify that bpftool correctly handles the corresponding RESTRICT BTF kind. Signed-off-by: Daniel Müller Signed-off-by: Daniel Borkmann Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20220706212855.1700615-3-deso@posteo.net --- tools/testing/selftests/bpf/prog_tests/core_reloc.c | 2 ++ tools/testing/selftests/bpf/progs/core_reloc_types.h | 8 ++++++-- tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c | 5 +++++ 3 files changed, 13 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index a6f65e2236f4..c8655ba9a88f 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -764,6 +764,7 @@ static const struct core_reloc_test_case test_cases[] = { .typedef_int_exists = 1, .typedef_enum_exists = 1, .typedef_void_ptr_exists = 1, + .typedef_restrict_ptr_exists = 1, .typedef_func_proto_exists = 1, .typedef_arr_exists = 1, @@ -777,6 +778,7 @@ static const struct core_reloc_test_case test_cases[] = { .typedef_int_matches = 1, .typedef_enum_matches = 1, .typedef_void_ptr_matches = 1, + .typedef_restrict_ptr_matches = 1, .typedef_func_proto_matches = 1, .typedef_arr_matches = 1, diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h index 7ef91d19c66e..fd8e1b4c6762 100644 --- a/tools/testing/selftests/bpf/progs/core_reloc_types.h +++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h @@ -874,6 +874,7 @@ struct core_reloc_type_based_output { bool typedef_int_exists; bool typedef_enum_exists; bool typedef_void_ptr_exists; + bool typedef_restrict_ptr_exists; bool typedef_func_proto_exists; bool typedef_arr_exists; @@ -887,6 +888,7 @@ struct core_reloc_type_based_output { bool typedef_int_matches; bool typedef_enum_matches; bool typedef_void_ptr_matches; + bool typedef_restrict_ptr_matches; bool typedef_func_proto_matches; bool typedef_arr_matches; @@ -939,6 +941,7 @@ typedef int int_typedef; typedef enum { TYPEDEF_ENUM_VAL1, TYPEDEF_ENUM_VAL2 } enum_typedef; typedef void *void_ptr_typedef; +typedef int *restrict restrict_ptr_typedef; typedef int (*func_proto_typedef)(long); @@ -955,8 +958,9 @@ struct core_reloc_type_based { int_typedef f8; enum_typedef f9; void_ptr_typedef f10; - func_proto_typedef f11; - arr_typedef f12; + restrict_ptr_typedef f11; + func_proto_typedef f12; + arr_typedef f13; }; /* no types in target */ diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c b/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c index d95bc08b75c1..2edb4df35e6e 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c @@ -51,6 +51,7 @@ typedef int int_typedef; typedef enum { TYPEDEF_ENUM_VAL1, TYPEDEF_ENUM_VAL2 } enum_typedef; typedef void *void_ptr_typedef; +typedef int *restrict restrict_ptr_typedef; typedef int (*func_proto_typedef)(long); @@ -67,6 +68,7 @@ struct core_reloc_type_based_output { bool typedef_int_exists; bool typedef_enum_exists; bool typedef_void_ptr_exists; + bool typedef_restrict_ptr_exists; bool typedef_func_proto_exists; bool typedef_arr_exists; @@ -80,6 +82,7 @@ struct core_reloc_type_based_output { bool typedef_int_matches; bool typedef_enum_matches; bool typedef_void_ptr_matches; + bool typedef_restrict_ptr_matches; bool typedef_func_proto_matches; bool typedef_arr_matches; @@ -118,6 +121,7 @@ int test_core_type_based(void *ctx) out->typedef_int_exists = bpf_core_type_exists(int_typedef); out->typedef_enum_exists = bpf_core_type_exists(enum_typedef); out->typedef_void_ptr_exists = bpf_core_type_exists(void_ptr_typedef); + out->typedef_restrict_ptr_exists = bpf_core_type_exists(restrict_ptr_typedef); out->typedef_func_proto_exists = bpf_core_type_exists(func_proto_typedef); out->typedef_arr_exists = bpf_core_type_exists(arr_typedef); @@ -131,6 +135,7 @@ int test_core_type_based(void *ctx) out->typedef_int_matches = bpf_core_type_matches(int_typedef); out->typedef_enum_matches = bpf_core_type_matches(enum_typedef); out->typedef_void_ptr_matches = bpf_core_type_matches(void_ptr_typedef); + out->typedef_restrict_ptr_matches = bpf_core_type_matches(restrict_ptr_typedef); out->typedef_func_proto_matches = bpf_core_type_matches(func_proto_typedef); out->typedef_arr_matches = bpf_core_type_matches(arr_typedef); -- cgit v1.2.3 From b067fc284667ccbcfdb6c990568a3cfd8b73bb8a Mon Sep 17 00:00:00 2001 From: Bryan O'Donoghue Date: Fri, 8 Jul 2022 12:58:57 +0100 Subject: tools: usb: testusb: Add wireless speed reporting Add the ability to detect and print the USB speed as "wireless" if/when the kernel reports that speed. Signed-off-by: Bryan O'Donoghue Link: https://lore.kernel.org/r/20220708115859.2095714-2-bryan.odonoghue@linaro.org Signed-off-by: Greg Kroah-Hartman --- tools/usb/testusb.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/usb/testusb.c b/tools/usb/testusb.c index 474bae868b35..6f428f925384 100644 --- a/tools/usb/testusb.c +++ b/tools/usb/testusb.c @@ -96,7 +96,8 @@ struct usb_interface_descriptor { enum usb_device_speed { USB_SPEED_UNKNOWN = 0, /* enumerating */ USB_SPEED_LOW, USB_SPEED_FULL, /* usb 1.1 */ - USB_SPEED_HIGH /* usb 2.0 */ + USB_SPEED_HIGH, /* usb 2.0 */ + USB_SPEED_WIRELESS, /* wireless (usb 2.5) */ }; /*-------------------------------------------------------------------------*/ @@ -104,11 +105,12 @@ enum usb_device_speed { static char *speed (enum usb_device_speed s) { switch (s) { - case USB_SPEED_UNKNOWN: return "unknown"; - case USB_SPEED_LOW: return "low"; - case USB_SPEED_FULL: return "full"; - case USB_SPEED_HIGH: return "high"; - default: return "??"; + case USB_SPEED_UNKNOWN: return "unknown"; + case USB_SPEED_LOW: return "low"; + case USB_SPEED_FULL: return "full"; + case USB_SPEED_HIGH: return "high"; + case USB_SPEED_WIRELESS: return "wireless"; + default: return "??"; } } -- cgit v1.2.3 From 7fbcd99ebc0b9df7f4a9bbcfbf3e71c6bfddf9cc Mon Sep 17 00:00:00 2001 From: Bryan O'Donoghue Date: Fri, 8 Jul 2022 12:58:58 +0100 Subject: tools: usb: testusb: Add super speed reporting Add the ability to detect and print the USB speed as "super" if/when the kernel reports that speed. Signed-off-by: Bryan O'Donoghue Link: https://lore.kernel.org/r/20220708115859.2095714-3-bryan.odonoghue@linaro.org Signed-off-by: Greg Kroah-Hartman --- tools/usb/testusb.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/usb/testusb.c b/tools/usb/testusb.c index 6f428f925384..d996a3819322 100644 --- a/tools/usb/testusb.c +++ b/tools/usb/testusb.c @@ -98,6 +98,7 @@ enum usb_device_speed { USB_SPEED_LOW, USB_SPEED_FULL, /* usb 1.1 */ USB_SPEED_HIGH, /* usb 2.0 */ USB_SPEED_WIRELESS, /* wireless (usb 2.5) */ + USB_SPEED_SUPER, /* usb 3.0 */ }; /*-------------------------------------------------------------------------*/ @@ -110,6 +111,7 @@ static char *speed (enum usb_device_speed s) case USB_SPEED_FULL: return "full"; case USB_SPEED_HIGH: return "high"; case USB_SPEED_WIRELESS: return "wireless"; + case USB_SPEED_SUPER: return "super"; default: return "??"; } } -- cgit v1.2.3 From 5ea5746dfa051eaca6c79a598ff7dc4ec911cada Mon Sep 17 00:00:00 2001 From: Bryan O'Donoghue Date: Fri, 8 Jul 2022 12:58:59 +0100 Subject: tools: usb: testusb: Add super-plus speed reporting Add the ability to detect and print the USB speed as "super-plus" if/when the kernel reports that speed. Signed-off-by: Bryan O'Donoghue Link: https://lore.kernel.org/r/20220708115859.2095714-4-bryan.odonoghue@linaro.org Signed-off-by: Greg Kroah-Hartman --- tools/usb/testusb.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/usb/testusb.c b/tools/usb/testusb.c index d996a3819322..cbaa1b9fdeac 100644 --- a/tools/usb/testusb.c +++ b/tools/usb/testusb.c @@ -99,6 +99,7 @@ enum usb_device_speed { USB_SPEED_HIGH, /* usb 2.0 */ USB_SPEED_WIRELESS, /* wireless (usb 2.5) */ USB_SPEED_SUPER, /* usb 3.0 */ + USB_SPEED_SUPER_PLUS, /* usb 3.1 */ }; /*-------------------------------------------------------------------------*/ @@ -112,6 +113,7 @@ static char *speed (enum usb_device_speed s) case USB_SPEED_HIGH: return "high"; case USB_SPEED_WIRELESS: return "wireless"; case USB_SPEED_SUPER: return "super"; + case USB_SPEED_SUPER_PLUS: return "super-plus"; default: return "??"; } } -- cgit v1.2.3 From ff682226a353d88ffa5db9c2a9b945066776311e Mon Sep 17 00:00:00 2001 From: Gautam Menghani Date: Thu, 30 Jun 2022 00:58:22 +0530 Subject: selftests/kcmp: Make the test output consistent and clear Make the output format of this test consistent. Currently the output is as follows: +TAP version 13 +1..1 +# selftests: kcmp: kcmp_test +# pid1: 45814 pid2: 45815 FD: 1 FILES: 1 VM: 2 FS: 1 SIGHAND: 2 + IO: 0 SYSVSEM: 0 INV: -1 +# PASS: 0 returned as expected +# PASS: 0 returned as expected +# PASS: 0 returned as expected +# # Planned tests != run tests (0 != 3) +# # Totals: pass:3 fail:0 xfail:0 xpass:0 skip:0 error:0 +# # Planned tests != run tests (0 != 3) +# # Totals: pass:3 fail:0 xfail:0 xpass:0 skip:0 error:0 +# # Totals: pass:0 fail:0 xfail:0 xpass:0 skip:0 error:0 +ok 1 selftests: kcmp: kcmp_test With this patch applied the output is as follows: +TAP version 13 +1..1 +# selftests: kcmp: kcmp_test +# TAP version 13 +# 1..3 +# pid1: 46330 pid2: 46331 FD: 1 FILES: 2 VM: 2 FS: 2 SIGHAND: 1 + IO: 0 SYSVSEM: 0 INV: -1 +# PASS: 0 returned as expected +# PASS: 0 returned as expected +# PASS: 0 returned as expected +# # Totals: pass:3 fail:0 xfail:0 xpass:0 skip:0 error:0 +ok 1 selftests: kcmp: kcmp_test Signed-off-by: Gautam Menghani Signed-off-by: Shuah Khan --- tools/testing/selftests/kcmp/kcmp_test.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kcmp/kcmp_test.c b/tools/testing/selftests/kcmp/kcmp_test.c index 6ea7b9f37a41..25110c7c0b3e 100644 --- a/tools/testing/selftests/kcmp/kcmp_test.c +++ b/tools/testing/selftests/kcmp/kcmp_test.c @@ -88,6 +88,9 @@ int main(int argc, char **argv) int pid2 = getpid(); int ret; + ksft_print_header(); + ksft_set_plan(3); + fd2 = open(kpath, O_RDWR, 0644); if (fd2 < 0) { perror("Can't open file"); @@ -152,7 +155,6 @@ int main(int argc, char **argv) ksft_inc_pass_cnt(); } - ksft_print_cnts(); if (ret) ksft_exit_fail(); @@ -162,5 +164,5 @@ int main(int argc, char **argv) waitpid(pid2, &status, P_ALL); - return ksft_exit_pass(); + return 0; } -- cgit v1.2.3 From dbeb232726871352fc3e688ff5b02897f8cb0dc7 Mon Sep 17 00:00:00 2001 From: Soumya Negi Date: Fri, 1 Jul 2022 05:50:52 -0700 Subject: selftests: drivers/dma-buf: Improve message in selftest summary Selftest udmabuf for the dma-buf driver is skipped when the device file (e.g. /dev/udmabuf) for the DMA buffer cannot be opened i.e. no DMA buffer has been allocated. This patch adds clarity to the SKIP message. Signed-off-by: Soumya Negi Signed-off-by: Shuah Khan --- tools/testing/selftests/drivers/dma-buf/udmabuf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/dma-buf/udmabuf.c b/tools/testing/selftests/drivers/dma-buf/udmabuf.c index de1c4e6de0b2..c812080e304e 100644 --- a/tools/testing/selftests/drivers/dma-buf/udmabuf.c +++ b/tools/testing/selftests/drivers/dma-buf/udmabuf.c @@ -32,7 +32,8 @@ int main(int argc, char *argv[]) devfd = open("/dev/udmabuf", O_RDWR); if (devfd < 0) { - printf("%s: [skip,no-udmabuf]\n", TEST_PREFIX); + printf("%s: [skip,no-udmabuf: Unable to access DMA buffer device file]\n", + TEST_PREFIX); exit(77); } -- cgit v1.2.3 From 53b466219f89782b5c3d96d21f8765d1eadcce4e Mon Sep 17 00:00:00 2001 From: Daniel Latypov Date: Fri, 8 Jul 2022 01:36:32 +0000 Subject: kunit: tool: make --kunitconfig repeatable, blindly concat It's come up a few times that it would be useful to have --kunitconfig be repeatable [1][2]. This could be done before with a bit of shell-fu, e.g. $ find fs/ -name '.kunitconfig' -exec cat {} + | \ ./tools/testing/kunit/kunit.py run --kunitconfig=/dev/stdin or equivalently: $ cat fs/ext4/.kunitconfig fs/fat/.kunitconfig | \ ./tools/testing/kunit/kunit.py run --kunitconfig=/dev/stdin But this can be fairly clunky to use in practice. And having explicit support in kunit.py opens the door to having more config fragments of interest, e.g. options for PCI on UML [1], UML coverage [2], variants of tests [3]. There's another argument to be made that users can just use multiple --kconfig_add's, but this gets very clunky very fast (e.g. [2]). Note: there's a big caveat here that some kconfig options might be incompatible. We try to give a clearish error message in the simple case where the same option appears multiple times with conflicting values, but more subtle ones (e.g. mutually exclusive options) will be potentially very confusing for the user. I don't know we can do better. Note 2: if you want to combine a --kunitconfig with the default, you either have to do to specify the current build_dir > --kunitconfig=.kunit --kunitconfig=additional.config or > --kunitconfig=tools/testing/kunit/configs/default.config --kunitconifg=additional.config each of which have their downsides (former depends on --build_dir, doesn't work if you don't have a .kunitconfig yet), etc. Example with conflicting values: > $ ./tools/testing/kunit/kunit.py config --kunitconfig=lib/kunit --kunitconfig=/dev/stdin < CONFIG_KUNIT_TEST=n > CONFIG_KUNIT=m > EOF > ... > kunit_kernel.ConfigError: Multiple values specified for 2 options in kunitconfig: > CONFIG_KUNIT=y > vs from /dev/stdin > CONFIG_KUNIT=m > > CONFIG_KUNIT_TEST=y > vs from /dev/stdin > # CONFIG_KUNIT_TEST is not set [1] https://lists.freedesktop.org/archives/dri-devel/2022-June/357616.html [2] https://lore.kernel.org/linux-kselftest/CAFd5g45f3X3xF2vz2BkTHRqOC4uW6GZxtUUMaP5mwwbK8uNVtA@mail.gmail.com/ [3] https://lore.kernel.org/linux-kselftest/CANpmjNOdSy6DuO6CYZ4UxhGxqhjzx4tn0sJMbRqo2xRFv9kX6Q@mail.gmail.com/ Signed-off-by: Daniel Latypov Reviewed-by: Brendan Higgins Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit.py | 7 +++-- tools/testing/kunit/kunit_config.py | 11 ++++++- tools/testing/kunit/kunit_kernel.py | 38 +++++++++++++++-------- tools/testing/kunit/kunit_tool_test.py | 56 +++++++++++++++++++++++++++++----- 4 files changed, 89 insertions(+), 23 deletions(-) (limited to 'tools') diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py index b686126afb40..e132b0654029 100755 --- a/tools/testing/kunit/kunit.py +++ b/tools/testing/kunit/kunit.py @@ -293,8 +293,9 @@ def add_common_opts(parser) -> None: parser.add_argument('--kunitconfig', help='Path to Kconfig fragment that enables KUnit tests.' ' If given a directory, (e.g. lib/kunit), "/.kunitconfig" ' - 'will get automatically appended.', - metavar='PATH') + 'will get automatically appended. If repeated, the files ' + 'blindly concatenated, which might not work in all cases.', + action='append', metavar='PATHS') parser.add_argument('--kconfig_add', help='Additional Kconfig options to append to the ' '.kunitconfig, e.g. CONFIG_KASAN=y. Can be repeated.', @@ -381,7 +382,7 @@ def tree_from_args(cli_args: argparse.Namespace) -> kunit_kernel.LinuxSourceTree qemu_args.extend(shlex.split(arg)) return kunit_kernel.LinuxSourceTree(cli_args.build_dir, - kunitconfig_path=cli_args.kunitconfig, + kunitconfig_paths=cli_args.kunitconfig, kconfig_add=cli_args.kconfig_add, arch=cli_args.arch, cross_compile=cli_args.cross_compile, diff --git a/tools/testing/kunit/kunit_config.py b/tools/testing/kunit/kunit_config.py index 898b2a35eb29..48b5f34b2e5d 100644 --- a/tools/testing/kunit/kunit_config.py +++ b/tools/testing/kunit/kunit_config.py @@ -8,7 +8,7 @@ from dataclasses import dataclass import re -from typing import Dict, Iterable, Set +from typing import Dict, Iterable, List, Set, Tuple CONFIG_IS_NOT_SET_PATTERN = r'^# CONFIG_(\w+) is not set$' CONFIG_PATTERN = r'^CONFIG_(\w+)=(\S+|".*")$' @@ -60,6 +60,15 @@ class Kconfig: return False return True + def conflicting_options(self, other: 'Kconfig') -> List[Tuple[KconfigEntry, KconfigEntry]]: + diff = [] # type: List[Tuple[KconfigEntry, KconfigEntry]] + for name, value in self._entries.items(): + b = other._entries.get(name) + if b and value != b: + pair = (KconfigEntry(name, value), KconfigEntry(name, b)) + diff.append(pair) + return diff + def merge_in_entries(self, other: 'Kconfig') -> None: for name, value in other._entries.items(): self._entries[name] = value diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py index 94ec9f65ef19..56492090e28e 100644 --- a/tools/testing/kunit/kunit_kernel.py +++ b/tools/testing/kunit/kunit_kernel.py @@ -177,6 +177,30 @@ def get_kunitconfig_path(build_dir: str) -> str: def get_old_kunitconfig_path(build_dir: str) -> str: return os.path.join(build_dir, OLD_KUNITCONFIG_PATH) +def get_parsed_kunitconfig(build_dir: str, + kunitconfig_paths: Optional[List[str]]=None) -> kunit_config.Kconfig: + if not kunitconfig_paths: + path = get_kunitconfig_path(build_dir) + if not os.path.exists(path): + shutil.copyfile(DEFAULT_KUNITCONFIG_PATH, path) + return kunit_config.parse_file(path) + + merged = kunit_config.Kconfig() + + for path in kunitconfig_paths: + if os.path.isdir(path): + path = os.path.join(path, KUNITCONFIG_PATH) + if not os.path.exists(path): + raise ConfigError(f'Specified kunitconfig ({path}) does not exist') + + partial = kunit_config.parse_file(path) + diff = merged.conflicting_options(partial) + if diff: + diff_str = '\n\n'.join(f'{a}\n vs from {path}\n{b}' for a, b in diff) + raise ConfigError(f'Multiple values specified for {len(diff)} options in kunitconfig:\n{diff_str}') + merged.merge_in_entries(partial) + return merged + def get_outfile_path(build_dir: str) -> str: return os.path.join(build_dir, OUTFILE_PATH) @@ -221,7 +245,7 @@ class LinuxSourceTree: def __init__( self, build_dir: str, - kunitconfig_path='', + kunitconfig_paths: Optional[List[str]]=None, kconfig_add: Optional[List[str]]=None, arch=None, cross_compile=None, @@ -238,17 +262,7 @@ class LinuxSourceTree: qemu_config_path = _default_qemu_config_path(self._arch) _, self._ops = _get_qemu_ops(qemu_config_path, extra_qemu_args, cross_compile) - if kunitconfig_path: - if os.path.isdir(kunitconfig_path): - kunitconfig_path = os.path.join(kunitconfig_path, KUNITCONFIG_PATH) - if not os.path.exists(kunitconfig_path): - raise ConfigError(f'Specified kunitconfig ({kunitconfig_path}) does not exist') - else: - kunitconfig_path = get_kunitconfig_path(build_dir) - if not os.path.exists(kunitconfig_path): - shutil.copyfile(DEFAULT_KUNITCONFIG_PATH, kunitconfig_path) - - self._kconfig = kunit_config.parse_file(kunitconfig_path) + self._kconfig = get_parsed_kunitconfig(build_dir, kunitconfig_paths) if kconfig_add: kconfig = kunit_config.parse_from_string('\n'.join(kconfig_add)) self._kconfig.merge_in_entries(kconfig) diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index e56544d58147..ad63d0d34f3f 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -356,17 +356,46 @@ class LinuxSourceTreeTest(unittest.TestCase): def test_invalid_kunitconfig(self): with self.assertRaisesRegex(kunit_kernel.ConfigError, 'nonexistent.* does not exist'): - kunit_kernel.LinuxSourceTree('', kunitconfig_path='/nonexistent_file') + kunit_kernel.LinuxSourceTree('', kunitconfig_paths=['/nonexistent_file']) def test_valid_kunitconfig(self): with tempfile.NamedTemporaryFile('wt') as kunitconfig: - kunit_kernel.LinuxSourceTree('', kunitconfig_path=kunitconfig.name) + kunit_kernel.LinuxSourceTree('', kunitconfig_paths=[kunitconfig.name]) def test_dir_kunitconfig(self): with tempfile.TemporaryDirectory('') as dir: with open(os.path.join(dir, '.kunitconfig'), 'w'): pass - kunit_kernel.LinuxSourceTree('', kunitconfig_path=dir) + kunit_kernel.LinuxSourceTree('', kunitconfig_paths=[dir]) + + def test_multiple_kunitconfig(self): + want_kconfig = kunit_config.Kconfig() + want_kconfig.add_entry('KUNIT', 'y') + want_kconfig.add_entry('KUNIT_TEST', 'm') + + with tempfile.TemporaryDirectory('') as dir: + other = os.path.join(dir, 'otherkunitconfig') + with open(os.path.join(dir, '.kunitconfig'), 'w') as f: + f.write('CONFIG_KUNIT=y') + with open(other, 'w') as f: + f.write('CONFIG_KUNIT_TEST=m') + pass + + tree = kunit_kernel.LinuxSourceTree('', kunitconfig_paths=[dir, other]) + self.assertTrue(want_kconfig.is_subset_of(tree._kconfig), msg=tree._kconfig) + + + def test_multiple_kunitconfig_invalid(self): + with tempfile.TemporaryDirectory('') as dir: + other = os.path.join(dir, 'otherkunitconfig') + with open(os.path.join(dir, '.kunitconfig'), 'w') as f: + f.write('CONFIG_KUNIT=y') + with open(other, 'w') as f: + f.write('CONFIG_KUNIT=m') + + with self.assertRaisesRegex(kunit_kernel.ConfigError, '(?s)Multiple values.*CONFIG_KUNIT'): + kunit_kernel.LinuxSourceTree('', kunitconfig_paths=[dir, other]) + def test_kconfig_add(self): want_kconfig = kunit_config.Kconfig() @@ -636,7 +665,7 @@ class KUnitMainTest(unittest.TestCase): kunit.main(['run', '--kunitconfig=mykunitconfig']) # Just verify that we parsed and initialized it correctly here. self.mock_linux_init.assert_called_once_with('.kunit', - kunitconfig_path='mykunitconfig', + kunitconfig_paths=['mykunitconfig'], kconfig_add=None, arch='um', cross_compile=None, @@ -647,18 +676,31 @@ class KUnitMainTest(unittest.TestCase): kunit.main(['config', '--kunitconfig=mykunitconfig']) # Just verify that we parsed and initialized it correctly here. self.mock_linux_init.assert_called_once_with('.kunit', - kunitconfig_path='mykunitconfig', + kunitconfig_paths=['mykunitconfig'], kconfig_add=None, arch='um', cross_compile=None, qemu_config_path=None, extra_qemu_args=[]) + @mock.patch.object(kunit_kernel, 'LinuxSourceTree') + def test_run_multiple_kunitconfig(self, mock_linux_init): + mock_linux_init.return_value = self.linux_source_mock + kunit.main(['run', '--kunitconfig=mykunitconfig', '--kunitconfig=other']) + # Just verify that we parsed and initialized it correctly here. + mock_linux_init.assert_called_once_with('.kunit', + kunitconfig_paths=['mykunitconfig', 'other'], + kconfig_add=None, + arch='um', + cross_compile=None, + qemu_config_path=None, + extra_qemu_args=[]) + def test_run_kconfig_add(self): kunit.main(['run', '--kconfig_add=CONFIG_KASAN=y', '--kconfig_add=CONFIG_KCSAN=y']) # Just verify that we parsed and initialized it correctly here. self.mock_linux_init.assert_called_once_with('.kunit', - kunitconfig_path=None, + kunitconfig_paths=None, kconfig_add=['CONFIG_KASAN=y', 'CONFIG_KCSAN=y'], arch='um', cross_compile=None, @@ -669,7 +711,7 @@ class KUnitMainTest(unittest.TestCase): kunit.main(['run', '--arch=x86_64', '--qemu_args', '-m 2048']) # Just verify that we parsed and initialized it correctly here. self.mock_linux_init.assert_called_once_with('.kunit', - kunitconfig_path=None, + kunitconfig_paths=None, kconfig_add=None, arch='x86_64', cross_compile=None, -- cgit v1.2.3 From 6fc3a8636a7b0f7dbd6d0a4e450e765dc17518d4 Mon Sep 17 00:00:00 2001 From: David Gow Date: Fri, 8 Jul 2022 16:27:11 +0000 Subject: kunit: tool: Enable virtio/PCI by default on UML MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are several tests which depend on PCI, and hence need a bunch of extra options to run under UML. This makes it awkward to give configuration instructions (whether in documentation, or as part of a .kunitconfig file), as two separate, incompatible sets of config options are required for UML and "most other architectures". For non-UML architectures, it's possible to add default kconfig options via the qemu_config python files, but there's no equivalent for UML. Add a new tools/testing/kunit/configs/arch_uml.config file containing extra kconfig options to use on UML. Tested-by: José Expósito Reviewed-by: Daniel Latypov Signed-off-by: David Gow Reviewed-by: Brendan Higgins Signed-off-by: Daniel Latypov Signed-off-by: Shuah Khan --- tools/testing/kunit/configs/arch_uml.config | 5 +++++ tools/testing/kunit/kunit_kernel.py | 14 ++++++++++---- tools/testing/kunit/kunit_tool_test.py | 12 ++++++++++++ 3 files changed, 27 insertions(+), 4 deletions(-) create mode 100644 tools/testing/kunit/configs/arch_uml.config (limited to 'tools') diff --git a/tools/testing/kunit/configs/arch_uml.config b/tools/testing/kunit/configs/arch_uml.config new file mode 100644 index 000000000000..e824ce43b05a --- /dev/null +++ b/tools/testing/kunit/configs/arch_uml.config @@ -0,0 +1,5 @@ +# Config options which are added to UML builds by default + +# Enable virtio/pci, as a lot of tests require it. +CONFIG_VIRTIO_UML=y +CONFIG_UML_PCI_OVER_VIRTIO=y diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py index 56492090e28e..f5c26ea89714 100644 --- a/tools/testing/kunit/kunit_kernel.py +++ b/tools/testing/kunit/kunit_kernel.py @@ -26,6 +26,7 @@ KUNITCONFIG_PATH = '.kunitconfig' OLD_KUNITCONFIG_PATH = 'last_used_kunitconfig' DEFAULT_KUNITCONFIG_PATH = 'tools/testing/kunit/configs/default.config' BROKEN_ALLCONFIG_PATH = 'tools/testing/kunit/configs/broken_on_uml.config' +UML_KCONFIG_PATH = 'tools/testing/kunit/configs/arch_uml.config' OUTFILE_PATH = 'test.log' ABS_TOOL_PATH = os.path.abspath(os.path.dirname(__file__)) QEMU_CONFIGS_DIR = os.path.join(ABS_TOOL_PATH, 'qemu_configs') @@ -53,7 +54,7 @@ class LinuxSourceTreeOperations: except subprocess.CalledProcessError as e: raise ConfigError(e.output.decode()) - def make_arch_qemuconfig(self, base_kunitconfig: kunit_config.Kconfig) -> kunit_config.Kconfig: + def make_arch_config(self, base_kunitconfig: kunit_config.Kconfig) -> kunit_config.Kconfig: return base_kunitconfig def make_allyesconfig(self, build_dir: str, make_options) -> None: @@ -109,7 +110,7 @@ class LinuxSourceTreeOperationsQemu(LinuxSourceTreeOperations): self._kernel_command_line = qemu_arch_params.kernel_command_line + ' kunit_shutdown=reboot' self._extra_qemu_params = qemu_arch_params.extra_qemu_params - def make_arch_qemuconfig(self, base_kunitconfig: kunit_config.Kconfig) -> kunit_config.Kconfig: + def make_arch_config(self, base_kunitconfig: kunit_config.Kconfig) -> kunit_config.Kconfig: kconfig = kunit_config.parse_from_string(self._kconfig) kconfig.merge_in_entries(base_kunitconfig) return kconfig @@ -138,6 +139,11 @@ class LinuxSourceTreeOperationsUml(LinuxSourceTreeOperations): def __init__(self, cross_compile=None): super().__init__(linux_arch='um', cross_compile=cross_compile) + def make_arch_config(self, base_kunitconfig: kunit_config.Kconfig) -> kunit_config.Kconfig: + kconfig = kunit_config.parse_file(UML_KCONFIG_PATH) + kconfig.merge_in_entries(base_kunitconfig) + return kconfig + def make_allyesconfig(self, build_dir: str, make_options) -> None: stdout.print_with_timestamp( 'Enabling all CONFIGs for UML...') @@ -298,7 +304,7 @@ class LinuxSourceTree: if build_dir and not os.path.exists(build_dir): os.mkdir(build_dir) try: - self._kconfig = self._ops.make_arch_qemuconfig(self._kconfig) + self._kconfig = self._ops.make_arch_config(self._kconfig) self._kconfig.write_to_file(kconfig_path) self._ops.make_olddefconfig(build_dir, make_options) except ConfigError as e: @@ -329,7 +335,7 @@ class LinuxSourceTree: return self.build_config(build_dir, make_options) existing_kconfig = kunit_config.parse_file(kconfig_path) - self._kconfig = self._ops.make_arch_qemuconfig(self._kconfig) + self._kconfig = self._ops.make_arch_config(self._kconfig) if self._kconfig.is_subset_of(existing_kconfig) and not self._kunitconfig_changed(build_dir): return True diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index ad63d0d34f3f..446ac432d9a4 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -430,6 +430,10 @@ class LinuxSourceTreeTest(unittest.TestCase): f.write('CONFIG_KUNIT=y') tree = kunit_kernel.LinuxSourceTree(build_dir) + # Stub out the source tree operations, so we don't have + # the defaults for any given architecture get in the + # way. + tree._ops = kunit_kernel.LinuxSourceTreeOperations('none', None) mock_build_config = mock.patch.object(tree, 'build_config').start() # Should generate the .config @@ -447,6 +451,10 @@ class LinuxSourceTreeTest(unittest.TestCase): f.write('CONFIG_KUNIT=y\nCONFIG_KUNIT_TEST=y') tree = kunit_kernel.LinuxSourceTree(build_dir) + # Stub out the source tree operations, so we don't have + # the defaults for any given architecture get in the + # way. + tree._ops = kunit_kernel.LinuxSourceTreeOperations('none', None) mock_build_config = mock.patch.object(tree, 'build_config').start() self.assertTrue(tree.build_reconfig(build_dir, make_options=[])) @@ -463,6 +471,10 @@ class LinuxSourceTreeTest(unittest.TestCase): f.write('CONFIG_KUNIT=y\nCONFIG_KUNIT_TEST=y') tree = kunit_kernel.LinuxSourceTree(build_dir) + # Stub out the source tree operations, so we don't have + # the defaults for any given architecture get in the + # way. + tree._ops = kunit_kernel.LinuxSourceTreeOperations('none', None) mock_build_config = mock.patch.object(tree, 'build_config').start() # ... so we should trigger a call to build_config() -- cgit v1.2.3 From d1a6edecc1fddfb6ef92c8f720631d2c02bf2744 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 8 Jul 2022 10:50:00 -0700 Subject: bpf: Check attach_func_proto more carefully in check_return_code Syzkaller reports the following crash: RIP: 0010:check_return_code kernel/bpf/verifier.c:10575 [inline] RIP: 0010:do_check kernel/bpf/verifier.c:12346 [inline] RIP: 0010:do_check_common+0xb3d2/0xd250 kernel/bpf/verifier.c:14610 With the following reproducer: bpf$PROG_LOAD_XDP(0x5, &(0x7f00000004c0)={0xd, 0x3, &(0x7f0000000000)=ANY=[@ANYBLOB="1800000000000019000000000000000095"], &(0x7f0000000300)='GPL\x00', 0x0, 0x0, 0x0, 0x0, 0x0, '\x00', 0x0, 0x2b, 0xffffffffffffffff, 0x8, 0x0, 0x0, 0x10, 0x0}, 0x80) Because we don't enforce expected_attach_type for XDP programs, we end up in hitting 'if (prog->expected_attach_type == BPF_LSM_CGROUP' part in check_return_code and follow up with testing `prog->aux->attach_func_proto->type`, but `prog->aux->attach_func_proto` is NULL. Add explicit prog_type check for the "Note, BPF_LSM_CGROUP that attach ..." condition. Also, don't skip return code check for LSM/STRUCT_OPS. The above actually brings an issue with existing selftest which tries to return EPERM from void inet_csk_clone. Fix the test (and move called_socket_clone to make sure it's not incremented in case of an error) and add a new one to explicitly verify this condition. Fixes: 69fd337a975c ("bpf: per-cgroup lsm flavor") Reported-by: syzbot+5cc0730bd4b4d2c5f152@syzkaller.appspotmail.com Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220708175000.2603078-1-sdf@google.com --- kernel/bpf/verifier.c | 21 ++++++++++++++++----- tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c | 12 ++++++++++++ tools/testing/selftests/bpf/progs/lsm_cgroup.c | 12 ++++++------ .../selftests/bpf/progs/lsm_cgroup_nonvoid.c | 14 ++++++++++++++ 4 files changed, 48 insertions(+), 11 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/lsm_cgroup_nonvoid.c (limited to 'tools') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index df3ec6b05f05..e3cf6194c24f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10444,11 +10444,21 @@ static int check_return_code(struct bpf_verifier_env *env) const bool is_subprog = frame->subprogno; /* LSM and struct_ops func-ptr's return type could be "void" */ - if (!is_subprog && - (prog_type == BPF_PROG_TYPE_STRUCT_OPS || - prog_type == BPF_PROG_TYPE_LSM) && - !prog->aux->attach_func_proto->type) - return 0; + if (!is_subprog) { + switch (prog_type) { + case BPF_PROG_TYPE_LSM: + if (prog->expected_attach_type == BPF_LSM_CGROUP) + /* See below, can be 0 or 0-1 depending on hook. */ + break; + fallthrough; + case BPF_PROG_TYPE_STRUCT_OPS: + if (!prog->aux->attach_func_proto->type) + return 0; + break; + default: + break; + } + } /* eBPF calling convention is such that R0 is used * to return the value from eBPF program. @@ -10572,6 +10582,7 @@ static int check_return_code(struct bpf_verifier_env *env) if (!tnum_in(range, reg->var_off)) { verbose_invalid_scalar(env, reg, &range, "program exit", "R0"); if (prog->expected_attach_type == BPF_LSM_CGROUP && + prog_type == BPF_PROG_TYPE_LSM && !prog->aux->attach_func_proto->type) verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); return -EINVAL; diff --git a/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c b/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c index c542d7e80a5b..1102e4f42d2d 100644 --- a/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c +++ b/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c @@ -6,6 +6,7 @@ #include #include "lsm_cgroup.skel.h" +#include "lsm_cgroup_nonvoid.skel.h" #include "cgroup_helpers.h" #include "network_helpers.h" @@ -293,9 +294,20 @@ close_cgroup: lsm_cgroup__destroy(skel); } +static void test_lsm_cgroup_nonvoid(void) +{ + struct lsm_cgroup_nonvoid *skel = NULL; + + skel = lsm_cgroup_nonvoid__open_and_load(); + ASSERT_NULL(skel, "open succeeds"); + lsm_cgroup_nonvoid__destroy(skel); +} + void test_lsm_cgroup(void) { if (test__start_subtest("functional")) test_lsm_cgroup_functional(); + if (test__start_subtest("nonvoid")) + test_lsm_cgroup_nonvoid(); btf__free(btf); } diff --git a/tools/testing/selftests/bpf/progs/lsm_cgroup.c b/tools/testing/selftests/bpf/progs/lsm_cgroup.c index 89f3b1e961a8..4f2d60b87b75 100644 --- a/tools/testing/selftests/bpf/progs/lsm_cgroup.c +++ b/tools/testing/selftests/bpf/progs/lsm_cgroup.c @@ -156,25 +156,25 @@ int BPF_PROG(socket_clone, struct sock *newsk, const struct request_sock *req) { int prio = 234; - called_socket_clone++; - if (!newsk) return 1; /* Accepted request sockets get a different priority. */ if (bpf_setsockopt(newsk, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio))) - return 0; /* EPERM */ + return 1; /* Make sure bpf_getsockopt is allowed and works. */ prio = 0; if (bpf_getsockopt(newsk, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio))) - return 0; /* EPERM */ + return 1; if (prio != 234) - return 0; /* EPERM */ + return 1; /* Can access cgroup local storage. */ if (!test_local_storage()) - return 0; /* EPERM */ + return 1; + + called_socket_clone++; return 1; } diff --git a/tools/testing/selftests/bpf/progs/lsm_cgroup_nonvoid.c b/tools/testing/selftests/bpf/progs/lsm_cgroup_nonvoid.c new file mode 100644 index 000000000000..6cb0f161f417 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/lsm_cgroup_nonvoid.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +SEC("lsm_cgroup/inet_csk_clone") +int BPF_PROG(nonvoid_socket_clone, struct sock *newsk, const struct request_sock *req) +{ + /* Can not return any errors from void LSM hooks. */ + return 0; +} -- cgit v1.2.3 From 18410251f66aee7e82234073ce6656ca20a732a9 Mon Sep 17 00:00:00 2001 From: James Hilliard Date: Wed, 6 Jul 2022 05:18:38 -0600 Subject: libbpf: Disable SEC pragma macro on GCC It seems the gcc preprocessor breaks with pragmas when surrounding __attribute__. Disable these pragmas on GCC due to upstream bugs see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55578 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90400 Fixes errors like: error: expected identifier or '(' before '#pragma' 106 | SEC("cgroup/bind6") | ^~~ error: expected '=', ',', ';', 'asm' or '__attribute__' before '#pragma' 114 | char _license[] SEC("license") = "GPL"; | ^~~ Signed-off-by: James Hilliard Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220706111839.1247911-1-james.hilliard1@gmail.com --- tools/lib/bpf/bpf_helpers.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index fb04eaf367f1..7349b16b8e2f 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -22,12 +22,25 @@ * To allow use of SEC() with externs (e.g., for extern .maps declarations), * make sure __attribute__((unused)) doesn't trigger compilation warning. */ +#if __GNUC__ && !__clang__ + +/* + * Pragma macros are broken on GCC + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55578 + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90400 + */ +#define SEC(name) __attribute__((section(name), used)) + +#else + #define SEC(name) \ _Pragma("GCC diagnostic push") \ _Pragma("GCC diagnostic ignored \"-Wignored-attributes\"") \ __attribute__((section(name), used)) \ _Pragma("GCC diagnostic pop") \ +#endif + /* Avoid 'linux/stddef.h' definition of '__always_inline'. */ #undef __always_inline #define __always_inline inline __attribute__((always_inline)) -- cgit v1.2.3 From 06cd4e9d5d969d713e6b710f0e5ca0bc8476ae41 Mon Sep 17 00:00:00 2001 From: Daniel Müller Date: Thu, 7 Jul 2022 21:19:31 +0000 Subject: bpf: Correctly propagate errors up from bpf_core_composites_match MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change addresses a comment made earlier [0] about a missing return of an error when __bpf_core_types_match is invoked from bpf_core_composites_match, which could have let to us erroneously ignoring errors. Regarding the typedef name check pointed out in the same context, it is not actually an issue, because callers of the function perform a name check for the root type anyway. To make that more obvious, let's add comments to the function (similar to what we have for bpf_core_types_are_compat, which is called in pretty much the same context). [0]: https://lore.kernel.org/bpf/165708121449.4919.13204634393477172905.git-patchwork-notify@kernel.org/T/#m55141e8f8cfd2e8d97e65328fa04852870d01af6 Suggested-by: Andrii Nakryiko Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220707211931.3415440-1-deso@posteo.net --- tools/lib/bpf/relo_core.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c index fe2533022aa9..c4b0e81ae293 100644 --- a/tools/lib/bpf/relo_core.c +++ b/tools/lib/bpf/relo_core.c @@ -1500,6 +1500,8 @@ static int bpf_core_composites_match(const struct btf *local_btf, const struct b err = __bpf_core_types_match(local_btf, local_m->type, targ_btf, targ_m->type, behind_ptr, level - 1); + if (err < 0) + return err; if (err > 0) { matched = true; break; @@ -1512,7 +1514,8 @@ static int bpf_core_composites_match(const struct btf *local_btf, const struct b return 1; } -/* Check that two types "match". +/* Check that two types "match". This function assumes that root types were + * already checked for name match. * * The matching relation is defined as follows: * - modifiers and typedefs are stripped (and, hence, effectively ignored) @@ -1561,6 +1564,10 @@ recur: if (!local_t || !targ_t) return -EINVAL; + /* While the name check happens after typedefs are skipped, root-level + * typedefs would still be name-matched as that's the contract with + * callers. + */ if (!bpf_core_names_match(local_btf, local_t->name_off, targ_btf, targ_t->name_off)) return 0; -- cgit v1.2.3 From 24bdfdd2ec343c94adf38fb5bc699f12e543713b Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Fri, 8 Jul 2022 16:03:19 +0300 Subject: selftests/bpf: Fix xdp_synproxy build failure if CONFIG_NF_CONNTRACK=m/n When CONFIG_NF_CONNTRACK=m, struct bpf_ct_opts and enum member BPF_F_CURRENT_NETNS are not exposed. This commit allows building the xdp_synproxy selftest in such cases. Note that nf_conntrack must be loaded before running the test if it's compiled as a module. This commit also allows this selftest to be successfully compiled when CONFIG_NF_CONNTRACK is disabled. One unused local variable of type struct bpf_ct_opts is also removed. Fixes: fb5cd0ce70d4 ("selftests/bpf: Add selftests for raw syncookie helpers") Reported-by: Yauheni Kaliuta Signed-off-by: Maxim Mikityanskiy Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220708130319.1016294-1-maximmi@nvidia.com --- .../selftests/bpf/progs/xdp_synproxy_kern.c | 24 +++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c index 9fd62e94b5e6..736686e903f6 100644 --- a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c +++ b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c @@ -77,16 +77,30 @@ struct { __uint(max_entries, MAX_ALLOWED_PORTS); } allowed_ports SEC(".maps"); +/* Some symbols defined in net/netfilter/nf_conntrack_bpf.c are unavailable in + * vmlinux.h if CONFIG_NF_CONNTRACK=m, so they are redefined locally. + */ + +struct bpf_ct_opts___local { + s32 netns_id; + s32 error; + u8 l4proto; + u8 dir; + u8 reserved[2]; +} __attribute__((preserve_access_index)); + +#define BPF_F_CURRENT_NETNS (-1) + extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, __u32 len_tuple, - struct bpf_ct_opts *opts, + struct bpf_ct_opts___local *opts, __u32 len_opts) __ksym; extern struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, u32 len_tuple, - struct bpf_ct_opts *opts, + struct bpf_ct_opts___local *opts, u32 len_opts) __ksym; extern void bpf_ct_release(struct nf_conn *ct) __ksym; @@ -393,7 +407,7 @@ static __always_inline int tcp_dissect(void *data, void *data_end, static __always_inline int tcp_lookup(void *ctx, struct header_pointers *hdr, bool xdp) { - struct bpf_ct_opts ct_lookup_opts = { + struct bpf_ct_opts___local ct_lookup_opts = { .netns_id = BPF_F_CURRENT_NETNS, .l4proto = IPPROTO_TCP, }; @@ -714,10 +728,6 @@ static __always_inline int syncookie_handle_ack(struct header_pointers *hdr) static __always_inline int syncookie_part1(void *ctx, void *data, void *data_end, struct header_pointers *hdr, bool xdp) { - struct bpf_ct_opts ct_lookup_opts = { - .netns_id = BPF_F_CURRENT_NETNS, - .l4proto = IPPROTO_TCP, - }; int ret; ret = tcp_dissect(data, data_end, hdr); -- cgit v1.2.3 From 437ac2592c09fcf27430db3ac878d2a566a58692 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Thu, 7 Jul 2022 15:55:31 +0200 Subject: selftests: forwarding: Install local_termination.sh When using the Makefile from tools/testing/selftests/net/forwarding/ all tests should be installed. Add local_termination.sh to the list of "to be installed tests" where it has been missing so far. Fixes: 90b9566aa5cd3f ("selftests: forwarding: add a test for local_termination.sh") Signed-off-by: Martin Blumenstingl Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile index 8f481218a492..46ca437af95a 100644 --- a/tools/testing/selftests/net/forwarding/Makefile +++ b/tools/testing/selftests/net/forwarding/Makefile @@ -37,6 +37,7 @@ TEST_PROGS = bridge_igmp.sh \ ipip_hier_gre_key.sh \ ipip_hier_gre_keys.sh \ ipip_hier_gre.sh \ + local_termination.sh \ loopback.sh \ mirror_gre_bound.sh \ mirror_gre_bridge_1d.sh \ -- cgit v1.2.3 From cfbba7b46aef631445909ab4c35b98c16e36074b Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Thu, 7 Jul 2022 15:55:32 +0200 Subject: selftests: forwarding: Install no_forwarding.sh When using the Makefile from tools/testing/selftests/net/forwarding/ all tests should be installed. Add no_forwarding.sh to the list of "to be installed tests" where it has been missing so far. Fixes: 476a4f05d9b83f ("selftests: forwarding: add a no_forwarding.sh test") Signed-off-by: Martin Blumenstingl Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile index 46ca437af95a..57b84e0c879e 100644 --- a/tools/testing/selftests/net/forwarding/Makefile +++ b/tools/testing/selftests/net/forwarding/Makefile @@ -53,6 +53,7 @@ TEST_PROGS = bridge_igmp.sh \ mirror_gre_vlan_bridge_1q.sh \ mirror_gre_vlan.sh \ mirror_vlan.sh \ + no_forwarding.sh \ pedit_dsfield.sh \ pedit_ip.sh \ pedit_l4port.sh \ -- cgit v1.2.3 From 4ad3278df6fe2b0852b00d5757fc2ccd8e92c26e Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 8 Jul 2022 13:36:09 -0700 Subject: x86/speculation: Disable RRSBA behavior Some Intel processors may use alternate predictors for RETs on RSB-underflow. This condition may be vulnerable to Branch History Injection (BHI) and intramode-BTI. Kernel earlier added spectre_v2 mitigation modes (eIBRS+Retpolines, eIBRS+LFENCE, Retpolines) which protect indirect CALLs and JMPs against such attacks. However, on RSB-underflow, RET target prediction may fallback to alternate predictors. As a result, RET's predicted target may get influenced by branch history. A new MSR_IA32_SPEC_CTRL bit (RRSBA_DIS_S) controls this fallback behavior when in kernel mode. When set, RETs will not take predictions from alternate predictors, hence mitigating RETs as well. Support for this is enumerated by CPUID.7.2.EDX[RRSBA_CTRL] (bit2). For spectre v2 mitigation, when a user selects a mitigation that protects indirect CALLs and JMPs against BHI and intramode-BTI, set RRSBA_DIS_S also to protect RETs for RSB-underflow case. Signed-off-by: Pawan Gupta Signed-off-by: Borislav Petkov --- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/include/asm/msr-index.h | 9 +++++++++ arch/x86/kernel/cpu/bugs.c | 26 ++++++++++++++++++++++++++ arch/x86/kernel/cpu/scattered.c | 1 + tools/arch/x86/include/asm/msr-index.h | 9 +++++++++ 5 files changed, 46 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 7e8099fd5ec1..00f5227c8459 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -297,7 +297,7 @@ #define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */ #define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */ #define X86_FEATURE_ENTRY_IBPB (11*32+10) /* "" Issue an IBPB on kernel entry */ -/* FREE! (11*32+11) */ +#define X86_FEATURE_RRSBA_CTRL (11*32+11) /* "" RET prediction control */ #define X86_FEATURE_RETPOLINE (11*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */ #define X86_FEATURE_RETHUNK (11*32+14) /* "" Use REturn THUNK */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 7b411d9a6efb..cc615be27a54 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -51,6 +51,8 @@ #define SPEC_CTRL_STIBP BIT(SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */ #define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */ #define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ +#define SPEC_CTRL_RRSBA_DIS_S_SHIFT 6 /* Disable RRSBA behavior */ +#define SPEC_CTRL_RRSBA_DIS_S BIT(SPEC_CTRL_RRSBA_DIS_S_SHIFT) #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ #define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */ @@ -141,6 +143,13 @@ * bit available to control VERW * behavior. */ +#define ARCH_CAP_RRSBA BIT(19) /* + * Indicates RET may use predictors + * other than the RSB. With eIBRS + * enabled predictions in kernel mode + * are restricted to targets in + * kernel. + */ #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index d26c57d98b98..0dd04713434b 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1318,6 +1318,22 @@ static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) return SPECTRE_V2_RETPOLINE; } +/* Disable in-kernel use of non-RSB RET predictors */ +static void __init spec_ctrl_disable_kernel_rrsba(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) + return; + + ia32_cap = x86_read_arch_cap_msr(); + + if (ia32_cap & ARCH_CAP_RRSBA) { + x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; + write_spec_ctrl_current(x86_spec_ctrl_base, true); + } +} + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -1412,6 +1428,16 @@ static void __init spectre_v2_select_mitigation(void) break; } + /* + * Disable alternate RSB predictions in kernel when indirect CALLs and + * JMPs gets protection against BHI and Intramode-BTI, but RET + * prediction from a non-RSB predictor is still a risk. + */ + if (mode == SPECTRE_V2_EIBRS_LFENCE || + mode == SPECTRE_V2_EIBRS_RETPOLINE || + mode == SPECTRE_V2_RETPOLINE) + spec_ctrl_disable_kernel_rrsba(); + spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index dbaa8326d6f2..fd44b54c90d5 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -27,6 +27,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, + { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index d27e0581b777..2eab6a3a8a8c 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -51,6 +51,8 @@ #define SPEC_CTRL_STIBP BIT(SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */ #define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */ #define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ +#define SPEC_CTRL_RRSBA_DIS_S_SHIFT 6 /* Disable RRSBA behavior */ +#define SPEC_CTRL_RRSBA_DIS_S BIT(SPEC_CTRL_RRSBA_DIS_S_SHIFT) #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ #define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */ @@ -140,6 +142,13 @@ * bit available to control VERW * behavior. */ +#define ARCH_CAP_RRSBA BIT(19) /* + * Indicates RET may use predictors + * other than the RSB. With eIBRS + * enabled predictions in kernel mode + * are restricted to targets in + * kernel. + */ #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* -- cgit v1.2.3 From d0d9c8f2df60c6d1495201981f4b424628601113 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 8 Jul 2022 10:14:09 -0700 Subject: selftests: mptcp: tweak simult_flows for debug kernels The mentioned test measures the transfer run-time to verify that the user-space program is able to use the full aggregate B/W. Even on (virtual) link-speed-bound tests, debug kernel can slow down the transfer enough to cause sporadic test failures. Instead of unconditionally raising the maximum allowed run-time, tweak when the running kernel is a debug one, and use some simple/ rough heuristic to guess such scenarios. Note: this intentionally avoids looking for /boot/config- as the latter file is not always available in our reference CI environments. Signed-off-by: Paolo Abeni Co-developed-by: Mat Martineau Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/simult_flows.sh | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index f441ff7904fc..ffa13a957a36 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -12,6 +12,7 @@ timeout_test=$((timeout_poll * 2 + 1)) test_cnt=1 ret=0 bail=0 +slack=50 usage() { echo "Usage: $0 [ -b ] [ -c ] [ -d ]" @@ -52,6 +53,7 @@ setup() cout=$(mktemp) capout=$(mktemp) size=$((2 * 2048 * 4096)) + dd if=/dev/zero of=$small bs=4096 count=20 >/dev/null 2>&1 dd if=/dev/zero of=$large bs=4096 count=$((size / 4096)) >/dev/null 2>&1 @@ -104,6 +106,16 @@ setup() ip -net "$ns3" route add default via dead:beef:3::2 ip netns exec "$ns3" ./pm_nl_ctl limits 1 1 + + # debug build can slow down measurably the test program + # we use quite tight time limit on the run-time, to ensure + # maximum B/W usage. + # Use kmemleak/lockdep/kasan/prove_locking presence as a rough + # estimate for this being a debug kernel and increase the + # maximum run-time accordingly. Observed run times for CI builds + # running selftests, including kbuild, were used to determine the + # amount of time to add. + grep -q ' kmemleak_init$\| lockdep_init$\| kasan_init$\| prove_locking$' /proc/kallsyms && slack=$((slack+550)) } # $1: ns, $2: port @@ -241,7 +253,7 @@ run_test() # mptcp_connect will do some sleeps to allow the mp_join handshake # completion (see mptcp_connect): 200ms on each side, add some slack - time=$((time + 450)) + time=$((time + 400 + slack)) printf "%-60s" "$msg" do_transfer $small $large $time -- cgit v1.2.3 From 97040cf9806e1163af29a3e24f2b8c5584aa44dd Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 8 Jul 2022 10:14:10 -0700 Subject: selftests: mptcp: userspace pm address tests This patch adds userspace pm tests support for mptcp_join.sh script. Add userspace pm add_addr and rm_addr test cases in userspace_tests(). Reviewed-by: Mat Martineau Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 49 ++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index a4406b7a8064..d889e7507cd9 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -455,6 +455,12 @@ wait_mpj() done } +kill_wait() +{ + kill $1 > /dev/null 2>&1 + wait $1 2>/dev/null +} + pm_nl_set_limits() { local ns=$1 @@ -654,6 +660,9 @@ do_transfer() local port=$((10000 + TEST_COUNT - 1)) local cappid + local userspace_pm=0 + local evts_ns1 + local evts_ns1_pid :> "$cout" :> "$sout" @@ -690,12 +699,24 @@ do_transfer() extra_args="-r ${speed:6}" fi + if [[ "${addr_nr_ns1}" = "userspace_"* ]]; then + userspace_pm=1 + addr_nr_ns1=${addr_nr_ns1:10} + fi + if [[ "${addr_nr_ns2}" = "fastclose_"* ]]; then # disconnect extra_args="$extra_args -I ${addr_nr_ns2:10}" addr_nr_ns2=0 fi + if [ $userspace_pm -eq 1 ]; then + evts_ns1=$(mktemp) + :> "$evts_ns1" + ip netns exec ${listener_ns} ./pm_nl_ctl events >> "$evts_ns1" 2>&1 & + evts_ns1_pid=$! + fi + local local_addr if is_v6 "${connect_addr}"; then local_addr="::" @@ -748,6 +769,8 @@ do_transfer() if [ $addr_nr_ns1 -gt 0 ]; then local counter=2 local add_nr_ns1=${addr_nr_ns1} + local id=10 + local tk while [ $add_nr_ns1 -gt 0 ]; do local addr if is_v6 "${connect_addr}"; then @@ -755,9 +778,18 @@ do_transfer() else addr="10.0.$counter.1" fi - pm_nl_add_endpoint $ns1 $addr flags signal + if [ $userspace_pm -eq 0 ]; then + pm_nl_add_endpoint $ns1 $addr flags signal + else + tk=$(sed -n 's/.*\(token:\)\([[:digit:]]*\).*$/\2/p;q' "$evts_ns1") + ip netns exec ${listener_ns} ./pm_nl_ctl ann $addr token $tk id $id + sleep 1 + ip netns exec ${listener_ns} ./pm_nl_ctl rem token $tk id $id + fi + counter=$((counter + 1)) add_nr_ns1=$((add_nr_ns1 - 1)) + id=$((id + 1)) done elif [ $addr_nr_ns1 -lt 0 ]; then local rm_nr_ns1=$((-addr_nr_ns1)) @@ -890,6 +922,11 @@ do_transfer() kill $cappid fi + if [ $userspace_pm -eq 1 ]; then + kill_wait $evts_ns1_pid + rm -rf $evts_ns1 + fi + NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \ nstat | grep Tcp > /tmp/${listener_ns}.out NSTAT_HISTORY=/tmp/${connector_ns}.nstat ip netns exec ${connector_ns} \ @@ -2810,6 +2847,16 @@ userspace_tests() chk_join_nr 0 0 0 chk_rm_nr 0 0 fi + + # userspace pm add & remove address + if reset "userspace pm add & remove address"; then + set_userspace_pm $ns1 + pm_nl_set_limits $ns2 1 1 + run_tests $ns1 $ns2 10.0.1.1 0 userspace_1 0 slow + chk_join_nr 1 1 1 + chk_add_nr 1 1 + chk_rm_nr 1 1 invert + fi } endpoint_tests() -- cgit v1.2.3 From 5e986ec468745e8d4582070d869a10eaae8ba56c Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 8 Jul 2022 10:14:11 -0700 Subject: selftests: mptcp: userspace pm subflow tests This patch adds userspace pm subflow tests support for mptcp_join.sh script. Add userspace pm create subflow and destroy test cases in userspace_tests(). Reviewed-by: Mat Martineau Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 39 +++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index d889e7507cd9..55efe2aafb84 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -663,6 +663,8 @@ do_transfer() local userspace_pm=0 local evts_ns1 local evts_ns1_pid + local evts_ns2 + local evts_ns2_pid :> "$cout" :> "$sout" @@ -708,13 +710,20 @@ do_transfer() # disconnect extra_args="$extra_args -I ${addr_nr_ns2:10}" addr_nr_ns2=0 + elif [[ "${addr_nr_ns2}" = "userspace_"* ]]; then + userspace_pm=1 + addr_nr_ns2=${addr_nr_ns2:10} fi if [ $userspace_pm -eq 1 ]; then evts_ns1=$(mktemp) + evts_ns2=$(mktemp) :> "$evts_ns1" + :> "$evts_ns2" ip netns exec ${listener_ns} ./pm_nl_ctl events >> "$evts_ns1" 2>&1 & evts_ns1_pid=$! + ip netns exec ${connector_ns} ./pm_nl_ctl events >> "$evts_ns2" 2>&1 & + evts_ns2_pid=$! fi local local_addr @@ -836,6 +845,8 @@ do_transfer() if [ $addr_nr_ns2 -gt 0 ]; then local add_nr_ns2=${addr_nr_ns2} local counter=3 + local id=20 + local tk da dp sp while [ $add_nr_ns2 -gt 0 ]; do local addr if is_v6 "${connect_addr}"; then @@ -843,9 +854,23 @@ do_transfer() else addr="10.0.$counter.2" fi - pm_nl_add_endpoint $ns2 $addr flags $flags + if [ $userspace_pm -eq 0 ]; then + pm_nl_add_endpoint $ns2 $addr flags $flags + else + tk=$(sed -n 's/.*\(token:\)\([[:digit:]]*\).*$/\2/p;q' "$evts_ns2") + da=$(sed -n 's/.*\(daddr4:\)\([0-9.]*\).*$/\2/p;q' "$evts_ns2") + dp=$(sed -n 's/.*\(dport:\)\([[:digit:]]*\).*$/\2/p;q' "$evts_ns2") + ip netns exec ${connector_ns} ./pm_nl_ctl csf lip $addr lid $id \ + rip $da rport $dp token $tk + sleep 1 + sp=$(grep "type:10" "$evts_ns2" | + sed -n 's/.*\(sport:\)\([[:digit:]]*\).*$/\2/p;q') + ip netns exec ${connector_ns} ./pm_nl_ctl dsf lip $addr lport $sp \ + rip $da rport $dp token $tk + fi counter=$((counter + 1)) add_nr_ns2=$((add_nr_ns2 - 1)) + id=$((id + 1)) done elif [ $addr_nr_ns2 -lt 0 ]; then local rm_nr_ns2=$((-addr_nr_ns2)) @@ -924,7 +949,8 @@ do_transfer() if [ $userspace_pm -eq 1 ]; then kill_wait $evts_ns1_pid - rm -rf $evts_ns1 + kill_wait $evts_ns2_pid + rm -rf $evts_ns1 $evts_ns2 fi NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \ @@ -2857,6 +2883,15 @@ userspace_tests() chk_add_nr 1 1 chk_rm_nr 1 1 invert fi + + # userspace pm create destroy subflow + if reset "userspace pm create destroy subflow"; then + set_userspace_pm $ns2 + pm_nl_set_limits $ns1 0 1 + run_tests $ns1 $ns2 10.0.1.1 0 0 userspace_1 slow + chk_join_nr 1 1 1 + chk_rm_nr 0 1 + fi } endpoint_tests() -- cgit v1.2.3 From 507719cd7c0f0251fb2b772e73e4c35b7429587b Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 8 Jul 2022 10:14:12 -0700 Subject: selftests: mptcp: avoid Terminated messages in userspace_pm There're some 'Terminated' messages in the output of userspace pm tests script after killing './pm_nl_ctl events' processes: Created network namespaces ns1, ns2 [OK] ./userspace_pm.sh: line 166: 13735 Terminated ip netns exec "$ns2" ./pm_nl_ctl events >> "$client_evts" 2>&1 ./userspace_pm.sh: line 172: 13737 Terminated ip netns exec "$ns1" ./pm_nl_ctl events >> "$server_evts" 2>&1 Established IPv4 MPTCP Connection ns2 => ns1 [OK] ./userspace_pm.sh: line 166: 13753 Terminated ip netns exec "$ns2" ./pm_nl_ctl events >> "$client_evts" 2>&1 ./userspace_pm.sh: line 172: 13755 Terminated ip netns exec "$ns1" ./pm_nl_ctl events >> "$server_evts" 2>&1 Established IPv6 MPTCP Connection ns2 => ns1 [OK] ADD_ADDR 10.0.2.2 (ns2) => ns1, invalid token [OK] This patch adds a helper kill_wait(), in it using 'wait $pid 2>/dev/null' commands after 'kill $pid' to avoid printing out these Terminated messages. Use this helper instead of using 'kill $pid'. Reviewed-by: Mat Martineau Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/userspace_pm.sh | 40 +++++++++++++---------- 1 file changed, 23 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index abe3d4ebe554..3229725b64b0 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -37,6 +37,12 @@ rndh=$(stdbuf -o0 -e0 printf %x "$sec")-$(mktemp -u XXXXXX) ns1="ns1-$rndh" ns2="ns2-$rndh" +kill_wait() +{ + kill $1 > /dev/null 2>&1 + wait $1 2>/dev/null +} + cleanup() { echo "cleanup" @@ -48,16 +54,16 @@ cleanup() kill -SIGUSR1 $client4_pid > /dev/null 2>&1 fi if [ $server4_pid -ne 0 ]; then - kill $server4_pid > /dev/null 2>&1 + kill_wait $server4_pid fi if [ $client6_pid -ne 0 ]; then kill -SIGUSR1 $client6_pid > /dev/null 2>&1 fi if [ $server6_pid -ne 0 ]; then - kill $server6_pid > /dev/null 2>&1 + kill_wait $server6_pid fi if [ $evts_pid -ne 0 ]; then - kill $evts_pid > /dev/null 2>&1 + kill_wait $evts_pid fi local netns for netns in "$ns1" "$ns2" ;do @@ -153,7 +159,7 @@ make_connection() sleep 1 # Capture client/server attributes from MPTCP connection netlink events - kill $client_evts_pid + kill_wait $client_evts_pid local client_token local client_port @@ -165,7 +171,7 @@ make_connection() client_port=$(sed --unbuffered -n 's/.*\(sport:\)\([[:digit:]]*\).*$/\2/p;q' "$client_evts") client_serverside=$(sed --unbuffered -n 's/.*\(server_side:\)\([[:digit:]]*\).*$/\2/p;q'\ "$client_evts") - kill $server_evts_pid + kill_wait $server_evts_pid server_token=$(sed --unbuffered -n 's/.*\(token:\)\([[:digit:]]*\).*$/\2/p;q' "$server_evts") server_serverside=$(sed --unbuffered -n 's/.*\(server_side:\)\([[:digit:]]*\).*$/\2/p;q'\ "$server_evts") @@ -286,7 +292,7 @@ test_announce() verify_announce_event "$evts" "$ANNOUNCED" "$server4_token" "10.0.2.2"\ "$client_addr_id" "$new4_port" - kill $evts_pid + kill_wait $evts_pid # Capture events on the network namespace running the client :>"$evts" @@ -321,7 +327,7 @@ test_announce() verify_announce_event "$evts" "$ANNOUNCED" "$client4_token" "10.0.2.1"\ "$server_addr_id" "$new4_port" - kill $evts_pid + kill_wait $evts_pid rm -f "$evts" } @@ -416,7 +422,7 @@ test_remove() sleep 0.5 verify_remove_event "$evts" "$REMOVED" "$server6_token" "$client_addr_id" - kill $evts_pid + kill_wait $evts_pid # Capture events on the network namespace running the client :>"$evts" @@ -449,7 +455,7 @@ test_remove() sleep 0.5 verify_remove_event "$evts" "$REMOVED" "$client6_token" "$server_addr_id" - kill $evts_pid + kill_wait $evts_pid rm -f "$evts" } @@ -553,7 +559,7 @@ test_subflows() "10.0.2.2" "$client4_port" "23" "$client_addr_id" "ns1" "ns2" # Delete the listener from the client ns, if one was created - kill $listener_pid > /dev/null 2>&1 + kill_wait $listener_pid local sport sport=$(sed --unbuffered -n 's/.*\(sport:\)\([[:digit:]]*\).*$/\2/p;q' "$evts") @@ -592,7 +598,7 @@ test_subflows() "$client_addr_id" "ns1" "ns2" # Delete the listener from the client ns, if one was created - kill $listener_pid > /dev/null 2>&1 + kill_wait $listener_pid sport=$(sed --unbuffered -n 's/.*\(sport:\)\([[:digit:]]*\).*$/\2/p;q' "$evts") @@ -631,7 +637,7 @@ test_subflows() "$client_addr_id" "ns1" "ns2" # Delete the listener from the client ns, if one was created - kill $listener_pid > /dev/null 2>&1 + kill_wait $listener_pid sport=$(sed --unbuffered -n 's/.*\(sport:\)\([[:digit:]]*\).*$/\2/p;q' "$evts") @@ -647,7 +653,7 @@ test_subflows() ip netns exec "$ns2" ./pm_nl_ctl rem id $client_addr_id token\ "$client4_token" > /dev/null 2>&1 - kill $evts_pid + kill_wait $evts_pid # Capture events on the network namespace running the client :>"$evts" @@ -674,7 +680,7 @@ test_subflows() "10.0.2.1" "$app4_port" "23" "$server_addr_id" "ns2" "ns1" # Delete the listener from the server ns, if one was created - kill $listener_pid> /dev/null 2>&1 + kill_wait $listener_pid sport=$(sed --unbuffered -n 's/.*\(sport:\)\([[:digit:]]*\).*$/\2/p;q' "$evts") @@ -713,7 +719,7 @@ test_subflows() "$server_addr_id" "ns2" "ns1" # Delete the listener from the server ns, if one was created - kill $listener_pid > /dev/null 2>&1 + kill_wait $listener_pid sport=$(sed --unbuffered -n 's/.*\(sport:\)\([[:digit:]]*\).*$/\2/p;q' "$evts") @@ -750,7 +756,7 @@ test_subflows() "10.0.2.2" "10.0.2.1" "$new4_port" "23" "$server_addr_id" "ns2" "ns1" # Delete the listener from the server ns, if one was created - kill $listener_pid > /dev/null 2>&1 + kill_wait $listener_pid sport=$(sed --unbuffered -n 's/.*\(sport:\)\([[:digit:]]*\).*$/\2/p;q' "$evts") @@ -766,7 +772,7 @@ test_subflows() ip netns exec "$ns1" ./pm_nl_ctl rem id $server_addr_id token\ "$server4_token" > /dev/null 2>&1 - kill $evts_pid + kill_wait $evts_pid rm -f "$evts" } -- cgit v1.2.3 From 65ebc6676d17847dde26dcbe50627a2fe198ca4b Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 8 Jul 2022 10:14:13 -0700 Subject: selftests: mptcp: update pm_nl_ctl usage header The usage header of pm_nl_ctl command doesn't match with the context. So this patch adds the missing userspace PM keywords 'ann', 'rem', 'csf', 'dsf', 'events' and 'listen' in it. Reviewed-by: Mat Martineau Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/pm_nl_ctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c index cb79f0719e3b..abddf4c63e79 100644 --- a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c +++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c @@ -31,7 +31,7 @@ static void syntax(char *argv[]) { - fprintf(stderr, "%s add|get|set|del|flush|dump|accept []\n", argv[0]); + fprintf(stderr, "%s add|ann|rem|csf|dsf|get|set|del|flush|dump|events|listen|accept []\n", argv[0]); fprintf(stderr, "\tadd [flags signal|subflow|backup|fullmesh] [id ] [dev ] \n"); fprintf(stderr, "\tann id token [port ] [dev ]\n"); fprintf(stderr, "\trem id token \n"); -- cgit v1.2.3 From e8b7ea58abbd2335734e67cccbd992e4735366bd Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 18 May 2022 17:55:13 -0700 Subject: cxl/core: Rename ->decoder_range ->hpa_range In preparation for growing a ->dpa_range attribute for endpoint decoders, rename the current ->decoder_range to the more descriptive ->hpa_range. Reviewed-by: Alison Schofield Reviewed-by: Jonathan Cameron Reviewed-by: Adam Manzanares Link: https://lore.kernel.org/r/165603872867.551046.2170426227407458814.stgit@dwillia2-xfh Signed-off-by: Dan Williams --- drivers/cxl/core/hdm.c | 2 +- drivers/cxl/core/port.c | 4 ++-- drivers/cxl/cxl.h | 4 ++-- tools/testing/cxl/test/cxl.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index ba3d2d959c71..5c070c93b07f 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -172,7 +172,7 @@ static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, return -ENXIO; } - cxld->decoder_range = (struct range) { + cxld->hpa_range = (struct range) { .start = base, .end = base + size - 1, }; diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 7810d1a8369b..98bcbbd59a75 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -78,7 +78,7 @@ static ssize_t start_show(struct device *dev, struct device_attribute *attr, if (is_root_decoder(dev)) start = cxld->platform_res.start; else - start = cxld->decoder_range.start; + start = cxld->hpa_range.start; return sysfs_emit(buf, "%#llx\n", start); } @@ -93,7 +93,7 @@ static ssize_t size_show(struct device *dev, struct device_attribute *attr, if (is_root_decoder(dev)) size = resource_size(&cxld->platform_res); else - size = range_len(&cxld->decoder_range); + size = range_len(&cxld->hpa_range); return sysfs_emit(buf, "%#llx\n", size); } diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 6799b27c7db2..8256728cea8d 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -198,7 +198,7 @@ enum cxl_decoder_type { * @dev: this decoder's device * @id: kernel device name id * @platform_res: address space resources considered by root decoder - * @decoder_range: address space resources considered by midlevel decoder + * @hpa_range: Host physical address range mapped by this decoder * @interleave_ways: number of cxl_dports in this decode * @interleave_granularity: data stride per dport * @target_type: accelerator vs expander (type2 vs type3) selector @@ -212,7 +212,7 @@ struct cxl_decoder { int id; union { struct resource platform_res; - struct range decoder_range; + struct range hpa_range; }; int interleave_ways; int interleave_granularity; diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 431f2bddf6c8..7a08b025f2de 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -461,7 +461,7 @@ static int mock_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm) return PTR_ERR(cxld); } - cxld->decoder_range = (struct range) { + cxld->hpa_range = (struct range) { .start = 0, .end = -1, }; -- cgit v1.2.3 From d3b75029f353c64e1e0e45ba5083cf8679d17f0a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 21 May 2022 15:35:29 -0700 Subject: cxl/mem: Convert partition-info to resources To date the per-device-partition DPA range information has only been used for enumeration purposes. In preparation for allocating regions from available DPA capacity, convert those ranges into DPA-type resource trees. With resources and the new add_dpa_res() helper some open coded end address calculations and debug prints can be cleaned. The 'cxlds->pmem_res' and 'cxlds->ram_res' resources are child resources of the total-device DPA space and they in turn will host DPA allocations from cxl_endpoint_decoder instances (tracked by cxled->dpa_res). Cc: Ira Weiny Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/165603878921.551046.8127845916514734142.stgit@dwillia2-xfh Signed-off-by: Dan Williams --- drivers/cxl/core/mbox.c | 78 +++++++++++++++++++++++++------------------- drivers/cxl/core/memdev.c | 4 +-- drivers/cxl/cxlmem.h | 10 +++--- drivers/cxl/pci.c | 2 +- tools/testing/cxl/test/mem.c | 2 +- 5 files changed, 55 insertions(+), 41 deletions(-) (limited to 'tools') diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index cbf23beebebe..aeb833857412 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -773,15 +773,6 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds) cxlds->partition_align_bytes = le64_to_cpu(id.partition_align) * CXL_CAPACITY_MULTIPLIER; - dev_dbg(cxlds->dev, - "Identify Memory Device\n" - " total_bytes = %#llx\n" - " volatile_only_bytes = %#llx\n" - " persistent_only_bytes = %#llx\n" - " partition_align_bytes = %#llx\n", - cxlds->total_bytes, cxlds->volatile_only_bytes, - cxlds->persistent_only_bytes, cxlds->partition_align_bytes); - cxlds->lsa_size = le32_to_cpu(id.lsa_size); memcpy(cxlds->firmware_version, id.fw_revision, sizeof(id.fw_revision)); @@ -789,42 +780,63 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds) } EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, CXL); -int cxl_mem_create_range_info(struct cxl_dev_state *cxlds) +static int add_dpa_res(struct device *dev, struct resource *parent, + struct resource *res, resource_size_t start, + resource_size_t size, const char *type) { int rc; - if (cxlds->partition_align_bytes == 0) { - cxlds->ram_range.start = 0; - cxlds->ram_range.end = cxlds->volatile_only_bytes - 1; - cxlds->pmem_range.start = cxlds->volatile_only_bytes; - cxlds->pmem_range.end = cxlds->volatile_only_bytes + - cxlds->persistent_only_bytes - 1; + res->name = type; + res->start = start; + res->end = start + size - 1; + res->flags = IORESOURCE_MEM; + if (resource_size(res) == 0) { + dev_dbg(dev, "DPA(%s): no capacity\n", res->name); return 0; } - - rc = cxl_mem_get_partition_info(cxlds); + rc = request_resource(parent, res); if (rc) { - dev_err(cxlds->dev, "Failed to query partition information\n"); + dev_err(dev, "DPA(%s): failed to track %pr (%d)\n", res->name, + res, rc); return rc; } - dev_dbg(cxlds->dev, - "Get Partition Info\n" - " active_volatile_bytes = %#llx\n" - " active_persistent_bytes = %#llx\n" - " next_volatile_bytes = %#llx\n" - " next_persistent_bytes = %#llx\n", - cxlds->active_volatile_bytes, cxlds->active_persistent_bytes, - cxlds->next_volatile_bytes, cxlds->next_persistent_bytes); + dev_dbg(dev, "DPA(%s): %pr\n", res->name, res); - cxlds->ram_range.start = 0; - cxlds->ram_range.end = cxlds->active_volatile_bytes - 1; + return 0; +} - cxlds->pmem_range.start = cxlds->active_volatile_bytes; - cxlds->pmem_range.end = - cxlds->active_volatile_bytes + cxlds->active_persistent_bytes - 1; +int cxl_mem_create_range_info(struct cxl_dev_state *cxlds) +{ + struct device *dev = cxlds->dev; + int rc; - return 0; + cxlds->dpa_res = + (struct resource)DEFINE_RES_MEM(0, cxlds->total_bytes); + + if (cxlds->partition_align_bytes == 0) { + rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0, + cxlds->volatile_only_bytes, "ram"); + if (rc) + return rc; + return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res, + cxlds->volatile_only_bytes, + cxlds->persistent_only_bytes, "pmem"); + } + + rc = cxl_mem_get_partition_info(cxlds); + if (rc) { + dev_err(dev, "Failed to query partition information\n"); + return rc; + } + + rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0, + cxlds->active_volatile_bytes, "ram"); + if (rc) + return rc; + return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res, + cxlds->active_volatile_bytes, + cxlds->active_persistent_bytes, "pmem"); } EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, CXL); diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index f7cdcd33504a..20ce488a7754 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -68,7 +68,7 @@ static ssize_t ram_size_show(struct device *dev, struct device_attribute *attr, { struct cxl_memdev *cxlmd = to_cxl_memdev(dev); struct cxl_dev_state *cxlds = cxlmd->cxlds; - unsigned long long len = range_len(&cxlds->ram_range); + unsigned long long len = resource_size(&cxlds->ram_res); return sysfs_emit(buf, "%#llx\n", len); } @@ -81,7 +81,7 @@ static ssize_t pmem_size_show(struct device *dev, struct device_attribute *attr, { struct cxl_memdev *cxlmd = to_cxl_memdev(dev); struct cxl_dev_state *cxlds = cxlmd->cxlds; - unsigned long long len = range_len(&cxlds->pmem_range); + unsigned long long len = resource_size(&cxlds->pmem_res); return sysfs_emit(buf, "%#llx\n", len); } diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 7df0b053373a..a9609d40643f 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -178,8 +178,9 @@ struct cxl_endpoint_dvsec_info { * @firmware_version: Firmware version for the memory device. * @enabled_cmds: Hardware commands found enabled in CEL. * @exclusive_cmds: Commands that are kernel-internal only - * @pmem_range: Active Persistent memory capacity configuration - * @ram_range: Active Volatile memory capacity configuration + * @dpa_res: Overall DPA resource tree for the device + * @pmem_res: Active Persistent memory capacity configuration + * @ram_res: Active Volatile memory capacity configuration * @total_bytes: sum of all possible capacities * @volatile_only_bytes: hard volatile capacity * @persistent_only_bytes: hard persistent capacity @@ -209,8 +210,9 @@ struct cxl_dev_state { DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX); DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX); - struct range pmem_range; - struct range ram_range; + struct resource dpa_res; + struct resource pmem_res; + struct resource ram_res; u64 total_bytes; u64 volatile_only_bytes; u64 persistent_only_bytes; diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 5a0ae46d4989..eeff9599acda 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -454,7 +454,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (IS_ERR(cxlmd)) return PTR_ERR(cxlmd); - if (range_len(&cxlds->pmem_range) && IS_ENABLED(CONFIG_CXL_PMEM)) + if (resource_size(&cxlds->pmem_res) && IS_ENABLED(CONFIG_CXL_PMEM)) rc = devm_cxl_add_nvdimm(&pdev->dev, cxlmd); return rc; diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 6b9239b2afd4..b81c90715fe8 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -282,7 +282,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) if (IS_ERR(cxlmd)) return PTR_ERR(cxlmd); - if (range_len(&cxlds->pmem_range) && IS_ENABLED(CONFIG_CXL_PMEM)) + if (resource_size(&cxlds->pmem_res) && IS_ENABLED(CONFIG_CXL_PMEM)) rc = devm_cxl_add_nvdimm(dev, cxlmd); return 0; -- cgit v1.2.3 From b2f3b74e1072ab7c03833f265bdb26dafa92e078 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 22 Jun 2022 18:02:37 -0700 Subject: tools/testing/cxl: Move cxl_test resources to the top of memory A recent QEMU upgrade resulted in collisions between QEMU's chosen location for PCI MMIO and cxl_test's fake address location for emulated CXL purposes. This was great for testing resource collisions, but not so great for continuing to test the nominal cases. Move cxl_test to the top-of-memory where it is less likely to collide with other resources. Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/165603886021.551046.12395967874222763381.stgit@dwillia2-xfh Signed-off-by: Dan Williams --- tools/testing/cxl/test/cxl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 7a08b025f2de..27dba24b5c99 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -619,7 +619,8 @@ static __init int cxl_test_init(void) goto err_gen_pool_create; } - rc = gen_pool_add(cxl_mock_pool, SZ_512G, SZ_64G, NUMA_NO_NODE); + rc = gen_pool_add(cxl_mock_pool, iomem_resource.end + 1 - SZ_64G, + SZ_64G, NUMA_NO_NODE); if (rc) goto err_gen_pool_add; -- cgit v1.2.3 From 855c90d30575f95c5a1fb72f9294a9f75dae20c2 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 24 May 2022 08:56:58 -0700 Subject: tools/testing/cxl: Expand CFMWS windows For the x2 host-bridge interleave windows, allow for a x8-endpoint-interleave configuration per memory-type with each device contributing the minimum 256MB extent. Similarly, for the x1 host-bridge interleave windows, allow for a x4-endpoint-interleave configuration per memory-type. Bump up the number of decoders per-port to support hosting 8 regions. Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/165603886721.551046.8682583835505795210.stgit@dwillia2-xfh Signed-off-by: Dan Williams --- tools/testing/cxl/test/cxl.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 27dba24b5c99..b38a3485505c 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -14,7 +14,7 @@ #define NR_CXL_HOST_BRIDGES 2 #define NR_CXL_ROOT_PORTS 2 #define NR_CXL_SWITCH_PORTS 2 -#define NR_CXL_PORT_DECODERS 2 +#define NR_CXL_PORT_DECODERS 8 static struct platform_device *cxl_acpi; static struct platform_device *cxl_host_bridge[NR_CXL_HOST_BRIDGES]; @@ -118,7 +118,7 @@ static struct { .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | ACPI_CEDT_CFMWS_RESTRICT_VOLATILE, .qtg_id = 0, - .window_size = SZ_256M, + .window_size = SZ_256M * 4UL, }, .target = { 0 }, }, @@ -133,7 +133,7 @@ static struct { .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | ACPI_CEDT_CFMWS_RESTRICT_VOLATILE, .qtg_id = 1, - .window_size = SZ_256M * 2, + .window_size = SZ_256M * 8UL, }, .target = { 0, 1, }, }, @@ -148,7 +148,7 @@ static struct { .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | ACPI_CEDT_CFMWS_RESTRICT_PMEM, .qtg_id = 2, - .window_size = SZ_256M, + .window_size = SZ_256M * 4UL, }, .target = { 0 }, }, @@ -163,7 +163,7 @@ static struct { .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | ACPI_CEDT_CFMWS_RESTRICT_PMEM, .qtg_id = 3, - .window_size = SZ_256M * 2, + .window_size = SZ_256M * 8UL, }, .target = { 0, 1, }, }, -- cgit v1.2.3 From e7ad1bf683295024e7a4e09e41015989a004a0f5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 23 May 2022 23:26:11 -0700 Subject: tools/testing/cxl: Add partition support In support of testing DPA allocation mechanisms in the CXL core, the cxl_test environment needs to support establishing and retrieving the 'pmem partition boundary. Replace the platform_device_add_resources() method for delineating DPA within an endpoint with an emulated DEV_SIZE amount of partitionable capacity. Set DEV_SIZE such that an endpoint has enough capacity to simultaneously participate in 8 distinct regions. Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/165603887411.551046.13234212587991192347.stgit@dwillia2-xfh Signed-off-by: Dan Williams --- drivers/cxl/core/mbox.c | 7 +----- drivers/cxl/cxlmem.h | 7 ++++++ tools/testing/cxl/test/cxl.c | 40 +--------------------------------- tools/testing/cxl/test/mem.c | 51 +++++++++++++++++++++++--------------------- 4 files changed, 36 insertions(+), 69 deletions(-) (limited to 'tools') diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 32a75092e8a3..16176b9278b4 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -718,12 +718,7 @@ EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL); */ static int cxl_mem_get_partition_info(struct cxl_dev_state *cxlds) { - struct cxl_mbox_get_partition_info { - __le64 active_volatile_cap; - __le64 active_persistent_cap; - __le64 next_volatile_cap; - __le64 next_persistent_cap; - } __packed pi; + struct cxl_mbox_get_partition_info pi; int rc; rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_PARTITION_INFO, NULL, 0, diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 4fa8f9391b3a..c6d6f57856cc 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -301,6 +301,13 @@ struct cxl_mbox_identify { u8 qos_telemetry_caps; } __packed; +struct cxl_mbox_get_partition_info { + __le64 active_volatile_cap; + __le64 active_persistent_cap; + __le64 next_volatile_cap; + __le64 next_persistent_cap; +} __packed; + struct cxl_mbox_get_lsa { __le32 offset; __le32 length; diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index b38a3485505c..91444279f9a2 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -569,44 +569,6 @@ static void mock_companion(struct acpi_device *adev, struct device *dev) #define SZ_512G (SZ_64G * 8) #endif -static struct platform_device *alloc_memdev(int id) -{ - struct resource res[] = { - [0] = { - .flags = IORESOURCE_MEM, - }, - [1] = { - .flags = IORESOURCE_MEM, - .desc = IORES_DESC_PERSISTENT_MEMORY, - }, - }; - struct platform_device *pdev; - int i, rc; - - for (i = 0; i < ARRAY_SIZE(res); i++) { - struct cxl_mock_res *r = alloc_mock_res(SZ_256M); - - if (!r) - return NULL; - res[i].start = r->range.start; - res[i].end = r->range.end; - } - - pdev = platform_device_alloc("cxl_mem", id); - if (!pdev) - return NULL; - - rc = platform_device_add_resources(pdev, res, ARRAY_SIZE(res)); - if (rc) - goto err; - - return pdev; - -err: - platform_device_put(pdev); - return NULL; -} - static __init int cxl_test_init(void) { int rc, i; @@ -709,7 +671,7 @@ static __init int cxl_test_init(void) struct platform_device *dport = cxl_switch_dport[i]; struct platform_device *pdev; - pdev = alloc_memdev(i); + pdev = platform_device_alloc("cxl_mem", i); if (!pdev) goto err_mem; pdev->dev.parent = &dport->dev; diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index b81c90715fe8..aa2df3a15051 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -10,6 +10,7 @@ #include #define LSA_SIZE SZ_128K +#define DEV_SIZE SZ_2G #define EFFECT(x) (1U << x) static struct cxl_cel_entry mock_cel[] = { @@ -25,6 +26,10 @@ static struct cxl_cel_entry mock_cel[] = { .opcode = cpu_to_le16(CXL_MBOX_OP_GET_LSA), .effect = cpu_to_le16(0), }, + { + .opcode = cpu_to_le16(CXL_MBOX_OP_GET_PARTITION_INFO), + .effect = cpu_to_le16(0), + }, { .opcode = cpu_to_le16(CXL_MBOX_OP_SET_LSA), .effect = cpu_to_le16(EFFECT(1) | EFFECT(2)), @@ -97,42 +102,37 @@ static int mock_get_log(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd) static int mock_id(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd) { - struct platform_device *pdev = to_platform_device(cxlds->dev); struct cxl_mbox_identify id = { .fw_revision = { "mock fw v1 " }, .lsa_size = cpu_to_le32(LSA_SIZE), - /* FIXME: Add partition support */ - .partition_align = cpu_to_le64(0), + .partition_align = + cpu_to_le64(SZ_256M / CXL_CAPACITY_MULTIPLIER), + .total_capacity = + cpu_to_le64(DEV_SIZE / CXL_CAPACITY_MULTIPLIER), }; - u64 capacity = 0; - int i; if (cmd->size_out < sizeof(id)) return -EINVAL; - for (i = 0; i < 2; i++) { - struct resource *res; - - res = platform_get_resource(pdev, IORESOURCE_MEM, i); - if (!res) - break; - - capacity += resource_size(res) / CXL_CAPACITY_MULTIPLIER; + memcpy(cmd->payload_out, &id, sizeof(id)); - if (le64_to_cpu(id.partition_align)) - continue; + return 0; +} - if (res->desc == IORES_DESC_PERSISTENT_MEMORY) - id.persistent_capacity = cpu_to_le64( - resource_size(res) / CXL_CAPACITY_MULTIPLIER); - else - id.volatile_capacity = cpu_to_le64( - resource_size(res) / CXL_CAPACITY_MULTIPLIER); - } +static int mock_partition_info(struct cxl_dev_state *cxlds, + struct cxl_mbox_cmd *cmd) +{ + struct cxl_mbox_get_partition_info pi = { + .active_volatile_cap = + cpu_to_le64(DEV_SIZE / 2 / CXL_CAPACITY_MULTIPLIER), + .active_persistent_cap = + cpu_to_le64(DEV_SIZE / 2 / CXL_CAPACITY_MULTIPLIER), + }; - id.total_capacity = cpu_to_le64(capacity); + if (cmd->size_out < sizeof(pi)) + return -EINVAL; - memcpy(cmd->payload_out, &id, sizeof(id)); + memcpy(cmd->payload_out, &pi, sizeof(pi)); return 0; } @@ -221,6 +221,9 @@ static int cxl_mock_mbox_send(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd * case CXL_MBOX_OP_GET_LSA: rc = mock_get_lsa(cxlds, cmd); break; + case CXL_MBOX_OP_GET_PARTITION_INFO: + rc = mock_partition_info(cxlds, cmd); + break; case CXL_MBOX_OP_SET_LSA: rc = mock_set_lsa(cxlds, cmd); break; -- cgit v1.2.3 From 08f8d040a11d539481b9aee7b482430561281a28 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 24 May 2022 10:48:59 -0700 Subject: tools/testing/cxl: Fix decoder default state The 'enabled' state is reserved for committed decoders. By default, cxl_test decoders are uncommitted at init time. Fixes: 7c7d68db0254 ("tools/testing/cxl: Enumerate mock decoders") Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/165603888091.551046.6312322707378021172.stgit@dwillia2-xfh Signed-off-by: Dan Williams --- tools/testing/cxl/test/cxl.c | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 91444279f9a2..6e086fbc5c5b 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -466,7 +466,6 @@ static int mock_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm) .end = -1, }; - cxld->flags = CXL_DECODER_F_ENABLE; cxld->interleave_ways = min_not_zero(target_count, 1); cxld->interleave_granularity = SZ_4K; cxld->target_type = CXL_DECODER_EXPANDER; -- cgit v1.2.3 From 1dd685c414a7b9fdb3d23aca3aedae84f0b998ae Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 6 Jul 2022 14:51:00 -0400 Subject: XArray: Add calls to might_alloc() Catch bogus GFP flags deterministically, instead of occasionally when we actually have to allocate memory. Reported-by: Nikolay Borisov Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/xarray.h | 15 +++++++++++++++ tools/include/linux/sched/mm.h | 2 ++ 2 files changed, 17 insertions(+) (limited to 'tools') diff --git a/include/linux/xarray.h b/include/linux/xarray.h index c29e11b2c073..44dd6d6e01bc 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -586,6 +587,7 @@ static inline void *xa_store_bh(struct xarray *xa, unsigned long index, { void *curr; + might_alloc(gfp); xa_lock_bh(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock_bh(xa); @@ -612,6 +614,7 @@ static inline void *xa_store_irq(struct xarray *xa, unsigned long index, { void *curr; + might_alloc(gfp); xa_lock_irq(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock_irq(xa); @@ -687,6 +690,7 @@ static inline void *xa_cmpxchg(struct xarray *xa, unsigned long index, { void *curr; + might_alloc(gfp); xa_lock(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock(xa); @@ -714,6 +718,7 @@ static inline void *xa_cmpxchg_bh(struct xarray *xa, unsigned long index, { void *curr; + might_alloc(gfp); xa_lock_bh(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock_bh(xa); @@ -741,6 +746,7 @@ static inline void *xa_cmpxchg_irq(struct xarray *xa, unsigned long index, { void *curr; + might_alloc(gfp); xa_lock_irq(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock_irq(xa); @@ -770,6 +776,7 @@ static inline int __must_check xa_insert(struct xarray *xa, { int err; + might_alloc(gfp); xa_lock(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock(xa); @@ -799,6 +806,7 @@ static inline int __must_check xa_insert_bh(struct xarray *xa, { int err; + might_alloc(gfp); xa_lock_bh(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock_bh(xa); @@ -828,6 +836,7 @@ static inline int __must_check xa_insert_irq(struct xarray *xa, { int err; + might_alloc(gfp); xa_lock_irq(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock_irq(xa); @@ -857,6 +866,7 @@ static inline __must_check int xa_alloc(struct xarray *xa, u32 *id, { int err; + might_alloc(gfp); xa_lock(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock(xa); @@ -886,6 +896,7 @@ static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id, { int err; + might_alloc(gfp); xa_lock_bh(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock_bh(xa); @@ -915,6 +926,7 @@ static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id, { int err; + might_alloc(gfp); xa_lock_irq(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock_irq(xa); @@ -948,6 +960,7 @@ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, { int err; + might_alloc(gfp); xa_lock(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock(xa); @@ -981,6 +994,7 @@ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry, { int err; + might_alloc(gfp); xa_lock_bh(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_bh(xa); @@ -1014,6 +1028,7 @@ static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry, { int err; + might_alloc(gfp); xa_lock_irq(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_irq(xa); diff --git a/tools/include/linux/sched/mm.h b/tools/include/linux/sched/mm.h index c8d9f19c1f35..967294b8edcf 100644 --- a/tools/include/linux/sched/mm.h +++ b/tools/include/linux/sched/mm.h @@ -1,4 +1,6 @@ #ifndef _TOOLS_PERF_LINUX_SCHED_MM_H #define _TOOLS_PERF_LINUX_SCHED_MM_H +#define might_alloc(gfp) do { } while (0) + #endif /* _TOOLS_PERF_LINUX_SCHED_MM_H */ -- cgit v1.2.3 From 3ddabc433670292492d217e0f3b5ce017c42da2c Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Fri, 8 Jul 2022 16:36:10 -0700 Subject: selftests: mptcp: validate userspace PM tests by default The new script was not listed in the programs to test. By consequence, some CIs running MPTCP selftests were not validating these new tests. Note that MPTCP CI was validating it as it executes all .sh scripts from 'tools/testing/selftests/net/mptcp' directory. Fixes: 259a834fadda ("selftests: mptcp: functional tests for the userspace PM type") Reported-by: Jakub Kicinski Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile index f905d5358e68..48a99e1453e1 100644 --- a/tools/testing/selftests/net/mptcp/Makefile +++ b/tools/testing/selftests/net/mptcp/Makefile @@ -6,7 +6,7 @@ KSFT_KHDR_INSTALL := 1 CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INCLUDES) TEST_PROGS := mptcp_connect.sh pm_netlink.sh mptcp_join.sh diag.sh \ - simult_flows.sh mptcp_sockopt.sh + simult_flows.sh mptcp_sockopt.sh userspace_pm.sh TEST_GEN_FILES = mptcp_connect pm_nl_ctl mptcp_sockopt mptcp_inq -- cgit v1.2.3 From 8370b400f5abad168bcc541fa2574e7bd6b3bf2c Mon Sep 17 00:00:00 2001 From: David Gow Date: Fri, 8 Jul 2022 12:48:47 +0800 Subject: selftest: Taint kernel when test module loaded Make any kselftest test module (using the kselftest_module framework) taint the kernel with TAINT_TEST on module load. Also mark the module as a test module using MODULE_INFO(test, "Y") so that other tools can tell this is a test module. We can't rely solely on this, though, as these test modules are also often built-in. Finally, update the kselftest documentation to mention that the kernel should be tainted, and how to do so manually (as below). Note that several selftests use kernel modules which are not based on the kselftest_module framework, and so will not automatically taint the kernel. This can be done in two ways: - Moving the module to the tools/testing directory. All modules under this directory will taint the kernel. - Adding the 'test' module property with: MODULE_INFO(test, "Y") Similarly, selftests which do not load modules into the kernel generally should not taint the kernel (or possibly should only do so on failure), as it's assumed that testing from user-space should be safe. Regardless, they can write to /proc/sys/kernel/tainted if required. Reviewed-by: Luis Chamberlain Acked-by: Brendan Higgins Signed-off-by: David Gow Signed-off-by: Shuah Khan --- Documentation/dev-tools/kselftest.rst | 9 +++++++++ tools/testing/selftests/kselftest_module.h | 4 ++++ 2 files changed, 13 insertions(+) (limited to 'tools') diff --git a/Documentation/dev-tools/kselftest.rst b/Documentation/dev-tools/kselftest.rst index a833ecf12fbc..1096a9833550 100644 --- a/Documentation/dev-tools/kselftest.rst +++ b/Documentation/dev-tools/kselftest.rst @@ -250,6 +250,14 @@ assist writing kernel modules that are for use with kselftest: - ``tools/testing/selftests/kselftest_module.h`` - ``tools/testing/selftests/kselftest/module.sh`` +Note that test modules should taint the kernel with TAINT_TEST. This will +happen automatically for modules which are in the ``tools/testing/`` +directory, or for modules which use the ``kselftest_module.h`` header above. +Otherwise, you'll need to add ``MODULE_INFO(test, "Y")`` to your module +source. selftests which do not load modules typically should not taint the +kernel, but in cases where a non-test module is loaded, TEST_TAINT can be +applied from userspace by writing to ``/proc/sys/kernel/tainted``. + How to use ---------- @@ -308,6 +316,7 @@ A bare bones test module might look like this: KSTM_MODULE_LOADERS(test_foo); MODULE_AUTHOR("John Developer "); MODULE_LICENSE("GPL"); + MODULE_INFO(test, "Y"); Example test script ------------------- diff --git a/tools/testing/selftests/kselftest_module.h b/tools/testing/selftests/kselftest_module.h index e2ea41de3f35..63cd7487373f 100644 --- a/tools/testing/selftests/kselftest_module.h +++ b/tools/testing/selftests/kselftest_module.h @@ -3,6 +3,7 @@ #define __KSELFTEST_MODULE_H #include +#include /* * Test framework for writing test modules to be loaded by kselftest. @@ -41,6 +42,7 @@ static inline int kstm_report(unsigned int total_tests, unsigned int failed_test static int __init __module##_init(void) \ { \ pr_info("loaded.\n"); \ + add_taint(TAINT_TEST, LOCKDEP_STILL_OK); \ selftest(); \ return kstm_report(total_tests, failed_tests, skipped_tests); \ } \ @@ -51,4 +53,6 @@ static void __exit __module##_exit(void) \ module_init(__module##_init); \ module_exit(__module##_exit) +MODULE_INFO(test, "Y"); + #endif /* __KSELFTEST_MODULE_H */ -- cgit v1.2.3 From 3bb267a36185b64949ea9c8bbfe93eb01986f6cb Mon Sep 17 00:00:00 2001 From: Guillaume Tucker Date: Fri, 8 Jul 2022 17:23:27 +0100 Subject: selftests: drop khdr make target Drop the "khdr" make target as it fails when the build directory is a sub-directory of the source tree. Rely on the "headers_install" target to have been run first instead. For example, here's a typical error this patch is addressing: $ make O=build -j32 kselftest-gen_tar make[1]: Entering directory '/home/kernelci/linux/build' make --no-builtin-rules INSTALL_HDR_PATH=/home/kernelci/linux/build/usr \ ARCH=x86 -C ../../.. headers_install make[3]: Entering directory '/home/kernelci/linux' Makefile:1022: ../scripts/Makefile.extrawarn: No such file or directory The source directory is determined in the top-level Makefile as ".." relatively to the "build" directory, but then the kselftest Makefile switches to "-C ../../.." so "../scripts" then points one level higher than the source tree e.g. "linux/../scripts" - which fails obviously. There is no other use-case in the kernel tree where a sub-directory Makefile tries to call a top-level make target, and it appears this isn't really a valid thing to do. Signed-off-by: Guillaume Tucker Tested-by: Anders Roxell Signed-off-by: Shuah Khan --- tools/testing/selftests/Makefile | 27 ++------------------------- 1 file changed, 2 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 52e31437f1a3..bf792130052d 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -151,30 +151,7 @@ export KHDR_INCLUDES # all isn't the first target in the file. .DEFAULT_GOAL := all -# Install headers here once for all tests. KSFT_KHDR_INSTALL_DONE -# is used to avoid running headers_install from lib.mk. -# Invoke headers install with --no-builtin-rules to avoid circular -# dependency in "make kselftest" case. In this case, second level -# make inherits builtin-rules which will use the rule generate -# Makefile.o and runs into -# "Circular Makefile.o <- prepare dependency dropped." -# and headers_install fails and test compile fails. -# -# O= KBUILD_OUTPUT cases don't run into this error, since main Makefile -# invokes them as sub-makes and --no-builtin-rules is not necessary, -# but doesn't cause any failures. Keep it simple and use the same -# flags in both cases. -# Local build cases: "make kselftest", "make -C" - headers are installed -# in the default INSTALL_HDR_PATH usr/include. -khdr: -ifeq (1,$(DEFAULT_INSTALL_HDR_PATH)) - $(MAKE) --no-builtin-rules ARCH=$(ARCH) -C $(top_srcdir) headers_install -else - $(MAKE) --no-builtin-rules INSTALL_HDR_PATH=$(abs_objtree)/usr \ - ARCH=$(ARCH) -C $(top_srcdir) headers_install -endif - -all: khdr +all: @ret=1; \ for TARGET in $(TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ @@ -274,4 +251,4 @@ clean: $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean;\ done; -.PHONY: khdr all run_tests hotplug run_hotplug clean_hotplug run_pstore_crash install clean gen_tar +.PHONY: all run_tests hotplug run_hotplug clean_hotplug run_pstore_crash install clean gen_tar -- cgit v1.2.3 From f2745dc0ba3dadd8fa2b2c33f48253d78e133a12 Mon Sep 17 00:00:00 2001 From: Guillaume Tucker Date: Fri, 8 Jul 2022 17:23:28 +0100 Subject: selftests: stop using KSFT_KHDR_INSTALL Stop using the KSFT_KHDR_INSTALL flag as installing the kernel headers from the kselftest Makefile is causing some issues. Instead, rely on the headers to be installed directly by the top-level Makefile "headers_install" make target prior to building kselftest. Signed-off-by: Guillaume Tucker Tested-by: Anders Roxell Signed-off-by: Shuah Khan --- tools/testing/selftests/arm64/mte/Makefile | 1 - tools/testing/selftests/arm64/signal/Makefile | 1 - tools/testing/selftests/arm64/signal/test_signals.h | 4 +--- tools/testing/selftests/drivers/s390x/uvdevice/Makefile | 1 - tools/testing/selftests/futex/functional/Makefile | 1 - tools/testing/selftests/kvm/Makefile | 1 - tools/testing/selftests/landlock/Makefile | 1 - tools/testing/selftests/net/Makefile | 1 - tools/testing/selftests/net/mptcp/Makefile | 1 - tools/testing/selftests/tc-testing/Makefile | 1 - tools/testing/selftests/vm/Makefile | 1 - 11 files changed, 1 insertion(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/mte/Makefile b/tools/testing/selftests/arm64/mte/Makefile index 409e3e53d00a..a5a0744423d8 100644 --- a/tools/testing/selftests/arm64/mte/Makefile +++ b/tools/testing/selftests/arm64/mte/Makefile @@ -22,7 +22,6 @@ ifeq ($(mte_cc_support),1) TEST_GEN_PROGS := $(PROGS) # Get Kernel headers installed and use them. -KSFT_KHDR_INSTALL := 1 else $(warning compiler "$(CC)" does not support the ARMv8.5 MTE extension.) $(warning test program "mte" will not be created.) diff --git a/tools/testing/selftests/arm64/signal/Makefile b/tools/testing/selftests/arm64/signal/Makefile index ac4ad0005715..be7520a863b0 100644 --- a/tools/testing/selftests/arm64/signal/Makefile +++ b/tools/testing/selftests/arm64/signal/Makefile @@ -11,7 +11,6 @@ PROGS := $(patsubst %.c,%,$(SRCS)) TEST_GEN_PROGS := $(notdir $(PROGS)) # Get Kernel headers installed and use them. -KSFT_KHDR_INSTALL := 1 # Including KSFT lib.mk here will also mangle the TEST_GEN_PROGS list # to account for any OUTPUT target-dirs optionally provided by diff --git a/tools/testing/selftests/arm64/signal/test_signals.h b/tools/testing/selftests/arm64/signal/test_signals.h index c70fdec7d7c4..0c645834ddc3 100644 --- a/tools/testing/selftests/arm64/signal/test_signals.h +++ b/tools/testing/selftests/arm64/signal/test_signals.h @@ -9,9 +9,7 @@ #include /* - * Using ARCH specific and sanitized Kernel headers installed by KSFT - * framework since we asked for it by setting flag KSFT_KHDR_INSTALL - * in our Makefile. + * Using ARCH specific and sanitized Kernel headers from the tree. */ #include #include diff --git a/tools/testing/selftests/drivers/s390x/uvdevice/Makefile b/tools/testing/selftests/drivers/s390x/uvdevice/Makefile index 5e701d2708d4..891215a7dc8a 100644 --- a/tools/testing/selftests/drivers/s390x/uvdevice/Makefile +++ b/tools/testing/selftests/drivers/s390x/uvdevice/Makefile @@ -11,7 +11,6 @@ else TEST_GEN_PROGS := test_uvdevice top_srcdir ?= ../../../../../.. -KSFT_KHDR_INSTALL := 1 khdr_dir = $(top_srcdir)/usr/include LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile index b8152c573e8a..732149011692 100644 --- a/tools/testing/selftests/futex/functional/Makefile +++ b/tools/testing/selftests/futex/functional/Makefile @@ -22,7 +22,6 @@ TEST_GEN_FILES := \ TEST_PROGS := run.sh top_srcdir = ../../../../.. -KSFT_KHDR_INSTALL := 1 DEFAULT_INSTALL_HDR_PATH := 1 include ../../lib.mk diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 22423c871ed6..120951fc304a 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -4,7 +4,6 @@ include ../../../build/Build.include all: top_srcdir = ../../../.. -KSFT_KHDR_INSTALL := 1 # For cross-builds to work, UNAME_M has to map to ARCH and arch specific # directories and targets in this Makefile. "uname -m" doesn't map to diff --git a/tools/testing/selftests/landlock/Makefile b/tools/testing/selftests/landlock/Makefile index 0b0049e133bb..1313e44e8fb9 100644 --- a/tools/testing/selftests/landlock/Makefile +++ b/tools/testing/selftests/landlock/Makefile @@ -8,7 +8,6 @@ TEST_GEN_PROGS := $(src_test:.c=) TEST_GEN_PROGS_EXTENDED := true -KSFT_KHDR_INSTALL := 1 OVERRIDE_TARGETS := 1 include ../lib.mk diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 7ea54af55490..bc006f2693da 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -62,7 +62,6 @@ TEST_PROGS += test_vxlan_vnifiltering.sh TEST_FILES := settings -KSFT_KHDR_INSTALL := 1 include ../lib.mk include bpf/Makefile diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile index f905d5358e68..1af2f66fb59a 100644 --- a/tools/testing/selftests/net/mptcp/Makefile +++ b/tools/testing/selftests/net/mptcp/Makefile @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 top_srcdir = ../../../../.. -KSFT_KHDR_INSTALL := 1 CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INCLUDES) diff --git a/tools/testing/selftests/tc-testing/Makefile b/tools/testing/selftests/tc-testing/Makefile index 4d639279f41e..cb553eac9f41 100644 --- a/tools/testing/selftests/tc-testing/Makefile +++ b/tools/testing/selftests/tc-testing/Makefile @@ -5,7 +5,6 @@ top_srcdir = $(abspath ../../../..) APIDIR := $(top_scrdir)/include/uapi TEST_GEN_FILES = action.o -KSFT_KHDR_INSTALL := 1 include ../lib.mk PROBE := $(shell $(LLC) -march=bpf -mcpu=probe -filetype=null /dev/null 2>&1) diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 44f25acfbeca..108587cb327a 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -94,7 +94,6 @@ TEST_PROGS := run_vmtests.sh TEST_FILES := test_vmalloc.sh TEST_FILES += test_hmm.sh -KSFT_KHDR_INSTALL := 1 include ../lib.mk $(OUTPUT)/madv_populate: vm_util.c -- cgit v1.2.3 From 49de12ba06efcba76332054379830f9d04541492 Mon Sep 17 00:00:00 2001 From: Guillaume Tucker Date: Fri, 8 Jul 2022 17:23:29 +0100 Subject: selftests: drop KSFT_KHDR_INSTALL make target Drop the KSFT_KHDR_INSTALL make target now that all use-cases have been removed from the other kselftest Makefiles. Signed-off-by: Guillaume Tucker Tested-by: Anders Roxell Signed-off-by: Shuah Khan --- tools/testing/selftests/Makefile | 1 - tools/testing/selftests/lib.mk | 38 -------------------------------------- 2 files changed, 39 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index bf792130052d..5047d8eef53e 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -143,7 +143,6 @@ endif # Prepare for headers install include $(top_srcdir)/scripts/subarch.include ARCH ?= $(SUBARCH) -export KSFT_KHDR_INSTALL_DONE := 1 export BUILD export KHDR_INCLUDES diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 1a5cc3cd97ec..947fc72413e9 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -51,45 +51,7 @@ TEST_GEN_PROGS := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS)) TEST_GEN_PROGS_EXTENDED := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS_EXTENDED)) TEST_GEN_FILES := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_FILES)) -ifdef KSFT_KHDR_INSTALL -top_srcdir ?= ../../../.. -include $(top_srcdir)/scripts/subarch.include -ARCH ?= $(SUBARCH) - -# set default goal to all, so make without a target runs all, even when -# all isn't the first target in the file. -.DEFAULT_GOAL := all - -# Invoke headers install with --no-builtin-rules to avoid circular -# dependency in "make kselftest" case. In this case, second level -# make inherits builtin-rules which will use the rule generate -# Makefile.o and runs into -# "Circular Makefile.o <- prepare dependency dropped." -# and headers_install fails and test compile fails. -# O= KBUILD_OUTPUT cases don't run into this error, since main Makefile -# invokes them as sub-makes and --no-builtin-rules is not necessary, -# but doesn't cause any failures. Keep it simple and use the same -# flags in both cases. -# Note that the support to install headers from lib.mk is necessary -# when test Makefile is run directly with "make -C". -# When local build is done, headers are installed in the default -# INSTALL_HDR_PATH usr/include. -.PHONY: khdr -.NOTPARALLEL: -khdr: -ifndef KSFT_KHDR_INSTALL_DONE -ifeq (1,$(DEFAULT_INSTALL_HDR_PATH)) - $(MAKE) --no-builtin-rules ARCH=$(ARCH) -C $(top_srcdir) headers_install -else - $(MAKE) --no-builtin-rules INSTALL_HDR_PATH=$$OUTPUT/usr \ - ARCH=$(ARCH) -C $(top_srcdir) headers_install -endif -endif - -all: khdr $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) -else all: $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) -endif define RUN_TESTS BASE_DIR="$(selfdir)"; \ -- cgit v1.2.3 From b5f37a0b6f667f5c72340ca9dcd7703f261cb981 Mon Sep 17 00:00:00 2001 From: jianchunfu Date: Wed, 15 Jun 2022 15:33:48 +0800 Subject: rtla/utils: Use calloc and check the potential memory allocation failure Replace malloc with calloc and add memory allocating check of mon_cpus before used. Link: https://lkml.kernel.org/r/20220615073348.6891-1-jianchunfu@cmss.chinamobile.com Fixes: 7d0dc9576dc3 ("rtla/timerlat: Add --dma-latency option") Signed-off-by: jianchunfu Acked-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- tools/tracing/rtla/src/utils.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/tracing/rtla/src/utils.c b/tools/tracing/rtla/src/utils.c index 5352167a1e75..5ae2fa96fde1 100644 --- a/tools/tracing/rtla/src/utils.c +++ b/tools/tracing/rtla/src/utils.c @@ -106,8 +106,9 @@ int parse_cpu_list(char *cpu_list, char **monitored_cpus) nr_cpus = sysconf(_SC_NPROCESSORS_CONF); - mon_cpus = malloc(nr_cpus * sizeof(char)); - memset(mon_cpus, 0, (nr_cpus * sizeof(char))); + mon_cpus = calloc(nr_cpus, sizeof(char)); + if (!mon_cpus) + goto err; for (p = cpu_list; *p; ) { cpu = atoi(p); -- cgit v1.2.3 From 4a46de446d3fb9ae304dd0f4b4fceb551b152498 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 8 Jul 2022 19:41:41 -0700 Subject: selftest: net: add tun to .gitignore Add missing .gitignore entry. Fixes: 839b92fede7b ("selftest: tun: add test for NAPI dismantle") Link: https://lore.kernel.org/r/20220709024141.321683-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/.gitignore | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index a29f79618934..ffc35a22e914 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -36,4 +36,5 @@ test_unix_oob gro ioam6_parser toeplitz +tun cmsg_sender -- cgit v1.2.3 From 1d55f20313853b09cd111b04bb264ca22e1d6046 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 8 Jul 2022 19:52:55 -0700 Subject: selftests: tls: add test for NoPad getsockopt Make sure setsockopt / getsockopt behave as expected. Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tls.c | 51 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index e71ec5846be9..dc26aae0feb0 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -1674,6 +1674,57 @@ TEST(keysizes) { close(cfd); } +TEST(no_pad) { + struct tls12_crypto_info_aes_gcm_256 tls12; + int ret, fd, cfd, val; + socklen_t len; + bool notls; + + memset(&tls12, 0, sizeof(tls12)); + tls12.info.version = TLS_1_3_VERSION; + tls12.info.cipher_type = TLS_CIPHER_AES_GCM_256; + + ulp_sock_pair(_metadata, &fd, &cfd, ¬ls); + + if (notls) + exit(KSFT_SKIP); + + ret = setsockopt(fd, SOL_TLS, TLS_TX, &tls12, sizeof(tls12)); + EXPECT_EQ(ret, 0); + + ret = setsockopt(cfd, SOL_TLS, TLS_RX, &tls12, sizeof(tls12)); + EXPECT_EQ(ret, 0); + + val = 1; + ret = setsockopt(cfd, SOL_TLS, TLS_RX_EXPECT_NO_PAD, + (void *)&val, sizeof(val)); + EXPECT_EQ(ret, 0); + + len = sizeof(val); + val = 2; + ret = getsockopt(cfd, SOL_TLS, TLS_RX_EXPECT_NO_PAD, + (void *)&val, &len); + EXPECT_EQ(ret, 0); + EXPECT_EQ(val, 1); + EXPECT_EQ(len, 4); + + val = 0; + ret = setsockopt(cfd, SOL_TLS, TLS_RX_EXPECT_NO_PAD, + (void *)&val, sizeof(val)); + EXPECT_EQ(ret, 0); + + len = sizeof(val); + val = 2; + ret = getsockopt(cfd, SOL_TLS, TLS_RX_EXPECT_NO_PAD, + (void *)&val, &len); + EXPECT_EQ(ret, 0); + EXPECT_EQ(val, 0); + EXPECT_EQ(len, 4); + + close(fd); + close(cfd); +} + TEST(tls_v6ops) { struct tls_crypto_info_keys tls12; struct sockaddr_in6 addr, addr2; -- cgit v1.2.3 From c7a774d78111d55f409d9ee0a5db2c4eb39d6657 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 7 Jul 2022 08:34:48 -0700 Subject: perf test: Add debug line to diagnose broken metrics Printing out the metric name and architecture makes finding the source of a failure easier. Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20220707153449.202409-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/pmu-events.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/perf/tests/pmu-events.c b/tools/perf/tests/pmu-events.c index f13368569d8b..478b33825790 100644 --- a/tools/perf/tests/pmu-events.c +++ b/tools/perf/tests/pmu-events.c @@ -1115,6 +1115,7 @@ static int test__parsing_fake(struct test_suite *test __maybe_unused, break; if (!pe->metric_expr) continue; + pr_debug("Found metric '%s' for '%s'\n", pe->metric_name, map->cpuid); err = metric_parse_fake(pe->metric_expr); if (err) return err; -- cgit v1.2.3 From 29d97deed6424d8dfac6797910d6920fb8aad33c Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 7 Jul 2022 08:34:49 -0700 Subject: perf test: Make all metrics test more tolerant Metric names are truncated so don't try to match all of one. Allow AMX metrics to skip as floating point ones do. Metrics for optane memory can also skip rather than fail. Add a system wide check for uncore metrics. Restructure code to avoid extensive nesting. Some impetus for this in: https://lore.kernel.org/lkml/d32376b5-5538-ff00-6620-e74ad4b4abf2@huawei.com/ Suggested-by: John Garry Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20220707153449.202409-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/stat_all_metrics.sh | 47 ++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/stat_all_metrics.sh b/tools/perf/tests/shell/stat_all_metrics.sh index e7c59e5a7a98..6e79349e42be 100755 --- a/tools/perf/tests/shell/stat_all_metrics.sh +++ b/tools/perf/tests/shell/stat_all_metrics.sh @@ -1,26 +1,41 @@ -#!/bin/sh +#!/bin/bash # perf all metrics test # SPDX-License-Identifier: GPL-2.0 -set -e - err=0 for m in $(perf list --raw-dump metrics); do echo "Testing $m" result=$(perf stat -M "$m" true 2>&1) - if [[ ! "$result" =~ "$m" ]] && [[ ! "$result" =~ "" ]]; then - # We failed to see the metric and the events are support. Possibly the - # workload was too small so retry with something longer. - result=$(perf stat -M "$m" perf bench internals synthesize 2>&1) - if [[ ! "$result" =~ "$m" ]]; then - echo "Metric '$m' not printed in:" - echo "$result" - if [[ "$result" =~ "FP_ARITH" && "$err" != "1" ]]; then - echo "Skip, not fail, for FP issues" - err=2 - else - err=1 - fi + if [[ "$result" =~ "${m:0:50}" ]] || [[ "$result" =~ "" ]] + then + continue + fi + # Failed so try system wide. + result=$(perf stat -M "$m" -a true 2>&1) + if [[ "$result" =~ "${m:0:50}" ]] + then + continue + fi + # Failed again, possibly the workload was too small so retry with something + # longer. + result=$(perf stat -M "$m" perf bench internals synthesize 2>&1) + if [[ "$result" =~ "${m:0:50}" ]] + then + continue + fi + echo "Metric '$m' not printed in:" + echo "$result" + if [[ "$err" != "1" ]] + then + err=2 + if [[ "$result" =~ "FP_ARITH" || "$result" =~ "AMX" ]] + then + echo "Skip, not fail, for FP issues" + elif [[ "$result" =~ "PMM" ]] + then + echo "Skip, not fail, for Optane memory issues" + else + err=1 fi fi done -- cgit v1.2.3 From b55878c90ab92a244b7b044a9e64aef5b75783e7 Mon Sep 17 00:00:00 2001 From: German Gomez Date: Tue, 5 Jul 2022 16:05:11 +0100 Subject: perf test: Add test for branch stack sampling Add a self test for branch stack sampling, to check that we get the expected branch types, and filters behave as expected. Suggested-by: Anshuman Khandual Signed-off-by: German Gomez Acked-by: Namhyung Kim Cc: Alexander Shishkin Cc: Ian Rogers Cc: Mark Rutland Link: https://lore.kernel.org/r/20220705150511.473919-2-german.gomez@arm.com Tested-by: Jiri Olsa Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/test_brstack.sh | 114 +++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100755 tools/perf/tests/shell/test_brstack.sh (limited to 'tools') diff --git a/tools/perf/tests/shell/test_brstack.sh b/tools/perf/tests/shell/test_brstack.sh new file mode 100755 index 000000000000..113ccd17bf03 --- /dev/null +++ b/tools/perf/tests/shell/test_brstack.sh @@ -0,0 +1,114 @@ +#!/bin/sh +# Check branch stack sampling + +# SPDX-License-Identifier: GPL-2.0 +# German Gomez , 2022 + +# we need a C compiler to build the test programs +# so bail if none is found +if ! [ -x "$(command -v cc)" ]; then + echo "failed: no compiler, install gcc" + exit 2 +fi + +# skip the test if the hardware doesn't support branch stack sampling +perf record -b -o- -e dummy -B true > /dev/null 2>&1 || exit 2 + +TMPDIR=$(mktemp -d /tmp/__perf_test.program.XXXXX) + +cleanup() { + rm -rf $TMPDIR +} + +trap cleanup exit term int + +gen_test_program() { + # generate test program + cat << EOF > $1 +#define BENCH_RUNS 999999 +int cnt; +void bar(void) { +} /* return */ +void foo(void) { + bar(); /* call */ +} /* return */ +void bench(void) { + void (*foo_ind)(void) = foo; + if ((cnt++) % 3) /* branch (cond) */ + foo(); /* call */ + bar(); /* call */ + foo_ind(); /* call (ind) */ +} +int main(void) +{ + int cnt = 0; + while (1) { + if ((cnt++) > BENCH_RUNS) + break; + bench(); /* call */ + } /* branch (uncond) */ + return 0; +} +EOF +} + +test_user_branches() { + echo "Testing user branch stack sampling" + + gen_test_program "$TEMPDIR/program.c" + cc -fno-inline -g "$TEMPDIR/program.c" -o $TMPDIR/a.out + + perf record -o $TMPDIR/perf.data --branch-filter any,save_type,u -- $TMPDIR/a.out > /dev/null 2>&1 + perf script -i $TMPDIR/perf.data --fields brstacksym | xargs -n1 > $TMPDIR/perf.script + + # example of branch entries: + # foo+0x14/bar+0x40/P/-/-/0/CALL + + set -x + egrep -m1 "^bench\+[^ ]*/foo\+[^ ]*/IND_CALL$" $TMPDIR/perf.script + egrep -m1 "^foo\+[^ ]*/bar\+[^ ]*/CALL$" $TMPDIR/perf.script + egrep -m1 "^bench\+[^ ]*/foo\+[^ ]*/CALL$" $TMPDIR/perf.script + egrep -m1 "^bench\+[^ ]*/bar\+[^ ]*/CALL$" $TMPDIR/perf.script + egrep -m1 "^bar\+[^ ]*/foo\+[^ ]*/RET$" $TMPDIR/perf.script + egrep -m1 "^foo\+[^ ]*/bench\+[^ ]*/RET$" $TMPDIR/perf.script + egrep -m1 "^bench\+[^ ]*/bench\+[^ ]*/COND$" $TMPDIR/perf.script + egrep -m1 "^main\+[^ ]*/main\+[^ ]*/UNCOND$" $TMPDIR/perf.script + set +x + + # some branch types are still not being tested: + # IND COND_CALL COND_RET SYSCALL SYSRET IRQ SERROR NO_TX +} + +# first argument is the argument passed to "--branch-stack ,save_type,u" +# second argument are the expected branch types for the given filter +test_filter() { + local filter=$1 + local expect=$2 + + echo "Testing branch stack filtering permutation ($filter,$expect)" + + gen_test_program "$TEMPDIR/program.c" + cc -fno-inline -g "$TEMPDIR/program.c" -o $TMPDIR/a.out + + perf record -o $TMPDIR/perf.data --branch-filter $filter,save_type,u -- $TMPDIR/a.out > /dev/null 2>&1 + perf script -i $TMPDIR/perf.data --fields brstack | xargs -n1 > $TMPDIR/perf.script + + # fail if we find any branch type that doesn't match any of the expected ones + # also consider UNKNOWN branch types (-) + if egrep -vm1 "^[^ ]*/($expect|-|( *))$" $TMPDIR/perf.script; then + return 1 + fi +} + +set -e + +test_user_branches + +test_filter "any_call" "CALL|IND_CALL|COND_CALL|SYSCALL|IRQ" +test_filter "call" "CALL|SYSCALL" +test_filter "cond" "COND" +test_filter "any_ret" "RET|COND_RET|SYSRET|ERET" + +test_filter "call,cond" "CALL|SYSCALL|COND" +test_filter "any_call,cond" "CALL|IND_CALL|COND_CALL|IRQ|SYSCALL|COND" +test_filter "cond,any_call,any_ret" "COND|CALL|IND_CALL|COND_CALL|SYSCALL|IRQ|RET|COND_RET|SYSRET|ERET" -- cgit v1.2.3 From ab0101768f639801b726f8be50e2ff7366ade056 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 15 Jun 2022 09:32:16 -0700 Subject: perf lock: Print wait times with unit Currently it only prints the time in nsec but it's a bit hard to read and takes longer in the screen. We can change it to use different units and keep the number small to save the space. Before: $ perf lock report Name acquired contended avg wait (ns) total wait (ns) max wait (ns) min wait (ns) jiffies_lock 433 32 2778 88908 13570 692 &lruvec->lru_lock 747 5 11254 56272 18317 1412 slock-AF_INET6 7 1 23543 23543 23543 23543 &newf->file_lock 706 15 1025 15388 2279 618 slock-AF_INET6 8 1 10379 10379 10379 10379 &rq->__lock 2143 5 2037 10185 3462 939 After: Name acquired contended avg wait total wait max wait min wait jiffies_lock 433 32 2.78 us 88.91 us 13.57 us 692 ns &lruvec->lru_lock 747 5 11.25 us 56.27 us 18.32 us 1.41 us slock-AF_INET6 7 1 23.54 us 23.54 us 23.54 us 23.54 us &newf->file_lock 706 15 1.02 us 15.39 us 2.28 us 618 ns slock-AF_INET6 8 1 10.38 us 10.38 us 10.38 us 10.38 us &rq->__lock 2143 5 2.04 us 10.19 us 3.46 us 939 ns Signed-off-by: Namhyung Kim Acked-by: Ian Rogers Cc: Boqun Feng Cc: Davidlohr Bueso Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Waiman Long Cc: Will Deacon Link: https://lore.kernel.org/r/20220615163222.1275500-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-lock.c | 48 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 23a33ac15e68..57e396323d05 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -251,6 +251,31 @@ struct lock_key { struct list_head list; }; +static void lock_stat_key_print_time(unsigned long long nsec, int len) +{ + static const struct { + float base; + const char *unit; + } table[] = { + { 1e9 * 3600, "h " }, + { 1e9 * 60, "m " }, + { 1e9, "s " }, + { 1e6, "ms" }, + { 1e3, "us" }, + { 0, NULL }, + }; + + for (int i = 0; table[i].unit; i++) { + if (nsec < table[i].base) + continue; + + pr_info("%*.2f %s", len - 3, nsec / table[i].base, table[i].unit); + return; + } + + pr_info("%*llu %s", len - 3, nsec, "ns"); +} + #define PRINT_KEY(member) \ static void lock_stat_key_print_ ## member(struct lock_key *key, \ struct lock_stat *ls) \ @@ -258,11 +283,18 @@ static void lock_stat_key_print_ ## member(struct lock_key *key, \ pr_info("%*llu", key->len, (unsigned long long)ls->member); \ } +#define PRINT_TIME(member) \ +static void lock_stat_key_print_ ## member(struct lock_key *key, \ + struct lock_stat *ls) \ +{ \ + lock_stat_key_print_time((unsigned long long)ls->member, key->len); \ +} + PRINT_KEY(nr_acquired) PRINT_KEY(nr_contended) -PRINT_KEY(avg_wait_time) -PRINT_KEY(wait_time_total) -PRINT_KEY(wait_time_max) +PRINT_TIME(avg_wait_time) +PRINT_TIME(wait_time_total) +PRINT_TIME(wait_time_max) static void lock_stat_key_print_wait_time_min(struct lock_key *key, struct lock_stat *ls) @@ -272,7 +304,7 @@ static void lock_stat_key_print_wait_time_min(struct lock_key *key, if (wait_time == ULLONG_MAX) wait_time = 0; - pr_info("%*"PRIu64, key->len, wait_time); + lock_stat_key_print_time(wait_time, key->len); } @@ -291,10 +323,10 @@ static const char *output_fields; struct lock_key keys[] = { DEF_KEY_LOCK(acquired, "acquired", nr_acquired, 10), DEF_KEY_LOCK(contended, "contended", nr_contended, 10), - DEF_KEY_LOCK(avg_wait, "avg wait (ns)", avg_wait_time, 15), - DEF_KEY_LOCK(wait_total, "total wait (ns)", wait_time_total, 15), - DEF_KEY_LOCK(wait_max, "max wait (ns)", wait_time_max, 15), - DEF_KEY_LOCK(wait_min, "min wait (ns)", wait_time_min, 15), + DEF_KEY_LOCK(avg_wait, "avg wait", avg_wait_time, 12), + DEF_KEY_LOCK(wait_total, "total wait", wait_time_total, 12), + DEF_KEY_LOCK(wait_max, "max wait", wait_time_max, 12), + DEF_KEY_LOCK(wait_min, "min wait", wait_time_min, 12), /* extra comparisons much complicated should be here */ { } -- cgit v1.2.3 From 309e133dfe2651eedd572b62af2215b94532b712 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 15 Jun 2022 09:32:17 -0700 Subject: perf lock: Allow to use different kernel symbols Add --vmlinux and --kallsyms options to support data file from different kernels. Signed-off-by: Namhyung Kim Cc: Boqun Feng Cc: Davidlohr Bueso Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Waiman Long Cc: Will Deacon Link: https://lore.kernel.org/r/20220615163222.1275500-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-lock.txt | 7 +++++++ tools/perf/builtin-lock.c | 4 ++++ 2 files changed, 11 insertions(+) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt index 656b537b2fba..4b8568f0c53b 100644 --- a/tools/perf/Documentation/perf-lock.txt +++ b/tools/perf/Documentation/perf-lock.txt @@ -46,6 +46,13 @@ COMMON OPTIONS --force:: Don't complain, do it. +--vmlinux=:: + vmlinux pathname + +--kallsyms=:: + kallsyms pathname + + REPORT OPTIONS -------------- diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 57e396323d05..118a036a81fb 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -1162,6 +1162,10 @@ int cmd_lock(int argc, const char **argv) OPT_INCR('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"), OPT_BOOLEAN('f', "force", &force, "don't complain, do it"), + OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name, + "file", "vmlinux pathname"), + OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name, + "file", "kallsyms pathname"), OPT_END() }; -- cgit v1.2.3 From 9565c9186d17ac55f5dc8a556bece847e49dee35 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 15 Jun 2022 09:32:18 -0700 Subject: perf lock: Skip print_bad_events() if nothing bad The debug output is meaningful when there are bad lock sequences. Skip it unless there's one or -v option is given. Signed-off-by: Namhyung Kim Cc: Boqun Feng Cc: Davidlohr Bueso Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Waiman Long Cc: Will Deacon Link: https://lore.kernel.org/r/20220615163222.1275500-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-lock.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'tools') diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 118a036a81fb..2337b09dd2cd 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -858,9 +858,16 @@ static void print_bad_events(int bad, int total) { /* Output for debug, this have to be removed */ int i; + int broken = 0; const char *name[4] = { "acquire", "acquired", "contended", "release" }; + for (i = 0; i < BROKEN_MAX; i++) + broken += bad_hist[i]; + + if (broken == 0 && !verbose) + return; + pr_info("\n=== output for debug===\n\n"); pr_info("bad: %d, total: %d\n", bad, total); pr_info("bad rate: %.2f %%\n", (double)bad / (double)total * 100); -- cgit v1.2.3 From 166a9764a38e37089088eb21f31cfcb6d5a0dde2 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 15 Jun 2022 09:32:19 -0700 Subject: perf lock: Add lock contention tracepoints record support When LOCKDEP and LOCK_STAT events are not available, it falls back to record the new lock contention tracepoints. Signed-off-by: Namhyung Kim Cc: Boqun Feng Cc: Davidlohr Bueso Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Waiman Long Cc: Will Deacon Link: https://lore.kernel.org/r/20220615163222.1275500-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-lock.c | 76 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 69 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 2337b09dd2cd..9e3b90cac505 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -516,17 +516,29 @@ alloc_failed: } struct trace_lock_handler { + /* it's used on CONFIG_LOCKDEP */ int (*acquire_event)(struct evsel *evsel, struct perf_sample *sample); + /* it's used on CONFIG_LOCKDEP && CONFIG_LOCK_STAT */ int (*acquired_event)(struct evsel *evsel, struct perf_sample *sample); + /* it's used on CONFIG_LOCKDEP && CONFIG_LOCK_STAT */ int (*contended_event)(struct evsel *evsel, struct perf_sample *sample); + /* it's used on CONFIG_LOCKDEP */ int (*release_event)(struct evsel *evsel, struct perf_sample *sample); + + /* it's used when CONFIG_LOCKDEP is off */ + int (*contention_begin_event)(struct evsel *evsel, + struct perf_sample *sample); + + /* it's used when CONFIG_LOCKDEP is off */ + int (*contention_end_event)(struct evsel *evsel, + struct perf_sample *sample); }; static struct lock_seq_stat *get_seq(struct thread_stat *ts, u64 addr) @@ -854,6 +866,20 @@ static int evsel__process_lock_release(struct evsel *evsel, struct perf_sample * return 0; } +static int evsel__process_contention_begin(struct evsel *evsel, struct perf_sample *sample) +{ + if (trace_handler->contention_begin_event) + return trace_handler->contention_begin_event(evsel, sample); + return 0; +} + +static int evsel__process_contention_end(struct evsel *evsel, struct perf_sample *sample) +{ + if (trace_handler->contention_end_event) + return trace_handler->contention_end_event(evsel, sample); + return 0; +} + static void print_bad_events(int bad, int total) { /* Output for debug, this have to be removed */ @@ -1062,6 +1088,11 @@ static const struct evsel_str_handler lock_tracepoints[] = { { "lock:lock_release", evsel__process_lock_release, }, /* CONFIG_LOCKDEP */ }; +static const struct evsel_str_handler contention_tracepoints[] = { + { "lock:contention_begin", evsel__process_contention_begin, }, + { "lock:contention_end", evsel__process_contention_end, }, +}; + static bool force; static int __cmd_report(bool display_info) @@ -1125,20 +1156,41 @@ static int __cmd_record(int argc, const char **argv) "record", "-R", "-m", "1024", "-c", "1", "--synth", "task", }; unsigned int rec_argc, i, j, ret; + unsigned int nr_tracepoints; const char **rec_argv; + bool has_lock_stat = true; for (i = 0; i < ARRAY_SIZE(lock_tracepoints); i++) { if (!is_valid_tracepoint(lock_tracepoints[i].name)) { - pr_err("tracepoint %s is not enabled. " - "Are CONFIG_LOCKDEP and CONFIG_LOCK_STAT enabled?\n", - lock_tracepoints[i].name); - return 1; + pr_debug("tracepoint %s is not enabled. " + "Are CONFIG_LOCKDEP and CONFIG_LOCK_STAT enabled?\n", + lock_tracepoints[i].name); + has_lock_stat = false; + break; + } + } + + if (has_lock_stat) + goto setup_args; + + for (i = 0; i < ARRAY_SIZE(contention_tracepoints); i++) { + if (!is_valid_tracepoint(contention_tracepoints[i].name)) { + pr_err("tracepoint %s is not enabled.\n", + contention_tracepoints[i].name); + return 1; } } +setup_args: rec_argc = ARRAY_SIZE(record_args) + argc - 1; + + if (has_lock_stat) + nr_tracepoints = ARRAY_SIZE(lock_tracepoints); + else + nr_tracepoints = ARRAY_SIZE(contention_tracepoints); + /* factor of 2 is for -e in front of each tracepoint */ - rec_argc += 2 * ARRAY_SIZE(lock_tracepoints); + rec_argc += 2 * nr_tracepoints; rec_argv = calloc(rec_argc + 1, sizeof(char *)); if (!rec_argv) @@ -1147,9 +1199,19 @@ static int __cmd_record(int argc, const char **argv) for (i = 0; i < ARRAY_SIZE(record_args); i++) rec_argv[i] = strdup(record_args[i]); - for (j = 0; j < ARRAY_SIZE(lock_tracepoints); j++) { + for (j = 0; j < nr_tracepoints; j++) { + const char *ev_name; + + if (has_lock_stat) + ev_name = strdup(lock_tracepoints[j].name); + else + ev_name = strdup(contention_tracepoints[j].name); + + if (!ev_name) + return -ENOMEM; + rec_argv[i++] = "-e"; - rec_argv[i++] = strdup(lock_tracepoints[j].name); + rec_argv[i++] = ev_name; } for (j = 1; j < (unsigned int)argc; j++, i++) -- cgit v1.2.3 From 3ae03f2650b8d87242906f6c160732bc9d991ca1 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 15 Jun 2022 09:32:20 -0700 Subject: perf lock: Handle lock contention tracepoints When the lock contention events are used, there's no tracking of acquire and release. So the state machine is simplified to use UNINITIALIZED -> CONTENDED -> ACQUIRED only. Note that CONTENDED state is re-entrant since mutex locks can hit two or more consecutive contention_begin events for optimistic spinning and sleep. Signed-off-by: Namhyung Kim Acked-by: Ian Rogers Cc: Boqun Feng Cc: Davidlohr Bueso Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Waiman Long Cc: Will Deacon Link: https://lore.kernel.org/r/20220615163222.1275500-6-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-lock.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) (limited to 'tools') diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 9e3b90cac505..546dad1963c8 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -483,6 +483,18 @@ static struct lock_stat *pop_from_result(void) return container_of(node, struct lock_stat, rb); } +static struct lock_stat *lock_stat_find(u64 addr) +{ + struct hlist_head *entry = lockhashentry(addr); + struct lock_stat *ret; + + hlist_for_each_entry(ret, entry, hash_entry) { + if (ret->addr == addr) + return ret; + } + return NULL; +} + static struct lock_stat *lock_stat_findnew(u64 addr, const char *name) { struct hlist_head *entry = lockhashentry(addr); @@ -827,6 +839,124 @@ end: return 0; } +static int report_lock_contention_begin_event(struct evsel *evsel, + struct perf_sample *sample) +{ + struct lock_stat *ls; + struct thread_stat *ts; + struct lock_seq_stat *seq; + u64 addr = evsel__intval(evsel, sample, "lock_addr"); + + if (show_thread_stats) + addr = sample->tid; + + ls = lock_stat_findnew(addr, "No name"); + if (!ls) + return -ENOMEM; + + ts = thread_stat_findnew(sample->tid); + if (!ts) + return -ENOMEM; + + seq = get_seq(ts, addr); + if (!seq) + return -ENOMEM; + + switch (seq->state) { + case SEQ_STATE_UNINITIALIZED: + case SEQ_STATE_ACQUIRED: + break; + case SEQ_STATE_CONTENDED: + /* + * It can have nested contention begin with mutex spinning, + * then we would use the original contention begin event and + * ignore the second one. + */ + goto end; + case SEQ_STATE_ACQUIRING: + case SEQ_STATE_READ_ACQUIRED: + case SEQ_STATE_RELEASED: + /* broken lock sequence */ + if (!ls->broken) { + ls->broken = 1; + bad_hist[BROKEN_CONTENDED]++; + } + list_del_init(&seq->list); + free(seq); + goto end; + default: + BUG_ON("Unknown state of lock sequence found!\n"); + break; + } + + if (seq->state != SEQ_STATE_CONTENDED) { + seq->state = SEQ_STATE_CONTENDED; + seq->prev_event_time = sample->time; + ls->nr_contended++; + } +end: + return 0; +} + +static int report_lock_contention_end_event(struct evsel *evsel, + struct perf_sample *sample) +{ + struct lock_stat *ls; + struct thread_stat *ts; + struct lock_seq_stat *seq; + u64 contended_term; + u64 addr = evsel__intval(evsel, sample, "lock_addr"); + + if (show_thread_stats) + addr = sample->tid; + + ls = lock_stat_find(addr); + if (!ls) + return 0; + + ts = thread_stat_find(sample->tid); + if (!ts) + return 0; + + seq = get_seq(ts, addr); + if (!seq) + return -ENOMEM; + + switch (seq->state) { + case SEQ_STATE_UNINITIALIZED: + goto end; + case SEQ_STATE_CONTENDED: + contended_term = sample->time - seq->prev_event_time; + ls->wait_time_total += contended_term; + if (contended_term < ls->wait_time_min) + ls->wait_time_min = contended_term; + if (ls->wait_time_max < contended_term) + ls->wait_time_max = contended_term; + break; + case SEQ_STATE_ACQUIRING: + case SEQ_STATE_ACQUIRED: + case SEQ_STATE_READ_ACQUIRED: + case SEQ_STATE_RELEASED: + /* broken lock sequence */ + if (!ls->broken) { + ls->broken = 1; + bad_hist[BROKEN_ACQUIRED]++; + } + list_del_init(&seq->list); + free(seq); + goto end; + default: + BUG_ON("Unknown state of lock sequence found!\n"); + break; + } + + seq->state = SEQ_STATE_ACQUIRED; + ls->nr_acquired++; + ls->avg_wait_time = ls->wait_time_total/ls->nr_acquired; +end: + return 0; +} + /* lock oriented handlers */ /* TODO: handlers for CPU oriented, thread oriented */ static struct trace_lock_handler report_lock_ops = { @@ -834,6 +964,8 @@ static struct trace_lock_handler report_lock_ops = { .acquired_event = report_lock_acquired_event, .contended_event = report_lock_contended_event, .release_event = report_lock_release_event, + .contention_begin_event = report_lock_contention_begin_event, + .contention_end_event = report_lock_contention_end_event, }; static struct trace_lock_handler *trace_handler; @@ -1126,6 +1258,11 @@ static int __cmd_report(bool display_info) goto out_delete; } + if (perf_session__set_tracepoints_handlers(session, contention_tracepoints)) { + pr_err("Initializing perf session tracepoint handlers failed\n"); + goto out_delete; + } + if (setup_output_field(output_fields)) goto out_delete; -- cgit v1.2.3 From 7cb2a53f7f413734734bac214c4855e9863b9853 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 15 Jun 2022 09:32:21 -0700 Subject: perf record: Allow to specify max stack depth of fp callchain Currently it has no interface to specify the max stack depth for perf record. Extend the command line parameter to accept a number after 'fp' to specify the depth like '--call-graph fp,32'. Signed-off-by: Namhyung Kim Cc: Boqun Feng Cc: Davidlohr Bueso Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Waiman Long Cc: Will Deacon Link: https://lore.kernel.org/r/20220615163222.1275500-7-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-record.txt | 5 +++++ tools/perf/util/callchain.c | 18 ++++++++++++------ 2 files changed, 17 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 6bd6d07021ba..099817ef5150 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -275,6 +275,11 @@ OPTIONS User can change the size by passing the size after comma like "--call-graph dwarf,4096". + When "fp" recording is used, perf tries to save stack enties + up to the number specified in sysctl.kernel.perf_event_max_stack + by default. User can change the number by passing it after comma + like "--call-graph fp,32". + -q:: --quiet:: Don't print any message, useful for scripting. diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 5c27a4b2e7a7..7e663673f79f 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -31,6 +31,7 @@ #include "callchain.h" #include "branch.h" #include "symbol.h" +#include "util.h" #include "../perf.h" #define CALLCHAIN_PARAM_DEFAULT \ @@ -266,12 +267,17 @@ int parse_callchain_record(const char *arg, struct callchain_param *param) do { /* Framepointer style */ if (!strncmp(name, "fp", sizeof("fp"))) { - if (!strtok_r(NULL, ",", &saveptr)) { - param->record_mode = CALLCHAIN_FP; - ret = 0; - } else - pr_err("callchain: No more arguments " - "needed for --call-graph fp\n"); + ret = 0; + param->record_mode = CALLCHAIN_FP; + + tok = strtok_r(NULL, ",", &saveptr); + if (tok) { + unsigned long size; + + size = strtoul(tok, &name, 0); + if (size < (unsigned) sysctl__max_stack()) + param->max_stack = size; + } break; /* Dwarf style */ -- cgit v1.2.3 From 0d2997f750d1de394231bc22768dab94a5b5db2f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 15 Jun 2022 09:32:22 -0700 Subject: perf lock: Look up callchain for the contended locks The lock contention tracepoints don't provide lock names. All we can do is to get stack traces and show the caller instead. To minimize the overhead it's limited to up to 8 stack traces and display the first non-lock function symbol name as a caller. $ perf lock report -F acquired,contended,avg_wait,wait_total Name acquired contended avg wait total wait update_blocked_a... 40 40 3.61 us 144.45 us kernfs_fop_open+... 5 5 3.64 us 18.18 us _nohz_idle_balance 3 3 2.65 us 7.95 us tick_do_update_j... 1 1 6.04 us 6.04 us ep_scan_ready_list 1 1 3.93 us 3.93 us ... Signed-off-by: Namhyung Kim Acked-by: Ian Rogers Cc: Boqun Feng Cc: Davidlohr Bueso Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Waiman Long Cc: Will Deacon Link: https://lore.kernel.org/r/20220615163222.1275500-8-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-lock.c | 160 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 156 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 546dad1963c8..c5ca34741561 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -9,6 +9,7 @@ #include "util/symbol.h" #include "util/thread.h" #include "util/header.h" +#include "util/callchain.h" #include #include @@ -19,6 +20,7 @@ #include "util/tool.h" #include "util/data.h" #include "util/string2.h" +#include "util/map.h" #include #include @@ -32,6 +34,7 @@ #include #include #include +#include static struct perf_session *session; @@ -120,6 +123,24 @@ static struct rb_root thread_stats; static bool combine_locks; static bool show_thread_stats; +/* + * CONTENTION_STACK_DEPTH + * Number of stack trace entries to find callers + */ +#define CONTENTION_STACK_DEPTH 8 + +/* + * CONTENTION_STACK_SKIP + * Number of stack trace entries to skip when finding callers. + * The first few entries belong to the locking implementation itself. + */ +#define CONTENTION_STACK_SKIP 3 + +static u64 sched_text_start; +static u64 sched_text_end; +static u64 lock_text_start; +static u64 lock_text_end; + static struct thread_stat *thread_stat_find(u32 tid) { struct rb_node *node; @@ -839,6 +860,116 @@ end: return 0; } +static bool is_lock_function(u64 addr) +{ + if (!sched_text_start) { + struct machine *machine = &session->machines.host; + struct map *kmap; + struct symbol *sym; + + sym = machine__find_kernel_symbol_by_name(machine, + "__sched_text_start", + &kmap); + if (!sym) { + /* to avoid retry */ + sched_text_start = 1; + return false; + } + + sched_text_start = kmap->unmap_ip(kmap, sym->start); + + /* should not fail from here */ + sym = machine__find_kernel_symbol_by_name(machine, + "__sched_text_end", + &kmap); + sched_text_end = kmap->unmap_ip(kmap, sym->start); + + sym = machine__find_kernel_symbol_by_name(machine, + "__lock_text_start", + &kmap); + lock_text_start = kmap->unmap_ip(kmap, sym->start); + + sym = machine__find_kernel_symbol_by_name(machine, + "__lock_text_end", + &kmap); + lock_text_start = kmap->unmap_ip(kmap, sym->start); + } + + /* failed to get kernel symbols */ + if (sched_text_start == 1) + return false; + + /* mutex and rwsem functions are in sched text section */ + if (sched_text_start <= addr && addr < sched_text_end) + return true; + + /* spinlock functions are in lock text section */ + if (lock_text_start <= addr && addr < lock_text_end) + return true; + + return false; +} + +static int lock_contention_caller(struct evsel *evsel, struct perf_sample *sample, + char *buf, int size) +{ + struct thread *thread; + struct callchain_cursor *cursor = &callchain_cursor; + struct symbol *sym; + int skip = 0; + int ret; + + /* lock names will be replaced to task name later */ + if (show_thread_stats) + return -1; + + thread = machine__findnew_thread(&session->machines.host, + -1, sample->pid); + if (thread == NULL) + return -1; + + /* use caller function name from the callchain */ + ret = thread__resolve_callchain(thread, cursor, evsel, sample, + NULL, NULL, CONTENTION_STACK_DEPTH); + if (ret != 0) { + thread__put(thread); + return -1; + } + + callchain_cursor_commit(cursor); + thread__put(thread); + + while (true) { + struct callchain_cursor_node *node; + + node = callchain_cursor_current(cursor); + if (node == NULL) + break; + + /* skip first few entries - for lock functions */ + if (++skip <= CONTENTION_STACK_SKIP) + goto next; + + sym = node->ms.sym; + if (sym && !is_lock_function(node->ip)) { + struct map *map = node->ms.map; + u64 offset; + + offset = map->map_ip(map, node->ip) - sym->start; + + if (offset) + scnprintf(buf, size, "%s+%#lx", sym->name, offset); + else + strlcpy(buf, sym->name, size); + return 0; + } + +next: + callchain_cursor_advance(cursor); + } + return -1; +} + static int report_lock_contention_begin_event(struct evsel *evsel, struct perf_sample *sample) { @@ -850,9 +981,18 @@ static int report_lock_contention_begin_event(struct evsel *evsel, if (show_thread_stats) addr = sample->tid; - ls = lock_stat_findnew(addr, "No name"); - if (!ls) - return -ENOMEM; + ls = lock_stat_find(addr); + if (!ls) { + char buf[128]; + const char *caller = buf; + + if (lock_contention_caller(evsel, sample, buf, sizeof(buf)) < 0) + caller = "Unknown"; + + ls = lock_stat_findnew(addr, caller); + if (!ls) + return -ENOMEM; + } ts = thread_stat_findnew(sample->tid); if (!ts) @@ -1233,6 +1373,7 @@ static int __cmd_report(bool display_info) struct perf_tool eops = { .sample = process_sample_event, .comm = perf_event__process_comm, + .mmap = perf_event__process_mmap, .namespaces = perf_event__process_namespaces, .ordered_events = true, }; @@ -1248,6 +1389,8 @@ static int __cmd_report(bool display_info) return PTR_ERR(session); } + /* for lock function check */ + symbol_conf.sort_by_name = true; symbol__init(&session->header.env); if (!perf_session__has_traces(session, "lock record")) @@ -1292,8 +1435,12 @@ static int __cmd_record(int argc, const char **argv) const char *record_args[] = { "record", "-R", "-m", "1024", "-c", "1", "--synth", "task", }; + const char *callgraph_args[] = { + "--call-graph", "fp," __stringify(CONTENTION_STACK_DEPTH), + }; unsigned int rec_argc, i, j, ret; unsigned int nr_tracepoints; + unsigned int nr_callgraph_args = 0; const char **rec_argv; bool has_lock_stat = true; @@ -1318,8 +1465,10 @@ static int __cmd_record(int argc, const char **argv) } } + nr_callgraph_args = ARRAY_SIZE(callgraph_args); + setup_args: - rec_argc = ARRAY_SIZE(record_args) + argc - 1; + rec_argc = ARRAY_SIZE(record_args) + nr_callgraph_args + argc - 1; if (has_lock_stat) nr_tracepoints = ARRAY_SIZE(lock_tracepoints); @@ -1351,6 +1500,9 @@ setup_args: rec_argv[i++] = ev_name; } + for (j = 0; j < nr_callgraph_args; j++, i++) + rec_argv[i] = callgraph_args[j]; + for (j = 1; j < (unsigned int)argc; j++, i++) rec_argv[i] = argv[j]; -- cgit v1.2.3 From a9d2fae89fa8eb638203d8a4da435c647c12dfa3 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Tue, 12 Jul 2022 13:31:45 +0100 Subject: selftests/bpf: add a ksym iter subtest add subtest verifying BPF ksym iter behaviour. The BPF ksym iter program shows an example of dumping a format different to /proc/kallsyms. It adds KIND and MAX_SIZE fields which represent the kind of symbol (core kernel, module, ftrace, bpf, or kprobe) and the maximum size the symbol can be. The latter is calculated from the difference between current symbol value and the next symbol value. The key benefit for this iterator will likely be supporting in-kernel data-gathering rather than dumping symbol details to userspace and parsing the results. Signed-off-by: Alan Maguire Acked-by: Yonghong Song Link: https://lore.kernel.org/r/1657629105-7812-3-git-send-email-alan.maguire@oracle.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/bpf_iter.c | 16 +++++ tools/testing/selftests/bpf/progs/bpf_iter.h | 7 +++ tools/testing/selftests/bpf/progs/bpf_iter_ksym.c | 74 +++++++++++++++++++++++ 3 files changed, 97 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_ksym.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c index 7ff5fa93d056..a33874b081b6 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c @@ -27,6 +27,7 @@ #include "bpf_iter_test_kern5.skel.h" #include "bpf_iter_test_kern6.skel.h" #include "bpf_iter_bpf_link.skel.h" +#include "bpf_iter_ksym.skel.h" static int duration; @@ -1120,6 +1121,19 @@ static void test_link_iter(void) bpf_iter_bpf_link__destroy(skel); } +static void test_ksym_iter(void) +{ + struct bpf_iter_ksym *skel; + + skel = bpf_iter_ksym__open_and_load(); + if (!ASSERT_OK_PTR(skel, "bpf_iter_ksym__open_and_load")) + return; + + do_dummy_read(skel->progs.dump_ksym); + + bpf_iter_ksym__destroy(skel); +} + #define CMP_BUFFER_SIZE 1024 static char task_vma_output[CMP_BUFFER_SIZE]; static char proc_maps_output[CMP_BUFFER_SIZE]; @@ -1267,4 +1281,6 @@ void test_bpf_iter(void) test_buf_neg_offset(); if (test__start_subtest("link-iter")) test_link_iter(); + if (test__start_subtest("ksym")) + test_ksym_iter(); } diff --git a/tools/testing/selftests/bpf/progs/bpf_iter.h b/tools/testing/selftests/bpf/progs/bpf_iter.h index 97ec8bc76ae6..e9846606690d 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter.h +++ b/tools/testing/selftests/bpf/progs/bpf_iter.h @@ -22,6 +22,7 @@ #define BTF_F_NONAME BTF_F_NONAME___not_used #define BTF_F_PTR_RAW BTF_F_PTR_RAW___not_used #define BTF_F_ZERO BTF_F_ZERO___not_used +#define bpf_iter__ksym bpf_iter__ksym___not_used #include "vmlinux.h" #undef bpf_iter_meta #undef bpf_iter__bpf_map @@ -44,6 +45,7 @@ #undef BTF_F_NONAME #undef BTF_F_PTR_RAW #undef BTF_F_ZERO +#undef bpf_iter__ksym struct bpf_iter_meta { struct seq_file *seq; @@ -151,3 +153,8 @@ enum { BTF_F_PTR_RAW = (1ULL << 2), BTF_F_ZERO = (1ULL << 3), }; + +struct bpf_iter__ksym { + struct bpf_iter_meta *meta; + struct kallsym_iter *ksym; +}; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_ksym.c b/tools/testing/selftests/bpf/progs/bpf_iter_ksym.c new file mode 100644 index 000000000000..285c008cbf9c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_ksym.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022, Oracle and/or its affiliates. */ +#include "bpf_iter.h" +#include + +char _license[] SEC("license") = "GPL"; + +unsigned long last_sym_value = 0; + +static inline char tolower(char c) +{ + if (c >= 'A' && c <= 'Z') + c += ('a' - 'A'); + return c; +} + +static inline char toupper(char c) +{ + if (c >= 'a' && c <= 'z') + c -= ('a' - 'A'); + return c; +} + +/* Dump symbols with max size; the latter is calculated by caching symbol N value + * and when iterating on symbol N+1, we can print max size of symbol N via + * address of N+1 - address of N. + */ +SEC("iter/ksym") +int dump_ksym(struct bpf_iter__ksym *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct kallsym_iter *iter = ctx->ksym; + __u32 seq_num = ctx->meta->seq_num; + unsigned long value; + char type; + int ret; + + if (!iter) + return 0; + + if (seq_num == 0) { + BPF_SEQ_PRINTF(seq, "ADDR TYPE NAME MODULE_NAME KIND MAX_SIZE\n"); + return 0; + } + if (last_sym_value) + BPF_SEQ_PRINTF(seq, "0x%x\n", iter->value - last_sym_value); + else + BPF_SEQ_PRINTF(seq, "\n"); + + value = iter->show_value ? iter->value : 0; + + last_sym_value = value; + + type = iter->type; + + if (iter->module_name[0]) { + type = iter->exported ? toupper(type) : tolower(type); + BPF_SEQ_PRINTF(seq, "0x%llx %c %s [ %s ] ", + value, type, iter->name, iter->module_name); + } else { + BPF_SEQ_PRINTF(seq, "0x%llx %c %s ", value, type, iter->name); + } + if (!iter->pos_arch_end || iter->pos_arch_end > iter->pos) + BPF_SEQ_PRINTF(seq, "CORE "); + else if (!iter->pos_mod_end || iter->pos_mod_end > iter->pos) + BPF_SEQ_PRINTF(seq, "MOD "); + else if (!iter->pos_ftrace_mod_end || iter->pos_ftrace_mod_end > iter->pos) + BPF_SEQ_PRINTF(seq, "FTRACE_MOD "); + else if (!iter->pos_bpf_end || iter->pos_bpf_end > iter->pos) + BPF_SEQ_PRINTF(seq, "BPF "); + else + BPF_SEQ_PRINTF(seq, "KPROBE "); + return 0; +} -- cgit v1.2.3 From 874190fd4ee894bbbcc0d4df4c55ebf93af9c011 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 11 Jul 2022 22:57:51 +0000 Subject: KVM: selftests: Test MONITOR and MWAIT, not just MONITOR for quirk Fix a copy+paste error in monitor_mwait_test by switching one of the two "monitor" instructions to an "mwait". The intent of the test is very much to verify the quirk handles both MONITOR and MWAIT. Fixes: 2325d4dd7321 ("KVM: selftests: Add MONITOR/MWAIT quirk test") Reported-by: Yuan Yao Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220711225753.1073989-2-seanjc@google.com --- tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c index 49f2ed1c53fe..f5c09cb528ae 100644 --- a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c +++ b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c @@ -34,7 +34,7 @@ static void guest_monitor_wait(int testcase) else GUEST_ASSERT_2(!vector, testcase, vector); - vector = kvm_asm_safe("monitor"); + vector = kvm_asm_safe("mwait"); if (fault_wanted) GUEST_ASSERT_2(vector == UD_VECTOR, testcase, vector); else -- cgit v1.2.3 From b624ae35418ce9424f639f8ffa2568e7674c262b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 11 Jul 2022 22:57:52 +0000 Subject: KVM: selftests: Provide valid inputs for MONITOR/MWAIT regs Provide valid inputs for RAX, RCX, and RDX when testing whether or not KVM injects a #UD on MONITOR/MWAIT. SVM has a virtualization hole and checks for _all_ faults before checking for intercepts, e.g. MONITOR with an unsupported RCX will #GP before KVM gets a chance to intercept and emulate. Fixes: 2325d4dd7321 ("KVM: selftests: Add MONITOR/MWAIT quirk test") Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220711225753.1073989-3-seanjc@google.com --- tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c index f5c09cb528ae..6a4ebcdfa374 100644 --- a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c +++ b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c @@ -28,13 +28,17 @@ static void guest_monitor_wait(int testcase) GUEST_SYNC(testcase); - vector = kvm_asm_safe("monitor"); + /* + * Arbitrarily MONITOR this function, SVM performs fault checks before + * intercept checks, so the inputs for MONITOR and MWAIT must be valid. + */ + vector = kvm_asm_safe("monitor", "a"(guest_monitor_wait), "c"(0), "d"(0)); if (fault_wanted) GUEST_ASSERT_2(vector == UD_VECTOR, testcase, vector); else GUEST_ASSERT_2(!vector, testcase, vector); - vector = kvm_asm_safe("mwait"); + vector = kvm_asm_safe("mwait", "a"(guest_monitor_wait), "c"(0), "d"(0)); if (fault_wanted) GUEST_ASSERT_2(vector == UD_VECTOR, testcase, vector); else -- cgit v1.2.3 From 914f6a59b10f41a8baf62d625087e6586d4762af Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 11 Jul 2022 12:16:33 -0700 Subject: selftests: mptcp: add MPC backup tests Add a couple of test-cases covering the newly introduced features - priority update for the MPC subflow. Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 30 +++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 55efe2aafb84..ff83ef426df5 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -2428,6 +2428,36 @@ backup_tests() chk_add_nr 1 1 chk_prio_nr 1 1 fi + + if reset "mpc backup"; then + pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow,backup + run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow + chk_join_nr 0 0 0 + chk_prio_nr 0 1 + fi + + if reset "mpc backup both sides"; then + pm_nl_add_endpoint $ns1 10.0.1.1 flags subflow,backup + pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow,backup + run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow + chk_join_nr 0 0 0 + chk_prio_nr 1 1 + fi + + if reset "mpc switch to backup"; then + pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow + run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow backup + chk_join_nr 0 0 0 + chk_prio_nr 0 1 + fi + + if reset "mpc switch to backup both sides"; then + pm_nl_add_endpoint $ns1 10.0.1.1 flags subflow + pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow + run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow backup + chk_join_nr 0 0 0 + chk_prio_nr 1 1 + fi } add_addr_ports_tests() -- cgit v1.2.3 From 8ed2f5a6f385b5fff313208b90ea83f7121bd909 Mon Sep 17 00:00:00 2001 From: Hengqi Chen Date: Tue, 12 Jul 2022 10:57:45 +0800 Subject: libbpf: Error out when binary_path is NULL for uprobe and USDT binary_path is a required non-null parameter for bpf_program__attach_usdt and bpf_program__attach_uprobe_opts. Check it against NULL to prevent coredump on strchr. Signed-off-by: Hengqi Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220712025745.2703995-1-hengqi.chen@gmail.com --- tools/lib/bpf/libbpf.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index cb49408eb298..72548798126b 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -10545,7 +10545,10 @@ bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, ref_ctr_off = OPTS_GET(opts, ref_ctr_offset, 0); pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); - if (binary_path && !strchr(binary_path, '/')) { + if (!binary_path) + return libbpf_err_ptr(-EINVAL); + + if (!strchr(binary_path, '/')) { err = resolve_full_path(binary_path, full_binary_path, sizeof(full_binary_path)); if (err) { @@ -10559,11 +10562,6 @@ bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, if (func_name) { long sym_off; - if (!binary_path) { - pr_warn("prog '%s': name-based attach requires binary_path\n", - prog->name); - return libbpf_err_ptr(-EINVAL); - } sym_off = elf_find_func_offset(binary_path, func_name); if (sym_off < 0) return libbpf_err_ptr(sym_off); @@ -10711,6 +10709,9 @@ struct bpf_link *bpf_program__attach_usdt(const struct bpf_program *prog, return libbpf_err_ptr(-EINVAL); } + if (!binary_path) + return libbpf_err_ptr(-EINVAL); + if (!strchr(binary_path, '/')) { err = resolve_full_path(binary_path, resolved_path, sizeof(resolved_path)); if (err) { -- cgit v1.2.3 From 14fd95bf145ddb8201406b89c83faf24e7e3d52f Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 14 Jul 2022 01:11:15 +0000 Subject: KVM: selftests: Use "a" and "d" to set EAX/EDX for wrmsr_safe() Do not use GCC's "A" constraint to load EAX:EDX in wrmsr_safe(). Per GCC's documenation on x86-specific constraints, "A" will not actually load a 64-bit value into EAX:EDX on x86-64. The a and d registers. This class is used for instructions that return double word results in the ax:dx register pair. Single word values will be allocated either in ax or dx. For example on i386 the following implements rdtsc: unsigned long long rdtsc (void) { unsigned long long tick; __asm__ __volatile__("rdtsc":"=A"(tick)); return tick; } This is not correct on x86-64 as it would allocate tick in either ax or dx. You have to use the following variant instead: unsigned long long rdtsc (void) { unsigned int tickl, tickh; __asm__ __volatile__("rdtsc":"=a"(tickl),"=d"(tickh)); return ((unsigned long long)tickh << 32)|tickl; } Because a u64 fits in a single 64-bit register, using "A" for selftests, which are 64-bit only, results in GCC loading the value into either RAX or RDX instead of splitting it across EAX:EDX. E.g.: kvm_exit: reason MSR_WRITE rip 0x402919 info 0 0 kvm_msr: msr_write 40000118 = 0x60000000001 (#GP) ... With "A": 48 8b 43 08 mov 0x8(%rbx),%rax 49 b9 ba da ca ba 0a movabs $0xabacadaba,%r9 00 00 00 4c 8d 15 07 00 00 00 lea 0x7(%rip),%r10 # 402f44 4c 8d 1d 06 00 00 00 lea 0x6(%rip),%r11 # 402f4a 0f 30 wrmsr With "a"/"d": 48 8b 53 08 mov 0x8(%rbx),%rdx 89 d0 mov %edx,%eax 48 c1 ea 20 shr $0x20,%rdx 49 b9 ba da ca ba 0a movabs $0xabacadaba,%r9 00 00 00 4c 8d 15 07 00 00 00 lea 0x7(%rip),%r10 # 402fc3 4c 8d 1d 06 00 00 00 lea 0x6(%rip),%r11 # 402fc9 0f 30 wrmsr Fixes: 3b23054cd3f5 ("KVM: selftests: Add x86-64 support for exception fixup") Signed-off-by: Vitaly Kuznetsov Link: https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html#Machine-Constraints [sean: use "& -1u", provide GCC blurb and link to documentation] Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220714011115.3135828-1-seanjc@google.com --- tools/testing/selftests/kvm/include/x86_64/processor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 79dcf6be1b47..71e942ffac77 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -612,7 +612,7 @@ static inline uint8_t rdmsr_safe(uint32_t msr, uint64_t *val) static inline uint8_t wrmsr_safe(uint32_t msr, uint64_t val) { - return kvm_asm_safe("wrmsr", "A"(val), "c"(msr)); + return kvm_asm_safe("wrmsr", "a"(val & -1u), "d"(val >> 32), "c"(msr)); } uint64_t vm_get_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, -- cgit v1.2.3 From 43bb9e000ea4c62154c01844771fea25b8b83520 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 11 Jul 2022 22:57:53 +0000 Subject: KVM: x86: Tweak name of MONITOR/MWAIT #UD quirk to make it #UD specific Add a "UD" clause to KVM_X86_QUIRK_MWAIT_NEVER_FAULTS to make it clear that the quirk only controls the #UD behavior of MONITOR/MWAIT. KVM doesn't currently enforce fault checks when MONITOR/MWAIT are supported, but that could change in the future. SVM also has a virtualization hole in that it checks all faults before intercepts, and so "never faults" is already a lie when running on SVM. Fixes: bfbcc81bb82c ("KVM: x86: Add a quirk for KVM's "MONITOR/MWAIT are NOPs!" behavior") Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220711225753.1073989-4-seanjc@google.com --- Documentation/virt/kvm/api.rst | 2 +- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/include/uapi/asm/kvm.h | 2 +- arch/x86/kvm/x86.c | 2 +- tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index bafaeedd455c..cd9361f22530 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7523,7 +7523,7 @@ The valid bits in cap.args[0] are: incorrect hypercall instruction will generate a #UD within the guest. -KVM_X86_QUIRK_MWAIT_NEVER_FAULTS By default, KVM emulates MONITOR/MWAIT (if +KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS By default, KVM emulates MONITOR/MWAIT (if they are intercepted) as NOPs regardless of whether or not MONITOR/MWAIT are supported according to guest CPUID. When this quirk diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dd6a26f7d46c..d4ece7bf2124 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2096,6 +2096,6 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_OUT_7E_INC_RIP | \ KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \ KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ - KVM_X86_QUIRK_MWAIT_NEVER_FAULTS) + KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index ee3896416c68..a0c0ab0c898e 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -439,7 +439,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) #define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) #define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5) -#define KVM_X86_QUIRK_MWAIT_NEVER_FAULTS (1 << 6) +#define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 67dcaa670874..db46c060acb5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2046,7 +2046,7 @@ EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) { - if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_FAULTS) && + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) && !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT)) return kvm_handle_invalid_op(vcpu); diff --git a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c index 6a4ebcdfa374..094c68d744c0 100644 --- a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c +++ b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c @@ -113,7 +113,7 @@ int main(int argc, char *argv[]) disabled_quirks = 0; if (testcase & MWAIT_QUIRK_DISABLED) - disabled_quirks |= KVM_X86_QUIRK_MWAIT_NEVER_FAULTS; + disabled_quirks |= KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS; if (testcase & MISC_ENABLES_QUIRK_DISABLED) disabled_quirks |= KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT; vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, disabled_quirks); -- cgit v1.2.3 From 8fb2638a568d8ef635bbef4f61eb6587d2ebd8da Mon Sep 17 00:00:00 2001 From: Colton Lewis Date: Wed, 15 Jun 2022 19:31:13 +0000 Subject: KVM: selftests: enumerate GUEST_ASSERT arguments Enumerate GUEST_ASSERT arguments to avoid magic indices to ucall.args. Signed-off-by: Colton Lewis Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20220615193116.806312-2-coltonlewis@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/ucall_common.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/ucall_common.h b/tools/testing/selftests/kvm/include/ucall_common.h index 98562f685151..dbe872870b83 100644 --- a/tools/testing/selftests/kvm/include/ucall_common.h +++ b/tools/testing/selftests/kvm/include/ucall_common.h @@ -32,6 +32,14 @@ uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc); ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4) #define GUEST_SYNC(stage) ucall(UCALL_SYNC, 2, "hello", stage) #define GUEST_DONE() ucall(UCALL_DONE, 0) + +enum guest_assert_builtin_args { + GUEST_ERROR_STRING, + GUEST_FILE, + GUEST_LINE, + GUEST_ASSERT_BUILTIN_NARGS +}; + #define __GUEST_ASSERT(_condition, _condstr, _nargs, _args...) do { \ if (!(_condition)) \ ucall(UCALL_ABORT, 2 + _nargs, \ -- cgit v1.2.3 From fc573fa4f38aa0cecace00d00cc60109ae947834 Mon Sep 17 00:00:00 2001 From: Colton Lewis Date: Wed, 15 Jun 2022 19:31:14 +0000 Subject: KVM: selftests: Increase UCALL_MAX_ARGS to 7 Increase UCALL_MAX_ARGS to 7 to allow GUEST_ASSERT_4 to pass 3 builtin ucall arguments specified in guest_assert_builtin_args plus 4 user-specified arguments. Signed-off-by: Colton Lewis Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20220615193116.806312-3-coltonlewis@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/ucall_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/ucall_common.h b/tools/testing/selftests/kvm/include/ucall_common.h index dbe872870b83..568c562f14cd 100644 --- a/tools/testing/selftests/kvm/include/ucall_common.h +++ b/tools/testing/selftests/kvm/include/ucall_common.h @@ -16,7 +16,7 @@ enum { UCALL_UNHANDLED, }; -#define UCALL_MAX_ARGS 6 +#define UCALL_MAX_ARGS 7 struct ucall { uint64_t cmd; -- cgit v1.2.3 From ddcb57afd5815191f02aec12f18b4d1bbad5fb9d Mon Sep 17 00:00:00 2001 From: Colton Lewis Date: Wed, 15 Jun 2022 19:31:15 +0000 Subject: KVM: selftests: Write REPORT_GUEST_ASSERT macros to pair with GUEST_ASSERT Write REPORT_GUEST_ASSERT macros to pair with GUEST_ASSERT to abstract and make consistent all guest assertion reporting. Every report includes an explanatory string, a filename, and a line number. Signed-off-by: Colton Lewis Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20220615193116.806312-4-coltonlewis@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/ucall_common.h | 42 ++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/ucall_common.h b/tools/testing/selftests/kvm/include/ucall_common.h index 568c562f14cd..e8af3b4fef6d 100644 --- a/tools/testing/selftests/kvm/include/ucall_common.h +++ b/tools/testing/selftests/kvm/include/ucall_common.h @@ -6,6 +6,7 @@ */ #ifndef SELFTEST_KVM_UCALL_COMMON_H #define SELFTEST_KVM_UCALL_COMMON_H +#include "test_util.h" /* Common ucalls */ enum { @@ -64,4 +65,45 @@ enum guest_assert_builtin_args { #define GUEST_ASSERT_EQ(a, b) __GUEST_ASSERT((a) == (b), #a " == " #b, 2, a, b) +#define __REPORT_GUEST_ASSERT(_ucall, fmt, _args...) \ + TEST_FAIL("%s at %s:%ld\n" fmt, \ + (const char *)(_ucall).args[GUEST_ERROR_STRING], \ + (const char *)(_ucall).args[GUEST_FILE], \ + (_ucall).args[GUEST_LINE], \ + ##_args) + +#define GUEST_ASSERT_ARG(ucall, i) ((ucall).args[GUEST_ASSERT_BUILTIN_NARGS + i]) + +#define REPORT_GUEST_ASSERT(ucall) \ + __REPORT_GUEST_ASSERT((ucall), "") + +#define REPORT_GUEST_ASSERT_1(ucall, fmt) \ + __REPORT_GUEST_ASSERT((ucall), \ + fmt, \ + GUEST_ASSERT_ARG((ucall), 0)) + +#define REPORT_GUEST_ASSERT_2(ucall, fmt) \ + __REPORT_GUEST_ASSERT((ucall), \ + fmt, \ + GUEST_ASSERT_ARG((ucall), 0), \ + GUEST_ASSERT_ARG((ucall), 1)) + +#define REPORT_GUEST_ASSERT_3(ucall, fmt) \ + __REPORT_GUEST_ASSERT((ucall), \ + fmt, \ + GUEST_ASSERT_ARG((ucall), 0), \ + GUEST_ASSERT_ARG((ucall), 1), \ + GUEST_ASSERT_ARG((ucall), 2)) + +#define REPORT_GUEST_ASSERT_4(ucall, fmt) \ + __REPORT_GUEST_ASSERT((ucall), \ + fmt, \ + GUEST_ASSERT_ARG((ucall), 0), \ + GUEST_ASSERT_ARG((ucall), 1), \ + GUEST_ASSERT_ARG((ucall), 2), \ + GUEST_ASSERT_ARG((ucall), 3)) + +#define REPORT_GUEST_ASSERT_N(ucall, fmt, args...) \ + __REPORT_GUEST_ASSERT((ucall), fmt, ##args) + #endif /* SELFTEST_KVM_UCALL_COMMON_H */ -- cgit v1.2.3 From 594a1c271c159c9c5f0ff2d92ebfda469e94e48d Mon Sep 17 00:00:00 2001 From: Colton Lewis Date: Wed, 15 Jun 2022 19:31:16 +0000 Subject: KVM: selftests: Fix filename reporting in guest asserts Fix filename reporting in guest asserts by ensuring the GUEST_ASSERT macro records __FILE__ and substituting REPORT_GUEST_ASSERT for many repetitive calls to TEST_FAIL. Previously filename was reported by using __FILE__ directly in the selftest, wrongly assuming it would always be the same as where the assertion failed. Signed-off-by: Colton Lewis Reported-by: Ricardo Koller Fixes: 4e18bccc2e5544f0be28fc1c4e6be47a469d6c60 Link: https://lore.kernel.org/r/20220615193116.806312-5-coltonlewis@google.com [sean: convert more TEST_FAIL => REPORT_GUEST_ASSERT instances] Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/aarch64/arch_timer.c | 11 +++++---- .../selftests/kvm/aarch64/debug-exceptions.c | 4 +--- tools/testing/selftests/kvm/aarch64/hypercalls.c | 7 +++--- tools/testing/selftests/kvm/aarch64/psci_test.c | 3 +-- tools/testing/selftests/kvm/aarch64/vgic_irq.c | 4 +--- tools/testing/selftests/kvm/include/ucall_common.h | 11 ++++----- tools/testing/selftests/kvm/memslot_perf_test.c | 4 +--- tools/testing/selftests/kvm/s390x/tprot.c | 26 ++++++++++------------ .../testing/selftests/kvm/set_memory_region_test.c | 3 +-- tools/testing/selftests/kvm/steal_time.c | 3 +-- .../selftests/kvm/system_counter_offset_test.c | 3 +-- tools/testing/selftests/kvm/x86_64/amx_test.c | 3 +-- tools/testing/selftests/kvm/x86_64/cpuid_test.c | 3 +-- .../selftests/kvm/x86_64/cr4_cpuid_sync_test.c | 2 +- .../selftests/kvm/x86_64/emulator_error_test.c | 3 +-- tools/testing/selftests/kvm/x86_64/evmcs_test.c | 3 +-- .../selftests/kvm/x86_64/fix_hypercall_test.c | 2 +- tools/testing/selftests/kvm/x86_64/hyperv_clock.c | 3 +-- .../testing/selftests/kvm/x86_64/hyperv_features.c | 8 ++----- .../testing/selftests/kvm/x86_64/hyperv_svm_test.c | 3 +-- .../testing/selftests/kvm/x86_64/kvm_clock_test.c | 3 +-- tools/testing/selftests/kvm/x86_64/kvm_pv_test.c | 4 +--- .../selftests/kvm/x86_64/monitor_mwait_test.c | 4 +--- .../testing/selftests/kvm/x86_64/set_boot_cpu_id.c | 4 +--- tools/testing/selftests/kvm/x86_64/state_test.c | 3 +-- .../selftests/kvm/x86_64/svm_int_ctl_test.c | 2 +- .../kvm/x86_64/svm_nested_soft_inject_test.c | 3 +-- .../testing/selftests/kvm/x86_64/svm_vmcall_test.c | 2 +- .../selftests/kvm/x86_64/triple_fault_event_test.c | 2 +- tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c | 4 +--- .../selftests/kvm/x86_64/userspace_io_test.c | 4 +--- .../selftests/kvm/x86_64/userspace_msr_exit_test.c | 5 ++--- .../selftests/kvm/x86_64/vmx_apic_access_test.c | 3 +-- .../kvm/x86_64/vmx_close_while_nested_test.c | 2 +- .../selftests/kvm/x86_64/vmx_dirty_log_test.c | 3 +-- .../kvm/x86_64/vmx_invalid_nested_guest_state.c | 2 +- .../kvm/x86_64/vmx_nested_tsc_scaling_test.c | 2 +- .../kvm/x86_64/vmx_preemption_timer_test.c | 3 +-- .../selftests/kvm/x86_64/vmx_tsc_adjust_test.c | 2 +- .../testing/selftests/kvm/x86_64/xen_shinfo_test.c | 2 +- .../testing/selftests/kvm/x86_64/xen_vmcall_test.c | 2 +- 41 files changed, 68 insertions(+), 102 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c index f68019be67c0..574eb73f0e90 100644 --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -231,10 +231,13 @@ static void *test_vcpu_run(void *arg) break; case UCALL_ABORT: sync_global_from_guest(vm, *shared_data); - TEST_FAIL("%s at %s:%ld\n\tvalues: %lu, %lu; %lu, vcpu: %u; stage: %u; iter: %u", - (const char *)uc.args[0], __FILE__, uc.args[1], - uc.args[2], uc.args[3], uc.args[4], vcpu_idx, - shared_data->guest_stage, shared_data->nr_iter); + REPORT_GUEST_ASSERT_N(uc, "values: %lu, %lu; %lu, vcpu %u; stage; %u; iter: %u", + GUEST_ASSERT_ARG(uc, 0), + GUEST_ASSERT_ARG(uc, 1), + GUEST_ASSERT_ARG(uc, 2), + vcpu_idx, + shared_data->guest_stage, + shared_data->nr_iter); break; default: TEST_FAIL("Unexpected guest exit\n"); diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c index b8072b40ccc8..2ee35cf9801e 100644 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -283,9 +283,7 @@ int main(int argc, char *argv[]) stage, (ulong)uc.args[1]); break; case UCALL_ABORT: - TEST_FAIL("%s at %s:%ld\n\tvalues: %#lx, %#lx", - (const char *)uc.args[0], - __FILE__, uc.args[1], uc.args[2], uc.args[3]); + REPORT_GUEST_ASSERT_2(uc, "values: %#lx, %#lx"); break; case UCALL_DONE: goto done; diff --git a/tools/testing/selftests/kvm/aarch64/hypercalls.c b/tools/testing/selftests/kvm/aarch64/hypercalls.c index 5fce4969cbb9..a39da3fe4952 100644 --- a/tools/testing/selftests/kvm/aarch64/hypercalls.c +++ b/tools/testing/selftests/kvm/aarch64/hypercalls.c @@ -291,9 +291,10 @@ static void test_run(void) guest_done = true; break; case UCALL_ABORT: - TEST_FAIL("%s at %s:%ld\n\tvalues: 0x%lx, 0x%lx; 0x%lx, stage: %u", - (const char *)uc.args[0], __FILE__, uc.args[1], - uc.args[2], uc.args[3], uc.args[4], stage); + REPORT_GUEST_ASSERT_N(uc, "values: 0x%lx, 0x%lx; 0x%lx, stage: %u", + GUEST_ASSERT_ARG(uc, 0), + GUEST_ASSERT_ARG(uc, 1), + GUEST_ASSERT_ARG(uc, 2), stage); break; default: TEST_FAIL("Unexpected guest exit\n"); diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c index b665b534cb78..f7621f6e938e 100644 --- a/tools/testing/selftests/kvm/aarch64/psci_test.c +++ b/tools/testing/selftests/kvm/aarch64/psci_test.c @@ -94,8 +94,7 @@ static void enter_guest(struct kvm_vcpu *vcpu) vcpu_run(vcpu); if (get_ucall(vcpu, &uc) == UCALL_ABORT) - TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, - uc.args[1]); + REPORT_GUEST_ASSERT(uc); } static void assert_vcpu_reset(struct kvm_vcpu *vcpu) diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c index 046ba4fde648..17417220a083 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c @@ -782,9 +782,7 @@ static void test_vgic(uint32_t nr_irqs, bool level_sensitive, bool eoi_split) run_guest_cmd(vcpu, gic_fd, &inject_args, &args); break; case UCALL_ABORT: - TEST_FAIL("%s at %s:%ld\n\tvalues: %#lx, %#lx", - (const char *)uc.args[0], - __FILE__, uc.args[1], uc.args[2], uc.args[3]); + REPORT_GUEST_ASSERT_2(uc, "values: %#lx, %#lx"); break; case UCALL_DONE: goto done; diff --git a/tools/testing/selftests/kvm/include/ucall_common.h b/tools/testing/selftests/kvm/include/ucall_common.h index e8af3b4fef6d..ee79d180e07e 100644 --- a/tools/testing/selftests/kvm/include/ucall_common.h +++ b/tools/testing/selftests/kvm/include/ucall_common.h @@ -41,11 +41,12 @@ enum guest_assert_builtin_args { GUEST_ASSERT_BUILTIN_NARGS }; -#define __GUEST_ASSERT(_condition, _condstr, _nargs, _args...) do { \ - if (!(_condition)) \ - ucall(UCALL_ABORT, 2 + _nargs, \ - "Failed guest assert: " \ - _condstr, __LINE__, _args); \ +#define __GUEST_ASSERT(_condition, _condstr, _nargs, _args...) \ +do { \ + if (!(_condition)) \ + ucall(UCALL_ABORT, GUEST_ASSERT_BUILTIN_NARGS + _nargs, \ + "Failed guest assert: " _condstr, \ + __FILE__, __LINE__, ##_args); \ } while (0) #define GUEST_ASSERT(_condition) \ diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index 5f98489e4f4d..44995446d942 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -162,9 +162,7 @@ static void *vcpu_worker(void *__data) goto done; break; case UCALL_ABORT: - TEST_FAIL("%s at %s:%ld, val = %lu", - (const char *)uc.args[0], - __FILE__, uc.args[1], uc.args[2]); + REPORT_GUEST_ASSERT_1(uc, "val = %lu"); break; case UCALL_DONE: goto done; diff --git a/tools/testing/selftests/kvm/s390x/tprot.c b/tools/testing/selftests/kvm/s390x/tprot.c index 015a13056503..a9a0b76e5fa4 100644 --- a/tools/testing/selftests/kvm/s390x/tprot.c +++ b/tools/testing/selftests/kvm/s390x/tprot.c @@ -181,20 +181,18 @@ static void guest_code(void) GUEST_SYNC(perform_next_stage(&i, mapped_0)); } -#define HOST_SYNC_NO_TAP(vcpup, stage) \ -({ \ - struct kvm_vcpu *__vcpu = (vcpup); \ - struct ucall uc; \ - int __stage = (stage); \ - \ - vcpu_run(__vcpu); \ - get_ucall(__vcpu, &uc); \ - if (uc.cmd == UCALL_ABORT) { \ - TEST_FAIL("line %lu: %s, hints: %lu, %lu", uc.args[1], \ - (const char *)uc.args[0], uc.args[2], uc.args[3]); \ - } \ - ASSERT_EQ(uc.cmd, UCALL_SYNC); \ - ASSERT_EQ(uc.args[1], __stage); \ +#define HOST_SYNC_NO_TAP(vcpup, stage) \ +({ \ + struct kvm_vcpu *__vcpu = (vcpup); \ + struct ucall uc; \ + int __stage = (stage); \ + \ + vcpu_run(__vcpu); \ + get_ucall(__vcpu, &uc); \ + if (uc.cmd == UCALL_ABORT) \ + REPORT_GUEST_ASSERT_2(uc, "hints: %lu, %lu"); \ + ASSERT_EQ(uc.cmd, UCALL_SYNC); \ + ASSERT_EQ(uc.args[1], __stage); \ }) #define HOST_SYNC(vcpu, stage) \ diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index 47b219dd60e4..0d55f508d595 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -88,8 +88,7 @@ static void *vcpu_worker(void *data) } if (run->exit_reason == KVM_EXIT_IO && cmd == UCALL_ABORT) - TEST_FAIL("%s at %s:%ld, val = %lu", (const char *)uc.args[0], - __FILE__, uc.args[1], uc.args[2]); + REPORT_GUEST_ASSERT_1(uc, "val = %lu"); return NULL; } diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index d122f1e05cdd..9866a71463d7 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -234,8 +234,7 @@ static void run_vcpu(struct kvm_vcpu *vcpu) case UCALL_DONE: break; case UCALL_ABORT: - TEST_ASSERT(false, "%s at %s:%ld", (const char *)uc.args[0], - __FILE__, uc.args[1]); + REPORT_GUEST_ASSERT(uc); default: TEST_ASSERT(false, "Unexpected exit: %s", exit_reason_str(vcpu->run->exit_reason)); diff --git a/tools/testing/selftests/kvm/system_counter_offset_test.c b/tools/testing/selftests/kvm/system_counter_offset_test.c index 862a8e93e070..1c274933912b 100644 --- a/tools/testing/selftests/kvm/system_counter_offset_test.c +++ b/tools/testing/selftests/kvm/system_counter_offset_test.c @@ -83,8 +83,7 @@ static void handle_sync(struct ucall *uc, uint64_t start, uint64_t end) static void handle_abort(struct ucall *uc) { - TEST_FAIL("%s at %s:%ld", (const char *)uc->args[0], - __FILE__, uc->args[1]); + REPORT_GUEST_ASSERT(*uc); } static void enter_guest(struct kvm_vcpu *vcpu) diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c index dab4ca16a2df..b71763b11b78 100644 --- a/tools/testing/selftests/kvm/x86_64/amx_test.c +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -373,8 +373,7 @@ int main(int argc, char *argv[]) switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], - __FILE__, uc.args[1]); + REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ case UCALL_SYNC: switch (uc.args[1]) { diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c index 4aa784932597..3767a0cc694b 100644 --- a/tools/testing/selftests/kvm/x86_64/cpuid_test.c +++ b/tools/testing/selftests/kvm/x86_64/cpuid_test.c @@ -132,8 +132,7 @@ static void run_vcpu(struct kvm_vcpu *vcpu, int stage) case UCALL_DONE: return; case UCALL_ABORT: - TEST_ASSERT(false, "%s at %s:%ld\n\tvalues: %#lx, %#lx", (const char *)uc.args[0], - __FILE__, uc.args[1], uc.args[2], uc.args[3]); + REPORT_GUEST_ASSERT_2(uc, "values: %#lx, %#lx"); default: TEST_ASSERT(false, "Unexpected exit: %s", exit_reason_str(vcpu->run->exit_reason)); diff --git a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c index a80940ac420f..56d8ab92eed4 100644 --- a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c +++ b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c @@ -94,7 +94,7 @@ int main(int argc, char *argv[]) vcpu_sregs_set(vcpu, &sregs); break; case UCALL_ABORT: - TEST_FAIL("Guest CR4 bit (OSXSAVE) unsynchronized with CPUID bit."); + REPORT_GUEST_ASSERT(uc); break; case UCALL_DONE: goto done; diff --git a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c index bfff2d271c48..3aa3d17f230f 100644 --- a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c +++ b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c @@ -92,8 +92,7 @@ static void process_exit_on_emulation_error(struct kvm_vcpu *vcpu) static void do_guest_assert(struct ucall *uc) { - TEST_FAIL("%s at %s:%ld", (const char *)uc->args[0], __FILE__, - uc->args[1]); + REPORT_GUEST_ASSERT(*uc); } static void check_for_guest_assert(struct kvm_vcpu *vcpu) diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index 8dda527cc080..aacad86d90e1 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -236,8 +236,7 @@ int main(int argc, char *argv[]) switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], - __FILE__, uc.args[1]); + REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ case UCALL_SYNC: break; diff --git a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c index f6f251ce59e1..b1905d280ef5 100644 --- a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c +++ b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c @@ -112,7 +112,7 @@ static void enter_guest(struct kvm_vcpu *vcpu) case UCALL_DONE: return; case UCALL_ABORT: - TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]); + REPORT_GUEST_ASSERT(uc); default: TEST_FAIL("Unhandled ucall: %ld\nexit_reason: %u (%s)", uc.cmd, run->exit_reason, exit_reason_str(run->exit_reason)); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c index f7a9e29ff0c7..d576bc8ce823 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c @@ -234,8 +234,7 @@ int main(void) switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], - __FILE__, uc.args[1]); + REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ case UCALL_SYNC: break; diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c index c05acd78548f..2070ba0d6392 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c @@ -447,9 +447,7 @@ static void guest_test_msrs_access(void) switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s at %s:%ld, MSR = %lx, vector = %lx", - (const char *)uc.args[0], __FILE__, - uc.args[1], uc.args[2], uc.args[3]); + REPORT_GUEST_ASSERT_2(uc, "MSR = %lx, vector = %lx"); return; case UCALL_DONE: break; @@ -618,9 +616,7 @@ static void guest_test_hcalls_access(void) switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s at %s:%ld, arg1 = %lx, arg2 = %lx", - (const char *)uc.args[0], __FILE__, - uc.args[1], uc.args[2], uc.args[3]); + REPORT_GUEST_ASSERT_2(uc, "arg1 = %lx, arg2 = %lx"); return; case UCALL_DONE: break; diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c index c5cd9835dbd6..b7dc243ab8d5 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c @@ -145,8 +145,7 @@ int main(int argc, char *argv[]) switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], - __FILE__, uc.args[1]); + REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ case UCALL_SYNC: break; diff --git a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c index 138455575a11..813ce282cf56 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c @@ -71,8 +71,7 @@ static void handle_sync(struct ucall *uc, struct kvm_clock_data *start, static void handle_abort(struct ucall *uc) { - TEST_FAIL("%s at %s:%ld", (const char *)uc->args[0], - __FILE__, uc->args[1]); + REPORT_GUEST_ASSERT(*uc); } static void setup_clock(struct kvm_vm *vm, struct test_case *test_case) diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c index feff85e43be3..ea452444f4af 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c @@ -137,9 +137,7 @@ static void enter_guest(struct kvm_vcpu *vcpu) pr_hcall(&uc); break; case UCALL_ABORT: - TEST_FAIL("%s at %s:%ld, vector = %lu", - (const char *)uc.args[0], __FILE__, - uc.args[1], uc.args[2]); + REPORT_GUEST_ASSERT_1(uc, "vector = %lu"); return; case UCALL_DONE: return; diff --git a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c index 094c68d744c0..2bf6851b4f42 100644 --- a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c +++ b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c @@ -100,9 +100,7 @@ int main(int argc, char *argv[]) testcase = uc.args[1]; break; case UCALL_ABORT: - TEST_FAIL("%s at %s:%ld, testcase = %lx, vector = %ld", - (const char *)uc.args[0], __FILE__, - uc.args[1], uc.args[2], uc.args[3]); + REPORT_GUEST_ASSERT_2(uc, "testcase = %lx, vector = %ld"); goto done; case UCALL_DONE: goto done; diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c index 7ef713fdd0a5..b25d7556b638 100644 --- a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c +++ b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c @@ -65,9 +65,7 @@ static void run_vcpu(struct kvm_vcpu *vcpu) stage); break; case UCALL_ABORT: - TEST_ASSERT(false, "%s at %s:%ld\n\tvalues: %#lx, %#lx", - (const char *)uc.args[0], __FILE__, - uc.args[1], uc.args[2], uc.args[3]); + REPORT_GUEST_ASSERT_2(uc, "values: %#lx, %#lx"); default: TEST_ASSERT(false, "Unexpected exit: %s", exit_reason_str(vcpu->run->exit_reason)); diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c index e2f1f35e51ff..2b0de1598ab8 100644 --- a/tools/testing/selftests/kvm/x86_64/state_test.c +++ b/tools/testing/selftests/kvm/x86_64/state_test.c @@ -190,8 +190,7 @@ int main(int argc, char *argv[]) switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], - __FILE__, uc.args[1]); + REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ case UCALL_SYNC: break; diff --git a/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c index 9c68a47b69e1..d978d1697f5a 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c @@ -113,7 +113,7 @@ int main(int argc, char *argv[]) switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s", (const char *)uc.args[0]); + REPORT_GUEST_ASSERT(uc); break; /* NOT REACHED */ case UCALL_DONE: diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c index 1c3f457aa3aa..07253e22defd 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c @@ -181,8 +181,7 @@ static void run_test(bool is_nmi) switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s at %s:%ld, vals = 0x%lx 0x%lx 0x%lx", (const char *)uc.args[0], - __FILE__, uc.args[1], uc.args[2], uc.args[3], uc.args[4]); + REPORT_GUEST_ASSERT_3(uc, "vals = 0x%lx 0x%lx 0x%lx"); break; /* NOT REACHED */ case UCALL_DONE: diff --git a/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c index e6d7191866a5..d53b1f7abb56 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c @@ -58,7 +58,7 @@ int main(int argc, char *argv[]) switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s", (const char *)uc.args[0]); + REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ case UCALL_SYNC: break; diff --git a/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c index 5a202ecb8ea0..d1274c097b36 100644 --- a/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c +++ b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c @@ -82,7 +82,7 @@ int main(void) case UCALL_DONE: break; case UCALL_ABORT: - TEST_FAIL("%s", (const char *)uc.args[0]); + REPORT_GUEST_ASSERT(uc); default: TEST_FAIL("Unexpected ucall: %lu", uc.cmd); } diff --git a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c index 3165d3f7e065..22d366c697f7 100644 --- a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c +++ b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c @@ -79,9 +79,7 @@ static void run_vcpu(struct kvm_vcpu *vcpu, int stage) case UCALL_DONE: return; case UCALL_ABORT: - TEST_ASSERT(false, "%s at %s:%ld\n" \ - "\tvalues: %#lx, %#lx", (const char *)uc.args[0], - __FILE__, uc.args[1], uc.args[2], uc.args[3]); + REPORT_GUEST_ASSERT_2(uc, "values: %#lx, %#lx"); default: TEST_ASSERT(false, "Unexpected exit: %s", exit_reason_str(vcpu->run->exit_reason)); diff --git a/tools/testing/selftests/kvm/x86_64/userspace_io_test.c b/tools/testing/selftests/kvm/x86_64/userspace_io_test.c index 7538d57a41d5..7316521428f8 100644 --- a/tools/testing/selftests/kvm/x86_64/userspace_io_test.c +++ b/tools/testing/selftests/kvm/x86_64/userspace_io_test.c @@ -98,9 +98,7 @@ int main(int argc, char *argv[]) case UCALL_DONE: break; case UCALL_ABORT: - TEST_FAIL("%s at %s:%ld : argN+1 = 0x%lx, argN+2 = 0x%lx", - (const char *)uc.args[0], __FILE__, uc.args[1], - uc.args[2], uc.args[3]); + REPORT_GUEST_ASSERT_2(uc, "argN+1 = 0x%lx, argN+2 = 0x%lx"); default: TEST_FAIL("Unknown ucall %lu", uc.cmd); } diff --git a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c index f84dc37426f5..a4f06370a245 100644 --- a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c +++ b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c @@ -400,8 +400,7 @@ static void check_for_guest_assert(struct kvm_vcpu *vcpu) if (vcpu->run->exit_reason == KVM_EXIT_IO && get_ucall(vcpu, &uc) == UCALL_ABORT) { - TEST_FAIL("%s at %s:%ld", - (const char *)uc.args[0], __FILE__, uc.args[1]); + REPORT_GUEST_ASSERT(uc); } } @@ -610,7 +609,7 @@ static int handle_ucall(struct kvm_vcpu *vcpu) switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("Guest assertion not met"); + REPORT_GUEST_ASSERT(uc); break; case UCALL_SYNC: vm_ioctl(vcpu->vm, KVM_X86_SET_MSR_FILTER, &no_filter_deny); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c index ccb05ef7234e..d3582cea1258 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c @@ -114,8 +114,7 @@ int main(int argc, char *argv[]) switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], - __FILE__, uc.args[1]); + REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ case UCALL_SYNC: apic_access_addr = uc.args[1]; diff --git a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c index 40c77bb706a1..e69e8963ed08 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c @@ -74,7 +74,7 @@ int main(int argc, char *argv[]) switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s", (const char *)uc.args[0]); + REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ default: TEST_FAIL("Unknown ucall %lu", uc.cmd); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c index 215ffa0589d4..f378960299c0 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c @@ -123,8 +123,7 @@ int main(int argc, char *argv[]) switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], - __FILE__, uc.args[1]); + REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ case UCALL_SYNC: /* diff --git a/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c b/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c index 683f4f0a1616..8c854738f2cc 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c @@ -98,7 +98,7 @@ int main(int argc, char *argv[]) case UCALL_DONE: break; case UCALL_ABORT: - TEST_FAIL("%s", (const char *)uc.args[0]); + REPORT_GUEST_ASSERT(uc); default: TEST_FAIL("Unexpected ucall: %lu", uc.cmd); } diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c index ff4644038c55..6bfef77b87b7 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c @@ -194,7 +194,7 @@ int main(int argc, char *argv[]) switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s", (const char *) uc.args[0]); + REPORT_GUEST_ASSERT(uc); case UCALL_SYNC: switch (uc.args[0]) { case USLEEP: diff --git a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c index 99e57b0cc2c9..0a8e989d4200 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c @@ -189,8 +189,7 @@ int main(int argc, char *argv[]) switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], - __FILE__, uc.args[1]); + REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ case UCALL_SYNC: break; diff --git a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c index e32bfb102699..2e75eef926ca 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c @@ -147,7 +147,7 @@ int main(int argc, char *argv[]) switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s", (const char *)uc.args[0]); + REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ case UCALL_SYNC: report(uc.args[1]); diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index a4a78637c35a..8a5cb800f50e 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -542,7 +542,7 @@ int main(int argc, char *argv[]) switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s", (const char *)uc.args[0]); + REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ case UCALL_SYNC: { struct kvm_xen_vcpu_attr rst; diff --git a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c index 8b76cade9bcd..88914d48c65e 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c @@ -129,7 +129,7 @@ int main(int argc, char *argv[]) switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s", (const char *)uc.args[0]); + REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ case UCALL_SYNC: break; -- cgit v1.2.3 From 4c16fa3ee945183a2f4718a45035e1835c19205b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:26 +0000 Subject: KVM: selftests: Set KVM's supported CPUID as vCPU's CPUID during recreate On x86-64, set KVM's supported CPUID as the vCPU's CPUID when recreating a VM+vCPU to deduplicate code for state save/restore tests, and to provide symmetry of sorts with respect to vm_create_with_one_vcpu(). The extra KVM_SET_CPUID2 call is wasteful for Hyper-V, but ultimately is nothing more than an expensive nop, and overriding the vCPU's CPUID with the Hyper-V CPUID information is the only known scenario where a state save/restore test wouldn't need/want the default CPUID. Opportunistically use __weak for the default vm_compute_max_gfn(), it's provided by tools' compiler.h. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-2-seanjc@google.com --- tools/testing/selftests/kvm/include/kvm_util_base.h | 9 +++++++++ tools/testing/selftests/kvm/lib/kvm_util.c | 10 ++++++++-- tools/testing/selftests/kvm/lib/x86_64/processor.c | 9 +++++++++ tools/testing/selftests/kvm/x86_64/amx_test.c | 1 - tools/testing/selftests/kvm/x86_64/smm_test.c | 1 - tools/testing/selftests/kvm/x86_64/state_test.c | 1 - tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c | 2 -- 7 files changed, 26 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index b78e3c7a2566..19f7b33a1e58 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -739,6 +739,15 @@ static inline struct kvm_vcpu *vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, return vm_arch_vcpu_add(vm, vcpu_id, guest_code); } +/* Re-create a vCPU after restarting a VM, e.g. for state save/restore tests. */ +struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, uint32_t vcpu_id); + +static inline struct kvm_vcpu *vm_vcpu_recreate(struct kvm_vm *vm, + uint32_t vcpu_id) +{ + return vm_arch_vcpu_recreate(vm, vcpu_id); +} + void virt_arch_pgd_alloc(struct kvm_vm *vm); static inline void virt_pgd_alloc(struct kvm_vm *vm) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 768f3bce0161..49c189878af3 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -388,11 +388,17 @@ void kvm_vm_restart(struct kvm_vm *vmp) } } +__weak struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, + uint32_t vcpu_id) +{ + return __vm_vcpu_add(vm, vcpu_id); +} + struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm) { kvm_vm_restart(vm); - return __vm_vcpu_add(vm, 0); + return vm_vcpu_recreate(vm, 0); } /* @@ -1812,7 +1818,7 @@ void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva) return addr_gpa2hva(vm, addr_gva2gpa(vm, gva)); } -unsigned long __attribute__((weak)) vm_compute_max_gfn(struct kvm_vm *vm) +unsigned long __weak vm_compute_max_gfn(struct kvm_vm *vm) { return ((1ULL << vm->pa_bits) >> vm->page_shift) - 1; } diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 1a32b1c75e9a..909ac12e5ec5 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -663,6 +663,15 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, return vcpu; } +struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, uint32_t vcpu_id) +{ + struct kvm_vcpu *vcpu = __vm_vcpu_add(vm, vcpu_id); + + vcpu_set_cpuid(vcpu, kvm_get_supported_cpuid()); + + return vcpu; +} + /* * Allocate an instance of struct kvm_cpuid2 * diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c index b71763b11b78..4c59923f454f 100644 --- a/tools/testing/selftests/kvm/x86_64/amx_test.c +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -424,7 +424,6 @@ int main(int argc, char *argv[]) /* Restore state in a new VM. */ vcpu = vm_recreate_with_one_vcpu(vm); - vcpu_set_cpuid(vcpu, kvm_get_supported_cpuid()); vcpu_load_state(vcpu, state); run = vcpu->run; kvm_x86_state_cleanup(state); diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c index 921cbf117329..d29aefb4d7d3 100644 --- a/tools/testing/selftests/kvm/x86_64/smm_test.c +++ b/tools/testing/selftests/kvm/x86_64/smm_test.c @@ -205,7 +205,6 @@ int main(int argc, char *argv[]) kvm_vm_release(vm); vcpu = vm_recreate_with_one_vcpu(vm); - vcpu_set_cpuid(vcpu, kvm_get_supported_cpuid()); vcpu_load_state(vcpu, state); run = vcpu->run; kvm_x86_state_cleanup(state); diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c index 2b0de1598ab8..3f1a13f28115 100644 --- a/tools/testing/selftests/kvm/x86_64/state_test.c +++ b/tools/testing/selftests/kvm/x86_64/state_test.c @@ -213,7 +213,6 @@ int main(int argc, char *argv[]) /* Restore state in a new VM. */ vcpu = vm_recreate_with_one_vcpu(vm); - vcpu_set_cpuid(vcpu, kvm_get_supported_cpuid()); vcpu_load_state(vcpu, state); run = vcpu->run; kvm_x86_state_cleanup(state); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c index 0a8e989d4200..8504c6ce0cad 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c @@ -236,8 +236,6 @@ int main(int argc, char *argv[]) /* Restore state in a new VM. */ vcpu = vm_recreate_with_one_vcpu(vm); - - vcpu_set_cpuid(vcpu, kvm_get_supported_cpuid()); vcpu_load_state(vcpu, state); run = vcpu->run; kvm_x86_state_cleanup(state); -- cgit v1.2.3 From 683edfd42bc222daf17388afb0c752e38712fa05 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:27 +0000 Subject: KVM: sefltests: Use CPUID_* instead of X86_FEATURE_* for one-off usage Rename X86_FEATURE_* macros to CPUID_* in various tests to free up the X86_FEATURE_* names for KVM-Unit-Tests style CPUID automagic where the function, leaf, register, and bit for the feature is embedded in its macro value. No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-3-seanjc@google.com --- tools/testing/selftests/kvm/include/x86_64/processor.h | 4 ++++ tools/testing/selftests/kvm/x86_64/amx_test.c | 9 +++------ tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c | 7 ++----- tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c | 4 ++-- tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c | 3 +-- 5 files changed, 12 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 71e942ffac77..776817a454bf 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -50,6 +50,7 @@ #define CPUID_SMX (1ul << 6) #define CPUID_PCID (1ul << 17) #define CPUID_XSAVE (1ul << 26) +#define CPUID_OSXSAVE (1ul << 27) /* CPUID.7.EBX */ #define CPUID_FSGSBASE (1ul << 0) @@ -64,6 +65,9 @@ /* CPUID.0x8000_0001.EDX */ #define CPUID_GBPAGES (1ul << 26) +/* CPUID.0x8000_000A.EDX */ +#define CPUID_NRIPS BIT(3) + /* Page table bitfield declarations */ #define PTE_PRESENT_MASK BIT_ULL(0) #define PTE_WRITABLE_MASK BIT_ULL(1) diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c index 4c59923f454f..8024562016bc 100644 --- a/tools/testing/selftests/kvm/x86_64/amx_test.c +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -25,9 +25,6 @@ # error This test is 64-bit only #endif -#define X86_FEATURE_XSAVE (1 << 26) -#define X86_FEATURE_OSXSAVE (1 << 27) - #define NUM_TILES 8 #define TILE_SIZE 1024 #define XSAVE_SIZE ((NUM_TILES * TILE_SIZE) + PAGE_SIZE) @@ -128,9 +125,9 @@ static inline void check_cpuid_xsave(void) eax = 1; ecx = 0; cpuid(&eax, &ebx, &ecx, &edx); - if (!(ecx & X86_FEATURE_XSAVE)) + if (!(ecx & CPUID_XSAVE)) GUEST_ASSERT(!"cpuid: no CPU xsave support!"); - if (!(ecx & X86_FEATURE_OSXSAVE)) + if (!(ecx & CPUID_OSXSAVE)) GUEST_ASSERT(!"cpuid: no OS xsave support!"); } @@ -333,7 +330,7 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); entry = kvm_get_supported_cpuid_entry(1); - TEST_REQUIRE(entry->ecx & X86_FEATURE_XSAVE); + TEST_REQUIRE(entry->ecx & CPUID_XSAVE); TEST_REQUIRE(kvm_get_cpuid_max_basic() >= 0xd); diff --git a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c index 56d8ab92eed4..f4d3a042ec1c 100644 --- a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c +++ b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c @@ -19,9 +19,6 @@ #include "kvm_util.h" #include "processor.h" -#define X86_FEATURE_XSAVE (1<<26) -#define X86_FEATURE_OSXSAVE (1<<27) - static inline bool cr4_cpuid_is_sync(void) { int func, subfunc; @@ -36,7 +33,7 @@ static inline bool cr4_cpuid_is_sync(void) cr4 = get_cr4(); - return (!!(ecx & X86_FEATURE_OSXSAVE)) == (!!(cr4 & X86_CR4_OSXSAVE)); + return (!!(ecx & CPUID_OSXSAVE)) == (!!(cr4 & X86_CR4_OSXSAVE)); } static void guest_code(void) @@ -70,7 +67,7 @@ int main(int argc, char *argv[]) struct ucall uc; entry = kvm_get_supported_cpuid_entry(1); - TEST_REQUIRE(entry->ecx & X86_FEATURE_XSAVE); + TEST_REQUIRE(entry->ecx & CPUID_XSAVE); /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); diff --git a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c index 2bf6851b4f42..c804f6f04134 100644 --- a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c +++ b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c @@ -8,7 +8,7 @@ #include "kvm_util.h" #include "processor.h" -#define X86_FEATURE_MWAIT (1u << 3) +#define CPUID_MWAIT (1u << 3) enum monitor_mwait_testcases { MWAIT_QUIRK_DISABLED = BIT(0), @@ -76,7 +76,7 @@ int main(int argc, char *argv[]) cpuid = kvm_get_supported_cpuid(); entry = kvm_get_supported_cpuid_index(1, 0); - entry->ecx &= ~X86_FEATURE_MWAIT; + entry->ecx &= ~CPUID_MWAIT; set_cpuid(cpuid, entry); vm = vm_create_with_one_vcpu(&vcpu, guest_code); diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c index 07253e22defd..bf7eda7722fe 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c @@ -19,7 +19,6 @@ #include "test_util.h" #define INT_NR 0x20 -#define X86_FEATURE_NRIPS BIT(3) static_assert(ATOMIC_INT_LOCK_FREE == 2, "atomic int is not lockless"); @@ -203,7 +202,7 @@ int main(int argc, char *argv[]) nested_svm_check_supported(); cpuid = kvm_get_supported_cpuid_entry(0x8000000a); - TEST_ASSERT(cpuid->edx & X86_FEATURE_NRIPS, + TEST_ASSERT(cpuid->edx & CPUID_NRIPS, "KVM with nSVM is supposed to unconditionally advertise nRIP Save\n"); atomic_init(&nmi_stage, 0); -- cgit v1.2.3 From 61d76b8a6943cd12b89029f9b04a75ce97f01d96 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:28 +0000 Subject: KVM: selftests: Add framework to query KVM CPUID bits Add X86_FEATURE_* magic in the style of KVM-Unit-Tests' implementation, where the CPUID function, index, output register, and output bit position are embedded in the macro value. Add kvm_cpu_has() to query KVM's supported CPUID and use it set_sregs_test, which is the most prolific user of manual feature querying. Opportunstically rename calc_cr4_feature_bits() to calc_supported_cr4_feature_bits() to better capture how the CR4 bits are chosen. Link: https://lore.kernel.org/all/20210422005626.564163-1-ricarkol@google.com Suggested-by: Paolo Bonzini Suggested-by: Jim Mattson Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-4-seanjc@google.com --- .../selftests/kvm/include/x86_64/processor.h | 106 ++++++++++++++++++--- tools/testing/selftests/kvm/lib/x86_64/processor.c | 22 +++++ .../testing/selftests/kvm/x86_64/set_sregs_test.c | 28 +++--- 3 files changed, 128 insertions(+), 28 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 776817a454bf..f5b935f112f5 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -45,23 +45,96 @@ #define X86_CR4_SMAP (1ul << 21) #define X86_CR4_PKE (1ul << 22) +/* Note, these are ordered alphabetically to match kvm_cpuid_entry2. Eww. */ +enum cpuid_output_regs { + KVM_CPUID_EAX, + KVM_CPUID_EBX, + KVM_CPUID_ECX, + KVM_CPUID_EDX +}; + +/* + * Pack the information into a 64-bit value so that each X86_FEATURE_XXX can be + * passed by value with no overhead. + */ +struct kvm_x86_cpu_feature { + u32 function; + u16 index; + u8 reg; + u8 bit; +}; +#define KVM_X86_CPU_FEATURE(fn, idx, gpr, __bit) \ +({ \ + struct kvm_x86_cpu_feature feature = { \ + .function = fn, \ + .index = idx, \ + .reg = KVM_CPUID_##gpr, \ + .bit = __bit, \ + }; \ + \ + feature; \ +}) + +/* + * Basic Leafs, a.k.a. Intel defined + */ +#define X86_FEATURE_MWAIT KVM_X86_CPU_FEATURE(0x1, 0, ECX, 3) +#define X86_FEATURE_VMX KVM_X86_CPU_FEATURE(0x1, 0, ECX, 5) +#define X86_FEATURE_SMX KVM_X86_CPU_FEATURE(0x1, 0, ECX, 6) +#define X86_FEATURE_PCID KVM_X86_CPU_FEATURE(0x1, 0, ECX, 17) +#define X86_FEATURE_MOVBE KVM_X86_CPU_FEATURE(0x1, 0, ECX, 22) +#define X86_FEATURE_TSC_DEADLINE_TIMER KVM_X86_CPU_FEATURE(0x1, 0, ECX, 24) +#define X86_FEATURE_XSAVE KVM_X86_CPU_FEATURE(0x1, 0, ECX, 26) +#define X86_FEATURE_OSXSAVE KVM_X86_CPU_FEATURE(0x1, 0, ECX, 27) +#define X86_FEATURE_RDRAND KVM_X86_CPU_FEATURE(0x1, 0, ECX, 30) +#define X86_FEATURE_MCE KVM_X86_CPU_FEATURE(0x1, 0, EDX, 7) +#define X86_FEATURE_APIC KVM_X86_CPU_FEATURE(0x1, 0, EDX, 9) +#define X86_FEATURE_CLFLUSH KVM_X86_CPU_FEATURE(0x1, 0, EDX, 19) +#define X86_FEATURE_XMM KVM_X86_CPU_FEATURE(0x1, 0, EDX, 25) +#define X86_FEATURE_XMM2 KVM_X86_CPU_FEATURE(0x1, 0, EDX, 26) +#define X86_FEATURE_FSGSBASE KVM_X86_CPU_FEATURE(0x7, 0, EBX, 0) +#define X86_FEATURE_TSC_ADJUST KVM_X86_CPU_FEATURE(0x7, 0, EBX, 1) +#define X86_FEATURE_HLE KVM_X86_CPU_FEATURE(0x7, 0, EBX, 4) +#define X86_FEATURE_SMEP KVM_X86_CPU_FEATURE(0x7, 0, EBX, 7) +#define X86_FEATURE_INVPCID KVM_X86_CPU_FEATURE(0x7, 0, EBX, 10) +#define X86_FEATURE_RTM KVM_X86_CPU_FEATURE(0x7, 0, EBX, 11) +#define X86_FEATURE_SMAP KVM_X86_CPU_FEATURE(0x7, 0, EBX, 20) +#define X86_FEATURE_PCOMMIT KVM_X86_CPU_FEATURE(0x7, 0, EBX, 22) +#define X86_FEATURE_CLFLUSHOPT KVM_X86_CPU_FEATURE(0x7, 0, EBX, 23) +#define X86_FEATURE_CLWB KVM_X86_CPU_FEATURE(0x7, 0, EBX, 24) +#define X86_FEATURE_UMIP KVM_X86_CPU_FEATURE(0x7, 0, ECX, 2) +#define X86_FEATURE_PKU KVM_X86_CPU_FEATURE(0x7, 0, ECX, 3) +#define X86_FEATURE_LA57 KVM_X86_CPU_FEATURE(0x7, 0, ECX, 16) +#define X86_FEATURE_RDPID KVM_X86_CPU_FEATURE(0x7, 0, ECX, 22) +#define X86_FEATURE_SHSTK KVM_X86_CPU_FEATURE(0x7, 0, ECX, 7) +#define X86_FEATURE_IBT KVM_X86_CPU_FEATURE(0x7, 0, EDX, 20) +#define X86_FEATURE_SPEC_CTRL KVM_X86_CPU_FEATURE(0x7, 0, EDX, 26) +#define X86_FEATURE_ARCH_CAPABILITIES KVM_X86_CPU_FEATURE(0x7, 0, EDX, 29) +#define X86_FEATURE_PKS KVM_X86_CPU_FEATURE(0x7, 0, ECX, 31) + +/* + * Extended Leafs, a.k.a. AMD defined + */ +#define X86_FEATURE_SVM KVM_X86_CPU_FEATURE(0x80000001, 0, ECX, 2) +#define X86_FEATURE_NX KVM_X86_CPU_FEATURE(0x80000001, 0, EDX, 20) +#define X86_FEATURE_GBPAGES KVM_X86_CPU_FEATURE(0x80000001, 0, EDX, 26) +#define X86_FEATURE_RDTSCP KVM_X86_CPU_FEATURE(0x80000001, 0, EDX, 27) +#define X86_FEATURE_LM KVM_X86_CPU_FEATURE(0x80000001, 0, EDX, 29) +#define X86_FEATURE_RDPRU KVM_X86_CPU_FEATURE(0x80000008, 0, EBX, 4) +#define X86_FEATURE_AMD_IBPB KVM_X86_CPU_FEATURE(0x80000008, 0, EBX, 12) +#define X86_FEATURE_NPT KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 0) +#define X86_FEATURE_LBRV KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 1) +#define X86_FEATURE_NRIPS KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 3) +#define X86_FEATURE_TSCRATEMSR KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 4) +#define X86_FEATURE_PAUSEFILTER KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 10) +#define X86_FEATURE_PFTHRESHOLD KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 12) +#define X86_FEATURE_VGIF KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 16) + /* CPUID.1.ECX */ #define CPUID_VMX (1ul << 5) -#define CPUID_SMX (1ul << 6) -#define CPUID_PCID (1ul << 17) #define CPUID_XSAVE (1ul << 26) #define CPUID_OSXSAVE (1ul << 27) -/* CPUID.7.EBX */ -#define CPUID_FSGSBASE (1ul << 0) -#define CPUID_SMEP (1ul << 7) -#define CPUID_SMAP (1ul << 20) - -/* CPUID.7.ECX */ -#define CPUID_UMIP (1ul << 2) -#define CPUID_PKU (1ul << 3) -#define CPUID_LA57 (1ul << 16) - /* CPUID.0x8000_0001.EDX */ #define CPUID_GBPAGES (1ul << 26) @@ -490,6 +563,15 @@ static inline void vcpu_xcrs_set(struct kvm_vcpu *vcpu, struct kvm_xcrs *xcrs) } struct kvm_cpuid2 *kvm_get_supported_cpuid(void); + +bool kvm_cpuid_has(const struct kvm_cpuid2 *cpuid, + struct kvm_x86_cpu_feature feature); + +static inline bool kvm_cpu_has(struct kvm_x86_cpu_feature feature) +{ + return kvm_cpuid_has(kvm_get_supported_cpuid(), feature); +} + struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vcpu *vcpu); static inline int __vcpu_set_cpuid(struct kvm_vcpu *vcpu, diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 909ac12e5ec5..10038496fff6 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -734,6 +734,28 @@ struct kvm_cpuid2 *kvm_get_supported_cpuid(void) return cpuid; } +bool kvm_cpuid_has(const struct kvm_cpuid2 *cpuid, + struct kvm_x86_cpu_feature feature) +{ + const struct kvm_cpuid_entry2 *entry; + int i; + + for (i = 0; i < cpuid->nent; i++) { + entry = &cpuid->entries[i]; + + /* + * The output registers in kvm_cpuid_entry2 are in alphabetical + * order, but kvm_x86_cpu_feature matches that mess, so yay + * pointer shenanigans! + */ + if (entry->function == feature.function && + entry->index == feature.index) + return (&entry->eax)[feature.reg] & BIT(feature.bit); + } + + return false; +} + uint64_t kvm_get_feature_msr(uint64_t msr_index) { struct { diff --git a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c index dd344439ad33..2bb08bf2125d 100644 --- a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c +++ b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c @@ -43,36 +43,32 @@ static void test_cr4_feature_bit(struct kvm_vcpu *vcpu, struct kvm_sregs *orig, TEST_ASSERT(!memcmp(&sregs, orig, sizeof(sregs)), "KVM modified sregs"); } -static uint64_t calc_cr4_feature_bits(struct kvm_vm *vm) +static uint64_t calc_supported_cr4_feature_bits(void) { - struct kvm_cpuid_entry2 *cpuid_1, *cpuid_7; uint64_t cr4; - cpuid_1 = kvm_get_supported_cpuid_entry(1); - cpuid_7 = kvm_get_supported_cpuid_entry(7); - cr4 = X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT; - if (cpuid_7->ecx & CPUID_UMIP) + if (kvm_cpu_has(X86_FEATURE_UMIP)) cr4 |= X86_CR4_UMIP; - if (cpuid_7->ecx & CPUID_LA57) + if (kvm_cpu_has(X86_FEATURE_LA57)) cr4 |= X86_CR4_LA57; - if (cpuid_1->ecx & CPUID_VMX) + if (kvm_cpu_has(X86_FEATURE_VMX)) cr4 |= X86_CR4_VMXE; - if (cpuid_1->ecx & CPUID_SMX) + if (kvm_cpu_has(X86_FEATURE_SMX)) cr4 |= X86_CR4_SMXE; - if (cpuid_7->ebx & CPUID_FSGSBASE) + if (kvm_cpu_has(X86_FEATURE_FSGSBASE)) cr4 |= X86_CR4_FSGSBASE; - if (cpuid_1->ecx & CPUID_PCID) + if (kvm_cpu_has(X86_FEATURE_PCID)) cr4 |= X86_CR4_PCIDE; - if (cpuid_1->ecx & CPUID_XSAVE) + if (kvm_cpu_has(X86_FEATURE_XSAVE)) cr4 |= X86_CR4_OSXSAVE; - if (cpuid_7->ebx & CPUID_SMEP) + if (kvm_cpu_has(X86_FEATURE_SMEP)) cr4 |= X86_CR4_SMEP; - if (cpuid_7->ebx & CPUID_SMAP) + if (kvm_cpu_has(X86_FEATURE_SMAP)) cr4 |= X86_CR4_SMAP; - if (cpuid_7->ecx & CPUID_PKU) + if (kvm_cpu_has(X86_FEATURE_PKU)) cr4 |= X86_CR4_PKE; return cr4; @@ -99,7 +95,7 @@ int main(int argc, char *argv[]) vcpu_sregs_get(vcpu, &sregs); - sregs.cr4 |= calc_cr4_feature_bits(vm); + sregs.cr4 |= calc_supported_cr4_feature_bits(); cr4 = sregs.cr4; rc = _vcpu_sregs_set(vcpu, &sregs); -- cgit v1.2.3 From c5c5b827f129734602b796eceb4342b17858dc01 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:29 +0000 Subject: KVM: selftests: Use kvm_cpu_has() in the SEV migration test Use kvm_cpu_has() in the SEV migration test instead of open coding equivalent functionality using kvm_get_supported_cpuid_entry(). No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-5-seanjc@google.com --- tools/testing/selftests/kvm/include/x86_64/processor.h | 2 ++ tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c | 13 ++----------- 2 files changed, 4 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index f5b935f112f5..57fefd9e8ae9 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -129,6 +129,8 @@ struct kvm_x86_cpu_feature { #define X86_FEATURE_PAUSEFILTER KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 10) #define X86_FEATURE_PFTHRESHOLD KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 12) #define X86_FEATURE_VGIF KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 16) +#define X86_FEATURE_SEV KVM_X86_CPU_FEATURE(0x8000001F, 0, EAX, 1) +#define X86_FEATURE_SEV_ES KVM_X86_CPU_FEATURE(0x8000001F, 0, EAX, 3) /* CPUID.1.ECX */ #define CPUID_VMX (1ul << 5) diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c index 46018b247a04..c7ef97561038 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c +++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c @@ -393,23 +393,14 @@ static void test_sev_move_copy(void) kvm_vm_free(sev_vm); } -#define X86_FEATURE_SEV (1 << 1) -#define X86_FEATURE_SEV_ES (1 << 3) - int main(int argc, char *argv[]) { - struct kvm_cpuid_entry2 *cpuid; - TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM)); TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)); - cpuid = kvm_get_supported_cpuid_entry(0x80000000); - TEST_REQUIRE(cpuid->eax >= 0x8000001f); - - cpuid = kvm_get_supported_cpuid_entry(0x8000001f); - TEST_REQUIRE(cpuid->eax & X86_FEATURE_SEV); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV)); - have_sev_es = !!(cpuid->eax & X86_FEATURE_SEV_ES); + have_sev_es = kvm_cpu_has(X86_FEATURE_SEV_ES); if (kvm_has_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM)) { test_sev_migrate_from(/* es= */ false); -- cgit v1.2.3 From f21940a3bb5c6a6cd075f589d9405efc5718690c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:30 +0000 Subject: KVM: selftests: Use kvm_cpu_has() for nested SVM checks Use kvm_cpu_has() to check for nested SVM support, and drop the helpers now that their functionality is trivial to implement. No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-6-seanjc@google.com --- tools/testing/selftests/kvm/include/x86_64/svm_util.h | 2 -- tools/testing/selftests/kvm/lib/x86_64/svm.c | 13 ------------- tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c | 2 +- tools/testing/selftests/kvm/x86_64/smm_test.c | 2 +- tools/testing/selftests/kvm/x86_64/state_test.c | 2 +- tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c | 2 +- .../selftests/kvm/x86_64/svm_nested_soft_inject_test.c | 2 +- tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c | 2 +- 8 files changed, 6 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/svm_util.h b/tools/testing/selftests/kvm/include/x86_64/svm_util.h index 136ba6a5d027..f48806d26989 100644 --- a/tools/testing/selftests/kvm/include/x86_64/svm_util.h +++ b/tools/testing/selftests/kvm/include/x86_64/svm_util.h @@ -51,8 +51,6 @@ struct svm_test_data { struct svm_test_data *vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva); void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp); void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa); -bool nested_svm_supported(void); -void nested_svm_check_supported(void); static inline bool cpu_has_svm(void) { diff --git a/tools/testing/selftests/kvm/lib/x86_64/svm.c b/tools/testing/selftests/kvm/lib/x86_64/svm.c index 37e9c0a923e0..6d445886e16c 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/svm.c +++ b/tools/testing/selftests/kvm/lib/x86_64/svm.c @@ -164,19 +164,6 @@ void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa) : "r15", "memory"); } -bool nested_svm_supported(void) -{ - struct kvm_cpuid_entry2 *entry = - kvm_get_supported_cpuid_entry(0x80000001); - - return entry->ecx & CPUID_SVM; -} - -void nested_svm_check_supported(void) -{ - TEST_REQUIRE(nested_svm_supported()); -} - /* * Open SEV_DEV_PATH if available, otherwise exit the entire program. * diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c index b7dc243ab8d5..a380ad7bb9b3 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c @@ -127,7 +127,7 @@ int main(int argc, char *argv[]) struct ucall uc; int stage; - TEST_REQUIRE(nested_svm_supported()); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, guest_code); diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c index d29aefb4d7d3..fde3fe925686 100644 --- a/tools/testing/selftests/kvm/x86_64/smm_test.c +++ b/tools/testing/selftests/kvm/x86_64/smm_test.c @@ -154,7 +154,7 @@ int main(int argc, char *argv[]) vcpu_set_msr(vcpu, MSR_IA32_SMBASE, SMRAM_GPA); if (kvm_has_cap(KVM_CAP_NESTED_STATE)) { - if (nested_svm_supported()) + if (kvm_cpu_has(X86_FEATURE_SVM)) vcpu_alloc_svm(vm, &nested_gva); else if (nested_vmx_supported()) vcpu_alloc_vmx(vm, &nested_gva); diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c index 3f1a13f28115..ae01d32624ec 100644 --- a/tools/testing/selftests/kvm/x86_64/state_test.c +++ b/tools/testing/selftests/kvm/x86_64/state_test.c @@ -170,7 +170,7 @@ int main(int argc, char *argv[]) vcpu_regs_get(vcpu, ®s1); if (kvm_has_cap(KVM_CAP_NESTED_STATE)) { - if (nested_svm_supported()) + if (kvm_cpu_has(X86_FEATURE_SVM)) vcpu_alloc_svm(vm, &nested_gva); else if (nested_vmx_supported()) vcpu_alloc_vmx(vm, &nested_gva); diff --git a/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c index d978d1697f5a..4a07ba227b99 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c @@ -90,7 +90,7 @@ int main(int argc, char *argv[]) struct kvm_vm *vm; struct ucall uc; - nested_svm_check_supported(); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c index bf7eda7722fe..2cc09ab41570 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c @@ -199,7 +199,7 @@ int main(int argc, char *argv[]) /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); - nested_svm_check_supported(); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); cpuid = kvm_get_supported_cpuid_entry(0x8000000a); TEST_ASSERT(cpuid->edx & CPUID_NRIPS, diff --git a/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c index d53b1f7abb56..c3ac45df7483 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c @@ -39,7 +39,7 @@ int main(int argc, char *argv[]) vm_vaddr_t svm_gva; struct kvm_vm *vm; - nested_svm_check_supported(); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); -- cgit v1.2.3 From 1ecbb337fa107c6a08800d34ba4f3296ad01103e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:31 +0000 Subject: KVM: selftests: Use kvm_cpu_has() for nested VMX checks Use kvm_cpu_has() to check for nested VMX support, and drop the helpers now that their functionality is trivial to implement. No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-7-seanjc@google.com --- tools/testing/selftests/kvm/include/x86_64/vmx.h | 2 -- tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c | 2 +- tools/testing/selftests/kvm/lib/x86_64/vmx.c | 12 ------------ tools/testing/selftests/kvm/x86_64/evmcs_test.c | 2 +- tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c | 4 ++-- tools/testing/selftests/kvm/x86_64/smm_test.c | 2 +- tools/testing/selftests/kvm/x86_64/state_test.c | 2 +- tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c | 2 +- tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c | 2 +- .../selftests/kvm/x86_64/vmx_close_while_nested_test.c | 2 +- tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c | 2 +- .../selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c | 2 +- .../selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c | 2 +- .../testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c | 2 +- .../testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c | 2 +- tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c | 2 +- 16 files changed, 15 insertions(+), 29 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h index cc3604f8f1d3..99fa1410964c 100644 --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h @@ -607,8 +607,6 @@ bool prepare_for_vmx_operation(struct vmx_pages *vmx); void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp); bool load_vmcs(struct vmx_pages *vmx); -bool nested_vmx_supported(void); -void nested_vmx_check_supported(void); bool ept_1g_pages_supported(void); void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, diff --git a/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c b/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c index bfe85c8c2f6e..0f344a7c89c4 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c @@ -84,7 +84,7 @@ void perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc vm_vaddr_t vmx_gva; int vcpu_id; - nested_vmx_check_supported(); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { vmx = vcpu_alloc_vmx(vm, &vmx_gva); diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index 381432741df4..80a568c439b8 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -382,18 +382,6 @@ void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp) init_vmcs_guest_state(guest_rip, guest_rsp); } -bool nested_vmx_supported(void) -{ - struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1); - - return entry->ecx & CPUID_VMX; -} - -void nested_vmx_check_supported(void) -{ - TEST_REQUIRE(nested_vmx_supported()); -} - static void nested_create_pte(struct kvm_vm *vm, struct eptPageTableEntry *pte, uint64_t nested_paddr, diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index aacad86d90e1..99bc202243d2 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -208,7 +208,7 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); - TEST_REQUIRE(nested_vmx_supported()); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c index cbd4a7d36189..c406b95cba9b 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c @@ -148,7 +148,7 @@ int main(int argc, char *argv[]) test_hv_cpuid(hv_cpuid_entries, false); free(hv_cpuid_entries); - if (!nested_vmx_supported() || + if (!kvm_cpu_has(X86_FEATURE_VMX) || !kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) { print_skip("Enlightened VMCS is unsupported"); goto do_sys; @@ -168,7 +168,7 @@ do_sys: test_hv_cpuid_e2big(vm, NULL); hv_cpuid_entries = kvm_get_supported_hv_cpuid(); - test_hv_cpuid(hv_cpuid_entries, nested_vmx_supported()); + test_hv_cpuid(hv_cpuid_entries, kvm_cpu_has(X86_FEATURE_VMX)); out: kvm_vm_free(vm); diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c index fde3fe925686..17bb2397ea38 100644 --- a/tools/testing/selftests/kvm/x86_64/smm_test.c +++ b/tools/testing/selftests/kvm/x86_64/smm_test.c @@ -156,7 +156,7 @@ int main(int argc, char *argv[]) if (kvm_has_cap(KVM_CAP_NESTED_STATE)) { if (kvm_cpu_has(X86_FEATURE_SVM)) vcpu_alloc_svm(vm, &nested_gva); - else if (nested_vmx_supported()) + else if (kvm_cpu_has(X86_FEATURE_VMX)) vcpu_alloc_vmx(vm, &nested_gva); } diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c index ae01d32624ec..8ab81a3b561f 100644 --- a/tools/testing/selftests/kvm/x86_64/state_test.c +++ b/tools/testing/selftests/kvm/x86_64/state_test.c @@ -172,7 +172,7 @@ int main(int argc, char *argv[]) if (kvm_has_cap(KVM_CAP_NESTED_STATE)) { if (kvm_cpu_has(X86_FEATURE_SVM)) vcpu_alloc_svm(vm, &nested_gva); - else if (nested_vmx_supported()) + else if (kvm_cpu_has(X86_FEATURE_VMX)) vcpu_alloc_vmx(vm, &nested_gva); } diff --git a/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c index d1274c097b36..70b44f0b52fe 100644 --- a/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c +++ b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c @@ -46,7 +46,7 @@ int main(void) vm_vaddr_t vmx_pages_gva; struct ucall uc; - nested_vmx_check_supported(); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); TEST_REQUIRE(kvm_has_cap(KVM_CAP_X86_TRIPLE_FAULT_EVENT)); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c index d3582cea1258..5abecf06329e 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c @@ -80,7 +80,7 @@ int main(int argc, char *argv[]) struct kvm_vcpu *vcpu; struct kvm_vm *vm; - nested_vmx_check_supported(); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c index e69e8963ed08..d79651b02740 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c @@ -51,7 +51,7 @@ int main(int argc, char *argv[]) struct kvm_vcpu *vcpu; struct kvm_vm *vm; - nested_vmx_check_supported(); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c index f378960299c0..2d8c23d639f7 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c @@ -77,7 +77,7 @@ int main(int argc, char *argv[]) struct ucall uc; bool done = false; - nested_vmx_check_supported(); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c b/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c index 8c854738f2cc..6bfb4bb471ca 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c @@ -58,7 +58,7 @@ int main(int argc, char *argv[]) struct kvm_run *run; struct ucall uc; - nested_vmx_check_supported(); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c index 6bfef77b87b7..465a9434d61c 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c @@ -150,7 +150,7 @@ int main(int argc, char *argv[]) uint64_t l1_tsc_freq = 0; uint64_t l2_tsc_freq = 0; - nested_vmx_check_supported(); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); TEST_REQUIRE(kvm_has_cap(KVM_CAP_TSC_CONTROL)); stable_tsc_check_supported(); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c index 8504c6ce0cad..0efdc05969a5 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c @@ -167,7 +167,7 @@ int main(int argc, char *argv[]) * AMD currently does not implement any VMX features, so for now we * just early out. */ - nested_vmx_check_supported(); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c index b564b86dfc1d..66cb2d0054e6 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c @@ -273,7 +273,7 @@ int main(int argc, char *argv[]) * AMD currently does not implement set_nested_state, so for now we * just early out. */ - nested_vmx_check_supported(); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); vm = vm_create_with_one_vcpu(&vcpu, NULL); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c index 2e75eef926ca..5943187e8594 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c @@ -127,7 +127,7 @@ int main(int argc, char *argv[]) vm_vaddr_t vmx_pages_gva; struct kvm_vcpu *vcpu; - nested_vmx_check_supported(); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); vm = vm_create_with_one_vcpu(&vcpu, (void *) l1_guest_code); -- cgit v1.2.3 From ea129d22541ecd2bf4c8769d33f48e3105eca5f9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:32 +0000 Subject: KVM: selftests: Use kvm_cpu_has() to query PDCM in PMU selftest Use kvm_cpu_has() in the PMU test to query PDCM support instead of open coding equivalent functionality using kvm_get_supported_cpuid_index(). No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-8-seanjc@google.com --- tools/testing/selftests/kvm/include/x86_64/processor.h | 1 + tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c | 7 ++----- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 57fefd9e8ae9..ad10c8787892 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -81,6 +81,7 @@ struct kvm_x86_cpu_feature { #define X86_FEATURE_MWAIT KVM_X86_CPU_FEATURE(0x1, 0, ECX, 3) #define X86_FEATURE_VMX KVM_X86_CPU_FEATURE(0x1, 0, ECX, 5) #define X86_FEATURE_SMX KVM_X86_CPU_FEATURE(0x1, 0, ECX, 6) +#define X86_FEATURE_PDCM KVM_X86_CPU_FEATURE(0x1, 0, ECX, 15) #define X86_FEATURE_PCID KVM_X86_CPU_FEATURE(0x1, 0, ECX, 17) #define X86_FEATURE_MOVBE KVM_X86_CPU_FEATURE(0x1, 0, ECX, 22) #define X86_FEATURE_TSC_DEADLINE_TIMER KVM_X86_CPU_FEATURE(0x1, 0, ECX, 24) diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c index eb592fae44ef..667d48e8c1e0 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c @@ -17,7 +17,6 @@ #include "kvm_util.h" #include "vmx.h" -#define X86_FEATURE_PDCM (1<<15) #define PMU_CAP_FW_WRITES (1ULL << 13) #define PMU_CAP_LBR_FMT 0x3f @@ -55,7 +54,6 @@ static void guest_code(void) int main(int argc, char *argv[]) { struct kvm_cpuid2 *cpuid; - struct kvm_cpuid_entry2 *entry_1_0; struct kvm_cpuid_entry2 *entry_a_0; struct kvm_vm *vm; struct kvm_vcpu *vcpu; @@ -70,11 +68,10 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); cpuid = kvm_get_supported_cpuid(); - TEST_REQUIRE(kvm_get_cpuid_max_basic() >= 0xa); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_PDCM)); - entry_1_0 = kvm_get_supported_cpuid_index(1, 0); + TEST_REQUIRE(kvm_get_cpuid_max_basic() >= 0xa); entry_a_0 = kvm_get_supported_cpuid_index(0xa, 0); - TEST_REQUIRE(entry_1_0->ecx & X86_FEATURE_PDCM); eax.full = entry_a_0->eax; __TEST_REQUIRE(eax.split.version_id, "PMU is not supported by the vCPU"); -- cgit v1.2.3 From 50445ea2337a5f8df2242f795f45b6cf8c959c77 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:33 +0000 Subject: KVM: selftests: Drop redundant vcpu_set_cpuid() from PMU selftest Drop a redundant vcpu_set_cpuid() from the PMU test. The vCPU's CPUID is set to KVM's supported CPUID by vm_create_with_one_vcpu(), which was also true back when the helper was named vm_create_default(). Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-9-seanjc@google.com --- tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c index 667d48e8c1e0..dc3869d5aff0 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c @@ -53,7 +53,6 @@ static void guest_code(void) int main(int argc, char *argv[]) { - struct kvm_cpuid2 *cpuid; struct kvm_cpuid_entry2 *entry_a_0; struct kvm_vm *vm; struct kvm_vcpu *vcpu; @@ -66,7 +65,6 @@ int main(int argc, char *argv[]) /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, guest_code); - cpuid = kvm_get_supported_cpuid(); TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_PDCM)); @@ -77,7 +75,6 @@ int main(int argc, char *argv[]) __TEST_REQUIRE(eax.split.version_id, "PMU is not supported by the vCPU"); /* testcase 1, set capabilities when we have PDCM bit */ - vcpu_set_cpuid(vcpu, cpuid); vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, PMU_CAP_FW_WRITES); /* check capabilities can be retrieved with KVM_GET_MSR */ -- cgit v1.2.3 From fdd1e2788c41e9b420a3744e022b5ff85b7e5fc0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:34 +0000 Subject: KVM: selftests: Use kvm_cpu_has() for XSAVES in XSS MSR test Use kvm_cpu_has() in the XSS MSR test instead of open coding equivalent functionality using kvm_get_supported_cpuid_index(). No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-10-seanjc@google.com --- tools/testing/selftests/kvm/include/x86_64/processor.h | 1 + tools/testing/selftests/kvm/x86_64/xss_msr_test.c | 8 +------- 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index ad10c8787892..227473ab8ec6 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -112,6 +112,7 @@ struct kvm_x86_cpu_feature { #define X86_FEATURE_SPEC_CTRL KVM_X86_CPU_FEATURE(0x7, 0, EDX, 26) #define X86_FEATURE_ARCH_CAPABILITIES KVM_X86_CPU_FEATURE(0x7, 0, EDX, 29) #define X86_FEATURE_PKS KVM_X86_CPU_FEATURE(0x7, 0, ECX, 31) +#define X86_FEATURE_XSAVES KVM_X86_CPU_FEATURE(0xD, 1, EAX, 3) /* * Extended Leafs, a.k.a. AMD defined diff --git a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c index 4e2e08059b95..e0ddf47362e7 100644 --- a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c +++ b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c @@ -14,11 +14,8 @@ #define MSR_BITS 64 -#define X86_FEATURE_XSAVES (1<<3) - int main(int argc, char *argv[]) { - struct kvm_cpuid_entry2 *entry; bool xss_in_msr_list; struct kvm_vm *vm; struct kvm_vcpu *vcpu; @@ -28,10 +25,7 @@ int main(int argc, char *argv[]) /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, NULL); - TEST_REQUIRE(kvm_get_cpuid_max_basic() >= 0xd); - - entry = kvm_get_supported_cpuid_index(0xd, 1); - TEST_REQUIRE(entry->eax & X86_FEATURE_XSAVES); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XSAVES)); xss_val = vcpu_get_msr(vcpu, MSR_IA32_XSS); TEST_ASSERT(xss_val == 0, -- cgit v1.2.3 From 2697646bd343e3e04004f49a3673975729ef9f01 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:35 +0000 Subject: KVM: selftests: Check for _both_ XTILE data and cfg in AMX test Check for _both_ XTILE data and cfg support in the AMX test instead of checking for _either_ feature. Practically speaking, no sane CPU or vCPU will support one but not the other, but the effective "or" behavior is subtle and technically incorrect. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-11-seanjc@google.com --- tools/testing/selftests/kvm/x86_64/amx_test.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c index 8024562016bc..6fc4e652b38f 100644 --- a/tools/testing/selftests/kvm/x86_64/amx_test.c +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -335,7 +335,8 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_get_cpuid_max_basic() >= 0xd); entry = kvm_get_supported_cpuid_index(0xd, 0); - TEST_REQUIRE(entry->eax & XFEATURE_MASK_XTILE); + TEST_REQUIRE(entry->eax & XFEATURE_MASK_XTILECFG); + TEST_REQUIRE(entry->eax & XFEATURE_MASK_XTILEDATA); /* Get xsave/restore max size */ xsave_restore_size = entry->ecx; -- cgit v1.2.3 From 8fea056eeb0c4840593bf2ed1e4cab5030d0b5b5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:36 +0000 Subject: KVM: selftests: Use kvm_cpu_has() in AMX test Use kvm_cpu_has() in the AMX test instead of open coding equivalent functionality using kvm_get_supported_cpuid_entry() and kvm_get_supported_cpuid_index(). No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-12-seanjc@google.com --- tools/testing/selftests/kvm/include/x86_64/processor.h | 3 +++ tools/testing/selftests/kvm/x86_64/amx_test.c | 17 ++++++----------- 2 files changed, 9 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 227473ab8ec6..3b819bf673ac 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -109,9 +109,12 @@ struct kvm_x86_cpu_feature { #define X86_FEATURE_RDPID KVM_X86_CPU_FEATURE(0x7, 0, ECX, 22) #define X86_FEATURE_SHSTK KVM_X86_CPU_FEATURE(0x7, 0, ECX, 7) #define X86_FEATURE_IBT KVM_X86_CPU_FEATURE(0x7, 0, EDX, 20) +#define X86_FEATURE_AMX_TILE KVM_X86_CPU_FEATURE(0x7, 0, EDX, 24) #define X86_FEATURE_SPEC_CTRL KVM_X86_CPU_FEATURE(0x7, 0, EDX, 26) #define X86_FEATURE_ARCH_CAPABILITIES KVM_X86_CPU_FEATURE(0x7, 0, EDX, 29) #define X86_FEATURE_PKS KVM_X86_CPU_FEATURE(0x7, 0, ECX, 31) +#define X86_FEATURE_XTILECFG KVM_X86_CPU_FEATURE(0xD, 0, EAX, 17) +#define X86_FEATURE_XTILEDATA KVM_X86_CPU_FEATURE(0xD, 0, EAX, 18) #define X86_FEATURE_XSAVES KVM_X86_CPU_FEATURE(0xD, 1, EAX, 3) /* diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c index 6fc4e652b38f..c8f98331a807 100644 --- a/tools/testing/selftests/kvm/x86_64/amx_test.c +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -312,13 +312,12 @@ void guest_nm_handler(struct ex_regs *regs) int main(int argc, char *argv[]) { - struct kvm_cpuid_entry2 *entry; struct kvm_regs regs1, regs2; struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct kvm_run *run; struct kvm_x86_state *state; - int xsave_restore_size = 0; + int xsave_restore_size; vm_vaddr_t amx_cfg, tiledata, xsavedata; struct ucall uc; u32 amx_offset; @@ -329,17 +328,13 @@ int main(int argc, char *argv[]) /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, guest_code); - entry = kvm_get_supported_cpuid_entry(1); - TEST_REQUIRE(entry->ecx & CPUID_XSAVE); - - TEST_REQUIRE(kvm_get_cpuid_max_basic() >= 0xd); - - entry = kvm_get_supported_cpuid_index(0xd, 0); - TEST_REQUIRE(entry->eax & XFEATURE_MASK_XTILECFG); - TEST_REQUIRE(entry->eax & XFEATURE_MASK_XTILEDATA); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XSAVE)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_AMX_TILE)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XTILECFG)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XTILEDATA)); /* Get xsave/restore max size */ - xsave_restore_size = entry->ecx; + xsave_restore_size = kvm_get_supported_cpuid_index(0xd, 0)->ecx; run = vcpu->run; vcpu_regs_get(vcpu, ®s1); -- cgit v1.2.3 From 045520e4755bbdf2f2983309c8eab6176a97a13d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:37 +0000 Subject: KVM: selftests: Use kvm_cpu_has() for XSAVE in cr4_cpuid_sync_test Use kvm_cpu_has() in the CR4/CPUID sync test instead of open coding equivalent functionality using kvm_get_supported_cpuid_entry(). No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-13-seanjc@google.com --- tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c index f4d3a042ec1c..611febdc2128 100644 --- a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c +++ b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c @@ -63,11 +63,9 @@ int main(int argc, char *argv[]) struct kvm_run *run; struct kvm_vm *vm; struct kvm_sregs sregs; - struct kvm_cpuid_entry2 *entry; struct ucall uc; - entry = kvm_get_supported_cpuid_entry(1); - TEST_REQUIRE(entry->ecx & CPUID_XSAVE); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XSAVE)); /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); -- cgit v1.2.3 From b046f4ee9cb60da285e1d45a1fe8dc6bb5fc446e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:38 +0000 Subject: KVM: selftests: Remove the obsolete/dead MMU role test Remove the MMU role test, which was made obsolete by KVM commit feb627e8d6f6 ("KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN"). The ongoing costs of keeping the test updated far outweigh any benefits, e.g. the test _might_ be useful as an example or for documentation purposes, but otherwise the test is dead weight. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-14-seanjc@google.com --- tools/testing/selftests/kvm/.gitignore | 1 - tools/testing/selftests/kvm/Makefile | 1 - .../selftests/kvm/include/x86_64/processor.h | 3 - tools/testing/selftests/kvm/x86_64/mmu_role_test.c | 137 --------------------- 4 files changed, 142 deletions(-) delete mode 100644 tools/testing/selftests/kvm/x86_64/mmu_role_test.c (limited to 'tools') diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index f44ebf401310..91429330faea 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -27,7 +27,6 @@ /x86_64/hyperv_svm_test /x86_64/max_vcpuid_cap_test /x86_64/mmio_warning_test -/x86_64/mmu_role_test /x86_64/monitor_mwait_test /x86_64/nx_huge_pages_test /x86_64/platform_info_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 4d6753aadfa0..6b22fb1ce2b9 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -87,7 +87,6 @@ TEST_GEN_PROGS_x86_64 += x86_64/hyperv_svm_test TEST_GEN_PROGS_x86_64 += x86_64/kvm_clock_test TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test -TEST_GEN_PROGS_x86_64 += x86_64/mmu_role_test TEST_GEN_PROGS_x86_64 += x86_64/monitor_mwait_test TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 3b819bf673ac..14342e68e0b5 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -142,9 +142,6 @@ struct kvm_x86_cpu_feature { #define CPUID_XSAVE (1ul << 26) #define CPUID_OSXSAVE (1ul << 27) -/* CPUID.0x8000_0001.EDX */ -#define CPUID_GBPAGES (1ul << 26) - /* CPUID.0x8000_000A.EDX */ #define CPUID_NRIPS BIT(3) diff --git a/tools/testing/selftests/kvm/x86_64/mmu_role_test.c b/tools/testing/selftests/kvm/x86_64/mmu_role_test.c deleted file mode 100644 index 383fff2c9587..000000000000 --- a/tools/testing/selftests/kvm/x86_64/mmu_role_test.c +++ /dev/null @@ -1,137 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "kvm_util.h" -#include "processor.h" - -#define MMIO_GPA 0x100000000ull - -static void guest_code(void) -{ - (void)READ_ONCE(*((uint64_t *)MMIO_GPA)); - (void)READ_ONCE(*((uint64_t *)MMIO_GPA)); - - GUEST_ASSERT(0); -} - -static void guest_pf_handler(struct ex_regs *regs) -{ - /* PFEC == RSVD | PRESENT (read, kernel). */ - GUEST_ASSERT(regs->error_code == 0x9); - GUEST_DONE(); -} - -static void mmu_role_test(u32 *cpuid_reg, u32 evil_cpuid_val) -{ - u32 good_cpuid_val = *cpuid_reg; - struct kvm_vcpu *vcpu; - struct kvm_run *run; - struct kvm_vm *vm; - uint64_t cmd; - - /* Create VM */ - vm = vm_create_with_one_vcpu(&vcpu, guest_code); - run = vcpu->run; - - /* Map 1gb page without a backing memlot. */ - __virt_pg_map(vm, MMIO_GPA, MMIO_GPA, PG_LEVEL_1G); - - vcpu_run(vcpu); - - /* Guest access to the 1gb page should trigger MMIO. */ - TEST_ASSERT(run->exit_reason == KVM_EXIT_MMIO, - "Unexpected exit reason: %u (%s), expected MMIO exit (1gb page w/o memslot)\n", - run->exit_reason, exit_reason_str(run->exit_reason)); - - TEST_ASSERT(run->mmio.len == 8, "Unexpected exit mmio size = %u", run->mmio.len); - - TEST_ASSERT(run->mmio.phys_addr == MMIO_GPA, - "Unexpected exit mmio address = 0x%llx", run->mmio.phys_addr); - - /* - * Effect the CPUID change for the guest and re-enter the guest. Its - * access should now #PF due to the PAGE_SIZE bit being reserved or - * the resulting GPA being invalid. Note, kvm_get_supported_cpuid() - * returns the struct that contains the entry being modified. Eww. - */ - *cpuid_reg = evil_cpuid_val; - vcpu_set_cpuid(vcpu, kvm_get_supported_cpuid()); - - /* - * Add a dummy memslot to coerce KVM into bumping the MMIO generation. - * KVM does not "officially" support mucking with CPUID after KVM_RUN, - * and will incorrectly reuse MMIO SPTEs. Don't delete the memslot! - * KVM x86 zaps all shadow pages on memslot deletion. - */ - vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, - MMIO_GPA << 1, 10, 1, 0); - - /* Set up a #PF handler to eat the RSVD #PF and signal all done! */ - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - vm_install_exception_handler(vm, PF_VECTOR, guest_pf_handler); - - vcpu_run(vcpu); - - cmd = get_ucall(vcpu, NULL); - TEST_ASSERT(cmd == UCALL_DONE, - "Unexpected guest exit, exit_reason=%s, ucall.cmd = %lu\n", - exit_reason_str(run->exit_reason), cmd); - - /* - * Restore the happy CPUID value for the next test. Yes, changes are - * indeed persistent across VM destruction. - */ - *cpuid_reg = good_cpuid_val; - - kvm_vm_free(vm); -} - -int main(int argc, char *argv[]) -{ - struct kvm_cpuid_entry2 *entry; - int opt; - - /* - * All tests are opt-in because TDP doesn't play nice with reserved #PF - * in the GVA->GPA translation. The hardware page walker doesn't let - * software change GBPAGES or MAXPHYADDR, and KVM doesn't manually walk - * the GVA on fault for performance reasons. - */ - bool do_gbpages = false; - bool do_maxphyaddr = false; - - setbuf(stdout, NULL); - - while ((opt = getopt(argc, argv, "gm")) != -1) { - switch (opt) { - case 'g': - do_gbpages = true; - break; - case 'm': - do_maxphyaddr = true; - break; - case 'h': - default: - printf("usage: %s [-g (GBPAGES)] [-m (MAXPHYADDR)]\n", argv[0]); - break; - } - } - - __TEST_REQUIRE(do_gbpages || do_maxphyaddr, "No sub-tests selected"); - - entry = kvm_get_supported_cpuid_entry(0x80000001); - TEST_REQUIRE(entry->edx & CPUID_GBPAGES); - - if (do_gbpages) { - pr_info("Test MMIO after toggling CPUID.GBPAGES\n\n"); - mmu_role_test(&entry->edx, entry->edx & ~CPUID_GBPAGES); - } - - if (do_maxphyaddr) { - pr_info("Test MMIO after changing CPUID.MAXPHYADDR\n\n"); - entry = kvm_get_supported_cpuid_entry(0x80000008); - mmu_role_test(&entry->eax, (entry->eax & ~0xff) | 0x20); - } - - return 0; -} -- cgit v1.2.3 From 601c067f381532105d01db9257b69eb9cfa8945e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:39 +0000 Subject: KVM: selftests: Use kvm_cpu_has() for KVM's PV steal time Use kvm_cpu_has() in the stea-ltime test instead of open coding equivalent functionality using kvm_get_supported_cpuid_entry(). Opportunistically define all of KVM's paravirt CPUID-based features. No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-15-seanjc@google.com --- .../selftests/kvm/include/x86_64/processor.h | 22 ++++++++++++++++++++++ tools/testing/selftests/kvm/steal_time.c | 4 +--- 2 files changed, 23 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 14342e68e0b5..15bccdd6d78c 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -137,6 +137,28 @@ struct kvm_x86_cpu_feature { #define X86_FEATURE_SEV KVM_X86_CPU_FEATURE(0x8000001F, 0, EAX, 1) #define X86_FEATURE_SEV_ES KVM_X86_CPU_FEATURE(0x8000001F, 0, EAX, 3) +/* + * KVM defined paravirt features. + */ +#define X86_FEATURE_KVM_CLOCKSOURCE KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 0) +#define X86_FEATURE_KVM_NOP_IO_DELAY KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 1) +#define X86_FEATURE_KVM_MMU_OP KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 2) +#define X86_FEATURE_KVM_CLOCKSOURCE2 KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 3) +#define X86_FEATURE_KVM_ASYNC_PF KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 4) +#define X86_FEATURE_KVM_STEAL_TIME KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 5) +#define X86_FEATURE_KVM_PV_EOI KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 6) +#define X86_FEATURE_KVM_PV_UNHALT KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 7) +/* Bit 8 apparently isn't used?!?! */ +#define X86_FEATURE_KVM_PV_TLB_FLUSH KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 9) +#define X86_FEATURE_KVM_ASYNC_PF_VMEXIT KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 10) +#define X86_FEATURE_KVM_PV_SEND_IPI KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 11) +#define X86_FEATURE_KVM_POLL_CONTROL KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 12) +#define X86_FEATURE_KVM_PV_SCHED_YIELD KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 13) +#define X86_FEATURE_KVM_ASYNC_PF_INT KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 14) +#define X86_FEATURE_KVM_MSI_EXT_DEST_ID KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 15) +#define X86_FEATURE_KVM_HC_MAP_GPA_RANGE KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 16) +#define X86_FEATURE_KVM_MIGRATION_CONTROL KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 17) + /* CPUID.1.ECX */ #define CPUID_VMX (1ul << 5) #define CPUID_XSAVE (1ul << 26) diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index 9866a71463d7..db8967f1a17b 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -60,9 +60,7 @@ static void guest_code(int cpu) static bool is_steal_time_supported(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *cpuid = kvm_get_supported_cpuid_entry(KVM_CPUID_FEATURES); - - return cpuid && (cpuid->eax & KVM_FEATURE_STEAL_TIME); + return kvm_cpu_has(X86_FEATURE_KVM_STEAL_TIME); } static void steal_time_init(struct kvm_vcpu *vcpu, uint32_t i) -- cgit v1.2.3 From 3c67f8208451a864478b9bf049782159fc925e11 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:40 +0000 Subject: KVM: selftests: Use kvm_cpu_has() for nSVM soft INT injection test Use kvm_cpu_has() to query for NRIPS support instead of open coding equivalent functionality using kvm_get_supported_cpuid_entry(). Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-16-seanjc@google.com --- tools/testing/selftests/kvm/include/x86_64/processor.h | 3 --- tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c | 7 ++----- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 15bccdd6d78c..a66d625da3d4 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -164,9 +164,6 @@ struct kvm_x86_cpu_feature { #define CPUID_XSAVE (1ul << 26) #define CPUID_OSXSAVE (1ul << 27) -/* CPUID.0x8000_000A.EDX */ -#define CPUID_NRIPS BIT(3) - /* Page table bitfield declarations */ #define PTE_PRESENT_MASK BIT_ULL(0) #define PTE_WRITABLE_MASK BIT_ULL(1) diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c index 2cc09ab41570..e637d7736012 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c @@ -194,16 +194,13 @@ done: int main(int argc, char *argv[]) { - struct kvm_cpuid_entry2 *cpuid; - /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); - cpuid = kvm_get_supported_cpuid_entry(0x8000000a); - TEST_ASSERT(cpuid->edx & CPUID_NRIPS, - "KVM with nSVM is supposed to unconditionally advertise nRIP Save\n"); + TEST_ASSERT(kvm_cpu_has(X86_FEATURE_NRIPS), + "KVM with nSVM is supposed to unconditionally advertise nRIP Save"); atomic_init(&nmi_stage, 0); -- cgit v1.2.3 From 71bcb951c68b422ca65f55dc7ae7be01735ae48e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:41 +0000 Subject: KVM: selftests: Verify that kvm_cpuid2.entries layout is unchanged by KVM In the CPUID test, verify that KVM doesn't modify the kvm_cpuid2.entries layout, i.e. that the order of entries and their flags is identical between what the test provides via KVM_SET_CPUID2 and what KVM returns via KVM_GET_CPUID2. Asserting that the layouts match simplifies the test as there's no need to iterate over both arrays. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-17-seanjc@google.com --- tools/testing/selftests/kvm/x86_64/cpuid_test.c | 49 +++++++++++-------------- 1 file changed, 21 insertions(+), 28 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c index 3767a0cc694b..59dcdf5a1220 100644 --- a/tools/testing/selftests/kvm/x86_64/cpuid_test.c +++ b/tools/testing/selftests/kvm/x86_64/cpuid_test.c @@ -79,41 +79,34 @@ static bool is_cpuid_mangled(struct kvm_cpuid_entry2 *entrie) return false; } -static void check_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 *entrie) +static void compare_cpuids(struct kvm_cpuid2 *cpuid1, struct kvm_cpuid2 *cpuid2) { + struct kvm_cpuid_entry2 *e1, *e2; int i; - for (i = 0; i < cpuid->nent; i++) { - if (cpuid->entries[i].function == entrie->function && - cpuid->entries[i].index == entrie->index) { - if (is_cpuid_mangled(entrie)) - return; - - TEST_ASSERT(cpuid->entries[i].eax == entrie->eax && - cpuid->entries[i].ebx == entrie->ebx && - cpuid->entries[i].ecx == entrie->ecx && - cpuid->entries[i].edx == entrie->edx, - "CPUID 0x%x.%x differ: 0x%x:0x%x:0x%x:0x%x vs 0x%x:0x%x:0x%x:0x%x", - entrie->function, entrie->index, - cpuid->entries[i].eax, cpuid->entries[i].ebx, - cpuid->entries[i].ecx, cpuid->entries[i].edx, - entrie->eax, entrie->ebx, entrie->ecx, entrie->edx); - return; - } - } + TEST_ASSERT(cpuid1->nent == cpuid2->nent, + "CPUID nent mismatch: %d vs. %d", cpuid1->nent, cpuid2->nent); - TEST_ASSERT(false, "CPUID 0x%x.%x not found", entrie->function, entrie->index); -} + for (i = 0; i < cpuid1->nent; i++) { + e1 = &cpuid1->entries[i]; + e2 = &cpuid2->entries[i]; -static void compare_cpuids(struct kvm_cpuid2 *cpuid1, struct kvm_cpuid2 *cpuid2) -{ - int i; + TEST_ASSERT(e1->function == e2->function && + e1->index == e2->index && e1->flags == e2->flags, + "CPUID entries[%d] mismtach: 0x%x.%d.%x vs. 0x%x.%d.%x\n", + i, e1->function, e1->index, e1->flags, + e2->function, e2->index, e2->flags); - for (i = 0; i < cpuid1->nent; i++) - check_cpuid(cpuid2, &cpuid1->entries[i]); + if (is_cpuid_mangled(e1)) + continue; - for (i = 0; i < cpuid2->nent; i++) - check_cpuid(cpuid1, &cpuid2->entries[i]); + TEST_ASSERT(e1->eax == e2->eax && e1->ebx == e2->ebx && + e1->ecx == e2->ecx && e1->edx == e2->edx, + "CPUID 0x%x.%x differ: 0x%x:0x%x:0x%x:0x%x vs 0x%x:0x%x:0x%x:0x%x", + e1->function, e1->index, + e1->eax, e1->ebx, e1->ecx, e1->edx, + e2->eax, e2->ebx, e2->ecx, e2->edx); + } } static void run_vcpu(struct kvm_vcpu *vcpu, int stage) -- cgit v1.2.3 From fc66963d7b01e00ef0482bd7adbc8918343f81d9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:42 +0000 Subject: KVM: selftests: Split out kvm_cpuid2_size() from allocate_kvm_cpuid2() Split out the computation of the effective size of a kvm_cpuid2 struct from allocate_kvm_cpuid2(), and modify both to take an arbitrary number of entries. Future commits will add caching of a vCPU's CPUID model, and will (a) be able to precisely size the entries array, and (b) will need to know the effective size of the struct in order to copy to/from the cache. Expose the helpers so that the Hyper-V Features test can use them in the (somewhat distant) future. The Hyper-V test very, very subtly relies on propagating CPUID info across vCPU instances, and will need to make a copy of the previous vCPU's CPUID information when it switches to using the per-vCPU cache. Alternatively, KVM could provide helpers to duplicate and/or copy a kvm_cpuid2 instance, but each is literally a single line of code if the helpers are exposed, and it's not like the size of kvm_cpuid2 is secret knowledge. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-18-seanjc@google.com --- .../selftests/kvm/include/x86_64/processor.h | 23 +++++++++++ tools/testing/selftests/kvm/lib/x86_64/processor.c | 48 ++++------------------ 2 files changed, 30 insertions(+), 41 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index a66d625da3d4..1326377f72b7 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -595,6 +595,29 @@ static inline bool kvm_cpu_has(struct kvm_x86_cpu_feature feature) return kvm_cpuid_has(kvm_get_supported_cpuid(), feature); } +static inline size_t kvm_cpuid2_size(int nr_entries) +{ + return sizeof(struct kvm_cpuid2) + + sizeof(struct kvm_cpuid_entry2) * nr_entries; +} + +/* + * Allocate a "struct kvm_cpuid2* instance, with the 0-length arrary of + * entries sized to hold @nr_entries. The caller is responsible for freeing + * the struct. + */ +static inline struct kvm_cpuid2 *allocate_kvm_cpuid2(int nr_entries) +{ + struct kvm_cpuid2 *cpuid; + + cpuid = malloc(kvm_cpuid2_size(nr_entries)); + TEST_ASSERT(cpuid, "-ENOMEM when allocating kvm_cpuid2"); + + cpuid->nent = nr_entries; + + return cpuid; +} + struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vcpu *vcpu); static inline int __vcpu_set_cpuid(struct kvm_vcpu *vcpu, diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 10038496fff6..4d383dee30b7 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -16,6 +16,8 @@ #define DEFAULT_CODE_SELECTOR 0x8 #define DEFAULT_DATA_SELECTOR 0x10 +#define MAX_NR_CPUID_ENTRIES 100 + vm_vaddr_t exception_handlers; static void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent) @@ -672,40 +674,6 @@ struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, uint32_t vcpu_id) return vcpu; } -/* - * Allocate an instance of struct kvm_cpuid2 - * - * Input Args: None - * - * Output Args: None - * - * Return: A pointer to the allocated struct. The caller is responsible - * for freeing this struct. - * - * Since kvm_cpuid2 uses a 0-length array to allow a the size of the - * array to be decided at allocation time, allocation is slightly - * complicated. This function uses a reasonable default length for - * the array and performs the appropriate allocation. - */ -static struct kvm_cpuid2 *allocate_kvm_cpuid2(void) -{ - struct kvm_cpuid2 *cpuid; - int nent = 100; - size_t size; - - size = sizeof(*cpuid); - size += nent * sizeof(struct kvm_cpuid_entry2); - cpuid = malloc(size); - if (!cpuid) { - perror("malloc"); - abort(); - } - - cpuid->nent = nent; - - return cpuid; -} - /* * KVM Supported CPUID Get * @@ -725,7 +693,7 @@ struct kvm_cpuid2 *kvm_get_supported_cpuid(void) if (cpuid) return cpuid; - cpuid = allocate_kvm_cpuid2(); + cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); kvm_fd = open_kvm_dev_path_or_exit(); kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid); @@ -781,7 +749,7 @@ struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vcpu *vcpu) int max_ent; int rc = -1; - cpuid = allocate_kvm_cpuid2(); + cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); max_ent = cpuid->nent; for (cpuid->nent = 1; cpuid->nent <= max_ent; cpuid->nent++) { @@ -1295,7 +1263,7 @@ struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void) if (cpuid) return cpuid; - cpuid = allocate_kvm_cpuid2(); + cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); kvm_fd = open_kvm_dev_path_or_exit(); kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid); @@ -1314,9 +1282,7 @@ void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu) cpuid_sys = kvm_get_supported_cpuid(); cpuid_hv = kvm_get_supported_hv_cpuid(); - cpuid_full = malloc(sizeof(*cpuid_full) + - (cpuid_sys->nent + cpuid_hv->nent) * - sizeof(struct kvm_cpuid_entry2)); + cpuid_full = allocate_kvm_cpuid2(cpuid_sys->nent + cpuid_hv->nent); if (!cpuid_full) { perror("malloc"); abort(); @@ -1343,7 +1309,7 @@ struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu) { static struct kvm_cpuid2 *cpuid; - cpuid = allocate_kvm_cpuid2(); + cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); vcpu_ioctl(vcpu, KVM_GET_SUPPORTED_HV_CPUID, cpuid); -- cgit v1.2.3 From 7fbc6038acbaa4c0c0b374aac635038881143e84 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:43 +0000 Subject: KVM: selftests: Cache CPUID in struct kvm_vcpu Cache a vCPU's CPUID information in "struct kvm_vcpu" to allow fixing the mess where tests, often unknowingly, modify the global/static "cpuid" allocated by kvm_get_supported_cpuid(). Add vcpu_init_cpuid() to handle stuffing an entirely different CPUID model, e.g. during vCPU creation or when switching to the Hyper-V enabled CPUID model. Automatically refresh the cache on vcpu_set_cpuid() so that any adjustments made by KVM are always reflected in the cache. Drop vcpu_get_cpuid() entirely to force tests to use the cache, and to allow adding e.g. vcpu_get_cpuid_entry() in the future without creating a conflicting set of APIs where vcpu_get_cpuid() does KVM_GET_CPUID2, but vcpu_get_cpuid_entry() does not. Opportunistically convert the VMX nested state test and KVM PV test to manipulating the vCPU's CPUID (because it's easy), but use vcpu_init_cpuid() for the Hyper-V features test and "emulator error" test to effectively retain their current behavior as they're less trivial to convert. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-19-seanjc@google.com --- .../testing/selftests/kvm/include/kvm_util_base.h | 5 +++ .../selftests/kvm/include/x86_64/processor.h | 25 +++++++++---- tools/testing/selftests/kvm/lib/kvm_util.c | 7 ++++ tools/testing/selftests/kvm/lib/x86_64/processor.c | 42 +++++++++++----------- tools/testing/selftests/kvm/x86_64/cpuid_test.c | 18 +++++----- .../selftests/kvm/x86_64/emulator_error_test.c | 2 +- .../testing/selftests/kvm/x86_64/hyperv_features.c | 2 +- tools/testing/selftests/kvm/x86_64/kvm_pv_test.c | 6 ++-- .../selftests/kvm/x86_64/monitor_mwait_test.c | 2 +- .../kvm/x86_64/vmx_set_nested_state_test.c | 6 ++-- .../selftests/kvm/x86_64/xapic_state_test.c | 4 +-- 11 files changed, 69 insertions(+), 50 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 19f7b33a1e58..24fde97f6121 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -50,6 +50,9 @@ struct kvm_vcpu { int fd; struct kvm_vm *vm; struct kvm_run *run; +#ifdef __x86_64__ + struct kvm_cpuid2 *cpuid; +#endif struct kvm_dirty_gfn *dirty_gfns; uint32_t fetch_index; uint32_t dirty_gfns_count; @@ -748,6 +751,8 @@ static inline struct kvm_vcpu *vm_vcpu_recreate(struct kvm_vm *vm, return vm_arch_vcpu_recreate(vm, vcpu_id); } +void vcpu_arch_free(struct kvm_vcpu *vcpu); + void virt_arch_pgd_alloc(struct kvm_vm *vm); static inline void virt_pgd_alloc(struct kvm_vm *vm) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 1326377f72b7..c61b3aa4950f 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -618,18 +618,29 @@ static inline struct kvm_cpuid2 *allocate_kvm_cpuid2(int nr_entries) return cpuid; } -struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vcpu *vcpu); +void vcpu_init_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid); -static inline int __vcpu_set_cpuid(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid) +static inline int __vcpu_set_cpuid(struct kvm_vcpu *vcpu) { - return __vcpu_ioctl(vcpu, KVM_SET_CPUID2, cpuid); + int r; + + TEST_ASSERT(vcpu->cpuid, "Must do vcpu_init_cpuid() first"); + r = __vcpu_ioctl(vcpu, KVM_SET_CPUID2, vcpu->cpuid); + if (r) + return r; + + /* On success, refresh the cache to pick up adjustments made by KVM. */ + vcpu_ioctl(vcpu, KVM_GET_CPUID2, vcpu->cpuid); + return 0; } -static inline void vcpu_set_cpuid(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid) +static inline void vcpu_set_cpuid(struct kvm_vcpu *vcpu) { - vcpu_ioctl(vcpu, KVM_SET_CPUID2, cpuid); + TEST_ASSERT(vcpu->cpuid, "Must do vcpu_init_cpuid() first"); + vcpu_ioctl(vcpu, KVM_SET_CPUID2, vcpu->cpuid); + + /* Refresh the cache to pick up adjustments made by KVM. */ + vcpu_ioctl(vcpu, KVM_GET_CPUID2, vcpu->cpuid); } struct kvm_cpuid_entry2 * diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 49c189878af3..9889fe0d8919 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -472,6 +472,11 @@ kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, return ®ion->region; } +__weak void vcpu_arch_free(struct kvm_vcpu *vcpu) +{ + +} + /* * VM VCPU Remove * @@ -501,6 +506,8 @@ static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); list_del(&vcpu->list); + + vcpu_arch_free(vcpu); free(vcpu); } diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 4d383dee30b7..4cbcc333bba0 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -648,7 +648,7 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, DEFAULT_GUEST_STACK_VADDR_MIN); vcpu = __vm_vcpu_add(vm, vcpu_id); - vcpu_set_cpuid(vcpu, kvm_get_supported_cpuid()); + vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid()); vcpu_setup(vm, vcpu); /* Setup guest general purpose registers */ @@ -669,11 +669,17 @@ struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, uint32_t vcpu_id) { struct kvm_vcpu *vcpu = __vm_vcpu_add(vm, vcpu_id); - vcpu_set_cpuid(vcpu, kvm_get_supported_cpuid()); + vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid()); return vcpu; } +void vcpu_arch_free(struct kvm_vcpu *vcpu) +{ + if (vcpu->cpuid) + free(vcpu->cpuid); +} + /* * KVM Supported CPUID Get * @@ -743,30 +749,22 @@ uint64_t kvm_get_feature_msr(uint64_t msr_index) return buffer.entry.data; } -struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vcpu *vcpu) +void vcpu_init_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid) { - struct kvm_cpuid2 *cpuid; - int max_ent; - int rc = -1; - - cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); - max_ent = cpuid->nent; + TEST_ASSERT(cpuid != vcpu->cpuid, "@cpuid can't be the vCPU's CPUID"); - for (cpuid->nent = 1; cpuid->nent <= max_ent; cpuid->nent++) { - rc = __vcpu_ioctl(vcpu, KVM_GET_CPUID2, cpuid); - if (!rc) - break; - - TEST_ASSERT(rc == -1 && errno == E2BIG, - "KVM_GET_CPUID2 should either succeed or give E2BIG: %d %d", - rc, errno); + /* Allow overriding the default CPUID. */ + if (vcpu->cpuid && vcpu->cpuid->nent < cpuid->nent) { + free(vcpu->cpuid); + vcpu->cpuid = NULL; } - TEST_ASSERT(!rc, KVM_IOCTL_ERROR(KVM_GET_CPUID2, rc)); - return cpuid; -} - + if (!vcpu->cpuid) + vcpu->cpuid = allocate_kvm_cpuid2(cpuid->nent); + memcpy(vcpu->cpuid, cpuid, kvm_cpuid2_size(cpuid->nent)); + vcpu_set_cpuid(vcpu); +} /* * Locate a cpuid entry. @@ -1302,7 +1300,7 @@ void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu) cpuid_full->nent = nent + cpuid_hv->nent; } - vcpu_set_cpuid(vcpu, cpuid_full); + vcpu_init_cpuid(vcpu, cpuid_full); } struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu) diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c index 59dcdf5a1220..96f141a758ca 100644 --- a/tools/testing/selftests/kvm/x86_64/cpuid_test.c +++ b/tools/testing/selftests/kvm/x86_64/cpuid_test.c @@ -144,21 +144,22 @@ struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct return guest_cpuids; } -static void set_cpuid_after_run(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid) +static void set_cpuid_after_run(struct kvm_vcpu *vcpu) { + struct kvm_cpuid2 *cpuid = vcpu->cpuid; struct kvm_cpuid_entry2 *ent; int rc; u32 eax, ebx, x; /* Setting unmodified CPUID is allowed */ - rc = __vcpu_set_cpuid(vcpu, cpuid); + rc = __vcpu_set_cpuid(vcpu); TEST_ASSERT(!rc, "Setting unmodified CPUID after KVM_RUN failed: %d", rc); /* Changing CPU features is forbidden */ ent = get_cpuid(cpuid, 0x7, 0); ebx = ent->ebx; ent->ebx--; - rc = __vcpu_set_cpuid(vcpu, cpuid); + rc = __vcpu_set_cpuid(vcpu); TEST_ASSERT(rc, "Changing CPU features should fail"); ent->ebx = ebx; @@ -167,14 +168,14 @@ static void set_cpuid_after_run(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid) eax = ent->eax; x = eax & 0xff; ent->eax = (eax & ~0xffu) | (x - 1); - rc = __vcpu_set_cpuid(vcpu, cpuid); + rc = __vcpu_set_cpuid(vcpu); TEST_ASSERT(rc, "Changing MAXPHYADDR should fail"); ent->eax = eax; } int main(void) { - struct kvm_cpuid2 *supp_cpuid, *cpuid2; + struct kvm_cpuid2 *supp_cpuid; struct kvm_vcpu *vcpu; vm_vaddr_t cpuid_gva; struct kvm_vm *vm; @@ -183,18 +184,17 @@ int main(void) vm = vm_create_with_one_vcpu(&vcpu, guest_main); supp_cpuid = kvm_get_supported_cpuid(); - cpuid2 = vcpu_get_cpuid(vcpu); - compare_cpuids(supp_cpuid, cpuid2); + compare_cpuids(supp_cpuid, vcpu->cpuid); - vcpu_alloc_cpuid(vm, &cpuid_gva, cpuid2); + vcpu_alloc_cpuid(vm, &cpuid_gva, vcpu->cpuid); vcpu_args_set(vcpu, 1, cpuid_gva); for (stage = 0; stage < 3; stage++) run_vcpu(vcpu, stage); - set_cpuid_after_run(vcpu, cpuid2); + set_cpuid_after_run(vcpu); kvm_vm_free(vm); } diff --git a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c index 3aa3d17f230f..d8dbed419638 100644 --- a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c +++ b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c @@ -171,7 +171,7 @@ int main(int argc, char *argv[]) entry->eax = (entry->eax & 0xffffff00) | MAXPHYADDR; set_cpuid(cpuid, entry); - vcpu_set_cpuid(vcpu, cpuid); + vcpu_init_cpuid(vcpu, cpuid); rc = kvm_check_cap(KVM_CAP_EXIT_ON_EMULATION_FAILURE); TEST_ASSERT(rc, "KVM_CAP_EXIT_ON_EMULATION_FAILURE is unavailable"); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c index 2070ba0d6392..86ce1f1e98ee 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c @@ -102,7 +102,7 @@ static void hv_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, "failed to set HYPERV_CPUID_ENLIGHTMENT_INFO leaf"); TEST_ASSERT(set_cpuid(cpuid, dbg), "failed to set HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES leaf"); - vcpu_set_cpuid(vcpu, cpuid); + vcpu_init_cpuid(vcpu, cpuid); } static void guest_test_msrs_access(void) diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c index ea452444f4af..986ca7282a09 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c @@ -147,7 +147,6 @@ static void enter_guest(struct kvm_vcpu *vcpu) int main(void) { - struct kvm_cpuid2 *best; struct kvm_vcpu *vcpu; struct kvm_vm *vm; @@ -157,9 +156,8 @@ int main(void) vcpu_enable_cap(vcpu, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, 1); - best = kvm_get_supported_cpuid(); - clear_kvm_cpuid_features(best); - vcpu_set_cpuid(vcpu, best); + clear_kvm_cpuid_features(vcpu->cpuid); + vcpu_set_cpuid(vcpu); vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vcpu); diff --git a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c index c804f6f04134..0ac5bec1e3b7 100644 --- a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c +++ b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c @@ -80,7 +80,7 @@ int main(int argc, char *argv[]) set_cpuid(cpuid, entry); vm = vm_create_with_one_vcpu(&vcpu, guest_code); - vcpu_set_cpuid(vcpu, cpuid); + vcpu_set_cpuid(vcpu); run = vcpu->run; diff --git a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c index 66cb2d0054e6..1cf78ec007f2 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c @@ -121,7 +121,7 @@ void test_vmx_nested_state(struct kvm_vcpu *vcpu) test_nested_state(vcpu, state); /* Enable VMX in the guest CPUID. */ - vcpu_set_cpuid(vcpu, kvm_get_supported_cpuid()); + vcpu_set_cpuid(vcpu); /* * Setting vmxon_pa == -1ull and vmcs_pa == -1ull exits early without @@ -245,7 +245,7 @@ void test_vmx_nested_state(struct kvm_vcpu *vcpu) void disable_vmx(struct kvm_vcpu *vcpu) { - struct kvm_cpuid2 *cpuid = kvm_get_supported_cpuid(); + struct kvm_cpuid2 *cpuid = vcpu->cpuid; int i; for (i = 0; i < cpuid->nent; ++i) @@ -255,7 +255,7 @@ void disable_vmx(struct kvm_vcpu *vcpu) TEST_ASSERT(i != cpuid->nent, "CPUID function 1 not found"); cpuid->entries[i].ecx &= ~CPUID_VMX; - vcpu_set_cpuid(vcpu, cpuid); + vcpu_set_cpuid(vcpu); cpuid->entries[i].ecx |= CPUID_VMX; } diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c index 87531623064f..52133bb4d85a 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c @@ -152,13 +152,13 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&x.vcpu, xapic_guest_code); x.is_x2apic = false; - cpuid = vcpu_get_cpuid(x.vcpu); + cpuid = x.vcpu->cpuid; for (i = 0; i < cpuid->nent; i++) { if (cpuid->entries[i].function == 1) break; } cpuid->entries[i].ecx &= ~BIT(21); - vcpu_set_cpuid(x.vcpu, cpuid); + vcpu_set_cpuid(x.vcpu); virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); test_icr(&x); -- cgit v1.2.3 From d838b313aadcf89d9b352ced1cd849d313971864 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:44 +0000 Subject: KVM: selftests: Don't use a static local in vcpu_get_supported_hv_cpuid() Don't use a static variable for the Hyper-V supported CPUID array, the helper unconditionally reallocates the array on every invocation (and all callers free the array immediately after use). The array is intentionally recreated and refilled because the set of supported CPUID features is dependent on vCPU state, e.g. whether or not eVMCS has been enabled. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-20-seanjc@google.com --- tools/testing/selftests/kvm/lib/x86_64/processor.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 4cbcc333bba0..09e51514e1b9 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1305,9 +1305,7 @@ void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu) struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu) { - static struct kvm_cpuid2 *cpuid; - - cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); + struct kvm_cpuid2 *cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); vcpu_ioctl(vcpu, KVM_GET_SUPPORTED_HV_CPUID, cpuid); -- cgit v1.2.3 From 8b02674103e6f09f9f9397bfb7bfd60a9323c9c2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:45 +0000 Subject: KVM: selftests: Rename and tweak get_cpuid() to get_cpuid_entry() Rename get_cpuid() to get_cpuid_entry() to better reflect its behavior. Leave set_cpuid() as is to avoid unnecessary churn, that helper will soon be removed entirely. Oppurtunistically tweak the implementation to avoid using a temporary variable in anticipation of taggin the input @cpuid with "const". No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-21-seanjc@google.com --- tools/testing/selftests/kvm/include/x86_64/processor.h | 4 ++-- tools/testing/selftests/kvm/lib/x86_64/processor.c | 11 +++++------ tools/testing/selftests/kvm/x86_64/cpuid_test.c | 4 ++-- 3 files changed, 9 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index c61b3aa4950f..1703e07f8b83 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -766,8 +766,8 @@ void vm_set_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, /* * get_cpuid() - find matching CPUID entry and return pointer to it. */ -struct kvm_cpuid_entry2 *get_cpuid(struct kvm_cpuid2 *cpuid, uint32_t function, - uint32_t index); +struct kvm_cpuid_entry2 *get_cpuid_entry(struct kvm_cpuid2 *cpuid, + uint32_t function, uint32_t index); /* * set_cpuid() - overwrites a matching cpuid entry with the provided value. * matches based on ent->function && ent->index. returns true diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 09e51514e1b9..fd8577a12678 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1207,16 +1207,15 @@ void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) } } -struct kvm_cpuid_entry2 *get_cpuid(struct kvm_cpuid2 *cpuid, uint32_t function, - uint32_t index) +struct kvm_cpuid_entry2 *get_cpuid_entry(struct kvm_cpuid2 *cpuid, + uint32_t function, uint32_t index) { int i; for (i = 0; i < cpuid->nent; i++) { - struct kvm_cpuid_entry2 *cur = &cpuid->entries[i]; - - if (cur->function == function && cur->index == index) - return cur; + if (cpuid->entries[i].function == function && + cpuid->entries[i].index == index) + return &cpuid->entries[i]; } TEST_FAIL("CPUID function 0x%x index 0x%x not found ", function, index); diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c index 96f141a758ca..a9345cee9bbc 100644 --- a/tools/testing/selftests/kvm/x86_64/cpuid_test.c +++ b/tools/testing/selftests/kvm/x86_64/cpuid_test.c @@ -156,7 +156,7 @@ static void set_cpuid_after_run(struct kvm_vcpu *vcpu) TEST_ASSERT(!rc, "Setting unmodified CPUID after KVM_RUN failed: %d", rc); /* Changing CPU features is forbidden */ - ent = get_cpuid(cpuid, 0x7, 0); + ent = get_cpuid_entry(cpuid, 0x7, 0); ebx = ent->ebx; ent->ebx--; rc = __vcpu_set_cpuid(vcpu); @@ -164,7 +164,7 @@ static void set_cpuid_after_run(struct kvm_vcpu *vcpu) ent->ebx = ebx; /* Changing MAXPHYADDR is forbidden */ - ent = get_cpuid(cpuid, 0x80000008, 0); + ent = get_cpuid_entry(cpuid, 0x80000008, 0); eax = ent->eax; x = eax & 0xff; ent->eax = (eax & ~0xffu) | (x - 1); -- cgit v1.2.3 From 662162fed26137651cc69971555c3f5a984345d7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:46 +0000 Subject: KVM: selftests: Use get_cpuid_entry() in kvm_get_supported_cpuid_index() Use get_cpuid_entry() in kvm_get_supported_cpuid_index() to replace functionally identical code. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-22-seanjc@google.com --- .../selftests/kvm/include/x86_64/processor.h | 14 +++++----- tools/testing/selftests/kvm/lib/x86_64/processor.c | 32 ---------------------- 2 files changed, 7 insertions(+), 39 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 1703e07f8b83..db8d8a84fbc3 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -618,6 +618,8 @@ static inline struct kvm_cpuid2 *allocate_kvm_cpuid2(int nr_entries) return cpuid; } +struct kvm_cpuid_entry2 *get_cpuid_entry(struct kvm_cpuid2 *cpuid, + uint32_t function, uint32_t index); void vcpu_init_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid); static inline int __vcpu_set_cpuid(struct kvm_vcpu *vcpu) @@ -643,8 +645,11 @@ static inline void vcpu_set_cpuid(struct kvm_vcpu *vcpu) vcpu_ioctl(vcpu, KVM_GET_CPUID2, vcpu->cpuid); } -struct kvm_cpuid_entry2 * -kvm_get_supported_cpuid_index(uint32_t function, uint32_t index); +static inline struct kvm_cpuid_entry2 *kvm_get_supported_cpuid_index(uint32_t function, + uint32_t index) +{ + return get_cpuid_entry(kvm_get_supported_cpuid(), function, index); +} static inline struct kvm_cpuid_entry2 * kvm_get_supported_cpuid_entry(uint32_t function) @@ -763,11 +768,6 @@ uint64_t vm_get_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, void vm_set_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, uint64_t vaddr, uint64_t pte); -/* - * get_cpuid() - find matching CPUID entry and return pointer to it. - */ -struct kvm_cpuid_entry2 *get_cpuid_entry(struct kvm_cpuid2 *cpuid, - uint32_t function, uint32_t index); /* * set_cpuid() - overwrites a matching cpuid entry with the provided value. * matches based on ent->function && ent->index. returns true diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index fd8577a12678..5f359314a6ec 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -766,38 +766,6 @@ void vcpu_init_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid) vcpu_set_cpuid(vcpu); } -/* - * Locate a cpuid entry. - * - * Input Args: - * function: The function of the cpuid entry to find. - * index: The index of the cpuid entry. - * - * Output Args: None - * - * Return: A pointer to the cpuid entry. Never returns NULL. - */ -struct kvm_cpuid_entry2 * -kvm_get_supported_cpuid_index(uint32_t function, uint32_t index) -{ - struct kvm_cpuid2 *cpuid; - struct kvm_cpuid_entry2 *entry = NULL; - int i; - - cpuid = kvm_get_supported_cpuid(); - for (i = 0; i < cpuid->nent; i++) { - if (cpuid->entries[i].function == function && - cpuid->entries[i].index == index) { - entry = &cpuid->entries[i]; - break; - } - } - - TEST_ASSERT(entry, "Guest CPUID entry not found: (EAX=%x, ECX=%x).", - function, index); - return entry; -} - uint64_t vcpu_get_msr(struct kvm_vcpu *vcpu, uint64_t msr_index) { struct { -- cgit v1.2.3 From c41880b5f040120fc27eb2305a0ab3f179c89f9a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:47 +0000 Subject: KVM: selftests: Add helpers to get and modify a vCPU's CPUID entries Add helpers to get a specific CPUID entry for a given vCPU, and to toggle a specific CPUID-based feature for a vCPU. The helpers will reduce the amount of boilerplate code needed to tweak a vCPU's CPUID model, improve code clarity, and most importantly move tests away from modifying the static "cpuid" returned by kvm_get_supported_cpuid(). Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-23-seanjc@google.com --- .../selftests/kvm/include/x86_64/processor.h | 30 ++++++++++++++++++++++ tools/testing/selftests/kvm/lib/x86_64/processor.c | 18 +++++++++++++ 2 files changed, 48 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index db8d8a84fbc3..25b99eab7e71 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -622,6 +622,19 @@ struct kvm_cpuid_entry2 *get_cpuid_entry(struct kvm_cpuid2 *cpuid, uint32_t function, uint32_t index); void vcpu_init_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid); +static inline struct kvm_cpuid_entry2 *__vcpu_get_cpuid_entry(struct kvm_vcpu *vcpu, + uint32_t function, + uint32_t index) +{ + return get_cpuid_entry(vcpu->cpuid, function, index); +} + +static inline struct kvm_cpuid_entry2 *vcpu_get_cpuid_entry(struct kvm_vcpu *vcpu, + uint32_t function) +{ + return __vcpu_get_cpuid_entry(vcpu, function, 0); +} + static inline int __vcpu_set_cpuid(struct kvm_vcpu *vcpu) { int r; @@ -645,6 +658,23 @@ static inline void vcpu_set_cpuid(struct kvm_vcpu *vcpu) vcpu_ioctl(vcpu, KVM_GET_CPUID2, vcpu->cpuid); } +void vcpu_set_or_clear_cpuid_feature(struct kvm_vcpu *vcpu, + struct kvm_x86_cpu_feature feature, + bool set); + +static inline void vcpu_set_cpuid_feature(struct kvm_vcpu *vcpu, + struct kvm_x86_cpu_feature feature) +{ + vcpu_set_or_clear_cpuid_feature(vcpu, feature, true); + +} + +static inline void vcpu_clear_cpuid_feature(struct kvm_vcpu *vcpu, + struct kvm_x86_cpu_feature feature) +{ + vcpu_set_or_clear_cpuid_feature(vcpu, feature, false); +} + static inline struct kvm_cpuid_entry2 *kvm_get_supported_cpuid_index(uint32_t function, uint32_t index) { diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 5f359314a6ec..72aac618e0e4 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -766,6 +766,24 @@ void vcpu_init_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid) vcpu_set_cpuid(vcpu); } +void vcpu_set_or_clear_cpuid_feature(struct kvm_vcpu *vcpu, + struct kvm_x86_cpu_feature feature, + bool set) +{ + struct kvm_cpuid_entry2 *entry; + u32 *reg; + + entry = __vcpu_get_cpuid_entry(vcpu, feature.function, feature.index); + reg = (&entry->eax) + feature.reg; + + if (set) + *reg |= BIT(feature.bit); + else + *reg &= ~BIT(feature.bit); + + vcpu_set_cpuid(vcpu); +} + uint64_t vcpu_get_msr(struct kvm_vcpu *vcpu, uint64_t msr_index) { struct { -- cgit v1.2.3 From 7af7161d87383f84caec1e8aaf6366c31ce845ec Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:48 +0000 Subject: KVM: selftests: Use vm->pa_bits to generate reserved PA bits Use vm->pa_bits to generate the mask of physical address bits that are reserved in page table entries. vm->pa_bits is set when the VM is created, i.e. it's guaranteed to be valid when populating page tables. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-24-seanjc@google.com --- tools/testing/selftests/kvm/lib/x86_64/processor.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 72aac618e0e4..080480c397a4 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -221,16 +221,12 @@ static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, uint16_t index[4]; uint64_t *pml4e, *pdpe, *pde; uint64_t *pte; - struct kvm_cpuid_entry2 *entry; struct kvm_sregs sregs; - int max_phy_addr; uint64_t rsvd_mask = 0; - entry = kvm_get_supported_cpuid_index(0x80000008, 0); - max_phy_addr = entry->eax & 0x000000ff; /* Set the high bits in the reserved mask. */ - if (max_phy_addr < 52) - rsvd_mask = GENMASK_ULL(51, max_phy_addr); + if (vm->pa_bits < 52) + rsvd_mask = GENMASK_ULL(51, vm->pa_bits); /* * SDM vol 3, fig 4-11 "Formats of CR3 and Paging-Structure Entries -- cgit v1.2.3 From 1940af0b8179ae2c2bd287fbd3edaab59df5fb55 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:49 +0000 Subject: KVM: selftests: Add and use helper to set vCPU's CPUID maxphyaddr Add a helper to set a vCPU's guest.MAXPHYADDR, and use it in the test that verifies the emulator returns an error on an unknown instruction when KVM emulates in response to an EPT violation with a GPA that is legal in hardware but illegal with respect to the guest's MAXPHYADDR. Add a helper even though there's only a single user at this time. Before its removal, mmu_role_test also stuffed guest.MAXPHYADDR, and the helper provides a small amount of clarity. More importantly, this eliminates a set_cpuid() user and an instance of modifying kvm_get_supported_cpuid()'s static "cpuid". Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-25-seanjc@google.com --- tools/testing/selftests/kvm/include/x86_64/processor.h | 2 ++ tools/testing/selftests/kvm/lib/x86_64/processor.c | 8 ++++++++ tools/testing/selftests/kvm/x86_64/emulator_error_test.c | 10 +--------- 3 files changed, 11 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 25b99eab7e71..a3d61807d223 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -658,6 +658,8 @@ static inline void vcpu_set_cpuid(struct kvm_vcpu *vcpu) vcpu_ioctl(vcpu, KVM_GET_CPUID2, vcpu->cpuid); } +void vcpu_set_cpuid_maxphyaddr(struct kvm_vcpu *vcpu, uint8_t maxphyaddr); + void vcpu_set_or_clear_cpuid_feature(struct kvm_vcpu *vcpu, struct kvm_x86_cpu_feature feature, bool set); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 080480c397a4..27bf5dad6ec5 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -762,6 +762,14 @@ void vcpu_init_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid) vcpu_set_cpuid(vcpu); } +void vcpu_set_cpuid_maxphyaddr(struct kvm_vcpu *vcpu, uint8_t maxphyaddr) +{ + struct kvm_cpuid_entry2 *entry = vcpu_get_cpuid_entry(vcpu, 0x80000008); + + entry->eax = (entry->eax & ~0xff) | maxphyaddr; + vcpu_set_cpuid(vcpu); +} + void vcpu_set_or_clear_cpuid_feature(struct kvm_vcpu *vcpu, struct kvm_x86_cpu_feature feature, bool set) diff --git a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c index d8dbed419638..236e11755ba6 100644 --- a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c +++ b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c @@ -150,8 +150,6 @@ static uint64_t process_ucall(struct kvm_vcpu *vcpu) int main(int argc, char *argv[]) { - struct kvm_cpuid_entry2 *entry; - struct kvm_cpuid2 *cpuid; struct kvm_vcpu *vcpu; struct kvm_vm *vm; uint64_t gpa, pte; @@ -165,13 +163,7 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, guest_code); - cpuid = kvm_get_supported_cpuid(); - - entry = kvm_get_supported_cpuid_index(0x80000008, 0); - entry->eax = (entry->eax & 0xffffff00) | MAXPHYADDR; - set_cpuid(cpuid, entry); - - vcpu_init_cpuid(vcpu, cpuid); + vcpu_set_cpuid_maxphyaddr(vcpu, MAXPHYADDR); rc = kvm_check_cap(KVM_CAP_EXIT_ON_EMULATION_FAILURE); TEST_ASSERT(rc, "KVM_CAP_EXIT_ON_EMULATION_FAILURE is unavailable"); -- cgit v1.2.3 From b78843be77968b1e5a071c7ed7fd8f3094e8f0a2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Jul 2022 14:42:49 -0700 Subject: KVM: selftests: Use vcpu_clear_cpuid_feature() in monitor_mwait_test Use vcpu_clear_cpuid_feature() to the MONITOR/MWAIT CPUID feature bit in the MONITOR/MWAIT quirk test. Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c index 0ac5bec1e3b7..016070cad36e 100644 --- a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c +++ b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c @@ -63,8 +63,6 @@ static void guest_code(void) int main(int argc, char *argv[]) { uint64_t disabled_quirks; - struct kvm_cpuid2 *cpuid; - struct kvm_cpuid_entry2 *entry; struct kvm_vcpu *vcpu; struct kvm_run *run; struct kvm_vm *vm; @@ -73,14 +71,8 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_has_cap(KVM_CAP_DISABLE_QUIRKS2)); - cpuid = kvm_get_supported_cpuid(); - - entry = kvm_get_supported_cpuid_index(1, 0); - entry->ecx &= ~CPUID_MWAIT; - set_cpuid(cpuid, entry); - vm = vm_create_with_one_vcpu(&vcpu, guest_code); - vcpu_set_cpuid(vcpu); + vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_MWAIT); run = vcpu->run; -- cgit v1.2.3 From 3a5d36b32bd26d88d8dfe8f1eff702e138ea18cd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:50 +0000 Subject: KVM: selftests: Use vcpu_get_cpuid_entry() in PV features test (sort of) Add a new helper, vcpu_clear_cpuid_entry(), to do a RMW operation on the vCPU's CPUID model to clear a given CPUID entry, and use it to clear KVM's paravirt feature instead of operating on kvm_get_supported_cpuid()'s static "cpuid" variable. This also eliminates a user of the soon-be-defunct set_cpuid() helper. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-26-seanjc@google.com --- tools/testing/selftests/kvm/include/x86_64/processor.h | 1 + tools/testing/selftests/kvm/lib/x86_64/processor.c | 11 +++++++++++ tools/testing/selftests/kvm/x86_64/kvm_pv_test.c | 12 +----------- 3 files changed, 13 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index a3d61807d223..c0d40474ef2d 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -660,6 +660,7 @@ static inline void vcpu_set_cpuid(struct kvm_vcpu *vcpu) void vcpu_set_cpuid_maxphyaddr(struct kvm_vcpu *vcpu, uint8_t maxphyaddr); +void vcpu_clear_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function); void vcpu_set_or_clear_cpuid_feature(struct kvm_vcpu *vcpu, struct kvm_x86_cpu_feature feature, bool set); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 27bf5dad6ec5..a41384863336 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -770,6 +770,17 @@ void vcpu_set_cpuid_maxphyaddr(struct kvm_vcpu *vcpu, uint8_t maxphyaddr) vcpu_set_cpuid(vcpu); } +void vcpu_clear_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function) +{ + struct kvm_cpuid_entry2 *entry = vcpu_get_cpuid_entry(vcpu, function); + + entry->eax = 0; + entry->ebx = 0; + entry->ecx = 0; + entry->edx = 0; + vcpu_set_cpuid(vcpu); +} + void vcpu_set_or_clear_cpuid_feature(struct kvm_vcpu *vcpu, struct kvm_x86_cpu_feature feature, bool set) diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c index 986ca7282a09..619655c1a1f3 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c @@ -95,15 +95,6 @@ static void guest_main(void) GUEST_DONE(); } -static void clear_kvm_cpuid_features(struct kvm_cpuid2 *cpuid) -{ - struct kvm_cpuid_entry2 ent = {0}; - - ent.function = KVM_CPUID_FEATURES; - TEST_ASSERT(set_cpuid(cpuid, &ent), - "failed to clear KVM_CPUID_FEATURES leaf"); -} - static void pr_msr(struct ucall *uc) { struct msr_data *msr = (struct msr_data *)uc->args[0]; @@ -156,8 +147,7 @@ int main(void) vcpu_enable_cap(vcpu, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, 1); - clear_kvm_cpuid_features(vcpu->cpuid); - vcpu_set_cpuid(vcpu); + vcpu_clear_cpuid_entry(vcpu, KVM_CPUID_FEATURES); vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vcpu); -- cgit v1.2.3 From 4dcd130c9b3d64464b5c44aba071203095f77e63 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:51 +0000 Subject: KVM: selftests: Use vCPU's CPUID directly in Hyper-V test Use the vCPU's persistent CPUID array directly when manipulating the set of exposed Hyper-V CPUID features. Drop set_cpuid() to route all future modification through the vCPU helpers; the Hyper-V features test was the last user. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-27-seanjc@google.com --- .../selftests/kvm/include/x86_64/processor.h | 9 -- tools/testing/selftests/kvm/lib/x86_64/processor.c | 18 --- .../testing/selftests/kvm/x86_64/hyperv_features.c | 126 +++++++++++---------- 3 files changed, 64 insertions(+), 89 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index c0d40474ef2d..50d68114ab86 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -801,15 +801,6 @@ uint64_t vm_get_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, void vm_set_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, uint64_t vaddr, uint64_t pte); -/* - * set_cpuid() - overwrites a matching cpuid entry with the provided value. - * matches based on ent->function && ent->index. returns true - * if a match was found and successfully overwritten. - * @cpuid: the kvm cpuid list to modify. - * @ent: cpuid entry to insert - */ -bool set_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 *ent); - uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index a41384863336..7cc0814f0b67 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1224,24 +1224,6 @@ struct kvm_cpuid_entry2 *get_cpuid_entry(struct kvm_cpuid2 *cpuid, return NULL; } -bool set_cpuid(struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 *ent) -{ - int i; - - for (i = 0; i < cpuid->nent; i++) { - struct kvm_cpuid_entry2 *cur = &cpuid->entries[i]; - - if (cur->function != ent->function || cur->index != ent->index) - continue; - - memcpy(cur, ent, sizeof(struct kvm_cpuid_entry2)); - return true; - } - - return false; -} - uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3) { diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c index 86ce1f1e98ee..79ab0152d281 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c @@ -91,37 +91,28 @@ static void guest_hcall(vm_vaddr_t pgs_gpa, struct hcall_data *hcall) GUEST_DONE(); } -static void hv_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 *feat, - struct kvm_cpuid_entry2 *recomm, - struct kvm_cpuid_entry2 *dbg) +static void vcpu_reset_hv_cpuid(struct kvm_vcpu *vcpu) { - TEST_ASSERT(set_cpuid(cpuid, feat), - "failed to set KVM_CPUID_FEATURES leaf"); - TEST_ASSERT(set_cpuid(cpuid, recomm), - "failed to set HYPERV_CPUID_ENLIGHTMENT_INFO leaf"); - TEST_ASSERT(set_cpuid(cpuid, dbg), - "failed to set HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES leaf"); - vcpu_init_cpuid(vcpu, cpuid); + /* + * Enable all supported Hyper-V features, then clear the leafs holding + * the features that will be tested one by one. + */ + vcpu_set_hv_cpuid(vcpu); + + vcpu_clear_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES); + vcpu_clear_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO); + vcpu_clear_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES); } static void guest_test_msrs_access(void) { + struct kvm_cpuid2 *prev_cpuid = NULL; + struct kvm_cpuid_entry2 *feat, *dbg; struct kvm_vcpu *vcpu; struct kvm_run *run; struct kvm_vm *vm; struct ucall uc; int stage = 0; - struct kvm_cpuid_entry2 feat = { - .function = HYPERV_CPUID_FEATURES - }; - struct kvm_cpuid_entry2 recomm = { - .function = HYPERV_CPUID_ENLIGHTMENT_INFO - }; - struct kvm_cpuid_entry2 dbg = { - .function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES - }; - struct kvm_cpuid2 *best; vm_vaddr_t msr_gva; struct msr_data *msr; @@ -135,9 +126,16 @@ static void guest_test_msrs_access(void) vcpu_args_set(vcpu, 1, msr_gva); vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_ENFORCE_CPUID, 1); - vcpu_set_hv_cpuid(vcpu); + if (!prev_cpuid) { + vcpu_reset_hv_cpuid(vcpu); - best = kvm_get_supported_hv_cpuid(); + prev_cpuid = allocate_kvm_cpuid2(vcpu->cpuid->nent); + } else { + vcpu_init_cpuid(vcpu, prev_cpuid); + } + + feat = vcpu_get_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES); + dbg = vcpu_get_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES); vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vcpu); @@ -163,7 +161,7 @@ static void guest_test_msrs_access(void) msr->available = 0; break; case 2: - feat.eax |= HV_MSR_HYPERCALL_AVAILABLE; + feat->eax |= HV_MSR_HYPERCALL_AVAILABLE; /* * HV_X64_MSR_GUEST_OS_ID has to be written first to make * HV_X64_MSR_HYPERCALL available. @@ -190,7 +188,7 @@ static void guest_test_msrs_access(void) msr->available = 0; break; case 6: - feat.eax |= HV_MSR_VP_RUNTIME_AVAILABLE; + feat->eax |= HV_MSR_VP_RUNTIME_AVAILABLE; msr->idx = HV_X64_MSR_VP_RUNTIME; msr->write = 0; msr->available = 1; @@ -209,7 +207,7 @@ static void guest_test_msrs_access(void) msr->available = 0; break; case 9: - feat.eax |= HV_MSR_TIME_REF_COUNT_AVAILABLE; + feat->eax |= HV_MSR_TIME_REF_COUNT_AVAILABLE; msr->idx = HV_X64_MSR_TIME_REF_COUNT; msr->write = 0; msr->available = 1; @@ -228,7 +226,7 @@ static void guest_test_msrs_access(void) msr->available = 0; break; case 12: - feat.eax |= HV_MSR_VP_INDEX_AVAILABLE; + feat->eax |= HV_MSR_VP_INDEX_AVAILABLE; msr->idx = HV_X64_MSR_VP_INDEX; msr->write = 0; msr->available = 1; @@ -247,7 +245,7 @@ static void guest_test_msrs_access(void) msr->available = 0; break; case 15: - feat.eax |= HV_MSR_RESET_AVAILABLE; + feat->eax |= HV_MSR_RESET_AVAILABLE; msr->idx = HV_X64_MSR_RESET; msr->write = 0; msr->available = 1; @@ -265,7 +263,7 @@ static void guest_test_msrs_access(void) msr->available = 0; break; case 18: - feat.eax |= HV_MSR_REFERENCE_TSC_AVAILABLE; + feat->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE; msr->idx = HV_X64_MSR_REFERENCE_TSC; msr->write = 0; msr->available = 1; @@ -292,7 +290,7 @@ static void guest_test_msrs_access(void) msr->available = 0; break; case 22: - feat.eax |= HV_MSR_SYNIC_AVAILABLE; + feat->eax |= HV_MSR_SYNIC_AVAILABLE; msr->idx = HV_X64_MSR_EOM; msr->write = 0; msr->available = 1; @@ -310,7 +308,7 @@ static void guest_test_msrs_access(void) msr->available = 0; break; case 25: - feat.eax |= HV_MSR_SYNTIMER_AVAILABLE; + feat->eax |= HV_MSR_SYNTIMER_AVAILABLE; msr->idx = HV_X64_MSR_STIMER0_CONFIG; msr->write = 0; msr->available = 1; @@ -329,7 +327,7 @@ static void guest_test_msrs_access(void) msr->available = 0; break; case 28: - feat.edx |= HV_STIMER_DIRECT_MODE_AVAILABLE; + feat->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE; msr->idx = HV_X64_MSR_STIMER0_CONFIG; msr->write = 1; msr->write_val = 1 << 12; @@ -342,7 +340,7 @@ static void guest_test_msrs_access(void) msr->available = 0; break; case 30: - feat.eax |= HV_MSR_APIC_ACCESS_AVAILABLE; + feat->eax |= HV_MSR_APIC_ACCESS_AVAILABLE; msr->idx = HV_X64_MSR_EOI; msr->write = 1; msr->write_val = 1; @@ -355,7 +353,7 @@ static void guest_test_msrs_access(void) msr->available = 0; break; case 32: - feat.eax |= HV_ACCESS_FREQUENCY_MSRS; + feat->eax |= HV_ACCESS_FREQUENCY_MSRS; msr->idx = HV_X64_MSR_TSC_FREQUENCY; msr->write = 0; msr->available = 1; @@ -374,7 +372,7 @@ static void guest_test_msrs_access(void) msr->available = 0; break; case 35: - feat.eax |= HV_ACCESS_REENLIGHTENMENT; + feat->eax |= HV_ACCESS_REENLIGHTENMENT; msr->idx = HV_X64_MSR_REENLIGHTENMENT_CONTROL; msr->write = 0; msr->available = 1; @@ -399,7 +397,7 @@ static void guest_test_msrs_access(void) msr->available = 0; break; case 39: - feat.edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; + feat->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; msr->idx = HV_X64_MSR_CRASH_P0; msr->write = 0; msr->available = 1; @@ -417,8 +415,8 @@ static void guest_test_msrs_access(void) msr->available = 0; break; case 42: - feat.edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE; - dbg.eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; + feat->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE; + dbg->eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; msr->idx = HV_X64_MSR_SYNDBG_STATUS; msr->write = 0; msr->available = 1; @@ -435,7 +433,9 @@ static void guest_test_msrs_access(void) return; } - hv_set_cpuid(vcpu, best, &feat, &recomm, &dbg); + vcpu_set_cpuid(vcpu); + + memcpy(prev_cpuid, vcpu->cpuid, kvm_cpuid2_size(vcpu->cpuid->nent)); pr_debug("Stage %d: testing msr: 0x%x for %s\n", stage, msr->idx, msr->write ? "write" : "read"); @@ -463,24 +463,15 @@ static void guest_test_msrs_access(void) static void guest_test_hcalls_access(void) { + struct kvm_cpuid_entry2 *feat, *recomm, *dbg; + struct kvm_cpuid2 *prev_cpuid = NULL; struct kvm_vcpu *vcpu; struct kvm_run *run; struct kvm_vm *vm; struct ucall uc; int stage = 0; - struct kvm_cpuid_entry2 feat = { - .function = HYPERV_CPUID_FEATURES, - .eax = HV_MSR_HYPERCALL_AVAILABLE - }; - struct kvm_cpuid_entry2 recomm = { - .function = HYPERV_CPUID_ENLIGHTMENT_INFO - }; - struct kvm_cpuid_entry2 dbg = { - .function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES - }; vm_vaddr_t hcall_page, hcall_params; struct hcall_data *hcall; - struct kvm_cpuid2 *best; while (true) { vm = vm_create_with_one_vcpu(&vcpu, guest_hcall); @@ -499,14 +490,23 @@ static void guest_test_hcalls_access(void) vcpu_args_set(vcpu, 2, addr_gva2gpa(vm, hcall_page), hcall_params); vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_ENFORCE_CPUID, 1); - vcpu_set_hv_cpuid(vcpu); + if (!prev_cpuid) { + vcpu_reset_hv_cpuid(vcpu); - best = kvm_get_supported_hv_cpuid(); + prev_cpuid = allocate_kvm_cpuid2(vcpu->cpuid->nent); + } else { + vcpu_init_cpuid(vcpu, prev_cpuid); + } + + feat = vcpu_get_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES); + recomm = vcpu_get_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO); + dbg = vcpu_get_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES); run = vcpu->run; switch (stage) { case 0: + feat->eax |= HV_MSR_HYPERCALL_AVAILABLE; hcall->control = 0xdeadbeef; hcall->expect = HV_STATUS_INVALID_HYPERCALL_CODE; break; @@ -516,7 +516,7 @@ static void guest_test_hcalls_access(void) hcall->expect = HV_STATUS_ACCESS_DENIED; break; case 2: - feat.ebx |= HV_POST_MESSAGES; + feat->ebx |= HV_POST_MESSAGES; hcall->control = HVCALL_POST_MESSAGE; hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT; break; @@ -526,7 +526,7 @@ static void guest_test_hcalls_access(void) hcall->expect = HV_STATUS_ACCESS_DENIED; break; case 4: - feat.ebx |= HV_SIGNAL_EVENTS; + feat->ebx |= HV_SIGNAL_EVENTS; hcall->control = HVCALL_SIGNAL_EVENT; hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT; break; @@ -536,12 +536,12 @@ static void guest_test_hcalls_access(void) hcall->expect = HV_STATUS_INVALID_HYPERCALL_CODE; break; case 6: - dbg.eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; + dbg->eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; hcall->control = HVCALL_RESET_DEBUG_SESSION; hcall->expect = HV_STATUS_ACCESS_DENIED; break; case 7: - feat.ebx |= HV_DEBUGGING; + feat->ebx |= HV_DEBUGGING; hcall->control = HVCALL_RESET_DEBUG_SESSION; hcall->expect = HV_STATUS_OPERATION_DENIED; break; @@ -551,7 +551,7 @@ static void guest_test_hcalls_access(void) hcall->expect = HV_STATUS_ACCESS_DENIED; break; case 9: - recomm.eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; + recomm->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE; hcall->expect = HV_STATUS_SUCCESS; break; @@ -560,7 +560,7 @@ static void guest_test_hcalls_access(void) hcall->expect = HV_STATUS_ACCESS_DENIED; break; case 11: - recomm.eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; + recomm->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX; hcall->expect = HV_STATUS_SUCCESS; break; @@ -570,7 +570,7 @@ static void guest_test_hcalls_access(void) hcall->expect = HV_STATUS_ACCESS_DENIED; break; case 13: - recomm.eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; + recomm->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; hcall->control = HVCALL_SEND_IPI; hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT; break; @@ -585,7 +585,7 @@ static void guest_test_hcalls_access(void) hcall->expect = HV_STATUS_ACCESS_DENIED; break; case 16: - recomm.ebx = 0xfff; + recomm->ebx = 0xfff; hcall->control = HVCALL_NOTIFY_LONG_SPIN_WAIT; hcall->expect = HV_STATUS_SUCCESS; break; @@ -595,7 +595,7 @@ static void guest_test_hcalls_access(void) hcall->ud_expected = true; break; case 18: - feat.edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE; + feat->edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE; hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | HV_HYPERCALL_FAST_BIT; hcall->ud_expected = false; hcall->expect = HV_STATUS_SUCCESS; @@ -605,7 +605,9 @@ static void guest_test_hcalls_access(void) return; } - hv_set_cpuid(vcpu, best, &feat, &recomm, &dbg); + vcpu_set_cpuid(vcpu); + + memcpy(prev_cpuid, vcpu->cpuid, kvm_cpuid2_size(vcpu->cpuid->nent)); pr_debug("Stage %d: testing hcall: 0x%lx\n", stage, hcall->control); -- cgit v1.2.3 From 49f6876a2e1e7e8a4dc3f2b27f720502d62e4804 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:52 +0000 Subject: KVM: selftests: Use vcpu_get_cpuid_entry() in CPUID test Use vcpu_get_cpuid_entry() instead of an open coded equivalent in the CPUID test. No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-28-seanjc@google.com --- tools/testing/selftests/kvm/x86_64/cpuid_test.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c index a9345cee9bbc..7bbc1837b389 100644 --- a/tools/testing/selftests/kvm/x86_64/cpuid_test.c +++ b/tools/testing/selftests/kvm/x86_64/cpuid_test.c @@ -146,7 +146,6 @@ struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct static void set_cpuid_after_run(struct kvm_vcpu *vcpu) { - struct kvm_cpuid2 *cpuid = vcpu->cpuid; struct kvm_cpuid_entry2 *ent; int rc; u32 eax, ebx, x; @@ -156,7 +155,7 @@ static void set_cpuid_after_run(struct kvm_vcpu *vcpu) TEST_ASSERT(!rc, "Setting unmodified CPUID after KVM_RUN failed: %d", rc); /* Changing CPU features is forbidden */ - ent = get_cpuid_entry(cpuid, 0x7, 0); + ent = vcpu_get_cpuid_entry(vcpu, 0x7); ebx = ent->ebx; ent->ebx--; rc = __vcpu_set_cpuid(vcpu); @@ -164,7 +163,7 @@ static void set_cpuid_after_run(struct kvm_vcpu *vcpu) ent->ebx = ebx; /* Changing MAXPHYADDR is forbidden */ - ent = get_cpuid_entry(cpuid, 0x80000008, 0); + ent = vcpu_get_cpuid_entry(vcpu, 0x80000008); eax = ent->eax; x = eax & 0xff; ent->eax = (eax & ~0xffu) | (x - 1); -- cgit v1.2.3 From 4ee315231e3d7f85ab672fe231421614cad8f68d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:53 +0000 Subject: KVM: selftests: Use vcpu_{set,clear}_cpuid_feature() in nVMX state test Use vcpu_{set,clear}_cpuid_feature() to toggle nested VMX support in the vCPU CPUID module in the nVMX state test. Drop CPUID_VMX as there are no longer any users. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-29-seanjc@google.com --- .../testing/selftests/kvm/include/x86_64/processor.h | 1 - .../selftests/kvm/x86_64/vmx_set_nested_state_test.c | 20 ++------------------ 2 files changed, 2 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 50d68114ab86..7c0396788f76 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -160,7 +160,6 @@ struct kvm_x86_cpu_feature { #define X86_FEATURE_KVM_MIGRATION_CONTROL KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 17) /* CPUID.1.ECX */ -#define CPUID_VMX (1ul << 5) #define CPUID_XSAVE (1ul << 26) #define CPUID_OSXSAVE (1ul << 27) diff --git a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c index 1cf78ec007f2..41ea7028a1f8 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c @@ -121,7 +121,7 @@ void test_vmx_nested_state(struct kvm_vcpu *vcpu) test_nested_state(vcpu, state); /* Enable VMX in the guest CPUID. */ - vcpu_set_cpuid(vcpu); + vcpu_set_cpuid_feature(vcpu, X86_FEATURE_VMX); /* * Setting vmxon_pa == -1ull and vmcs_pa == -1ull exits early without @@ -243,22 +243,6 @@ void test_vmx_nested_state(struct kvm_vcpu *vcpu) free(state); } -void disable_vmx(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid2 *cpuid = vcpu->cpuid; - int i; - - for (i = 0; i < cpuid->nent; ++i) - if (cpuid->entries[i].function == 1 && - cpuid->entries[i].index == 0) - break; - TEST_ASSERT(i != cpuid->nent, "CPUID function 1 not found"); - - cpuid->entries[i].ecx &= ~CPUID_VMX; - vcpu_set_cpuid(vcpu); - cpuid->entries[i].ecx |= CPUID_VMX; -} - int main(int argc, char *argv[]) { struct kvm_vm *vm; @@ -280,7 +264,7 @@ int main(int argc, char *argv[]) /* * First run tests with VMX disabled to check error handling. */ - disable_vmx(vcpu); + vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_VMX); /* Passing a NULL kvm_nested_state causes a EFAULT. */ test_nested_state_expect_efault(vcpu, NULL); -- cgit v1.2.3 From 7ed5a54e8282edfb464ccce4d3cf3a6f1d79b14a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:54 +0000 Subject: KVM: selftests: Use vcpu_clear_cpuid_feature() to clear x2APIC Add X86_FEATURE_X2APIC and use vcpu_clear_cpuid_feature() to clear x2APIC support in the xAPIC state test. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-30-seanjc@google.com --- tools/testing/selftests/kvm/include/x86_64/processor.h | 1 + tools/testing/selftests/kvm/x86_64/xapic_state_test.c | 10 +--------- 2 files changed, 2 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 7c0396788f76..db90a80dd09a 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -83,6 +83,7 @@ struct kvm_x86_cpu_feature { #define X86_FEATURE_SMX KVM_X86_CPU_FEATURE(0x1, 0, ECX, 6) #define X86_FEATURE_PDCM KVM_X86_CPU_FEATURE(0x1, 0, ECX, 15) #define X86_FEATURE_PCID KVM_X86_CPU_FEATURE(0x1, 0, ECX, 17) +#define X86_FEATURE_X2APIC KVM_X86_CPU_FEATURE(0x1, 0, ECX, 21) #define X86_FEATURE_MOVBE KVM_X86_CPU_FEATURE(0x1, 0, ECX, 22) #define X86_FEATURE_TSC_DEADLINE_TIMER KVM_X86_CPU_FEATURE(0x1, 0, ECX, 24) #define X86_FEATURE_XSAVE KVM_X86_CPU_FEATURE(0x1, 0, ECX, 26) diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c index 52133bb4d85a..6f7a5ef66718 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c @@ -136,9 +136,7 @@ int main(int argc, char *argv[]) .vcpu = NULL, .is_x2apic = true, }; - struct kvm_cpuid2 *cpuid; struct kvm_vm *vm; - int i; vm = vm_create_with_one_vcpu(&x.vcpu, x2apic_guest_code); test_icr(&x); @@ -152,13 +150,7 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&x.vcpu, xapic_guest_code); x.is_x2apic = false; - cpuid = x.vcpu->cpuid; - for (i = 0; i < cpuid->nent; i++) { - if (cpuid->entries[i].function == 1) - break; - } - cpuid->entries[i].ecx &= ~BIT(21); - vcpu_set_cpuid(x.vcpu); + vcpu_clear_cpuid_feature(x.vcpu, X86_FEATURE_X2APIC); virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); test_icr(&x); -- cgit v1.2.3 From 813e38cd6d7b4247314427a901015867e0534356 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:55 +0000 Subject: KVM: selftests: Make get_supported_cpuid() returns "const" Tag the returned CPUID pointers from kvm_get_supported_cpuid(), kvm_get_supported_hv_cpuid(), and vcpu_get_supported_hv_cpuid() "const" to prevent reintroducing the broken pattern of modifying the static "cpuid" variable used by kvm_get_supported_cpuid() to cache the results of KVM_GET_SUPPORTED_CPUID. Update downstream consumers as needed. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-31-seanjc@google.com --- .../selftests/kvm/include/x86_64/processor.h | 24 +++++++++---------- tools/testing/selftests/kvm/lib/x86_64/processor.c | 27 +++++++--------------- tools/testing/selftests/kvm/x86_64/cpuid_test.c | 12 ++++------ tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c | 10 ++++---- .../selftests/kvm/x86_64/pmu_event_filter_test.c | 10 ++++---- .../selftests/kvm/x86_64/vmx_pmu_caps_test.c | 2 +- 6 files changed, 36 insertions(+), 49 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index db90a80dd09a..d1c0cff24779 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -585,7 +585,9 @@ static inline void vcpu_xcrs_set(struct kvm_vcpu *vcpu, struct kvm_xcrs *xcrs) vcpu_ioctl(vcpu, KVM_SET_XCRS, xcrs); } -struct kvm_cpuid2 *kvm_get_supported_cpuid(void); +const struct kvm_cpuid2 *kvm_get_supported_cpuid(void); +const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void); +const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu); bool kvm_cpuid_has(const struct kvm_cpuid2 *cpuid, struct kvm_x86_cpu_feature feature); @@ -618,15 +620,17 @@ static inline struct kvm_cpuid2 *allocate_kvm_cpuid2(int nr_entries) return cpuid; } -struct kvm_cpuid_entry2 *get_cpuid_entry(struct kvm_cpuid2 *cpuid, - uint32_t function, uint32_t index); -void vcpu_init_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid); +const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid, + uint32_t function, uint32_t index); +void vcpu_init_cpuid(struct kvm_vcpu *vcpu, const struct kvm_cpuid2 *cpuid); +void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu); static inline struct kvm_cpuid_entry2 *__vcpu_get_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function, uint32_t index) { - return get_cpuid_entry(vcpu->cpuid, function, index); + return (struct kvm_cpuid_entry2 *)get_cpuid_entry(vcpu->cpuid, + function, index); } static inline struct kvm_cpuid_entry2 *vcpu_get_cpuid_entry(struct kvm_vcpu *vcpu, @@ -678,14 +682,13 @@ static inline void vcpu_clear_cpuid_feature(struct kvm_vcpu *vcpu, vcpu_set_or_clear_cpuid_feature(vcpu, feature, false); } -static inline struct kvm_cpuid_entry2 *kvm_get_supported_cpuid_index(uint32_t function, - uint32_t index) +static inline const struct kvm_cpuid_entry2 *kvm_get_supported_cpuid_index(uint32_t function, + uint32_t index) { return get_cpuid_entry(kvm_get_supported_cpuid(), function, index); } -static inline struct kvm_cpuid_entry2 * -kvm_get_supported_cpuid_entry(uint32_t function) +static inline const struct kvm_cpuid_entry2 *kvm_get_supported_cpuid_entry(uint32_t function) { return kvm_get_supported_cpuid_index(function, 0); } @@ -804,9 +807,6 @@ void vm_set_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); -struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void); -void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu); -struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu); void vm_xsave_req_perm(int bit); enum pg_level { diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 7cc0814f0b67..8677c6063388 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -676,18 +676,7 @@ void vcpu_arch_free(struct kvm_vcpu *vcpu) free(vcpu->cpuid); } -/* - * KVM Supported CPUID Get - * - * Input Args: None - * - * Output Args: - * - * Return: The supported KVM CPUID - * - * Get the guest CPUID supported by KVM. - */ -struct kvm_cpuid2 *kvm_get_supported_cpuid(void) +const struct kvm_cpuid2 *kvm_get_supported_cpuid(void) { static struct kvm_cpuid2 *cpuid; int kvm_fd; @@ -745,7 +734,7 @@ uint64_t kvm_get_feature_msr(uint64_t msr_index) return buffer.entry.data; } -void vcpu_init_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid) +void vcpu_init_cpuid(struct kvm_vcpu *vcpu, const struct kvm_cpuid2 *cpuid) { TEST_ASSERT(cpuid != vcpu->cpuid, "@cpuid can't be the vCPU's CPUID"); @@ -1079,7 +1068,7 @@ uint32_t kvm_get_cpuid_max_extended(void) void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits) { - struct kvm_cpuid_entry2 *entry; + const struct kvm_cpuid_entry2 *entry; bool pae; /* SDM 4.1.4 */ @@ -1208,8 +1197,8 @@ void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) } } -struct kvm_cpuid_entry2 *get_cpuid_entry(struct kvm_cpuid2 *cpuid, - uint32_t function, uint32_t index) +const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid, + uint32_t function, uint32_t index) { int i; @@ -1235,7 +1224,7 @@ uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, return r; } -struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void) +const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void) { static struct kvm_cpuid2 *cpuid; int kvm_fd; @@ -1255,7 +1244,7 @@ struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void) void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu) { static struct kvm_cpuid2 *cpuid_full; - struct kvm_cpuid2 *cpuid_sys, *cpuid_hv; + const struct kvm_cpuid2 *cpuid_sys, *cpuid_hv; int i, nent = 0; if (!cpuid_full) { @@ -1285,7 +1274,7 @@ void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu) vcpu_init_cpuid(vcpu, cpuid_full); } -struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu) +const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid2 *cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c index 7bbc1837b389..d8ae4a0e00a4 100644 --- a/tools/testing/selftests/kvm/x86_64/cpuid_test.c +++ b/tools/testing/selftests/kvm/x86_64/cpuid_test.c @@ -66,7 +66,7 @@ static void guest_main(struct kvm_cpuid2 *guest_cpuid) GUEST_DONE(); } -static bool is_cpuid_mangled(struct kvm_cpuid_entry2 *entrie) +static bool is_cpuid_mangled(const struct kvm_cpuid_entry2 *entrie) { int i; @@ -79,9 +79,10 @@ static bool is_cpuid_mangled(struct kvm_cpuid_entry2 *entrie) return false; } -static void compare_cpuids(struct kvm_cpuid2 *cpuid1, struct kvm_cpuid2 *cpuid2) +static void compare_cpuids(const struct kvm_cpuid2 *cpuid1, + const struct kvm_cpuid2 *cpuid2) { - struct kvm_cpuid_entry2 *e1, *e2; + const struct kvm_cpuid_entry2 *e1, *e2; int i; TEST_ASSERT(cpuid1->nent == cpuid2->nent, @@ -174,7 +175,6 @@ static void set_cpuid_after_run(struct kvm_vcpu *vcpu) int main(void) { - struct kvm_cpuid2 *supp_cpuid; struct kvm_vcpu *vcpu; vm_vaddr_t cpuid_gva; struct kvm_vm *vm; @@ -182,9 +182,7 @@ int main(void) vm = vm_create_with_one_vcpu(&vcpu, guest_main); - supp_cpuid = kvm_get_supported_cpuid(); - - compare_cpuids(supp_cpuid, vcpu->cpuid); + compare_cpuids(kvm_get_supported_cpuid(), vcpu->cpuid); vcpu_alloc_cpuid(vm, &cpuid_gva, vcpu->cpuid); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c index c406b95cba9b..e804eb08dff9 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c @@ -43,7 +43,7 @@ static bool smt_possible(void) return res; } -static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries, +static void test_hv_cpuid(const struct kvm_cpuid2 *hv_cpuid_entries, bool evmcs_expected) { int i; @@ -56,7 +56,7 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries, nent_expected, hv_cpuid_entries->nent); for (i = 0; i < hv_cpuid_entries->nent; i++) { - struct kvm_cpuid_entry2 *entry = &hv_cpuid_entries->entries[i]; + const struct kvm_cpuid_entry2 *entry = &hv_cpuid_entries->entries[i]; TEST_ASSERT((entry->function >= 0x40000000) && (entry->function <= 0x40000082), @@ -131,7 +131,7 @@ void test_hv_cpuid_e2big(struct kvm_vm *vm, struct kvm_vcpu *vcpu) int main(int argc, char *argv[]) { struct kvm_vm *vm; - struct kvm_cpuid2 *hv_cpuid_entries; + const struct kvm_cpuid2 *hv_cpuid_entries; struct kvm_vcpu *vcpu; /* Tell stdout not to buffer its content */ @@ -146,7 +146,7 @@ int main(int argc, char *argv[]) hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vcpu); test_hv_cpuid(hv_cpuid_entries, false); - free(hv_cpuid_entries); + free((void *)hv_cpuid_entries); if (!kvm_cpu_has(X86_FEATURE_VMX) || !kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) { @@ -156,7 +156,7 @@ int main(int argc, char *argv[]) vcpu_enable_evmcs(vcpu); hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vcpu); test_hv_cpuid(hv_cpuid_entries, true); - free(hv_cpuid_entries); + free((void *)hv_cpuid_entries); do_sys: /* Test system ioctl version */ diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index 530a75fee92c..b4c4631891d5 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -384,7 +384,7 @@ static void test_pmu_config_disable(void (*guest_code)(void)) * counter per logical processor, an EBX bit vector of length greater * than 5, and EBX[5] clear. */ -static bool check_intel_pmu_leaf(struct kvm_cpuid_entry2 *entry) +static bool check_intel_pmu_leaf(const struct kvm_cpuid_entry2 *entry) { union cpuid10_eax eax = { .full = entry->eax }; union cpuid10_ebx ebx = { .full = entry->ebx }; @@ -400,10 +400,10 @@ static bool check_intel_pmu_leaf(struct kvm_cpuid_entry2 *entry) */ static bool use_intel_pmu(void) { - struct kvm_cpuid_entry2 *entry; + const struct kvm_cpuid_entry2 *entry; entry = kvm_get_supported_cpuid_index(0xa, 0); - return is_intel_cpu() && entry && check_intel_pmu_leaf(entry); + return is_intel_cpu() && check_intel_pmu_leaf(entry); } static bool is_zen1(uint32_t eax) @@ -432,10 +432,10 @@ static bool is_zen3(uint32_t eax) */ static bool use_amd_pmu(void) { - struct kvm_cpuid_entry2 *entry; + const struct kvm_cpuid_entry2 *entry; entry = kvm_get_supported_cpuid_index(1, 0); - return is_amd_cpu() && entry && + return is_amd_cpu() && (is_zen1(entry->eax) || is_zen2(entry->eax) || is_zen3(entry->eax)); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c index dc3869d5aff0..689517f2aae6 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c @@ -53,7 +53,7 @@ static void guest_code(void) int main(int argc, char *argv[]) { - struct kvm_cpuid_entry2 *entry_a_0; + const struct kvm_cpuid_entry2 *entry_a_0; struct kvm_vm *vm; struct kvm_vcpu *vcpu; int ret; -- cgit v1.2.3 From 8fe09d6a91be94be6910d843bb21d60b1fc99cd2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:56 +0000 Subject: KVM: selftests: Set input function/index in raw CPUID helper(s) Set the function/index for CPUID in the helper instead of relying on the caller to do so. In addition to reducing the risk of consuming an uninitialized ECX, having the function/index embedded in the call makes it easier to understand what is being checked. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-32-seanjc@google.com --- .../testing/selftests/kvm/include/x86_64/processor.h | 16 +++++++++++++--- tools/testing/selftests/kvm/lib/x86_64/processor.c | 13 ++++--------- tools/testing/selftests/kvm/x86_64/amx_test.c | 19 ++++--------------- tools/testing/selftests/kvm/x86_64/cpuid_test.c | 11 +++++------ 4 files changed, 26 insertions(+), 33 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index d1c0cff24779..47e74beda155 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -404,10 +404,13 @@ static inline void outl(uint16_t port, uint32_t value) __asm__ __volatile__("outl %%eax, %%dx" : : "d"(port), "a"(value)); } -static inline void cpuid(uint32_t *eax, uint32_t *ebx, - uint32_t *ecx, uint32_t *edx) +static inline void __cpuid(uint32_t function, uint32_t index, + uint32_t *eax, uint32_t *ebx, + uint32_t *ecx, uint32_t *edx) { - /* ecx is often an input as well as an output. */ + *eax = function; + *ecx = index; + asm volatile("cpuid" : "=a" (*eax), "=b" (*ebx), @@ -417,6 +420,13 @@ static inline void cpuid(uint32_t *eax, uint32_t *ebx, : "memory"); } +static inline void cpuid(uint32_t function, + uint32_t *eax, uint32_t *ebx, + uint32_t *ecx, uint32_t *edx) +{ + return __cpuid(function, 0, eax, ebx, ecx, edx); +} + #define SET_XMM(__var, __xmm) \ asm volatile("movq %0, %%"#__xmm : : "r"(__var) : #__xmm) diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 8677c6063388..5f62b58433d3 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1301,9 +1301,7 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm) /* Before family 17h, the HyperTransport area is just below 1T. */ ht_gfn = (1 << 28) - num_ht_pages; - eax = 1; - ecx = 0; - cpuid(&eax, &ebx, &ecx, &edx); + cpuid(1, &eax, &ebx, &ecx, &edx); if (x86_family(eax) < 0x17) goto done; @@ -1312,18 +1310,15 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm) * reduced due to SME by bits 11:6 of CPUID[0x8000001f].EBX. Use * the old conservative value if MAXPHYADDR is not enumerated. */ - eax = 0x80000000; - cpuid(&eax, &ebx, &ecx, &edx); + cpuid(0x80000000, &eax, &ebx, &ecx, &edx); max_ext_leaf = eax; if (max_ext_leaf < 0x80000008) goto done; - eax = 0x80000008; - cpuid(&eax, &ebx, &ecx, &edx); + cpuid(0x80000008, &eax, &ebx, &ecx, &edx); max_pfn = (1ULL << ((eax & 0xff) - vm->page_shift)) - 1; if (max_ext_leaf >= 0x8000001f) { - eax = 0x8000001f; - cpuid(&eax, &ebx, &ecx, &edx); + cpuid(0x8000001f, &eax, &ebx, &ecx, &edx); max_pfn >>= (ebx >> 6) & 0x3f; } diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c index c8f98331a807..cb5e20936cc1 100644 --- a/tools/testing/selftests/kvm/x86_64/amx_test.c +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -122,9 +122,7 @@ static inline void check_cpuid_xsave(void) { uint32_t eax, ebx, ecx, edx; - eax = 1; - ecx = 0; - cpuid(&eax, &ebx, &ecx, &edx); + cpuid(1, &eax, &ebx, &ecx, &edx); if (!(ecx & CPUID_XSAVE)) GUEST_ASSERT(!"cpuid: no CPU xsave support!"); if (!(ecx & CPUID_OSXSAVE)) @@ -140,10 +138,7 @@ static bool enum_xtile_config(void) { u32 eax, ebx, ecx, edx; - eax = TILE_CPUID; - ecx = TILE_PALETTE_CPUID_SUBLEAVE; - - cpuid(&eax, &ebx, &ecx, &edx); + __cpuid(TILE_CPUID, TILE_PALETTE_CPUID_SUBLEAVE, &eax, &ebx, &ecx, &edx); if (!eax || !ebx || !ecx) return false; @@ -165,10 +160,7 @@ static bool enum_xsave_tile(void) { u32 eax, ebx, ecx, edx; - eax = XSTATE_CPUID; - ecx = XFEATURE_XTILEDATA; - - cpuid(&eax, &ebx, &ecx, &edx); + __cpuid(XSTATE_CPUID, XFEATURE_XTILEDATA, &eax, &ebx, &ecx, &edx); if (!eax || !ebx) return false; @@ -183,10 +175,7 @@ static bool check_xsave_size(void) u32 eax, ebx, ecx, edx; bool valid = false; - eax = XSTATE_CPUID; - ecx = XSTATE_USER_STATE_SUBLEAVE; - - cpuid(&eax, &ebx, &ecx, &edx); + __cpuid(XSTATE_CPUID, XSTATE_USER_STATE_SUBLEAVE, &eax, &ebx, &ecx, &edx); if (ebx && ebx <= XSAVE_SIZE) valid = true; diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c index d8ae4a0e00a4..a6aeee2e62e4 100644 --- a/tools/testing/selftests/kvm/x86_64/cpuid_test.c +++ b/tools/testing/selftests/kvm/x86_64/cpuid_test.c @@ -31,10 +31,9 @@ static void test_guest_cpuids(struct kvm_cpuid2 *guest_cpuid) u32 eax, ebx, ecx, edx; for (i = 0; i < guest_cpuid->nent; i++) { - eax = guest_cpuid->entries[i].function; - ecx = guest_cpuid->entries[i].index; - - cpuid(&eax, &ebx, &ecx, &edx); + __cpuid(guest_cpuid->entries[i].function, + guest_cpuid->entries[i].index, + &eax, &ebx, &ecx, &edx); GUEST_ASSERT(eax == guest_cpuid->entries[i].eax && ebx == guest_cpuid->entries[i].ebx && @@ -46,9 +45,9 @@ static void test_guest_cpuids(struct kvm_cpuid2 *guest_cpuid) static void test_cpuid_40000000(struct kvm_cpuid2 *guest_cpuid) { - u32 eax = 0x40000000, ebx, ecx = 0, edx; + u32 eax, ebx, ecx, edx; - cpuid(&eax, &ebx, &ecx, &edx); + cpuid(0x40000000, &eax, &ebx, &ecx, &edx); GUEST_ASSERT(eax == 0x40000001); } -- cgit v1.2.3 From 48ce3ed052e8336a17c64ca1787648d9e17c54d9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:57 +0000 Subject: KVM: selftests: Add this_cpu_has() to query X86_FEATURE_* via cpuid() Add this_cpu_has() to query an X86_FEATURE_* via cpuid(), i.e. to query a feature from L1 (or L2) guest code. Arbitrarily select the AMX test to be the first user. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-33-seanjc@google.com --- tools/testing/selftests/kvm/include/x86_64/processor.h | 12 +++++++++++- tools/testing/selftests/kvm/x86_64/amx_test.c | 9 ++------- 2 files changed, 13 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 47e74beda155..b065d6cadba1 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -161,7 +161,6 @@ struct kvm_x86_cpu_feature { #define X86_FEATURE_KVM_MIGRATION_CONTROL KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 17) /* CPUID.1.ECX */ -#define CPUID_XSAVE (1ul << 26) #define CPUID_OSXSAVE (1ul << 27) /* Page table bitfield declarations */ @@ -427,6 +426,17 @@ static inline void cpuid(uint32_t function, return __cpuid(function, 0, eax, ebx, ecx, edx); } +static inline bool this_cpu_has(struct kvm_x86_cpu_feature feature) +{ + uint32_t gprs[4]; + + __cpuid(feature.function, feature.index, + &gprs[KVM_CPUID_EAX], &gprs[KVM_CPUID_EBX], + &gprs[KVM_CPUID_ECX], &gprs[KVM_CPUID_EDX]); + + return gprs[feature.reg] & BIT(feature.bit); +} + #define SET_XMM(__var, __xmm) \ asm volatile("movq %0, %%"#__xmm : : "r"(__var) : #__xmm) diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c index cb5e20936cc1..c867f10532af 100644 --- a/tools/testing/selftests/kvm/x86_64/amx_test.c +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -120,13 +120,8 @@ static inline void __xsavec(struct xsave_data *data, uint64_t rfbm) static inline void check_cpuid_xsave(void) { - uint32_t eax, ebx, ecx, edx; - - cpuid(1, &eax, &ebx, &ecx, &edx); - if (!(ecx & CPUID_XSAVE)) - GUEST_ASSERT(!"cpuid: no CPU xsave support!"); - if (!(ecx & CPUID_OSXSAVE)) - GUEST_ASSERT(!"cpuid: no OS xsave support!"); + GUEST_ASSERT(this_cpu_has(X86_FEATURE_XSAVE)); + GUEST_ASSERT(this_cpu_has(X86_FEATURE_OSXSAVE)); } static bool check_xsave_supports_xtile(void) -- cgit v1.2.3 From 2b424a76d02cf6f8d94eeb253b1c0e4327b1718a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:58 +0000 Subject: KVM: selftests: Use this_cpu_has() in CR4/CPUID sync test Use this_cpu_has() to query OSXSAVE from the L1 guest in the CR4=>CPUID sync test. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-34-seanjc@google.com --- tools/testing/selftests/kvm/include/x86_64/processor.h | 3 --- tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c | 14 ++------------ 2 files changed, 2 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index b065d6cadba1..8ce421471c4c 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -160,9 +160,6 @@ struct kvm_x86_cpu_feature { #define X86_FEATURE_KVM_HC_MAP_GPA_RANGE KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 16) #define X86_FEATURE_KVM_MIGRATION_CONTROL KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 17) -/* CPUID.1.ECX */ -#define CPUID_OSXSAVE (1ul << 27) - /* Page table bitfield declarations */ #define PTE_PRESENT_MASK BIT_ULL(0) #define PTE_WRITABLE_MASK BIT_ULL(1) diff --git a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c index 611febdc2128..4208487652f8 100644 --- a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c +++ b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c @@ -21,19 +21,9 @@ static inline bool cr4_cpuid_is_sync(void) { - int func, subfunc; - uint32_t eax, ebx, ecx, edx; - uint64_t cr4; - - func = 0x1; - subfunc = 0x0; - __asm__ __volatile__("cpuid" - : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) - : "a"(func), "c"(subfunc)); - - cr4 = get_cr4(); + uint64_t cr4 = get_cr4(); - return (!!(ecx & CPUID_OSXSAVE)) == (!!(cr4 & X86_CR4_OSXSAVE)); + return (this_cpu_has(X86_FEATURE_OSXSAVE) == !!(cr4 & X86_CR4_OSXSAVE)); } static void guest_code(void) -- cgit v1.2.3 From 05c2b6e5facc4f04e4df0cfedbcc8943318ed983 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:06:59 +0000 Subject: KVM: selftests: Use this_cpu_has() to detect SVM support in L1 Replace an evil open coded instance of querying CPUID from L1 with this_cpu_has(X86_FEATURE_SVM). No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-35-seanjc@google.com --- tools/testing/selftests/kvm/include/x86_64/svm_util.h | 13 ------------- tools/testing/selftests/kvm/x86_64/smm_test.c | 4 ++-- tools/testing/selftests/kvm/x86_64/state_test.c | 2 +- 3 files changed, 3 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/svm_util.h b/tools/testing/selftests/kvm/include/x86_64/svm_util.h index f48806d26989..a339b537a575 100644 --- a/tools/testing/selftests/kvm/include/x86_64/svm_util.h +++ b/tools/testing/selftests/kvm/include/x86_64/svm_util.h @@ -13,9 +13,6 @@ #include "svm.h" #include "processor.h" -#define CPUID_SVM_BIT 2 -#define CPUID_SVM BIT_ULL(CPUID_SVM_BIT) - #define SVM_EXIT_EXCP_BASE 0x040 #define SVM_EXIT_HLT 0x078 #define SVM_EXIT_MSR 0x07c @@ -52,16 +49,6 @@ struct svm_test_data *vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva); void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp); void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa); -static inline bool cpu_has_svm(void) -{ - u32 eax = 0x80000001, ecx; - - asm("cpuid" : - "=a" (eax), "=c" (ecx) : "0" (eax) : "ebx", "edx"); - - return ecx & CPUID_SVM; -} - int open_sev_dev_path_or_exit(void); #endif /* SELFTEST_KVM_SVM_UTILS_H */ diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c index 17bb2397ea38..1f136a81858e 100644 --- a/tools/testing/selftests/kvm/x86_64/smm_test.c +++ b/tools/testing/selftests/kvm/x86_64/smm_test.c @@ -83,7 +83,7 @@ static void guest_code(void *arg) sync_with_host(4); if (arg) { - if (cpu_has_svm()) { + if (this_cpu_has(X86_FEATURE_SVM)) { generic_svm_setup(svm, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); } else { @@ -99,7 +99,7 @@ static void guest_code(void *arg) sync_with_host(7); - if (cpu_has_svm()) { + if (this_cpu_has(X86_FEATURE_SVM)) { run_guest(svm->vmcb, svm->vmcb_gpa); run_guest(svm->vmcb, svm->vmcb_gpa); } else { diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c index 8ab81a3b561f..ea578971fb9f 100644 --- a/tools/testing/selftests/kvm/x86_64/state_test.c +++ b/tools/testing/selftests/kvm/x86_64/state_test.c @@ -142,7 +142,7 @@ static void __attribute__((__flatten__)) guest_code(void *arg) GUEST_SYNC(2); if (arg) { - if (cpu_has_svm()) + if (this_cpu_has(X86_FEATURE_SVM)) svm_l1_guest_code(arg); else vmx_l1_guest_code(arg); -- cgit v1.2.3 From 446ab76a0f7addb196c154636eb04c478ccecdcf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:07:00 +0000 Subject: KVM: selftests: Drop unnecessary use of kvm_get_supported_cpuid_index() Use kvm_get_supported_cpuid_entry() instead of kvm_get_supported_cpuid_index() when passing in '0' for the index, which just so happens to be the case in all remaining users of kvm_get_supported_cpuid_index() except kvm_get_supported_cpuid_entry(). Keep the helper as there may be users in the future, and it's not doing any harm. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-36-seanjc@google.com --- tools/testing/selftests/kvm/x86_64/amx_test.c | 2 +- tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c | 4 ++-- tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c index c867f10532af..08d5801cf858 100644 --- a/tools/testing/selftests/kvm/x86_64/amx_test.c +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -318,7 +318,7 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XTILEDATA)); /* Get xsave/restore max size */ - xsave_restore_size = kvm_get_supported_cpuid_index(0xd, 0)->ecx; + xsave_restore_size = kvm_get_supported_cpuid_entry(0xd)->ecx; run = vcpu->run; vcpu_regs_get(vcpu, ®s1); diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index b4c4631891d5..ea4e259a1e2e 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -402,7 +402,7 @@ static bool use_intel_pmu(void) { const struct kvm_cpuid_entry2 *entry; - entry = kvm_get_supported_cpuid_index(0xa, 0); + entry = kvm_get_supported_cpuid_entry(0xa); return is_intel_cpu() && check_intel_pmu_leaf(entry); } @@ -434,7 +434,7 @@ static bool use_amd_pmu(void) { const struct kvm_cpuid_entry2 *entry; - entry = kvm_get_supported_cpuid_index(1, 0); + entry = kvm_get_supported_cpuid_entry(1); return is_amd_cpu() && (is_zen1(entry->eax) || is_zen2(entry->eax) || diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c index 689517f2aae6..6ec901dab61e 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c @@ -69,7 +69,7 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_PDCM)); TEST_REQUIRE(kvm_get_cpuid_max_basic() >= 0xa); - entry_a_0 = kvm_get_supported_cpuid_index(0xa, 0); + entry_a_0 = kvm_get_supported_cpuid_entry(0xa); eax.full = entry_a_0->eax; __TEST_REQUIRE(eax.split.version_id, "PMU is not supported by the vCPU"); -- cgit v1.2.3 From 28e09d321035149b881ceb3253d0a6729b91d506 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:07:01 +0000 Subject: KVM: selftests: Rename kvm_get_supported_cpuid_index() to __..._entry() Rename kvm_get_supported_cpuid_index() to __kvm_get_supported_cpuid_entry() to better show its relationship to kvm_get_supported_cpuid_entry(), and because the helper returns a CPUID entry, not the index of an entry. No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-37-seanjc@google.com --- tools/testing/selftests/kvm/include/x86_64/processor.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 8ce421471c4c..ba5cd86a8565 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -699,15 +699,15 @@ static inline void vcpu_clear_cpuid_feature(struct kvm_vcpu *vcpu, vcpu_set_or_clear_cpuid_feature(vcpu, feature, false); } -static inline const struct kvm_cpuid_entry2 *kvm_get_supported_cpuid_index(uint32_t function, - uint32_t index) +static inline const struct kvm_cpuid_entry2 *__kvm_get_supported_cpuid_entry(uint32_t function, + uint32_t index) { return get_cpuid_entry(kvm_get_supported_cpuid(), function, index); } static inline const struct kvm_cpuid_entry2 *kvm_get_supported_cpuid_entry(uint32_t function) { - return kvm_get_supported_cpuid_index(function, 0); + return __kvm_get_supported_cpuid_entry(function, 0); } uint64_t vcpu_get_msr(struct kvm_vcpu *vcpu, uint64_t msr_index); -- cgit v1.2.3 From d04019274d13301b413c28b85f1b1775a9d63062 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:07:02 +0000 Subject: KVM: selftests: Inline "get max CPUID leaf" helpers Make the "get max CPUID leaf" helpers static inline, there's no reason to bury the one liners in processor.c. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-38-seanjc@google.com --- tools/testing/selftests/kvm/include/x86_64/processor.h | 11 +++++++++-- tools/testing/selftests/kvm/lib/x86_64/processor.c | 10 ---------- 2 files changed, 9 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index ba5cd86a8565..ecf2ace952ab 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -721,9 +721,16 @@ static inline void vcpu_set_msr(struct kvm_vcpu *vcpu, uint64_t msr_index, TEST_ASSERT(r == 1, KVM_IOCTL_ERROR(KVM_SET_MSRS, r)); } +static inline uint32_t kvm_get_cpuid_max_basic(void) +{ + return kvm_get_supported_cpuid_entry(0)->eax; +} + +static inline uint32_t kvm_get_cpuid_max_extended(void) +{ + return kvm_get_supported_cpuid_entry(0x80000000)->eax; +} -uint32_t kvm_get_cpuid_max_basic(void); -uint32_t kvm_get_cpuid_max_extended(void); void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits); bool vm_is_unrestricted_guest(struct kvm_vm *vm); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 5f62b58433d3..e2894b5bcfd1 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1056,16 +1056,6 @@ bool is_amd_cpu(void) return cpu_vendor_string_is("AuthenticAMD"); } -uint32_t kvm_get_cpuid_max_basic(void) -{ - return kvm_get_supported_cpuid_entry(0)->eax; -} - -uint32_t kvm_get_cpuid_max_extended(void) -{ - return kvm_get_supported_cpuid_entry(0x80000000)->eax; -} - void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits) { const struct kvm_cpuid_entry2 *entry; -- cgit v1.2.3 From 7fbb653e01fdd45b377674a9e77da473d49afd38 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:07:03 +0000 Subject: KVM: selftests: Check KVM's supported CPUID, not host CPUID, for XFD Use kvm_cpu_has() to check for XFD supported in vm_xsave_req_perm(), simply checking host CPUID doesn't guarantee KVM supports AMX/XFD. Opportunistically hoist the check above the bit check; if XFD isn't supported, it's far better to get a "not supported at all" message, as opposed to a "feature X isn't supported" message". Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-39-seanjc@google.com --- .../testing/selftests/kvm/include/x86_64/processor.h | 1 + tools/testing/selftests/kvm/lib/x86_64/processor.c | 19 ++----------------- 2 files changed, 3 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index ecf2ace952ab..559840e47f01 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -117,6 +117,7 @@ struct kvm_x86_cpu_feature { #define X86_FEATURE_XTILECFG KVM_X86_CPU_FEATURE(0xD, 0, EAX, 17) #define X86_FEATURE_XTILEDATA KVM_X86_CPU_FEATURE(0xD, 0, EAX, 18) #define X86_FEATURE_XSAVES KVM_X86_CPU_FEATURE(0xD, 1, EAX, 3) +#define X86_FEATURE_XFD KVM_X86_CPU_FEATURE(0xD, 1, EAX, 4) /* * Extended Leafs, a.k.a. AMD defined diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index e2894b5bcfd1..6cb93bbd6130 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -578,21 +578,6 @@ static void vcpu_setup(struct kvm_vm *vm, struct kvm_vcpu *vcpu) vcpu_sregs_set(vcpu, &sregs); } -#define CPUID_XFD_BIT (1 << 4) -static bool is_xfd_supported(void) -{ - int eax, ebx, ecx, edx; - const int leaf = 0xd, subleaf = 0x1; - - __asm__ __volatile__( - "cpuid" - : /* output */ "=a"(eax), "=b"(ebx), - "=c"(ecx), "=d"(edx) - : /* input */ "0"(leaf), "2"(subleaf)); - - return !!(eax & CPUID_XFD_BIT); -} - void vm_xsave_req_perm(int bit) { int kvm_fd; @@ -604,6 +589,8 @@ void vm_xsave_req_perm(int bit) .addr = (unsigned long) &bitmask }; + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XFD)); + kvm_fd = open_kvm_dev_path_or_exit(); rc = __kvm_ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr); close(kvm_fd); @@ -614,8 +601,6 @@ void vm_xsave_req_perm(int bit) TEST_REQUIRE(bitmask & (1ULL << bit)); - TEST_REQUIRE(is_xfd_supported()); - rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit); /* -- cgit v1.2.3 From d4c94ee8121cce64f2882f3bdcc346dc3d01bcdd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:07:04 +0000 Subject: KVM: selftests: Skip AMX test if ARCH_REQ_XCOMP_GUEST_PERM isn't supported Skip the AMX test instead of silently returning if the host kernel doesn't support ARCH_REQ_XCOMP_GUEST_PERM. KVM didn't support XFD until v5.17, so it's extremely unlikely allowing the test to run on a pre-v5.15 kernel is the right thing to do. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-40-seanjc@google.com --- tools/testing/selftests/kvm/lib/x86_64/processor.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 6cb93bbd6130..912da2100f90 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -601,14 +601,7 @@ void vm_xsave_req_perm(int bit) TEST_REQUIRE(bitmask & (1ULL << bit)); - rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit); - - /* - * The older kernel version(<5.15) can't support - * ARCH_REQ_XCOMP_GUEST_PERM and directly return. - */ - if (rc) - return; + TEST_REQUIRE(!syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit)); rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &bitmask); TEST_ASSERT(rc == 0, "prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc); -- cgit v1.2.3 From 090cd45b21cd0d26297315d4bd9b83d9dcad10e1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:07:05 +0000 Subject: KVM: selftests: Clean up requirements for XFD-aware XSAVE features Provide informative error messages for the various checks related to requesting access to XSAVE features that are buried behind XSAVE Feature Disabling (XFD). Opportunistically rename the helper to have "require" in the name so that it's somewhat obvious that the helper may skip the test. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-41-seanjc@google.com --- tools/testing/selftests/kvm/include/x86_64/processor.h | 5 ++++- tools/testing/selftests/kvm/lib/x86_64/processor.c | 8 +++++--- tools/testing/selftests/kvm/x86_64/amx_test.c | 2 +- 3 files changed, 10 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 559840e47f01..4060fe954d53 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -832,7 +832,10 @@ void vm_set_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); -void vm_xsave_req_perm(int bit); +void __vm_xsave_require_permission(int bit, const char *name); + +#define vm_xsave_require_permission(perm) \ + __vm_xsave_require_permission(perm, #perm) enum pg_level { PG_LEVEL_NONE, diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 912da2100f90..92a047cd1cd5 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -578,7 +578,7 @@ static void vcpu_setup(struct kvm_vm *vm, struct kvm_vcpu *vcpu) vcpu_sregs_set(vcpu, &sregs); } -void vm_xsave_req_perm(int bit) +void __vm_xsave_require_permission(int bit, const char *name) { int kvm_fd; u64 bitmask; @@ -596,10 +596,12 @@ void vm_xsave_req_perm(int bit) close(kvm_fd); if (rc == -1 && (errno == ENXIO || errno == EINVAL)) - exit(KSFT_SKIP); + __TEST_REQUIRE(0, "KVM_X86_XCOMP_GUEST_SUPP not supported"); + TEST_ASSERT(rc == 0, "KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) error: %ld", rc); - TEST_REQUIRE(bitmask & (1ULL << bit)); + __TEST_REQUIRE(bitmask & (1ULL << bit), + "Required XSAVE feature '%s' not supported", name); TEST_REQUIRE(!syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit)); diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c index 08d5801cf858..dadcbad10a1d 100644 --- a/tools/testing/selftests/kvm/x86_64/amx_test.c +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -307,7 +307,7 @@ int main(int argc, char *argv[]) u32 amx_offset; int stage, ret; - vm_xsave_req_perm(XSTATE_XTILE_DATA_BIT); + vm_xsave_require_permission(XSTATE_XTILE_DATA_BIT); /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, guest_code); -- cgit v1.2.3 From 12a985aeb40691a27befb0ae99707d0322b4bd8e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:07:06 +0000 Subject: KVM: selftests: Use the common cpuid() helper in cpu_vendor_string_is() Use cpuid() to get CPUID.0x0 in cpu_vendor_string_is(), thus eliminating the last open coded usage of CPUID (ignoring debug_regs.c, which emits CPUID from the guest to trigger a VM-Exit and doesn't actually care about the results of CPUID). Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-42-seanjc@google.com --- tools/testing/selftests/kvm/lib/x86_64/processor.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 92a047cd1cd5..f35626df1dea 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1011,15 +1011,9 @@ void kvm_x86_state_cleanup(struct kvm_x86_state *state) static bool cpu_vendor_string_is(const char *vendor) { const uint32_t *chunk = (const uint32_t *)vendor; - int eax, ebx, ecx, edx; - const int leaf = 0; - - __asm__ __volatile__( - "cpuid" - : /* output */ "=a"(eax), "=b"(ebx), - "=c"(ecx), "=d"(edx) - : /* input */ "0"(leaf), "2"(0)); + uint32_t eax, ebx, ecx, edx; + cpuid(0, &eax, &ebx, &ecx, &edx); return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]); } -- cgit v1.2.3 From 3d5f8d03786fee6aa7a4c59446c5356775aeb4d9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 14 Jun 2022 20:07:07 +0000 Subject: KVM: selftests: Drop unused SVM_CPUID_FUNC macro Drop SVM_CPUID_FUNC to reduce the probability of tests open coding CPUID checks instead of using kvm_cpu_has() or this_cpu_has(). Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220614200707.3315957-43-seanjc@google.com --- tools/testing/selftests/kvm/include/x86_64/svm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/svm.h b/tools/testing/selftests/kvm/include/x86_64/svm.h index 2225e5077350..c8343ff84f7f 100644 --- a/tools/testing/selftests/kvm/include/x86_64/svm.h +++ b/tools/testing/selftests/kvm/include/x86_64/svm.h @@ -218,8 +218,6 @@ struct __attribute__ ((__packed__)) vmcb { struct vmcb_save_area save; }; -#define SVM_CPUID_FUNC 0x8000000a - #define SVM_VM_CR_SVM_DISABLE 4 #define SVM_SELECTOR_S_SHIFT 4 -- cgit v1.2.3 From bf3f00378524adae16628cbadbd11ba7211863bb Mon Sep 17 00:00:00 2001 From: Anquan Wu Date: Tue, 12 Jul 2022 11:15:40 +0800 Subject: libbpf: Fix the name of a reused map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BPF map name is limited to BPF_OBJ_NAME_LEN. A map name is defined as being longer than BPF_OBJ_NAME_LEN, it will be truncated to BPF_OBJ_NAME_LEN when a userspace program calls libbpf to create the map. A pinned map also generates a path in the /sys. If the previous program wanted to reuse the map, it can not get bpf_map by name, because the name of the map is only partially the same as the name which get from pinned path. The syscall information below show that map name "process_pinned_map" is truncated to "process_pinned_". bpf(BPF_OBJ_GET, {pathname="/sys/fs/bpf/process_pinned_map", bpf_fd=0, file_flags=0}, 144) = -1 ENOENT (No such file or directory) bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_HASH, key_size=4, value_size=4,max_entries=1024, map_flags=0, inner_map_fd=0, map_name="process_pinned_",map_ifindex=0, btf_fd=3, btf_key_type_id=6, btf_value_type_id=10,btf_vmlinux_value_type_id=0}, 72) = 4 This patch check that if the name of pinned map are the same as the actual name for the first (BPF_OBJ_NAME_LEN - 1), bpf map still uses the name which is included in bpf object. Fixes: 26736eb9a483 ("tools: libbpf: allow map reuse") Signed-off-by: Anquan Wu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/OSZP286MB1725CEA1C95C5CB8E7CCC53FB8869@OSZP286MB1725.JPNP286.PROD.OUTLOOK.COM --- tools/lib/bpf/libbpf.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 72548798126b..68da1aca406c 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -4232,7 +4232,7 @@ int bpf_map__set_autocreate(struct bpf_map *map, bool autocreate) int bpf_map__reuse_fd(struct bpf_map *map, int fd) { struct bpf_map_info info = {}; - __u32 len = sizeof(info); + __u32 len = sizeof(info), name_len; int new_fd, err; char *new_name; @@ -4242,7 +4242,12 @@ int bpf_map__reuse_fd(struct bpf_map *map, int fd) if (err) return libbpf_err(err); - new_name = strdup(info.name); + name_len = strlen(info.name); + if (name_len == BPF_OBJ_NAME_LEN - 1 && strncmp(map->name, info.name, name_len) == 0) + new_name = strdup(map->name); + else + new_name = strdup(info.name); + if (!new_name) return libbpf_err(-errno); -- cgit v1.2.3 From 94bf6aad5dbed1c93618035ec31b37927538c276 Mon Sep 17 00:00:00 2001 From: Linkui Xiao Date: Thu, 14 Jul 2022 09:56:47 +0800 Subject: selftests/bpf: Return true/false (not 1/0) from bool functions Return boolean values ("true" or "false") instead of 1 or 0 from bool functions. This fixes the following warnings from coccicheck: tools/testing/selftests/bpf/progs/test_xdp_noinline.c:407:9-10: WARNING: return of 0/1 in function 'decap_v4' with return type bool tools/testing/selftests/bpf/progs/test_xdp_noinline.c:389:9-10: WARNING: return of 0/1 in function 'decap_v6' with return type bool tools/testing/selftests/bpf/progs/test_xdp_noinline.c:290:9-10: WARNING: return of 0/1 in function 'encap_v6' with return type bool tools/testing/selftests/bpf/progs/test_xdp_noinline.c:264:9-10: WARNING: return of 0/1 in function 'parse_tcp' with return type bool tools/testing/selftests/bpf/progs/test_xdp_noinline.c:242:9-10: WARNING: return of 0/1 in function 'parse_udp' with return type bool Generated by: scripts/coccinelle/misc/boolreturn.cocci Suggested-by: Stanislav Fomichev Signed-off-by: Linkui Xiao Signed-off-by: Andrii Nakryiko Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20220714015647.25074-1-xiaolinkui@kylinos.cn --- .../selftests/bpf/progs/test_xdp_noinline.c | 30 +++++++++++----------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c index 125d872d7981..ba48fcb98ab2 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c @@ -239,7 +239,7 @@ bool parse_udp(void *data, void *data_end, udp = data + off; if (udp + 1 > data_end) - return 0; + return false; if (!is_icmp) { pckt->flow.port16[0] = udp->source; pckt->flow.port16[1] = udp->dest; @@ -247,7 +247,7 @@ bool parse_udp(void *data, void *data_end, pckt->flow.port16[0] = udp->dest; pckt->flow.port16[1] = udp->source; } - return 1; + return true; } static __attribute__ ((noinline)) @@ -261,7 +261,7 @@ bool parse_tcp(void *data, void *data_end, tcp = data + off; if (tcp + 1 > data_end) - return 0; + return false; if (tcp->syn) pckt->flags |= (1 << 1); if (!is_icmp) { @@ -271,7 +271,7 @@ bool parse_tcp(void *data, void *data_end, pckt->flow.port16[0] = tcp->dest; pckt->flow.port16[1] = tcp->source; } - return 1; + return true; } static __attribute__ ((noinline)) @@ -287,7 +287,7 @@ bool encap_v6(struct xdp_md *xdp, struct ctl_value *cval, void *data; if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct ipv6hdr))) - return 0; + return false; data = (void *)(long)xdp->data; data_end = (void *)(long)xdp->data_end; new_eth = data; @@ -295,7 +295,7 @@ bool encap_v6(struct xdp_md *xdp, struct ctl_value *cval, old_eth = data + sizeof(struct ipv6hdr); if (new_eth + 1 > data_end || old_eth + 1 > data_end || ip6h + 1 > data_end) - return 0; + return false; memcpy(new_eth->eth_dest, cval->mac, 6); memcpy(new_eth->eth_source, old_eth->eth_dest, 6); new_eth->eth_proto = 56710; @@ -314,7 +314,7 @@ bool encap_v6(struct xdp_md *xdp, struct ctl_value *cval, ip6h->saddr.in6_u.u6_addr32[2] = 3; ip6h->saddr.in6_u.u6_addr32[3] = ip_suffix; memcpy(ip6h->daddr.in6_u.u6_addr32, dst->dstv6, 16); - return 1; + return true; } static __attribute__ ((noinline)) @@ -335,7 +335,7 @@ bool encap_v4(struct xdp_md *xdp, struct ctl_value *cval, ip_suffix <<= 15; ip_suffix ^= pckt->flow.src; if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct iphdr))) - return 0; + return false; data = (void *)(long)xdp->data; data_end = (void *)(long)xdp->data_end; new_eth = data; @@ -343,7 +343,7 @@ bool encap_v4(struct xdp_md *xdp, struct ctl_value *cval, old_eth = data + sizeof(struct iphdr); if (new_eth + 1 > data_end || old_eth + 1 > data_end || iph + 1 > data_end) - return 0; + return false; memcpy(new_eth->eth_dest, cval->mac, 6); memcpy(new_eth->eth_source, old_eth->eth_dest, 6); new_eth->eth_proto = 8; @@ -367,8 +367,8 @@ bool encap_v4(struct xdp_md *xdp, struct ctl_value *cval, csum += *next_iph_u16++; iph->check = ~((csum & 0xffff) + (csum >> 16)); if (bpf_xdp_adjust_head(xdp, (int)sizeof(struct iphdr))) - return 0; - return 1; + return false; + return true; } static __attribute__ ((noinline)) @@ -386,10 +386,10 @@ bool decap_v6(struct xdp_md *xdp, void **data, void **data_end, bool inner_v4) else new_eth->eth_proto = 56710; if (bpf_xdp_adjust_head(xdp, (int)sizeof(struct ipv6hdr))) - return 0; + return false; *data = (void *)(long)xdp->data; *data_end = (void *)(long)xdp->data_end; - return 1; + return true; } static __attribute__ ((noinline)) @@ -404,10 +404,10 @@ bool decap_v4(struct xdp_md *xdp, void **data, void **data_end) memcpy(new_eth->eth_dest, old_eth->eth_dest, 6); new_eth->eth_proto = 8; if (bpf_xdp_adjust_head(xdp, (int)sizeof(struct iphdr))) - return 0; + return false; *data = (void *)(long)xdp->data; *data_end = (void *)(long)xdp->data_end; - return 1; + return true; } static __attribute__ ((noinline)) -- cgit v1.2.3 From cd72e61bad145a0968df85193dcf1261cb66c4c6 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 13 Jul 2022 13:48:53 +0200 Subject: selftests/net: test nexthop without gw This test implement the scenario described in the commit "ip: fix dflt addr selection for connected nexthop". The test configures a nexthop object with an output device only (no gateway address) and a route that uses this nexthop. The goal is to check if the kernel selects a valid source address. Link: https://lore.kernel.org/netdev/20220712095545.10947-1-nicolas.dichtel@6wind.com/ Signed-off-by: Nicolas Dichtel Link: https://lore.kernel.org/r/20220713114853.29406-2-nicolas.dichtel@6wind.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/Makefile | 2 +- tools/testing/selftests/net/fib_nexthop_nongw.sh | 119 +++++++++++++++++++++++ 2 files changed, 120 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/net/fib_nexthop_nongw.sh (limited to 'tools') diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index ddad703ace34..db05b3764b77 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -11,7 +11,7 @@ TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh psock_snd.sh TEST_PROGS += udpgro_bench.sh udpgro.sh test_vxlan_under_vrf.sh reuseport_addr_any.sh TEST_PROGS += test_vxlan_fdb_changelink.sh so_txtime.sh ipv6_flowlabel.sh TEST_PROGS += tcp_fastopen_backup_key.sh fcnal-test.sh l2tp.sh traceroute.sh -TEST_PROGS += fin_ack_lat.sh fib_nexthop_multiprefix.sh fib_nexthops.sh +TEST_PROGS += fin_ack_lat.sh fib_nexthop_multiprefix.sh fib_nexthops.sh fib_nexthop_nongw.sh TEST_PROGS += altnames.sh icmp.sh icmp_redirect.sh ip6_gre_headroom.sh TEST_PROGS += route_localnet.sh TEST_PROGS += reuseaddr_ports_exhausted.sh diff --git a/tools/testing/selftests/net/fib_nexthop_nongw.sh b/tools/testing/selftests/net/fib_nexthop_nongw.sh new file mode 100755 index 000000000000..b7b928b38ce4 --- /dev/null +++ b/tools/testing/selftests/net/fib_nexthop_nongw.sh @@ -0,0 +1,119 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# ns: h1 | ns: h2 +# 192.168.0.1/24 | +# eth0 | +# | 192.168.1.1/32 +# veth0 <---|---> veth1 +# Validate source address selection for route without gateway + +PAUSE_ON_FAIL=no +VERBOSE=0 +ret=0 + +################################################################################ +# helpers + +log_test() +{ + local rc=$1 + local expected=$2 + local msg="$3" + + if [ ${rc} -eq ${expected} ]; then + printf "TEST: %-60s [ OK ]\n" "${msg}" + nsuccess=$((nsuccess+1)) + else + ret=1 + nfail=$((nfail+1)) + printf "TEST: %-60s [FAIL]\n" "${msg}" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi + fi + + [ "$VERBOSE" = "1" ] && echo +} + +run_cmd() +{ + local cmd="$*" + local out + local rc + + if [ "$VERBOSE" = "1" ]; then + echo "COMMAND: $cmd" + fi + + out=$(eval $cmd 2>&1) + rc=$? + if [ "$VERBOSE" = "1" -a -n "$out" ]; then + echo "$out" + fi + + [ "$VERBOSE" = "1" ] && echo + + return $rc +} + +################################################################################ +# config +setup() +{ + ip netns add h1 + ip -n h1 link set lo up + ip netns add h2 + ip -n h2 link set lo up + + # Add a fake eth0 to support an ip address + ip -n h1 link add name eth0 type dummy + ip -n h1 link set eth0 up + ip -n h1 address add 192.168.0.1/24 dev eth0 + + # Configure veths (same @mac, arp off) + ip -n h1 link add name veth0 type veth peer name veth1 netns h2 + ip -n h1 link set veth0 up + + ip -n h2 link set veth1 up + + # Configure @IP in the peer netns + ip -n h2 address add 192.168.1.1/32 dev veth1 + ip -n h2 route add default dev veth1 + + # Add a nexthop without @gw and use it in a route + ip -n h1 nexthop add id 1 dev veth0 + ip -n h1 route add 192.168.1.1 nhid 1 +} + +cleanup() +{ + ip netns del h1 2>/dev/null + ip netns del h2 2>/dev/null +} + +trap cleanup EXIT + +################################################################################ +# main + +while getopts :pv o +do + case $o in + p) PAUSE_ON_FAIL=yes;; + v) VERBOSE=1;; + esac +done + +cleanup +setup + +run_cmd ip -netns h1 route get 192.168.1.1 +log_test $? 0 "nexthop: get route with nexthop without gw" +run_cmd ip netns exec h1 ping -c1 192.168.1.1 +log_test $? 0 "nexthop: ping through nexthop without gw" + +exit $ret -- cgit v1.2.3 From 9a162977d20436be5678a8e21a8e58eb4616d86a Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 13 Jul 2022 22:46:13 +0200 Subject: selftests: timers: valid-adjtimex: build fix for newer toolchains MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Toolchains with an include file 'sys/timex.h' based on 3.18 will have a 'clock_adjtime' definition added, so it can't be static in the code: valid-adjtimex.c:43:12: error: static declaration of ‘clock_adjtime’ follows non-static declaration Fixes: e03a58c320e1 ("kselftests: timers: Add adjtimex SETOFFSET validity tests") Signed-off-by: Wolfram Sang Acked-by: John Stultz Signed-off-by: Shuah Khan --- tools/testing/selftests/timers/valid-adjtimex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/timers/valid-adjtimex.c b/tools/testing/selftests/timers/valid-adjtimex.c index 5397de708d3c..48b9a803235a 100644 --- a/tools/testing/selftests/timers/valid-adjtimex.c +++ b/tools/testing/selftests/timers/valid-adjtimex.c @@ -40,7 +40,7 @@ #define ADJ_SETOFFSET 0x0100 #include -static int clock_adjtime(clockid_t id, struct timex *tx) +int clock_adjtime(clockid_t id, struct timex *tx) { return syscall(__NR_clock_adjtime, id, tx); } -- cgit v1.2.3 From a8d74fe7fed55fe53066dca66708273f3dd9bb2f Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 13 Jul 2022 22:46:14 +0200 Subject: selftests: timers: fix declarations of main() Mixing up argc/argv went unnoticed because they were not used. Still, this is worth fixing. Signed-off-by: Wolfram Sang Acked-by: John Stultz Signed-off-by: Shuah Khan --- tools/testing/selftests/timers/adjtick.c | 2 +- tools/testing/selftests/timers/change_skew.c | 2 +- tools/testing/selftests/timers/raw_skew.c | 2 +- tools/testing/selftests/timers/skew_consistency.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/timers/adjtick.c b/tools/testing/selftests/timers/adjtick.c index 54d8d87f36b3..47e05fdc32c5 100644 --- a/tools/testing/selftests/timers/adjtick.c +++ b/tools/testing/selftests/timers/adjtick.c @@ -165,7 +165,7 @@ int check_tick_adj(long tickval) return 0; } -int main(int argv, char **argc) +int main(int argc, char **argv) { struct timespec raw; long tick, max, interval, err; diff --git a/tools/testing/selftests/timers/change_skew.c b/tools/testing/selftests/timers/change_skew.c index c4eab7124990..992a77f2a74c 100644 --- a/tools/testing/selftests/timers/change_skew.c +++ b/tools/testing/selftests/timers/change_skew.c @@ -55,7 +55,7 @@ int change_skew_test(int ppm) } -int main(int argv, char **argc) +int main(int argc, char **argv) { struct timex tx; int i, ret; diff --git a/tools/testing/selftests/timers/raw_skew.c b/tools/testing/selftests/timers/raw_skew.c index b41d8dd0c40c..5beceeed0d11 100644 --- a/tools/testing/selftests/timers/raw_skew.c +++ b/tools/testing/selftests/timers/raw_skew.c @@ -89,7 +89,7 @@ void get_monotonic_and_raw(struct timespec *mon, struct timespec *raw) } } -int main(int argv, char **argc) +int main(int argc, char **argv) { struct timespec mon, raw, start, end; long long delta1, delta2, interval, eppm, ppm; diff --git a/tools/testing/selftests/timers/skew_consistency.c b/tools/testing/selftests/timers/skew_consistency.c index 8066be9aff11..63913f75b384 100644 --- a/tools/testing/selftests/timers/skew_consistency.c +++ b/tools/testing/selftests/timers/skew_consistency.c @@ -38,7 +38,7 @@ #define NSEC_PER_SEC 1000000000LL -int main(int argv, char **argc) +int main(int argc, char **argv) { struct timex tx; int ret, ppm; -- cgit v1.2.3 From 2d87048acb53e876a634a3f5cb15718b22a109ad Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 13 Jul 2022 22:46:15 +0200 Subject: selftests: timers: nanosleep: adapt to kselftest framework So we have proper counters at the end of a test, e.g.: # Totals: pass:4 fail:0 xfail:0 xpass:0 skip:8 error:0 Signed-off-by: Wolfram Sang Acked-by: John Stultz Signed-off-by: Shuah Khan --- tools/testing/selftests/timers/nanosleep.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/timers/nanosleep.c b/tools/testing/selftests/timers/nanosleep.c index 71b5441c2fd9..df1d03516e7b 100644 --- a/tools/testing/selftests/timers/nanosleep.c +++ b/tools/testing/selftests/timers/nanosleep.c @@ -133,33 +133,37 @@ int main(int argc, char **argv) long long length; int clockid, ret; + ksft_print_header(); + ksft_set_plan(NR_CLOCKIDS); + for (clockid = CLOCK_REALTIME; clockid < NR_CLOCKIDS; clockid++) { /* Skip cputime clockids since nanosleep won't increment cputime */ if (clockid == CLOCK_PROCESS_CPUTIME_ID || clockid == CLOCK_THREAD_CPUTIME_ID || - clockid == CLOCK_HWSPECIFIC) + clockid == CLOCK_HWSPECIFIC) { + ksft_test_result_skip("%-31s\n", clockstring(clockid)); continue; + } - printf("Nanosleep %-31s ", clockstring(clockid)); fflush(stdout); length = 10; while (length <= (NSEC_PER_SEC * 10)) { ret = nanosleep_test(clockid, length); if (ret == UNSUPPORTED) { - printf("[UNSUPPORTED]\n"); + ksft_test_result_skip("%-31s\n", clockstring(clockid)); goto next; } if (ret < 0) { - printf("[FAILED]\n"); - return ksft_exit_fail(); + ksft_test_result_fail("%-31s\n", clockstring(clockid)); + ksft_exit_fail(); } length *= 100; } - printf("[OK]\n"); + ksft_test_result_pass("%-31s\n", clockstring(clockid)); next: ret = 0; } - return ksft_exit_pass(); + ksft_exit_pass(); } -- cgit v1.2.3 From 04fd937eb652fd9b908f885a373960ebad5cd72b Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 13 Jul 2022 22:46:16 +0200 Subject: selftests: timers: inconsistency-check: adapt to kselftest framework So we have proper counters at the end of a test, e.g.: # Totals: pass:11 fail:0 xfail:0 xpass:0 skip:1 error:0 Signed-off-by: Wolfram Sang Acked-by: John Stultz Signed-off-by: Shuah Khan --- .../testing/selftests/timers/inconsistency-check.c | 32 ++++++++++++---------- 1 file changed, 18 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/timers/inconsistency-check.c b/tools/testing/selftests/timers/inconsistency-check.c index e6756d9c60a7..36a49fba6c9b 100644 --- a/tools/testing/selftests/timers/inconsistency-check.c +++ b/tools/testing/selftests/timers/inconsistency-check.c @@ -122,30 +122,28 @@ int consistency_test(int clock_type, unsigned long seconds) if (inconsistent >= 0) { unsigned long long delta; - printf("\%s\n", start_str); + ksft_print_msg("\%s\n", start_str); for (i = 0; i < CALLS_PER_LOOP; i++) { if (i == inconsistent) - printf("--------------------\n"); - printf("%lu:%lu\n", list[i].tv_sec, + ksft_print_msg("--------------------\n"); + ksft_print_msg("%lu:%lu\n", list[i].tv_sec, list[i].tv_nsec); if (i == inconsistent + 1) - printf("--------------------\n"); + ksft_print_msg("--------------------\n"); } delta = list[inconsistent].tv_sec * NSEC_PER_SEC; delta += list[inconsistent].tv_nsec; delta -= list[inconsistent+1].tv_sec * NSEC_PER_SEC; delta -= list[inconsistent+1].tv_nsec; - printf("Delta: %llu ns\n", delta); + ksft_print_msg("Delta: %llu ns\n", delta); fflush(0); /* timestamp inconsistency*/ t = time(0); - printf("%s\n", ctime(&t)); - printf("[FAILED]\n"); + ksft_print_msg("%s\n", ctime(&t)); return -1; } now = list[0].tv_sec; } - printf("[OK]\n"); return 0; } @@ -178,16 +176,22 @@ int main(int argc, char *argv[]) setbuf(stdout, NULL); + ksft_print_header(); + ksft_set_plan(maxclocks - userclock); + for (clockid = userclock; clockid < maxclocks; clockid++) { - if (clockid == CLOCK_HWSPECIFIC) + if (clockid == CLOCK_HWSPECIFIC || clock_gettime(clockid, &ts)) { + ksft_test_result_skip("%-31s\n", clockstring(clockid)); continue; + } - if (!clock_gettime(clockid, &ts)) { - printf("Consistent %-30s ", clockstring(clockid)); - if (consistency_test(clockid, runtime)) - return ksft_exit_fail(); + if (consistency_test(clockid, runtime)) { + ksft_test_result_fail("%-31s\n", clockstring(clockid)); + ksft_exit_fail(); + } else { + ksft_test_result_pass("%-31s\n", clockstring(clockid)); } } - return ksft_exit_pass(); + ksft_exit_pass(); } -- cgit v1.2.3 From 4d8f52ac5fa9eede7b7aa2f2d67c841d9eeb655f Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 13 Jul 2022 22:46:17 +0200 Subject: selftests: timers: clocksource-switch: fix passing errors from child The return value from system() is a waitpid-style integer. Do not return it directly because with the implicit masking in exit() it will always return 0. Access it with appropriate macros to really pass on errors. Fixes: 7290ce1423c3 ("selftests/timers: Add clocksource-switch test from timetest suite") Signed-off-by: Wolfram Sang Acked-by: John Stultz Signed-off-by: Shuah Khan --- tools/testing/selftests/timers/clocksource-switch.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/timers/clocksource-switch.c b/tools/testing/selftests/timers/clocksource-switch.c index ef8eb3604595..b57f0a9be490 100644 --- a/tools/testing/selftests/timers/clocksource-switch.c +++ b/tools/testing/selftests/timers/clocksource-switch.c @@ -110,10 +110,10 @@ int run_tests(int secs) sprintf(buf, "./inconsistency-check -t %i", secs); ret = system(buf); - if (ret) - return ret; + if (WIFEXITED(ret) && WEXITSTATUS(ret)) + return WEXITSTATUS(ret); ret = system("./nanosleep"); - return ret; + return WIFEXITED(ret) ? WEXITSTATUS(ret) : 0; } -- cgit v1.2.3 From 5be1fd963f7989090b16e3c471d0910502fb9bf8 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 13 Jul 2022 22:46:18 +0200 Subject: selftests: timers: clocksource-switch: sort includes It is easier to check if you need to add an include if the existing ones are sorted. Signed-off-by: Wolfram Sang Acked-by: John Stultz Signed-off-by: Shuah Khan --- tools/testing/selftests/timers/clocksource-switch.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/timers/clocksource-switch.c b/tools/testing/selftests/timers/clocksource-switch.c index b57f0a9be490..ed5b71f5b37c 100644 --- a/tools/testing/selftests/timers/clocksource-switch.c +++ b/tools/testing/selftests/timers/clocksource-switch.c @@ -23,17 +23,17 @@ */ +#include #include -#include #include +#include +#include #include #include -#include #include -#include -#include -#include #include +#include +#include #include "../kselftest.h" -- cgit v1.2.3 From 19b6823a6e9177ca95068440d622005e4aab5543 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 13 Jul 2022 22:46:19 +0200 Subject: selftests: timers: clocksource-switch: add command line switch to skip sanity check The sanity check takes a while. If you do repeated checks when debugging, this is time consuming. Add a parameter to skip it. Signed-off-by: Wolfram Sang Acked-by: John Stultz Signed-off-by: Shuah Khan --- .../testing/selftests/timers/clocksource-switch.c | 40 +++++++++++++++------- 1 file changed, 28 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/timers/clocksource-switch.c b/tools/testing/selftests/timers/clocksource-switch.c index ed5b71f5b37c..5256e6215980 100644 --- a/tools/testing/selftests/timers/clocksource-switch.c +++ b/tools/testing/selftests/timers/clocksource-switch.c @@ -119,12 +119,26 @@ int run_tests(int secs) char clocksource_list[10][30]; -int main(int argv, char **argc) +int main(int argc, char **argv) { char orig_clk[512]; - int count, i, status; + int count, i, status, opt; + int do_sanity_check = 1; pid_t pid; + /* Process arguments */ + while ((opt = getopt(argc, argv, "s")) != -1) { + switch (opt) { + case 's': + do_sanity_check = 0; + break; + default: + printf("Usage: %s [-s]\n", argv[0]); + printf(" -s: skip sanity checks\n"); + exit(-1); + } + } + get_cur_clocksource(orig_clk, 512); count = get_clocksources(clocksource_list); @@ -135,19 +149,21 @@ int main(int argv, char **argc) } /* Check everything is sane before we start switching asynchronously */ - for (i = 0; i < count; i++) { - printf("Validating clocksource %s\n", clocksource_list[i]); - if (change_clocksource(clocksource_list[i])) { - status = -1; - goto out; - } - if (run_tests(5)) { - status = -1; - goto out; + if (do_sanity_check) { + for (i = 0; i < count; i++) { + printf("Validating clocksource %s\n", + clocksource_list[i]); + if (change_clocksource(clocksource_list[i])) { + status = -1; + goto out; + } + if (run_tests(5)) { + status = -1; + goto out; + } } } - printf("Running Asynchronous Switching Tests...\n"); pid = fork(); if (!pid) -- cgit v1.2.3 From 248ae6f49a257bcbc57513ebc58e5ed7fa47e7fe Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 14 Jul 2022 20:57:21 +0200 Subject: selftests: timers: clocksource-switch: add 'runtime' command line parameter So the user can decide how long the test should run. Signed-off-by: Wolfram Sang Acked-by: John Stultz Signed-off-by: Shuah Khan --- tools/testing/selftests/timers/clocksource-switch.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/timers/clocksource-switch.c b/tools/testing/selftests/timers/clocksource-switch.c index 5256e6215980..577e4b74211a 100644 --- a/tools/testing/selftests/timers/clocksource-switch.c +++ b/tools/testing/selftests/timers/clocksource-switch.c @@ -124,17 +124,22 @@ int main(int argc, char **argv) char orig_clk[512]; int count, i, status, opt; int do_sanity_check = 1; + int runtime = 60; pid_t pid; /* Process arguments */ - while ((opt = getopt(argc, argv, "s")) != -1) { + while ((opt = getopt(argc, argv, "st:")) != -1) { switch (opt) { case 's': do_sanity_check = 0; break; + case 't': + runtime = atoi(optarg); + break; default: - printf("Usage: %s [-s]\n", argv[0]); + printf("Usage: %s [-s] [-t ]\n", argv[0]); printf(" -s: skip sanity checks\n"); + printf(" -t: Number of seconds to run\n"); exit(-1); } } @@ -167,7 +172,7 @@ int main(int argc, char **argv) printf("Running Asynchronous Switching Tests...\n"); pid = fork(); if (!pid) - return run_tests(60); + return run_tests(runtime); while (pid != waitpid(pid, &status, WNOHANG)) for (i = 0; i < count; i++) -- cgit v1.2.3 From ce7d101750ff8450af16d6e1f6ccba10d44ae9f3 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 13 Jul 2022 22:46:21 +0200 Subject: selftests: timers: clocksource-switch: adapt to kselftest framework So we have proper counters at the end of a test. We also print the kselftest header at the end of the test, so we don't mix with the output of the child process. There is only this one test anyhow. Signed-off-by: Wolfram Sang Acked-by: John Stultz Signed-off-by: Shuah Khan --- tools/testing/selftests/timers/clocksource-switch.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/timers/clocksource-switch.c b/tools/testing/selftests/timers/clocksource-switch.c index 577e4b74211a..c5264594064c 100644 --- a/tools/testing/selftests/timers/clocksource-switch.c +++ b/tools/testing/selftests/timers/clocksource-switch.c @@ -183,7 +183,9 @@ int main(int argc, char **argv) out: change_clocksource(orig_clk); - if (status) - return ksft_exit_fail(); - return ksft_exit_pass(); + /* Print at the end to not mix output with child process */ + ksft_print_header(); + ksft_set_plan(1); + ksft_test_result(!status, "clocksource-switch\n"); + ksft_exit(!status); } -- cgit v1.2.3 From a917dd94b832c8f4ad458d0ca2bd1007466c5643 Mon Sep 17 00:00:00 2001 From: Guillaume Tucker Date: Wed, 13 Jul 2022 08:35:01 +0200 Subject: selftests/landlock: drop deprecated headers dependency The khdr make target has been removed, so drop it from the landlock Makefile dependencies as well as related include paths that are standard for headers in the kernel tree. Signed-off-by: Guillaume Tucker Reported-by: Anders Roxell Signed-off-by: Shuah Khan --- tools/testing/selftests/landlock/Makefile | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/landlock/Makefile b/tools/testing/selftests/landlock/Makefile index 1313e44e8fb9..a6959df28eb0 100644 --- a/tools/testing/selftests/landlock/Makefile +++ b/tools/testing/selftests/landlock/Makefile @@ -11,13 +11,8 @@ TEST_GEN_PROGS_EXTENDED := true OVERRIDE_TARGETS := 1 include ../lib.mk -khdr_dir = $(top_srcdir)/usr/include - -$(khdr_dir)/linux/landlock.h: khdr - @: - $(OUTPUT)/true: true.c $(LINK.c) $< $(LDLIBS) -o $@ -static -$(OUTPUT)/%_test: %_test.c $(khdr_dir)/linux/landlock.h ../kselftest_harness.h common.h - $(LINK.c) $< $(LDLIBS) -o $@ -lcap -I$(khdr_dir) +$(OUTPUT)/%_test: %_test.c ../kselftest_harness.h common.h + $(LINK.c) $< $(LDLIBS) -o $@ -lcap -- cgit v1.2.3 From e2863a78593d638d3924a6f67900c4820034f349 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Fri, 1 Jul 2022 05:54:24 -0700 Subject: lib/bitmap: change return types to bool where appropriate Some bitmap functions return boolean results in int variables. Fix it by changing return types to bool. Signed-off-by: Yury Norov --- include/linux/bitmap.h | 8 ++++---- lib/bitmap.c | 4 ++-- tools/include/linux/bitmap.h | 8 ++++---- tools/lib/bitmap.c | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index c91638e507f2..e1a438bdda52 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -148,13 +148,13 @@ void __bitmap_shift_left(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits); void bitmap_cut(unsigned long *dst, const unsigned long *src, unsigned int first, unsigned int cut, unsigned int nbits); -int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, +bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); -int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, +bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); void __bitmap_replace(unsigned long *dst, const unsigned long *old, const unsigned long *new, @@ -315,7 +315,7 @@ void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits); bitmap_copy_clear_tail((unsigned long *)(buf), (const unsigned long *)(bitmap), (nbits)) #endif -static inline int bitmap_and(unsigned long *dst, const unsigned long *src1, +static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) @@ -341,7 +341,7 @@ static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, __bitmap_xor(dst, src1, src2, nbits); } -static inline int bitmap_andnot(unsigned long *dst, const unsigned long *src1, +static inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) diff --git a/lib/bitmap.c b/lib/bitmap.c index e903e13c62e1..9bc80f5bf149 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -237,7 +237,7 @@ void bitmap_cut(unsigned long *dst, const unsigned long *src, } EXPORT_SYMBOL(bitmap_cut); -int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, +bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k; @@ -275,7 +275,7 @@ void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, } EXPORT_SYMBOL(__bitmap_xor); -int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, +bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k; diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index afdf93bebaaf..2ae7ab8ed7d1 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -14,7 +14,7 @@ int __bitmap_weight(const unsigned long *bitmap, int bits); void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, int bits); -int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, +bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits); bool __bitmap_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits); @@ -45,7 +45,7 @@ static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) dst[nlongs - 1] = BITMAP_LAST_WORD_MASK(nbits); } -static inline int bitmap_empty(const unsigned long *src, unsigned nbits) +static inline bool bitmap_empty(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return ! (*src & BITMAP_LAST_WORD_MASK(nbits)); @@ -53,7 +53,7 @@ static inline int bitmap_empty(const unsigned long *src, unsigned nbits) return find_first_bit(src, nbits) == nbits; } -static inline int bitmap_full(const unsigned long *src, unsigned int nbits) +static inline bool bitmap_full(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits)); @@ -146,7 +146,7 @@ size_t bitmap_scnprintf(unsigned long *bitmap, unsigned int nbits, * @src2: operand 2 * @nbits: size of bitmap */ -static inline int bitmap_and(unsigned long *dst, const unsigned long *src1, +static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) diff --git a/tools/lib/bitmap.c b/tools/lib/bitmap.c index 354f8cdc0880..2e351d63fdba 100644 --- a/tools/lib/bitmap.c +++ b/tools/lib/bitmap.c @@ -57,7 +57,7 @@ size_t bitmap_scnprintf(unsigned long *bitmap, unsigned int nbits, return ret; } -int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, +bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k; -- cgit v1.2.3 From 7fb27a56b9ebd8a77d9dd188e8a42bff99bc3443 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 14 Jul 2022 10:23:16 +0200 Subject: selftests/bpf: Do not attach kprobe_multi bench to bpf_dispatcher_xdp_func Alexei reported crash by running test_progs -j on system with 32 cpus. It turned out the kprobe_multi bench test that attaches all ftrace-able functions will race with bpf_dispatcher_update, that calls bpf_arch_text_poke on bpf_dispatcher_xdp_func, which is ftrace-able function. Ftrace is not aware of this update so this will cause ftrace_bug with: WARNING: CPU: 6 PID: 1985 at arch/x86/kernel/ftrace.c:94 ftrace_verify_code+0x27/0x50 ... ftrace_replace_code+0xa3/0x170 ftrace_modify_all_code+0xbd/0x150 ftrace_startup_enable+0x3f/0x50 ftrace_startup+0x98/0xf0 register_ftrace_function+0x20/0x60 register_fprobe_ips+0xbb/0xd0 bpf_kprobe_multi_link_attach+0x179/0x430 __sys_bpf+0x18a1/0x2440 ... ------------[ ftrace bug ]------------ ftrace failed to modify [] bpf_dispatcher_xdp_func+0x0/0x10 actual: ffffffe9:7b:ffffff9c:77:1e Setting ftrace call site to call ftrace function It looks like we need some way to hide some functions from ftrace, but meanwhile we workaround this by skipping bpf_dispatcher_xdp_func from kprobe_multi bench test. Reported-by: Alexei Starovoitov Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220714082316.479181-1-jolsa@kernel.org --- tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 335917df0614..d457a55ff408 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -364,6 +364,8 @@ static int get_syms(char ***symsp, size_t *cntp) continue; if (!strncmp(name, "rcu_", 4)) continue; + if (!strcmp(name, "bpf_dispatcher_xdp_func")) + continue; if (!strncmp(name, "__ftrace_invalid_address__", sizeof("__ftrace_invalid_address__") - 1)) continue; -- cgit v1.2.3 From 6a4f7fcd750497cb2fa870f799e8b23270bec6e3 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 14 Jul 2022 16:41:08 +0100 Subject: KVM: arm64: selftests: Add support for GICv2 on v3 The current vgic_init test wrongly assumes that the host cannot multiple versions of the GIC architecture, while v2 emulation on v3 has almost always been supported (it was supported before the standalone v3 emulation). Tweak the test to support multiple GIC incarnations. Signed-off-by: Marc Zyngier Fixes: 3f4db37e203b ("KVM: arm64: selftests: Make vgic_init gic version agnostic") Reviewed-by: Ricardo Koller Link: https://lore.kernel.org/r/20220714154108.3531213-1-maz@kernel.org --- tools/testing/selftests/kvm/aarch64/vgic_init.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index 34379c98d2f4..21ba4002fc18 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -670,7 +670,7 @@ int test_kvm_device(uint32_t gic_dev_type) if (!_kvm_create_device(v.vm, other, true, &fd)) { ret = _kvm_create_device(v.vm, other, false, &fd); - TEST_ASSERT(ret && errno == EINVAL, + TEST_ASSERT(ret && (errno == EINVAL || errno == EEXIST), "create GIC device while other version exists"); } @@ -698,6 +698,7 @@ int main(int ac, char **av) { int ret; int pa_bits; + int cnt_impl = 0; pa_bits = vm_guest_mode_params[VM_MODE_DEFAULT].pa_bits; max_phys_size = 1ULL << pa_bits; @@ -706,17 +707,19 @@ int main(int ac, char **av) if (!ret) { pr_info("Running GIC_v3 tests.\n"); run_tests(KVM_DEV_TYPE_ARM_VGIC_V3); - return 0; + cnt_impl++; } ret = test_kvm_device(KVM_DEV_TYPE_ARM_VGIC_V2); if (!ret) { pr_info("Running GIC_v2 tests.\n"); run_tests(KVM_DEV_TYPE_ARM_VGIC_V2); - return 0; + cnt_impl++; } - print_skip("No GICv2 nor GICv3 support"); - exit(KSFT_SKIP); + if (!cnt_impl) { + print_skip("No GICv2 nor GICv3 support"); + exit(KSFT_SKIP); + } return 0; } -- cgit v1.2.3 From 4dea97f8636d0514befc9fc5cf342b351b7d0e20 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Fri, 1 Jul 2022 05:54:25 -0700 Subject: lib/bitmap: change type of bitmap_weight to unsigned long bitmap_weight() doesn't return negative values, so change it's type to unsigned long. It may help compiler to generate better code and catch bugs. Signed-off-by: Yury Norov --- drivers/dma/ti/k3-udma.c | 6 +++--- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 2 +- drivers/gpu/drm/i915/display/intel_display_power.c | 2 +- drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c | 2 +- drivers/iommu/intel/iommu.c | 2 +- drivers/net/ethernet/mellanox/mlx4/fw.c | 2 +- drivers/net/wireless/ath/ath9k/htc_drv_debug.c | 2 +- drivers/net/wireless/ath/carl9170/debug.c | 2 +- include/linux/bitmap.h | 5 +++-- lib/bitmap.c | 5 ++--- tools/include/linux/bitmap.h | 4 ++-- tools/lib/bitmap.c | 4 ++-- 12 files changed, 19 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c index 2f0d2c68c93c..07cb48db76ba 100644 --- a/drivers/dma/ti/k3-udma.c +++ b/drivers/dma/ti/k3-udma.c @@ -4997,7 +4997,7 @@ static int setup_resources(struct udma_dev *ud) switch (ud->match_data->type) { case DMA_TYPE_UDMA: dev_info(dev, - "Channels: %d (tchan: %u, rchan: %u, gp-rflow: %u)\n", + "Channels: %d (tchan: %lu, rchan: %lu, gp-rflow: %lu)\n", ch_count, ud->tchan_cnt - bitmap_weight(ud->tchan_map, ud->tchan_cnt), @@ -5008,7 +5008,7 @@ static int setup_resources(struct udma_dev *ud) break; case DMA_TYPE_BCDMA: dev_info(dev, - "Channels: %d (bchan: %u, tchan: %u, rchan: %u)\n", + "Channels: %d (bchan: %lu, tchan: %lu, rchan: %lu)\n", ch_count, ud->bchan_cnt - bitmap_weight(ud->bchan_map, ud->bchan_cnt), @@ -5019,7 +5019,7 @@ static int setup_resources(struct udma_dev *ud) break; case DMA_TYPE_PKTDMA: dev_info(dev, - "Channels: %d (tchan: %u, rchan: %u)\n", + "Channels: %d (tchan: %lu, rchan: %lu)\n", ch_count, ud->tchan_cnt - bitmap_weight(ud->tchan_map, ud->tchan_cnt), diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 16699158e00d..1098506ba1aa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -195,7 +195,7 @@ void amdgpu_gfx_compute_queue_acquire(struct amdgpu_device *adev) set_bit(i, adev->gfx.mec.queue_bitmap); } - dev_dbg(adev->dev, "mec queue bitmap weight=%d\n", bitmap_weight(adev->gfx.mec.queue_bitmap, AMDGPU_MAX_COMPUTE_QUEUES)); + dev_dbg(adev->dev, "mec queue bitmap weight=%ld\n", bitmap_weight(adev->gfx.mec.queue_bitmap, AMDGPU_MAX_COMPUTE_QUEUES)); } void amdgpu_gfx_graphics_queue_acquire(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/i915/display/intel_display_power.c b/drivers/gpu/drm/i915/display/intel_display_power.c index 949edc983a16..4204e8a0e1a6 100644 --- a/drivers/gpu/drm/i915/display/intel_display_power.c +++ b/drivers/gpu/drm/i915/display/intel_display_power.c @@ -378,7 +378,7 @@ static void print_power_domains(struct i915_power_domains *power_domains, power_domains); enum intel_display_power_domain domain; - drm_dbg(&i915->drm, "%s (%d):\n", prefix, bitmap_weight(mask->bits, POWER_DOMAIN_NUM)); + drm_dbg(&i915->drm, "%s (%ld):\n", prefix, bitmap_weight(mask->bits, POWER_DOMAIN_NUM)); for_each_power_domain(domain, mask) drm_dbg(&i915->drm, "%s use_count %d\n", intel_display_power_domain_str(domain), diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c index 56a3063545ec..3d35b3982a60 100644 --- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c +++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c @@ -363,7 +363,7 @@ void mdp5_smp_dump(struct mdp5_smp *smp, struct drm_printer *p) } drm_printf(p, "TOTAL:\t%d\t(of %d)\n", total, smp->blk_cnt); - drm_printf(p, "AVAIL:\t%d\n", smp->blk_cnt - + drm_printf(p, "AVAIL:\t%ld\n", smp->blk_cnt - bitmap_weight(state->state, smp->blk_cnt)); if (drm_can_sleep()) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 44016594831d..2f1b09f34706 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3946,7 +3946,7 @@ static ssize_t domains_used_show(struct device *dev, struct device_attribute *attr, char *buf) { struct intel_iommu *iommu = dev_to_intel_iommu(dev); - return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids, + return sprintf(buf, "%ld\n", bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))); } static DEVICE_ATTR_RO(domains_used); diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index 42c96c9d7fb1..af054a3808ca 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -463,7 +463,7 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave, field = min( bitmap_weight(actv_ports.ports, dev->caps.num_ports), - dev->caps.num_ports); + (unsigned long)dev->caps.num_ports); MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_NUM_PORTS_OFFSET); size = dev->caps.function_caps; /* set PF behaviours */ diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_debug.c b/drivers/net/wireless/ath/ath9k/htc_drv_debug.c index b3ed65e5c4da..d32056c4ee1d 100644 --- a/drivers/net/wireless/ath/ath9k/htc_drv_debug.c +++ b/drivers/net/wireless/ath/ath9k/htc_drv_debug.c @@ -296,7 +296,7 @@ static ssize_t read_file_slot(struct file *file, char __user *user_buf, spin_lock_bh(&priv->tx.tx_lock); len = scnprintf(buf, sizeof(buf), "TX slot bitmap : %*pb\n" - "Used slots : %d\n", + "Used slots : %ld\n", MAX_TX_BUF_NUM, priv->tx.tx_slot, bitmap_weight(priv->tx.tx_slot, MAX_TX_BUF_NUM)); spin_unlock_bh(&priv->tx.tx_lock); diff --git a/drivers/net/wireless/ath/carl9170/debug.c b/drivers/net/wireless/ath/carl9170/debug.c index bb40889d7c72..7e88882b87d9 100644 --- a/drivers/net/wireless/ath/carl9170/debug.c +++ b/drivers/net/wireless/ath/carl9170/debug.c @@ -221,7 +221,7 @@ static char *carl9170_debugfs_mem_usage_read(struct ar9170 *ar, char *buf, ADD(buf, *len, bufsize, "jar: [%*pb]\n", ar->fw.mem_blocks, ar->mem_bitmap); - ADD(buf, *len, bufsize, "cookies: used:%3d / total:%3d, allocs:%d\n", + ADD(buf, *len, bufsize, "cookies: used:%3ld / total:%3d, allocs:%d\n", bitmap_weight(ar->mem_bitmap, ar->fw.mem_blocks), ar->fw.mem_blocks, atomic_read(&ar->mem_allocs)); diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index e1a438bdda52..035d4ac66641 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -163,7 +163,7 @@ bool __bitmap_intersects(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); bool __bitmap_subset(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); -int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); +unsigned long __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); void __bitmap_set(unsigned long *map, unsigned int start, int len); void __bitmap_clear(unsigned long *map, unsigned int start, int len); @@ -431,7 +431,8 @@ static inline bool bitmap_full(const unsigned long *src, unsigned int nbits) return find_first_zero_bit(src, nbits) == nbits; } -static __always_inline int bitmap_weight(const unsigned long *src, unsigned int nbits) +static __always_inline +unsigned long bitmap_weight(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits)); diff --git a/lib/bitmap.c b/lib/bitmap.c index 9bc80f5bf149..2b67cd657692 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -333,10 +333,9 @@ bool __bitmap_subset(const unsigned long *bitmap1, } EXPORT_SYMBOL(__bitmap_subset); -int __bitmap_weight(const unsigned long *bitmap, unsigned int bits) +unsigned long __bitmap_weight(const unsigned long *bitmap, unsigned int bits) { - unsigned int k, lim = bits/BITS_PER_LONG; - int w = 0; + unsigned long k, w = 0, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; k++) w += hweight_long(bitmap[k]); diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index 2ae7ab8ed7d1..ae1852e39142 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -11,7 +11,7 @@ #define DECLARE_BITMAP(name,bits) \ unsigned long name[BITS_TO_LONGS(bits)] -int __bitmap_weight(const unsigned long *bitmap, int bits); +unsigned long __bitmap_weight(const unsigned long *bitmap, unsigned int bits); void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, int bits); bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, @@ -61,7 +61,7 @@ static inline bool bitmap_full(const unsigned long *src, unsigned int nbits) return find_first_zero_bit(src, nbits) == nbits; } -static inline int bitmap_weight(const unsigned long *src, unsigned int nbits) +static inline unsigned long bitmap_weight(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits)); diff --git a/tools/lib/bitmap.c b/tools/lib/bitmap.c index 2e351d63fdba..e1fafc131a49 100644 --- a/tools/lib/bitmap.c +++ b/tools/lib/bitmap.c @@ -5,9 +5,9 @@ */ #include -int __bitmap_weight(const unsigned long *bitmap, int bits) +unsigned long __bitmap_weight(const unsigned long *bitmap, unsigned int bits) { - int k, w = 0, lim = bits/BITS_PER_LONG; + unsigned long k, w = 0, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; k++) w += hweight_long(bitmap[k]); -- cgit v1.2.3 From b3f6c43d008c5ab103e00361671c456d45012547 Mon Sep 17 00:00:00 2001 From: Todd Brandt Date: Thu, 7 Jul 2022 15:54:51 -0700 Subject: pm-graph v5.9 bootgraph: - fix parsing of /proc/version to be much more flexible - check kernel version to disallow ftrace on anything older than 4.10 sleepgraph: - include fix to bugzilla 212761 in case it regresses - fix for -proc bug: https://github.com/intel/pm-graph/pull/20 - add -debugtiming arg to get timestamps on prints - allow use of the netfix tool hosted in the github repo - read s0ix data from pmc_core for better debug - include more system data in the output log - Do a better job testing input files useability - flag more error data from dmesg in the timeline - pre-parse the trace log to fix any ordering issues - add new parser to process dmesg only timelines - remove superflous sleep(5) in multitest mode config/custom-timeline-functions.cfg: - change some names to keep up to date README: - new version, small wording changes Signed-off-by: Todd Brandt Signed-off-by: Rafael J. Wysocki --- tools/power/pm-graph/README | 6 +- tools/power/pm-graph/bootgraph.py | 20 +- .../pm-graph/config/custom-timeline-functions.cfg | 2 +- tools/power/pm-graph/sleepgraph.py | 518 ++++++++++++++------- 4 files changed, 360 insertions(+), 186 deletions(-) (limited to 'tools') diff --git a/tools/power/pm-graph/README b/tools/power/pm-graph/README index da468bd510ca..e6020c0d59ec 100644 --- a/tools/power/pm-graph/README +++ b/tools/power/pm-graph/README @@ -6,7 +6,7 @@ |_| |___/ |_| pm-graph: suspend/resume/boot timing analysis tools - Version: 5.8 + Version: 5.9 Author: Todd Brandt Home Page: https://01.org/pm-graph @@ -97,8 +97,8 @@ (kernel/pre-3.15/enable_trace_events_suspend_resume.patch) (kernel/pre-3.15/enable_trace_events_device_pm_callback.patch) - If you're using a kernel older than 3.15.0, the following - additional kernel parameters are required: + If you're using bootgraph, or sleepgraph with a kernel older than 3.15.0, + the following additional kernel parameters are required: (e.g. in file /etc/default/grub) GRUB_CMDLINE_LINUX_DEFAULT="... initcall_debug log_buf_len=32M ..." diff --git a/tools/power/pm-graph/bootgraph.py b/tools/power/pm-graph/bootgraph.py index 2823cd3122f7..f96f50e0c336 100755 --- a/tools/power/pm-graph/bootgraph.py +++ b/tools/power/pm-graph/bootgraph.py @@ -69,22 +69,24 @@ class SystemValues(aslib.SystemValues): bootloader = 'grub' blexec = [] def __init__(self): - self.hostname = platform.node() + self.kernel, self.hostname = 'unknown', platform.node() self.testtime = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') if os.path.exists('/proc/version'): fp = open('/proc/version', 'r') - val = fp.read().strip() + self.kernel = self.kernelVersion(fp.read().strip()) fp.close() - self.kernel = self.kernelVersion(val) - else: - self.kernel = 'unknown' self.testdir = datetime.now().strftime('boot-%y%m%d-%H%M%S') def kernelVersion(self, msg): - return msg.split()[2] + m = re.match('^[Ll]inux *[Vv]ersion *(?P\S*) .*', msg) + if m: + return m.group('v') + return 'unknown' def checkFtraceKernelVersion(self): - val = tuple(map(int, self.kernel.split('-')[0].split('.'))) - if val >= (4, 10, 0): - return True + m = re.match('^(?P[0-9]*)\.(?P[0-9]*)\.(?P[0-9]*).*', self.kernel) + if m: + val = tuple(map(int, m.groups())) + if val >= (4, 10, 0): + return True return False def kernelParams(self): cmdline = 'initcall_debug log_buf_len=32M' diff --git a/tools/power/pm-graph/config/custom-timeline-functions.cfg b/tools/power/pm-graph/config/custom-timeline-functions.cfg index 962e5768681c..4f80ad7d7275 100644 --- a/tools/power/pm-graph/config/custom-timeline-functions.cfg +++ b/tools/power/pm-graph/config/custom-timeline-functions.cfg @@ -125,7 +125,7 @@ acpi_suspend_begin: suspend_console: acpi_pm_prepare: syscore_suspend: -arch_thaw_secondary_cpus_end: +arch_enable_nonboot_cpus_end: syscore_resume: acpi_pm_finish: resume_console: diff --git a/tools/power/pm-graph/sleepgraph.py b/tools/power/pm-graph/sleepgraph.py index ffd50953a024..33981adcdd68 100755 --- a/tools/power/pm-graph/sleepgraph.py +++ b/tools/power/pm-graph/sleepgraph.py @@ -66,8 +66,13 @@ from threading import Thread from subprocess import call, Popen, PIPE import base64 +debugtiming = False +mystarttime = time.time() def pprint(msg): - print(msg) + if debugtiming: + print('[%09.3f] %s' % (time.time()-mystarttime, msg)) + else: + print(msg) sys.stdout.flush() def ascii(text): @@ -81,13 +86,14 @@ def ascii(text): # store system values and test parameters class SystemValues: title = 'SleepGraph' - version = '5.8' + version = '5.9' ansi = False rs = 0 display = '' gzip = False sync = False wifi = False + netfix = False verbose = False testlog = True dmesglog = True @@ -108,6 +114,7 @@ class SystemValues: cpucount = 0 memtotal = 204800 memfree = 204800 + osversion = '' srgap = 0 cgexp = False testdir = '' @@ -116,6 +123,7 @@ class SystemValues: fpdtpath = '/sys/firmware/acpi/tables/FPDT' epath = '/sys/kernel/debug/tracing/events/power/' pmdpath = '/sys/power/pm_debug_messages' + s0ixpath = '/sys/module/intel_pmc_core/parameters/warn_on_s0ix_failures' acpipath='/sys/module/acpi/parameters/debug_level' traceevents = [ 'suspend_resume', @@ -156,6 +164,7 @@ class SystemValues: ftop = False usetraceevents = False usetracemarkers = True + useftrace = True usekprobes = True usedevsrc = False useprocmon = False @@ -279,10 +288,16 @@ class SystemValues: 'intel_fbdev_set_suspend': {}, } infocmds = [ + [0, 'sysinfo', 'uname', '-a'], + [0, 'cpuinfo', 'head', '-7', '/proc/cpuinfo'], [0, 'kparams', 'cat', '/proc/cmdline'], [0, 'mcelog', 'mcelog'], [0, 'pcidevices', 'lspci', '-tv'], - [0, 'usbdevices', 'lsusb', '-t'], + [0, 'usbdevices', 'lsusb', '-tv'], + [0, 'acpidevices', 'sh', '-c', 'ls -l /sys/bus/acpi/devices/*/physical_node'], + [0, 's0ix_require', 'cat', '/sys/kernel/debug/pmc_core/substate_requirements'], + [0, 's0ix_debug', 'cat', '/sys/kernel/debug/pmc_core/slp_s0_debug_status'], + [1, 's0ix_residency', 'cat', '/sys/kernel/debug/pmc_core/slp_s0_residency_usec'], [1, 'interrupts', 'cat', '/proc/interrupts'], [1, 'wakeups', 'cat', '/sys/kernel/debug/wakeup_sources'], [2, 'gpecounts', 'sh', '-c', 'grep -v invalid /sys/firmware/acpi/interrupts/*'], @@ -358,8 +373,19 @@ class SystemValues: self.outputResult({'error':msg}) sys.exit(1) return False - def usable(self, file): - return (os.path.exists(file) and os.path.getsize(file) > 0) + def usable(self, file, ishtml=False): + if not os.path.exists(file) or os.path.getsize(file) < 1: + return False + if ishtml: + try: + fp = open(file, 'r') + res = fp.read(1000) + fp.close() + except: + return False + if '' not in res: + return False + return True def getExec(self, cmd): try: fp = Popen(['which', cmd], stdout=PIPE, stderr=PIPE).stdout @@ -413,12 +439,16 @@ class SystemValues: r = info['bios-release-date'] if 'bios-release-date' in info else '' self.sysstamp = '# sysinfo | man:%s | plat:%s | cpu:%s | bios:%s | biosdate:%s | numcpu:%d | memsz:%d | memfr:%d' % \ (m, p, c, b, r, self.cpucount, self.memtotal, self.memfree) + if self.osversion: + self.sysstamp += ' | os:%s' % self.osversion def printSystemInfo(self, fatal=False): self.rootCheck(True) out = dmidecode(self.mempath, fatal) if len(out) < 1: return fmt = '%-24s: %s' + if self.osversion: + print(fmt % ('os-version', self.osversion)) for name in sorted(out): print(fmt % (name, out[name])) print(fmt % ('cpucount', ('%d' % self.cpucount))) @@ -426,20 +456,25 @@ class SystemValues: print(fmt % ('memfree', ('%d kB' % self.memfree))) def cpuInfo(self): self.cpucount = 0 - fp = open('/proc/cpuinfo', 'r') - for line in fp: - if re.match('^processor[ \t]*:[ \t]*[0-9]*', line): - self.cpucount += 1 - fp.close() - fp = open('/proc/meminfo', 'r') - for line in fp: - m = re.match('^MemTotal:[ \t]*(?P[0-9]*) *kB', line) - if m: - self.memtotal = int(m.group('sz')) - m = re.match('^MemFree:[ \t]*(?P[0-9]*) *kB', line) - if m: - self.memfree = int(m.group('sz')) - fp.close() + if os.path.exists('/proc/cpuinfo'): + with open('/proc/cpuinfo', 'r') as fp: + for line in fp: + if re.match('^processor[ \t]*:[ \t]*[0-9]*', line): + self.cpucount += 1 + if os.path.exists('/proc/meminfo'): + with open('/proc/meminfo', 'r') as fp: + for line in fp: + m = re.match('^MemTotal:[ \t]*(?P[0-9]*) *kB', line) + if m: + self.memtotal = int(m.group('sz')) + m = re.match('^MemFree:[ \t]*(?P[0-9]*) *kB', line) + if m: + self.memfree = int(m.group('sz')) + if os.path.exists('/etc/os-release'): + with open('/etc/os-release', 'r') as fp: + for line in fp: + if line.startswith('PRETTY_NAME='): + self.osversion = line[12:].strip().replace('"', '') def initTestOutput(self, name): self.prefix = self.hostname v = open('/proc/version', 'r').read().strip() @@ -698,6 +733,8 @@ class SystemValues: return False return True def fsetVal(self, val, path): + if not self.useftrace: + return False return self.setVal(val, self.tpath+path) def getVal(self, file): res = '' @@ -711,9 +748,11 @@ class SystemValues: pass return res def fgetVal(self, path): + if not self.useftrace: + return '' return self.getVal(self.tpath+path) def cleanupFtrace(self): - if(self.usecallgraph or self.usetraceevents or self.usedevsrc): + if self.useftrace: self.fsetVal('0', 'events/kprobes/enable') self.fsetVal('', 'kprobe_events') self.fsetVal('1024', 'buffer_size_kb') @@ -734,13 +773,14 @@ class SystemValues: return True return False def initFtrace(self, quiet=False): + if not self.useftrace: + return if not quiet: sysvals.printSystemInfo(False) pprint('INITIALIZING FTRACE...') # turn trace off self.fsetVal('0', 'tracing_on') self.cleanupFtrace() - self.testVal(self.pmdpath, 'basic', '1') # set the trace clock to global self.fsetVal('global', 'trace_clock') self.fsetVal('nop', 'current_tracer') @@ -766,6 +806,10 @@ class SystemValues: # set trace type self.fsetVal('function_graph', 'current_tracer') self.fsetVal('', 'set_ftrace_filter') + # temporary hack to fix https://bugzilla.kernel.org/show_bug.cgi?id=212761 + fp = open(self.tpath+'set_ftrace_notrace', 'w') + fp.write('native_queued_spin_lock_slowpath\ndev_driver_string') + fp.close() # set trace format options self.fsetVal('print-parent', 'trace_options') self.fsetVal('funcgraph-abstime', 'trace_options') @@ -846,6 +890,8 @@ class SystemValues: fp.write('# turbostat %s\n' % test['turbo']) if 'wifi' in test: fp.write('# wifi %s\n' % test['wifi']) + if 'netfix' in test: + fp.write('# netfix %s\n' % test['netfix']) if test['error'] or len(testdata) > 1: fp.write('# enter_sleep_error %s\n' % test['error']) return fp @@ -865,6 +911,8 @@ class SystemValues: fp.write('error%s: %s\n' % (n, testdata['error'])) else: fp.write('result%s: pass\n' % n) + if 'mode' in testdata: + fp.write('mode%s: %s\n' % (n, testdata['mode'])) for v in ['suspend', 'resume', 'boot', 'lastinit']: if v in testdata: fp.write('%s%s: %.3f\n' % (v, n, testdata[v])) @@ -901,6 +949,8 @@ class SystemValues: fp.write(text) fp.close() def dlog(self, text): + if not self.dmesgfile: + return self.putlog(self.dmesgfile, '# %s\n' % text) def flog(self, text): self.putlog(self.ftracefile, text) @@ -954,34 +1004,31 @@ class SystemValues: dirname = props[dev].syspath if not dirname or not os.path.exists(dirname): continue - with open(dirname+'/power/async') as fp: - text = fp.read() - props[dev].isasync = False - if 'enabled' in text: + props[dev].isasync = False + if os.path.exists(dirname+'/power/async'): + fp = open(dirname+'/power/async') + if 'enabled' in fp.read(): props[dev].isasync = True + fp.close() fields = os.listdir(dirname) - if 'product' in fields: - with open(dirname+'/product', 'rb') as fp: - props[dev].altname = ascii(fp.read()) - elif 'name' in fields: - with open(dirname+'/name', 'rb') as fp: - props[dev].altname = ascii(fp.read()) - elif 'model' in fields: - with open(dirname+'/model', 'rb') as fp: - props[dev].altname = ascii(fp.read()) - elif 'description' in fields: - with open(dirname+'/description', 'rb') as fp: - props[dev].altname = ascii(fp.read()) - elif 'id' in fields: - with open(dirname+'/id', 'rb') as fp: - props[dev].altname = ascii(fp.read()) - elif 'idVendor' in fields and 'idProduct' in fields: - idv, idp = '', '' - with open(dirname+'/idVendor', 'rb') as fp: - idv = ascii(fp.read()).strip() - with open(dirname+'/idProduct', 'rb') as fp: - idp = ascii(fp.read()).strip() - props[dev].altname = '%s:%s' % (idv, idp) + for file in ['product', 'name', 'model', 'description', 'id', 'idVendor']: + if file not in fields: + continue + try: + with open(os.path.join(dirname, file), 'rb') as fp: + props[dev].altname = ascii(fp.read()) + except: + continue + if file == 'idVendor': + idv, idp = props[dev].altname.strip(), '' + try: + with open(os.path.join(dirname, 'idProduct'), 'rb') as fp: + idp = ascii(fp.read()).strip() + except: + props[dev].altname = '' + break + props[dev].altname = '%s:%s' % (idv, idp) + break if props[dev].altname: out = props[dev].altname.strip().replace('\n', ' ')\ .replace(',', ' ').replace(';', ' ') @@ -1047,7 +1094,7 @@ class SystemValues: self.cmd1[name] = self.dictify(info, delta) elif not debug and delta and name in self.cmd1: before, after = self.cmd1[name], self.dictify(info, delta) - dinfo = ('\t%s\n' % before['@']) if '@' in before else '' + dinfo = ('\t%s\n' % before['@']) if '@' in before and len(before) > 1 else '' prefix = self.commonPrefix(list(before.keys())) for key in sorted(before): if key in after and before[key] != after[key]: @@ -1128,6 +1175,22 @@ class SystemValues: val = valline[idx] out.append('%s=%s' % (key, val)) return '|'.join(out) + def netfixon(self, net='both'): + cmd = self.getExec('netfix') + if not cmd: + return '' + fp = Popen([cmd, '-s', net, 'on'], stdout=PIPE, stderr=PIPE).stdout + out = ascii(fp.read()).strip() + fp.close() + return out + def wifiRepair(self): + out = self.netfixon('wifi') + if not out or 'error' in out.lower(): + return '' + m = re.match('WIFI \S* ONLINE (?P\S*)', out) + if not m: + return 'dead' + return m.group('action') def wifiDetails(self, dev): try: info = open('/sys/class/net/%s/device/uevent' % dev, 'r').read().strip() @@ -1144,12 +1207,12 @@ class SystemValues: except: return '' for line in reversed(w.split('\n')): - m = re.match(' *(?P.*): (?P[0-9a-f]*) .*', w.split('\n')[-1]) + m = re.match(' *(?P.*): (?P[0-9a-f]*) .*', line) if not m or (dev and dev != m.group('dev')): continue return m.group('dev') return '' - def pollWifi(self, dev, timeout=60): + def pollWifi(self, dev, timeout=10): start = time.time() while (time.time() - start) < timeout: w = self.checkWifi(dev) @@ -1157,6 +1220,11 @@ class SystemValues: return '%s reconnected %.2f' % \ (self.wifiDetails(dev), max(0, time.time() - start)) time.sleep(0.01) + if self.netfix: + res = self.wifiRepair() + if res: + timeout = max(0, time.time() - start) + return '%s %s %d' % (self.wifiDetails(dev), res, timeout) return '%s timeout %d' % (self.wifiDetails(dev), timeout) def errorSummary(self, errinfo, msg): found = False @@ -1283,10 +1351,10 @@ sysvals = SystemValues() switchvalues = ['enable', 'disable', 'on', 'off', 'true', 'false', '1', '0'] switchoff = ['disable', 'off', 'false', '0'] suspendmodename = { - 'freeze': 'Freeze (S0)', - 'standby': 'Standby (S1)', - 'mem': 'Suspend (S3)', - 'disk': 'Hibernate (S4)' + 'standby': 'standby (S1)', + 'freeze': 'freeze (S2idle)', + 'mem': 'suspend (S3)', + 'disk': 'hibernate (S4)' } # Class: DevProps @@ -1376,6 +1444,7 @@ class Data: 'INVALID' : r'(?i).*\bINVALID\b.*', 'CRASH' : r'(?i).*\bCRASHED\b.*', 'TIMEOUT' : r'(?i).*\bTIMEOUT\b.*', + 'ABORT' : r'(?i).*\bABORT\b.*', 'IRQ' : r'.*\bgenirq: .*', 'TASKFAIL': r'.*Freezing of tasks *.*', 'ACPI' : r'.*\bACPI *(?P[A-Za-z]*) *Error[: ].*', @@ -1724,9 +1793,9 @@ class Data: if 'waking' in self.dmesg[lp]: tCnt = self.dmesg[lp]['waking'][0] if self.dmesg[lp]['waking'][1] >= 0.001: - tTry = '-%.0f' % (round(self.dmesg[lp]['waking'][1] * 1000)) + tTry = '%.0f' % (round(self.dmesg[lp]['waking'][1] * 1000)) else: - tTry = '-%.3f' % (self.dmesg[lp]['waking'][1] * 1000) + tTry = '%.3f' % (self.dmesg[lp]['waking'][1] * 1000) text = '%.0f (%s ms waking %d times)' % (tL * 1000, tTry, tCnt) else: text = '%.0f' % (tL * 1000) @@ -2107,6 +2176,30 @@ class Data: # set resume complete to end at end marker if 'resume_complete' in dm: dm['resume_complete']['end'] = time + def initcall_debug_call(self, line, quick=False): + m = re.match('.*(\[ *)(?P[0-9\.]*)(\]) .* (?P.*)\: '+\ + 'PM: *calling .* @ (?P.*), parent: (?P

.*)', line) + if not m: + m = re.match('.*(\[ *)(?P[0-9\.]*)(\]) .* (?P.*)\: '+\ + 'calling .* @ (?P.*), parent: (?P

.*)', line) + if not m: + m = re.match('.*(\[ *)(?P[0-9\.]*)(\]) calling '+\ + '(?P.*)\+ @ (?P.*), parent: (?P

.*)', line) + if m: + return True if quick else m.group('t', 'f', 'n', 'p') + return False if quick else ('', '', '', '') + def initcall_debug_return(self, line, quick=False): + m = re.match('.*(\[ *)(?P[0-9\.]*)(\]) .* (?P.*)\: PM: '+\ + '.* returned (?P[0-9]*) after (?P

[0-9]*) usecs', line) + if not m: + m = re.match('.*(\[ *)(?P[0-9\.]*)(\]) .* (?P.*)\: '+\ + '.* returned (?P[0-9]*) after (?P
[0-9]*) usecs', line) + if not m: + m = re.match('.*(\[ *)(?P[0-9\.]*)(\]) call '+\ + '(?P.*)\+ returned .* after (?P
.*) usecs', line) + if m: + return True if quick else m.group('t', 'f', 'dt') + return False if quick else ('', '', '') def debugPrint(self): for p in self.sortedPhases(): list = self.dmesg[p]['list'] @@ -2880,10 +2973,11 @@ class TestProps: cmdlinefmt = '^# command \| (?P.*)' kparamsfmt = '^# kparams \| (?P.*)' devpropfmt = '# Device Properties: .*' - pinfofmt = '# platform-(?P[a-z,A-Z,0-9]*): (?P.*)' + pinfofmt = '# platform-(?P[a-z,A-Z,0-9,_]*): (?P.*)' tracertypefmt = '# tracer: (?P.*)' firmwarefmt = '# fwsuspend (?P[0-9]*) fwresume (?P[0-9]*)$' procexecfmt = 'ps - (?P.*)$' + procmultifmt = '@(?P[0-9]*)\|(?P.*)$' ftrace_line_fmt_fg = \ '^ *(?P
.*) usecs', last) - if(mc and mr and (mc.group('t') == mr.group('t')) and - (mc.group('f') == mr.group('f'))): + ct, cf, n, p = data.initcall_debug_call(line) + rt, rf, l = data.initcall_debug_return(last) + if ct and rt and ct == rt and cf == rf: i = data.dmesgtext.index(last) j = data.dmesgtext.index(line) data.dmesgtext[i] = line @@ -3777,7 +3935,6 @@ def loadKernelLog(): # Function: parseKernelLog # Description: -# [deprecated for kernel 3.15.0 or newer] # Analyse a dmesg log output file generated from this app during # the execution phase. Create a set of device structures in memory # for subsequent formatting in the html output file @@ -3796,30 +3953,30 @@ def parseKernelLog(data): # dmesg phase match table dm = { - 'suspend_prepare': ['PM: Syncing filesystems.*'], - 'suspend': ['PM: Entering [a-z]* sleep.*', 'Suspending console.*'], - 'suspend_late': ['PM: suspend of devices complete after.*'], - 'suspend_noirq': ['PM: late suspend of devices complete after.*'], - 'suspend_machine': ['PM: noirq suspend of devices complete after.*'], - 'resume_machine': ['ACPI: Low-level resume complete.*'], - 'resume_noirq': ['ACPI: Waking up from system sleep state.*'], - 'resume_early': ['PM: noirq resume of devices complete after.*'], - 'resume': ['PM: early resume of devices complete after.*'], - 'resume_complete': ['PM: resume of devices complete after.*'], + 'suspend_prepare': ['PM: Syncing filesystems.*', 'PM: suspend entry.*'], + 'suspend': ['PM: Entering [a-z]* sleep.*', 'Suspending console.*', + 'PM: Suspending system .*'], + 'suspend_late': ['PM: suspend of devices complete after.*', + 'PM: freeze of devices complete after.*'], + 'suspend_noirq': ['PM: late suspend of devices complete after.*', + 'PM: late freeze of devices complete after.*'], + 'suspend_machine': ['PM: suspend-to-idle', + 'PM: noirq suspend of devices complete after.*', + 'PM: noirq freeze of devices complete after.*'], + 'resume_machine': ['PM: Timekeeping suspended for.*', + 'ACPI: Low-level resume complete.*', + 'ACPI: resume from mwait', + 'Suspended for [0-9\.]* seconds'], + 'resume_noirq': ['PM: resume from suspend-to-idle', + 'ACPI: Waking up from system sleep state.*'], + 'resume_early': ['PM: noirq resume of devices complete after.*', + 'PM: noirq restore of devices complete after.*'], + 'resume': ['PM: early resume of devices complete after.*', + 'PM: early restore of devices complete after.*'], + 'resume_complete': ['PM: resume of devices complete after.*', + 'PM: restore of devices complete after.*'], 'post_resume': ['.*Restarting tasks \.\.\..*'], } - if(sysvals.suspendmode == 'standby'): - dm['resume_machine'] = ['PM: Restoring platform NVS memory'] - elif(sysvals.suspendmode == 'disk'): - dm['suspend_late'] = ['PM: freeze of devices complete after.*'] - dm['suspend_noirq'] = ['PM: late freeze of devices complete after.*'] - dm['suspend_machine'] = ['PM: noirq freeze of devices complete after.*'] - dm['resume_machine'] = ['PM: Restoring platform NVS memory'] - dm['resume_early'] = ['PM: noirq restore of devices complete after.*'] - dm['resume'] = ['PM: early restore of devices complete after.*'] - dm['resume_complete'] = ['PM: restore of devices complete after.*'] - elif(sysvals.suspendmode == 'freeze'): - dm['resume_machine'] = ['ACPI: resume from mwait'] # action table (expected events that occur and show up in dmesg) at = { @@ -3867,12 +4024,13 @@ def parseKernelLog(data): for s in dm[p]: if(re.match(s, msg)): phasechange, phase = True, p + dm[p] = [s] break # hack for determining resume_machine end for freeze if(not sysvals.usetraceevents and sysvals.suspendmode == 'freeze' \ and phase == 'resume_machine' and \ - re.match('calling (?P.*)\+ @ .*, parent: .*', msg)): + data.initcall_debug_call(line, True)): data.setPhase(phase, ktime, False) phase = 'resume_noirq' data.setPhase(phase, ktime, True) @@ -3945,26 +4103,18 @@ def parseKernelLog(data): # -- device callbacks -- if(phase in data.sortedPhases()): # device init call - if(re.match('calling (?P.*)\+ @ .*, parent: .*', msg)): - sm = re.match('calling (?P.*)\+ @ '+\ - '(?P.*), parent: (?P

.*)', msg); - f = sm.group('f') - n = sm.group('n') - p = sm.group('p') - if(f and n and p): - data.newAction(phase, f, int(n), p, ktime, -1, '') - # device init return - elif(re.match('call (?P.*)\+ returned .* after '+\ - '(?P.*) usecs', msg)): - sm = re.match('call (?P.*)\+ returned .* after '+\ - '(?P.*) usecs(?P.*)', msg); - f = sm.group('f') - t = sm.group('t') - list = data.dmesg[phase]['list'] - if(f in list): - dev = list[f] - dev['length'] = int(t) - dev['end'] = ktime + t, f, n, p = data.initcall_debug_call(line) + if t and f and n and p: + data.newAction(phase, f, int(n), p, ktime, -1, '') + else: + # device init return + t, f, l = data.initcall_debug_return(line) + if t and f and l: + list = data.dmesg[phase]['list'] + if(f in list): + dev = list[f] + dev['length'] = int(l) + dev['end'] = ktime # if trace events are not available, these are better than nothing if(not sysvals.usetraceevents): @@ -4006,6 +4156,8 @@ def parseKernelLog(data): # fill in any missing phases phasedef = data.phasedef terr, lp = '', 'suspend_prepare' + if lp not in data.dmesg: + doError('dmesg log format has changed, could not find start of suspend') for p in sorted(phasedef, key=lambda k:phasedef[k]['order']): if p not in data.dmesg: if not terr: @@ -5302,7 +5454,7 @@ def executeSuspend(quiet=False): sv.dlog('read dmesg') sv.initdmesg() # start ftrace - if(sv.usecallgraph or sv.usetraceevents): + if sv.useftrace: if not quiet: pprint('START TRACING') sv.dlog('start ftrace tracing') @@ -5334,8 +5486,7 @@ def executeSuspend(quiet=False): sv.dlog('enable RTC wake alarm') sv.rtcWakeAlarmOn() # start of suspend trace marker - if(sv.usecallgraph or sv.usetraceevents): - sv.fsetVal(datetime.now().strftime(sv.tmstart), 'trace_marker') + sv.fsetVal(datetime.now().strftime(sv.tmstart), 'trace_marker') # predelay delay if(count == 1 and sv.predelay > 0): sv.fsetVal('WAIT %d' % sv.predelay, 'trace_marker') @@ -5384,11 +5535,17 @@ def executeSuspend(quiet=False): sv.fsetVal('WAIT END', 'trace_marker') # return from suspend pprint('RESUME COMPLETE') - if(sv.usecallgraph or sv.usetraceevents): - sv.fsetVal(datetime.now().strftime(sv.tmend), 'trace_marker') + sv.fsetVal(datetime.now().strftime(sv.tmend), 'trace_marker') if sv.wifi and wifi: tdata['wifi'] = sv.pollWifi(wifi) sv.dlog('wifi check, %s' % tdata['wifi']) + if sv.netfix: + netfixout = sv.netfixon('wired') + elif sv.netfix: + netfixout = sv.netfixon() + if sv.netfix and netfixout: + tdata['netfix'] = netfixout + sv.dlog('netfix, %s' % tdata['netfix']) if(sv.suspendmode == 'mem' or sv.suspendmode == 'command'): sv.dlog('read the ACPI FPDT') tdata['fw'] = getFPDT(False) @@ -5396,7 +5553,7 @@ def executeSuspend(quiet=False): sv.dlog('run the cmdinfo list after') cmdafter = sv.cmdinfo(False) # stop ftrace - if(sv.usecallgraph or sv.usetraceevents): + if sv.useftrace: if sv.useprocmon: sv.dlog('stop the process monitor') pm.stop() @@ -5407,7 +5564,7 @@ def executeSuspend(quiet=False): sysvals.dlog('EXECUTION TRACE END') sv.getdmesg(testdata) # grab a copy of the ftrace output - if(sv.usecallgraph or sv.usetraceevents): + if sv.useftrace: if not quiet: pprint('CAPTURING TRACE') op = sv.writeDatafileHeader(sv.ftracefile, testdata) @@ -5838,13 +5995,19 @@ def statusCheck(probecheck=False): pprint(' please choose one with -m') # check if ftrace is available - res = sysvals.colorText('NO') - ftgood = sysvals.verifyFtrace() - if(ftgood): - res = 'YES' - elif(sysvals.usecallgraph): - status = 'ftrace is not properly supported' - pprint(' is ftrace supported: %s' % res) + if sysvals.useftrace: + res = sysvals.colorText('NO') + sysvals.useftrace = sysvals.verifyFtrace() + efmt = '"{0}" uses ftrace, and it is not properly supported' + if sysvals.useftrace: + res = 'YES' + elif sysvals.usecallgraph: + status = efmt.format('-f') + elif sysvals.usedevsrc: + status = efmt.format('-dev') + elif sysvals.useprocmon: + status = efmt.format('-proc') + pprint(' is ftrace supported: %s' % res) # check if kprobes are available if sysvals.usekprobes: @@ -5857,8 +6020,8 @@ def statusCheck(probecheck=False): pprint(' are kprobes supported: %s' % res) # what data source are we using - res = 'DMESG' - if(ftgood): + res = 'DMESG (very limited, ftrace is preferred)' + if sysvals.useftrace: sysvals.usetraceevents = True for e in sysvals.traceevents: if not os.path.exists(sysvals.epath+e): @@ -5879,7 +6042,7 @@ def statusCheck(probecheck=False): pprint(' optional commands this tool may use for info:') no = sysvals.colorText('MISSING') yes = sysvals.colorText('FOUND', 32) - for c in ['turbostat', 'mcelog', 'lspci', 'lsusb']: + for c in ['turbostat', 'mcelog', 'lspci', 'lsusb', 'netfix']: if c == 'turbostat': res = yes if sysvals.haveTurbostat() else no else: @@ -5971,7 +6134,7 @@ def processData(live=False, quiet=False): if not sysvals.stamp: pprint('ERROR: data does not include the expected stamp') return (testruns, {'error': 'timeline generation failed'}) - shown = ['bios', 'biosdate', 'cpu', 'host', 'kernel', 'man', 'memfr', + shown = ['os', 'bios', 'biosdate', 'cpu', 'host', 'kernel', 'man', 'memfr', 'memsz', 'mode', 'numcpu', 'plat', 'time', 'wifi'] sysvals.vprint('System Info:') for key in sorted(sysvals.stamp): @@ -6052,6 +6215,8 @@ def runTest(n=0, quiet=False): if sysvals.display: ret = sysvals.displayControl('init') sysvals.dlog('xset display init, ret = %d' % ret) + sysvals.testVal(sysvals.pmdpath, 'basic', '1') + sysvals.testVal(sysvals.s0ixpath, 'basic', 'Y') sysvals.dlog('initialize ftrace') sysvals.initFtrace(quiet) @@ -6145,9 +6310,12 @@ def data_from_html(file, outpath, issues, fulldetail=False): elist[err[0]] += 1 for i in elist: ilist.append('%sx%d' % (i, elist[i]) if elist[i] > 1 else i) - wifi = find_in_html(html, 'Wifi Resume: ', '') - if wifi: - extra['wifi'] = wifi + line = find_in_html(log, '# wifi ', '\n') + if line: + extra['wifi'] = line + line = find_in_html(log, '# netfix ', '\n') + if line: + extra['netfix'] = line low = find_in_html(html, 'freeze time: ', ' ms') for lowstr in ['waking', '+']: if not low: @@ -6243,7 +6411,7 @@ def genHtml(subdir, force=False): sysvals.ftracefile = file sysvals.setOutputFile() if (sysvals.dmesgfile or sysvals.ftracefile) and sysvals.htmlfile and \ - (force or not sysvals.usable(sysvals.htmlfile)): + (force or not sysvals.usable(sysvals.htmlfile, True)): pprint('FTRACE: %s' % sysvals.ftracefile) if sysvals.dmesgfile: pprint('DMESG : %s' % sysvals.dmesgfile) @@ -6533,6 +6701,7 @@ def printHelp(): ' -skiphtml Run the test and capture the trace logs, but skip the timeline (default: disabled)\n'\ ' -result fn Export a results table to a text file for parsing.\n'\ ' -wifi If a wifi connection is available, check that it reconnects after resume.\n'\ + ' -netfix Use netfix to reset the network in the event it fails to resume.\n'\ ' [testprep]\n'\ ' -sync Sync the filesystems before starting the test\n'\ ' -rs on/off Enable/disable runtime suspend for all devices, restore all after test\n'\ @@ -6615,6 +6784,8 @@ if __name__ == '__main__': elif(arg == '-v'): pprint("Version %s" % sysvals.version) sys.exit(0) + elif(arg == '-debugtiming'): + debugtiming = True elif(arg == '-x2'): sysvals.execcount = 2 elif(arg == '-x2delay'): @@ -6657,6 +6828,8 @@ if __name__ == '__main__': sysvals.sync = True elif(arg == '-wifi'): sysvals.wifi = True + elif(arg == '-netfix'): + sysvals.netfix = True elif(arg == '-gzip'): sysvals.gzip = True elif(arg == '-info'): @@ -6819,7 +6992,7 @@ if __name__ == '__main__': sysvals.outdir = val sysvals.notestrun = True if(os.path.isdir(val) == False): - doError('%s is not accessible' % val) + doError('%s is not accesible' % val) elif(arg == '-filter'): try: val = next(args) @@ -6942,12 +7115,11 @@ if __name__ == '__main__': time.sleep(sysvals.multitest['delay']) fmt = 'suspend-%y%m%d-%H%M%S' sysvals.testdir = os.path.join(sysvals.outdir, datetime.now().strftime(fmt)) - ret = runTest(i+1, True) + ret = runTest(i+1, not sysvals.verbose) failcnt = 0 if not ret else failcnt + 1 if sysvals.maxfail > 0 and failcnt >= sysvals.maxfail: pprint('Maximum fail count of %d reached, aborting multitest' % (sysvals.maxfail)) break - time.sleep(5) sysvals.resetlog() sysvals.multistat(False, i, finish) if 'time' in sysvals.multitest and datetime.now() >= finish: -- cgit v1.2.3 From 92c005a1176288c98a0dc49f37376da35bbea071 Mon Sep 17 00:00:00 2001 From: Micah Morton Date: Wed, 15 Jun 2022 21:19:06 +0000 Subject: LSM: SafeSetID: fix userns bug in selftest Not sure how this bug got in here but its been there since the original merge. I think I tested the code on a system that wouldn't let me clone() with CLONE_NEWUSER flag set so had to comment out these test_userns invocations. Trying to map UID 0 inside the userns to UID 0 outside will never work, even with CAP_SETUID. The code is supposed to test whether we can map UID 0 in the userns to the UID of the parent process (the one with CAP_SETUID that is writing the /proc/[pid]/uid_map file). Signed-off-by: Micah Morton --- tools/testing/selftests/safesetid/safesetid-test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/safesetid/safesetid-test.c b/tools/testing/selftests/safesetid/safesetid-test.c index 4b809c93ba36..111dcbcc0491 100644 --- a/tools/testing/selftests/safesetid/safesetid-test.c +++ b/tools/testing/selftests/safesetid/safesetid-test.c @@ -194,7 +194,7 @@ static bool test_userns(bool expect_success) printf("preparing file name string failed"); return false; } - success = write_file(map_file_name, "0 0 1", uid); + success = write_file(map_file_name, "0 %d 1", uid); return success == expect_success; } -- cgit v1.2.3 From b2927170d4fbdf30545eb482133425477625a665 Mon Sep 17 00:00:00 2001 From: Micah Morton Date: Wed, 15 Jun 2022 22:14:14 +0000 Subject: LSM: SafeSetID: selftest cleanup and prepare for GIDs Add some notes on how to run the test, update the policy file paths to reflect recent upstream changes, prepare test for adding GID testing. Signed-off-by: Micah Morton --- tools/testing/selftests/safesetid/Makefile | 2 +- tools/testing/selftests/safesetid/safesetid-test.c | 93 ++++++++++++---------- 2 files changed, 51 insertions(+), 44 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/safesetid/Makefile b/tools/testing/selftests/safesetid/Makefile index fa02c4d5ec13..e815bbf2d0f4 100644 --- a/tools/testing/selftests/safesetid/Makefile +++ b/tools/testing/selftests/safesetid/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -# Makefile for mount selftests. +# Makefile for SafeSetID selftest. CFLAGS = -Wall -O2 LDLIBS = -lcap diff --git a/tools/testing/selftests/safesetid/safesetid-test.c b/tools/testing/selftests/safesetid/safesetid-test.c index 111dcbcc0491..c16977e4b913 100644 --- a/tools/testing/selftests/safesetid/safesetid-test.c +++ b/tools/testing/selftests/safesetid/safesetid-test.c @@ -16,17 +16,28 @@ #include #include +/* + * NOTES about this test: + * - requries libcap-dev to be installed on test system + * - requires securityfs to me mounted at /sys/kernel/security, e.g.: + * mount -n -t securityfs -o nodev,noexec,nosuid securityfs /sys/kernel/security + * - needs CONFIG_SECURITYFS and CONFIG_SAFESETID to be enabled + */ + #ifndef CLONE_NEWUSER # define CLONE_NEWUSER 0x10000000 #endif -#define ROOT_USER 0 -#define RESTRICTED_PARENT 1 -#define ALLOWED_CHILD1 2 -#define ALLOWED_CHILD2 3 -#define NO_POLICY_USER 4 +#define ROOT_UGID 0 +#define RESTRICTED_PARENT_UGID 1 +#define ALLOWED_CHILD1_UGID 2 +#define ALLOWED_CHILD2_UGID 3 +#define NO_POLICY_UGID 4 + +#define UGID_POLICY_STRING "1:2\n1:3\n2:2\n3:3\n" -char* add_whitelist_policy_file = "/sys/kernel/security/safesetid/add_whitelist_policy"; +char* add_uid_whitelist_policy_file = "/sys/kernel/security/safesetid/uid_allowlist_policy"; +char* add_gid_whitelist_policy_file = "/sys/kernel/security/safesetid/gid_allowlist_policy"; static void die(char *fmt, ...) { @@ -106,7 +117,7 @@ static void ensure_user_exists(uid_t uid) die("couldn't open file\n"); if (fseek(fd, 0, SEEK_END)) die("couldn't fseek\n"); - snprintf(name_str, 10, "%d", uid); + snprintf(name_str, 10, "user %d", uid); p.pw_name=name_str; p.pw_uid=uid; p.pw_gecos="Test account"; @@ -122,7 +133,7 @@ static void ensure_user_exists(uid_t uid) static void ensure_securityfs_mounted(void) { - int fd = open(add_whitelist_policy_file, O_WRONLY); + int fd = open(add_uid_whitelist_policy_file, O_WRONLY); if (fd < 0) { if (errno == ENOENT) { // Need to mount securityfs @@ -135,36 +146,32 @@ static void ensure_securityfs_mounted(void) } else { if (close(fd) != 0) { die("close of %s failed: %s\n", - add_whitelist_policy_file, strerror(errno)); + add_uid_whitelist_policy_file, strerror(errno)); } } } -static void write_policies(void) +static void write_uid_policies() { - static char *policy_str = - "1:2\n" - "1:3\n" - "2:2\n" - "3:3\n"; + static char *policy_str = UGID_POLICY_STRING; ssize_t written; int fd; - fd = open(add_whitelist_policy_file, O_WRONLY); + fd = open(add_uid_whitelist_policy_file, O_WRONLY); if (fd < 0) - die("can't open add_whitelist_policy file\n"); + die("can't open add_uid_whitelist_policy file\n"); written = write(fd, policy_str, strlen(policy_str)); if (written != strlen(policy_str)) { if (written >= 0) { - die("short write to %s\n", add_whitelist_policy_file); + die("short write to %s\n", add_uid_whitelist_policy_file); } else { die("write to %s failed: %s\n", - add_whitelist_policy_file, strerror(errno)); + add_uid_whitelist_policy_file, strerror(errno)); } } if (close(fd) != 0) { die("close of %s failed: %s\n", - add_whitelist_policy_file, strerror(errno)); + add_uid_whitelist_policy_file, strerror(errno)); } } @@ -260,11 +267,11 @@ static void test_setuid(uid_t child_uid, bool expect_success) static void ensure_users_exist(void) { - ensure_user_exists(ROOT_USER); - ensure_user_exists(RESTRICTED_PARENT); - ensure_user_exists(ALLOWED_CHILD1); - ensure_user_exists(ALLOWED_CHILD2); - ensure_user_exists(NO_POLICY_USER); + ensure_user_exists(ROOT_UGID); + ensure_user_exists(RESTRICTED_PARENT_UGID); + ensure_user_exists(ALLOWED_CHILD1_UGID); + ensure_user_exists(ALLOWED_CHILD2_UGID); + ensure_user_exists(NO_POLICY_UGID); } static void drop_caps(bool setid_retained) @@ -285,39 +292,38 @@ int main(int argc, char **argv) { ensure_users_exist(); ensure_securityfs_mounted(); - write_policies(); + write_uid_policies(); if (prctl(PR_SET_KEEPCAPS, 1L)) die("Error with set keepcaps\n"); - // First test to make sure we can write userns mappings from a user - // that doesn't have any restrictions (as long as it has CAP_SETUID); - if (setuid(NO_POLICY_USER) < 0) - die("Error with set uid(%d)\n", NO_POLICY_USER); - if (setgid(NO_POLICY_USER) < 0) - die("Error with set gid(%d)\n", NO_POLICY_USER); - + // First test to make sure we can write userns mappings from a non-root + // user that doesn't have any restrictions (as long as it has + // CAP_SETUID); + if (setgid(NO_POLICY_UGID) < 0) + die("Error with set gid(%d)\n", NO_POLICY_UGID); + if (setuid(NO_POLICY_UGID) < 0) + die("Error with set uid(%d)\n", NO_POLICY_UGID); // Take away all but setid caps drop_caps(true); - // Need PR_SET_DUMPABLE flag set so we can write /proc/[pid]/uid_map // from non-root parent process. if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0)) die("Error with set dumpable\n"); - if (!test_userns(true)) { die("test_userns failed when it should work\n"); } - if (setuid(RESTRICTED_PARENT) < 0) - die("Error with set uid(%d)\n", RESTRICTED_PARENT); - if (setgid(RESTRICTED_PARENT) < 0) - die("Error with set gid(%d)\n", RESTRICTED_PARENT); + // Now switch to a user/group with restrictions + if (setgid(RESTRICTED_PARENT_UGID) < 0) + die("Error with set gid(%d)\n", RESTRICTED_PARENT_UGID); + if (setuid(RESTRICTED_PARENT_UGID) < 0) + die("Error with set uid(%d)\n", RESTRICTED_PARENT_UGID); - test_setuid(ROOT_USER, false); - test_setuid(ALLOWED_CHILD1, true); - test_setuid(ALLOWED_CHILD2, true); - test_setuid(NO_POLICY_USER, false); + test_setuid(ROOT_UGID, false); + test_setuid(ALLOWED_CHILD1_UGID, true); + test_setuid(ALLOWED_CHILD2_UGID, true); + test_setuid(NO_POLICY_UGID, false); if (!test_userns(false)) { die("test_userns worked when it should fail\n"); @@ -331,5 +337,6 @@ int main(int argc, char **argv) // NOTE: this test doesn't clean up users that were created in // /etc/passwd or flush policies that were added to the LSM. + printf("test successful!\n"); return EXIT_SUCCESS; } -- cgit v1.2.3 From a1732d6898ced0523cb4073c7f02d236edf312b1 Mon Sep 17 00:00:00 2001 From: Micah Morton Date: Wed, 15 Jun 2022 22:17:40 +0000 Subject: LSM: SafeSetID: add GID testing to selftest GID security policies were added back in v5.10, update the selftest to reflect this. Signed-off-by: Micah Morton --- tools/testing/selftests/safesetid/safesetid-test.c | 131 +++++++++++++++++++++ 1 file changed, 131 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/safesetid/safesetid-test.c b/tools/testing/selftests/safesetid/safesetid-test.c index c16977e4b913..a653c47a4ab5 100644 --- a/tools/testing/selftests/safesetid/safesetid-test.c +++ b/tools/testing/selftests/safesetid/safesetid-test.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -120,6 +121,7 @@ static void ensure_user_exists(uid_t uid) snprintf(name_str, 10, "user %d", uid); p.pw_name=name_str; p.pw_uid=uid; + p.pw_gid=uid; p.pw_gecos="Test account"; p.pw_dir="/dev/null"; p.pw_shell="/bin/false"; @@ -131,6 +133,33 @@ static void ensure_user_exists(uid_t uid) } } +static void ensure_group_exists(gid_t gid) +{ + struct group g; + + FILE *fd; + char name_str[10]; + + if (getgrgid(gid) == NULL) { + memset(&g,0x00,sizeof(g)); + fd=fopen("/etc/group","a"); + if (fd == NULL) + die("couldn't open group file\n"); + if (fseek(fd, 0, SEEK_END)) + die("couldn't fseek group file\n"); + snprintf(name_str, 10, "group %d", gid); + g.gr_name=name_str; + g.gr_gid=gid; + g.gr_passwd=NULL; + g.gr_mem=NULL; + int value = putgrent(&g,fd); + if (value != 0) + die("putgrent failed\n"); + if (fclose(fd)) + die("fclose failed\n"); + } +} + static void ensure_securityfs_mounted(void) { int fd = open(add_uid_whitelist_policy_file, O_WRONLY); @@ -175,6 +204,31 @@ static void write_uid_policies() } } +static void write_gid_policies() +{ + static char *policy_str = UGID_POLICY_STRING; + ssize_t written; + int fd; + + fd = open(add_gid_whitelist_policy_file, O_WRONLY); + if (fd < 0) + die("can't open add_gid_whitelist_policy file\n"); + written = write(fd, policy_str, strlen(policy_str)); + if (written != strlen(policy_str)) { + if (written >= 0) { + die("short write to %s\n", add_gid_whitelist_policy_file); + } else { + die("write to %s failed: %s\n", + add_gid_whitelist_policy_file, strerror(errno)); + } + } + if (close(fd) != 0) { + die("close of %s failed: %s\n", + add_gid_whitelist_policy_file, strerror(errno)); + } +} + + static bool test_userns(bool expect_success) { uid_t uid; @@ -265,6 +319,63 @@ static void test_setuid(uid_t child_uid, bool expect_success) die("should not reach here\n"); } +static void test_setgid(gid_t child_gid, bool expect_success) +{ + pid_t cpid, w; + int wstatus; + + cpid = fork(); + if (cpid == -1) { + die("fork\n"); + } + + if (cpid == 0) { /* Code executed by child */ + if (setgid(child_gid) < 0) + exit(EXIT_FAILURE); + if (getgid() == child_gid) + exit(EXIT_SUCCESS); + else + exit(EXIT_FAILURE); + } else { /* Code executed by parent */ + do { + w = waitpid(cpid, &wstatus, WUNTRACED | WCONTINUED); + if (w == -1) { + die("waitpid\n"); + } + + if (WIFEXITED(wstatus)) { + if (WEXITSTATUS(wstatus) == EXIT_SUCCESS) { + if (expect_success) { + return; + } else { + die("unexpected success\n"); + } + } else { + if (expect_success) { + die("unexpected failure\n"); + } else { + return; + } + } + } else if (WIFSIGNALED(wstatus)) { + if (WTERMSIG(wstatus) == 9) { + if (expect_success) + die("killed unexpectedly\n"); + else + return; + } else { + die("unexpected signal: %d\n", wstatus); + } + } else { + die("unexpected status: %d\n", wstatus); + } + } while (!WIFEXITED(wstatus) && !WIFSIGNALED(wstatus)); + } + + die("should not reach here\n"); +} + + static void ensure_users_exist(void) { ensure_user_exists(ROOT_UGID); @@ -274,6 +385,15 @@ static void ensure_users_exist(void) ensure_user_exists(NO_POLICY_UGID); } +static void ensure_groups_exist(void) +{ + ensure_group_exists(ROOT_UGID); + ensure_group_exists(RESTRICTED_PARENT_UGID); + ensure_group_exists(ALLOWED_CHILD1_UGID); + ensure_group_exists(ALLOWED_CHILD2_UGID); + ensure_group_exists(NO_POLICY_UGID); +} + static void drop_caps(bool setid_retained) { cap_value_t cap_values[] = {CAP_SETUID, CAP_SETGID}; @@ -290,9 +410,11 @@ static void drop_caps(bool setid_retained) int main(int argc, char **argv) { + ensure_groups_exist(); ensure_users_exist(); ensure_securityfs_mounted(); write_uid_policies(); + write_gid_policies(); if (prctl(PR_SET_KEEPCAPS, 1L)) die("Error with set keepcaps\n"); @@ -325,6 +447,12 @@ int main(int argc, char **argv) test_setuid(ALLOWED_CHILD2_UGID, true); test_setuid(NO_POLICY_UGID, false); + test_setgid(ROOT_UGID, false); + test_setgid(ALLOWED_CHILD1_UGID, true); + test_setgid(ALLOWED_CHILD2_UGID, true); + test_setgid(NO_POLICY_UGID, false); + + if (!test_userns(false)) { die("test_userns worked when it should fail\n"); } @@ -334,6 +462,9 @@ int main(int argc, char **argv) test_setuid(2, false); test_setuid(3, false); test_setuid(4, false); + test_setgid(2, false); + test_setgid(3, false); + test_setgid(4, false); // NOTE: this test doesn't clean up users that were created in // /etc/passwd or flush policies that were added to the LSM. -- cgit v1.2.3 From 64b634830c919979de4b18163e15d30df66e64a8 Mon Sep 17 00:00:00 2001 From: Micah Morton Date: Thu, 16 Jun 2022 17:09:55 +0000 Subject: LSM: SafeSetID: add setgroups() testing to selftest Selftest already has support for testing UID and GID transitions. Signed-off-by: Micah Morton --- tools/testing/selftests/safesetid/safesetid-test.c | 69 ++++++++++++++++++++++ 1 file changed, 69 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/safesetid/safesetid-test.c b/tools/testing/selftests/safesetid/safesetid-test.c index a653c47a4ab5..eb9bf0aee951 100644 --- a/tools/testing/selftests/safesetid/safesetid-test.c +++ b/tools/testing/selftests/safesetid/safesetid-test.c @@ -375,6 +375,71 @@ static void test_setgid(gid_t child_gid, bool expect_success) die("should not reach here\n"); } +static void test_setgroups(gid_t* child_groups, size_t len, bool expect_success) +{ + pid_t cpid, w; + int wstatus; + gid_t groupset[len]; + int i, j; + + cpid = fork(); + if (cpid == -1) { + die("fork\n"); + } + + if (cpid == 0) { /* Code executed by child */ + if (setgroups(len, child_groups) != 0) + exit(EXIT_FAILURE); + if (getgroups(len, groupset) != len) + exit(EXIT_FAILURE); + for (i = 0; i < len; i++) { + for (j = 0; j < len; j++) { + if (child_groups[i] == groupset[j]) + break; + if (j == len - 1) + exit(EXIT_FAILURE); + } + } + exit(EXIT_SUCCESS); + } else { /* Code executed by parent */ + do { + w = waitpid(cpid, &wstatus, WUNTRACED | WCONTINUED); + if (w == -1) { + die("waitpid\n"); + } + + if (WIFEXITED(wstatus)) { + if (WEXITSTATUS(wstatus) == EXIT_SUCCESS) { + if (expect_success) { + return; + } else { + die("unexpected success\n"); + } + } else { + if (expect_success) { + die("unexpected failure\n"); + } else { + return; + } + } + } else if (WIFSIGNALED(wstatus)) { + if (WTERMSIG(wstatus) == 9) { + if (expect_success) + die("killed unexpectedly\n"); + else + return; + } else { + die("unexpected signal: %d\n", wstatus); + } + } else { + die("unexpected status: %d\n", wstatus); + } + } while (!WIFEXITED(wstatus) && !WIFSIGNALED(wstatus)); + } + + die("should not reach here\n"); +} + static void ensure_users_exist(void) { @@ -452,6 +517,10 @@ int main(int argc, char **argv) test_setgid(ALLOWED_CHILD2_UGID, true); test_setgid(NO_POLICY_UGID, false); + gid_t allowed_supp_groups[2] = {ALLOWED_CHILD1_UGID, ALLOWED_CHILD2_UGID}; + gid_t disallowed_supp_groups[2] = {ROOT_UGID, NO_POLICY_UGID}; + test_setgroups(allowed_supp_groups, 2, true); + test_setgroups(disallowed_supp_groups, 2, false); if (!test_userns(false)) { die("test_userns worked when it should fail\n"); -- cgit v1.2.3 From 3a2a58c4479a6acd4421db41e1e1ffd804763a5a Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Thu, 14 Jul 2022 10:46:11 +0800 Subject: tools: runqslower: Build and use lightweight bootstrap version of bpftool tools/runqslower use bpftool for vmlinux.h, skeleton, and static linking only. So we can use lightweight bootstrap version of bpftool to handle these, and it will be faster. Suggested-by: Andrii Nakryiko Signed-off-by: Pu Lehui Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220714024612.944071-3-pulehui@huawei.com --- tools/bpf/runqslower/Makefile | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile index da6de16a3dfb..8b3d87b82b7a 100644 --- a/tools/bpf/runqslower/Makefile +++ b/tools/bpf/runqslower/Makefile @@ -4,7 +4,7 @@ include ../../scripts/Makefile.include OUTPUT ?= $(abspath .output)/ BPFTOOL_OUTPUT := $(OUTPUT)bpftool/ -DEFAULT_BPFTOOL := $(BPFTOOL_OUTPUT)bpftool +DEFAULT_BPFTOOL := $(BPFTOOL_OUTPUT)bootstrap/bpftool BPFTOOL ?= $(DEFAULT_BPFTOOL) LIBBPF_SRC := $(abspath ../../lib/bpf) BPFOBJ_OUTPUT := $(OUTPUT)libbpf/ @@ -86,6 +86,5 @@ $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(BPFOBJ_OU $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) OUTPUT=$(BPFOBJ_OUTPUT) \ DESTDIR=$(BPFOBJ_OUTPUT) prefix= $(abspath $@) install_headers -$(DEFAULT_BPFTOOL): $(BPFOBJ) | $(BPFTOOL_OUTPUT) - $(Q)$(MAKE) $(submake_extras) -C ../bpftool OUTPUT=$(BPFTOOL_OUTPUT) \ - ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD) +$(DEFAULT_BPFTOOL): | $(BPFTOOL_OUTPUT) + $(Q)$(MAKE) $(submake_extras) -C ../bpftool OUTPUT=$(BPFTOOL_OUTPUT) bootstrap -- cgit v1.2.3 From 9ff5efdeb0897547cffc275e88d2a55462d9b08e Mon Sep 17 00:00:00 2001 From: Jon Doron Date: Fri, 15 Jul 2022 21:11:22 +0300 Subject: libbpf: perfbuf: Add API to get the ring buffer Add support for writing a custom event reader, by exposing the ring buffer. With the new API perf_buffer__buffer() you will get access to the raw mmaped()'ed per-cpu underlying memory of the ring buffer. This region contains both the perf buffer data and header (struct perf_event_mmap_page), which manages the ring buffer state (head/tail positions, when accessing the head/tail position it's important to take into consideration SMP). With this type of low level access one can implement different types of consumers here are few simple examples where this API helps with: 1. perf_event_read_simple is allocating using malloc, perhaps you want to handle the wrap-around in some other way. 2. Since perf buf is per-cpu then the order of the events is not guarnteed, for example: Given 3 events where each event has a timestamp t0 < t1 < t2, and the events are spread on more than 1 CPU, then we can end up with the following state in the ring buf: CPU[0] => [t0, t2] CPU[1] => [t1] When you consume the events from CPU[0], you could know there is a t1 missing, (assuming there are no drops, and your event data contains a sequential index). So now one can simply do the following, for CPU[0], you can store the address of t0 and t2 in an array (without moving the tail, so there data is not perished) then move on the CPU[1] and set the address of t1 in the same array. So you end up with something like: void **arr[] = [&t0, &t1, &t2], now you can consume it orderely and move the tails as you process in order. 3. Assuming there are multiple CPUs and we want to start draining the messages from them, then we can "pick" with which one to start with according to the remaining free space in the ring buffer. Signed-off-by: Jon Doron Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220715181122.149224-1-arilou@gmail.com --- tools/lib/bpf/libbpf.c | 16 ++++++++++++++++ tools/lib/bpf/libbpf.h | 16 ++++++++++++++++ tools/lib/bpf/libbpf.map | 1 + 3 files changed, 33 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 68da1aca406c..77ae83308199 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -11734,6 +11734,22 @@ int perf_buffer__buffer_fd(const struct perf_buffer *pb, size_t buf_idx) return cpu_buf->fd; } +int perf_buffer__buffer(struct perf_buffer *pb, int buf_idx, void **buf, size_t *buf_size) +{ + struct perf_cpu_buf *cpu_buf; + + if (buf_idx >= pb->cpu_cnt) + return libbpf_err(-EINVAL); + + cpu_buf = pb->cpu_bufs[buf_idx]; + if (!cpu_buf) + return libbpf_err(-ENOENT); + + *buf = cpu_buf->base; + *buf_size = pb->mmap_size; + return 0; +} + /* * Consume data from perf ring buffer corresponding to slot *buf_idx* in * PERF_EVENT_ARRAY BPF map without waiting/polling. If there is no data to diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index e4d5353f757b..c51d6e6b3066 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1053,6 +1053,22 @@ LIBBPF_API int perf_buffer__consume(struct perf_buffer *pb); LIBBPF_API int perf_buffer__consume_buffer(struct perf_buffer *pb, size_t buf_idx); LIBBPF_API size_t perf_buffer__buffer_cnt(const struct perf_buffer *pb); LIBBPF_API int perf_buffer__buffer_fd(const struct perf_buffer *pb, size_t buf_idx); +/** + * @brief **perf_buffer__buffer()** returns the per-cpu raw mmap()'ed underlying + * memory region of the ring buffer. + * This ring buffer can be used to implement a custom events consumer. + * The ring buffer starts with the *struct perf_event_mmap_page*, which + * holds the ring buffer managment fields, when accessing the header + * structure it's important to be SMP aware. + * You can refer to *perf_event_read_simple* for a simple example. + * @param pb the perf buffer structure + * @param buf_idx the buffer index to retreive + * @param buf (out) gets the base pointer of the mmap()'ed memory + * @param buf_size (out) gets the size of the mmap()'ed region + * @return 0 on success, negative error code for failure + */ +LIBBPF_API int perf_buffer__buffer(struct perf_buffer *pb, int buf_idx, void **buf, + size_t *buf_size); struct bpf_prog_linfo; struct bpf_prog_info; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 94b589ecfeaa..4c4c40b8f935 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -362,4 +362,5 @@ LIBBPF_1.0.0 { libbpf_bpf_link_type_str; libbpf_bpf_map_type_str; libbpf_bpf_prog_type_str; + perf_buffer__buffer; }; -- cgit v1.2.3 From 0ea7b0a454ca1839acddc37c4cf802f0e0d5fb5f Mon Sep 17 00:00:00 2001 From: Jaehee Park Date: Wed, 13 Jul 2022 16:40:49 -0700 Subject: selftests: net: arp_ndisc_untracked_subnets: test for arp_accept and accept_untracked_na ipv4 arp_accept has a new option '2' to create new neighbor entries only if the src ip is in the same subnet as an address configured on the interface that received the garp message. This selftest tests all options in arp_accept. ipv6 has a sysctl endpoint, accept_untracked_na, that defines the behavior for accepting untracked neighbor advertisements. A new option similar to that of arp_accept for learning only from the same subnet is added to accept_untracked_na. This selftest tests this new feature. Signed-off-by: Jaehee Park Suggested-by: Roopa Prabhu Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 1 + .../selftests/net/arp_ndisc_untracked_subnets.sh | 308 +++++++++++++++++++++ 2 files changed, 309 insertions(+) create mode 100755 tools/testing/selftests/net/arp_ndisc_untracked_subnets.sh (limited to 'tools') diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index db05b3764b77..80628bf8413a 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -38,6 +38,7 @@ TEST_PROGS += srv6_end_dt6_l3vpn_test.sh TEST_PROGS += vrf_strict_mode_test.sh TEST_PROGS += arp_ndisc_evict_nocarrier.sh TEST_PROGS += ndisc_unsolicited_na_test.sh +TEST_PROGS += arp_ndisc_untracked_subnets.sh TEST_PROGS += stress_reuseport_listen.sh TEST_PROGS_EXTENDED := in_netns.sh setup_loopback.sh setup_veth.sh TEST_PROGS_EXTENDED += toeplitz_client.sh toeplitz.sh diff --git a/tools/testing/selftests/net/arp_ndisc_untracked_subnets.sh b/tools/testing/selftests/net/arp_ndisc_untracked_subnets.sh new file mode 100755 index 000000000000..c899b446acb6 --- /dev/null +++ b/tools/testing/selftests/net/arp_ndisc_untracked_subnets.sh @@ -0,0 +1,308 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# 2 namespaces: one host and one router. Use arping from the host to send a +# garp to the router. Router accepts or ignores based on its arp_accept +# or accept_untracked_na configuration. + +TESTS="arp ndisc" + +ROUTER_NS="ns-router" +ROUTER_NS_V6="ns-router-v6" +ROUTER_INTF="veth-router" +ROUTER_ADDR="10.0.10.1" +ROUTER_ADDR_V6="2001:db8:abcd:0012::1" + +HOST_NS="ns-host" +HOST_NS_V6="ns-host-v6" +HOST_INTF="veth-host" +HOST_ADDR="10.0.10.2" +HOST_ADDR_V6="2001:db8:abcd:0012::2" + +SUBNET_WIDTH=24 +PREFIX_WIDTH_V6=64 + +cleanup() { + ip netns del ${HOST_NS} + ip netns del ${ROUTER_NS} +} + +cleanup_v6() { + ip netns del ${HOST_NS_V6} + ip netns del ${ROUTER_NS_V6} +} + +setup() { + set -e + local arp_accept=$1 + + # Set up two namespaces + ip netns add ${ROUTER_NS} + ip netns add ${HOST_NS} + + # Set up interfaces veth0 and veth1, which are pairs in separate + # namespaces. veth0 is veth-router, veth1 is veth-host. + # first, set up the inteface's link to the namespace + # then, set the interface "up" + ip netns exec ${ROUTER_NS} ip link add name ${ROUTER_INTF} \ + type veth peer name ${HOST_INTF} + + ip netns exec ${ROUTER_NS} ip link set dev ${ROUTER_INTF} up + ip netns exec ${ROUTER_NS} ip link set dev ${HOST_INTF} netns ${HOST_NS} + + ip netns exec ${HOST_NS} ip link set dev ${HOST_INTF} up + ip netns exec ${ROUTER_NS} ip addr add ${ROUTER_ADDR}/${SUBNET_WIDTH} \ + dev ${ROUTER_INTF} + + ip netns exec ${HOST_NS} ip addr add ${HOST_ADDR}/${SUBNET_WIDTH} \ + dev ${HOST_INTF} + ip netns exec ${HOST_NS} ip route add default via ${HOST_ADDR} \ + dev ${HOST_INTF} + ip netns exec ${ROUTER_NS} ip route add default via ${ROUTER_ADDR} \ + dev ${ROUTER_INTF} + + ROUTER_CONF=net.ipv4.conf.${ROUTER_INTF} + ip netns exec ${ROUTER_NS} sysctl -w \ + ${ROUTER_CONF}.arp_accept=${arp_accept} >/dev/null 2>&1 + set +e +} + +setup_v6() { + set -e + local accept_untracked_na=$1 + + # Set up two namespaces + ip netns add ${ROUTER_NS_V6} + ip netns add ${HOST_NS_V6} + + # Set up interfaces veth0 and veth1, which are pairs in separate + # namespaces. veth0 is veth-router, veth1 is veth-host. + # first, set up the inteface's link to the namespace + # then, set the interface "up" + ip -6 -netns ${ROUTER_NS_V6} link add name ${ROUTER_INTF} \ + type veth peer name ${HOST_INTF} + + ip -6 -netns ${ROUTER_NS_V6} link set dev ${ROUTER_INTF} up + ip -6 -netns ${ROUTER_NS_V6} link set dev ${HOST_INTF} netns \ + ${HOST_NS_V6} + + ip -6 -netns ${HOST_NS_V6} link set dev ${HOST_INTF} up + ip -6 -netns ${ROUTER_NS_V6} addr add \ + ${ROUTER_ADDR_V6}/${PREFIX_WIDTH_V6} dev ${ROUTER_INTF} nodad + + HOST_CONF=net.ipv6.conf.${HOST_INTF} + ip netns exec ${HOST_NS_V6} sysctl -qw ${HOST_CONF}.ndisc_notify=1 + ip netns exec ${HOST_NS_V6} sysctl -qw ${HOST_CONF}.disable_ipv6=0 + ip -6 -netns ${HOST_NS_V6} addr add ${HOST_ADDR_V6}/${PREFIX_WIDTH_V6} \ + dev ${HOST_INTF} + + ROUTER_CONF=net.ipv6.conf.${ROUTER_INTF} + + ip netns exec ${ROUTER_NS_V6} sysctl -w \ + ${ROUTER_CONF}.forwarding=1 >/dev/null 2>&1 + ip netns exec ${ROUTER_NS_V6} sysctl -w \ + ${ROUTER_CONF}.drop_unsolicited_na=0 >/dev/null 2>&1 + ip netns exec ${ROUTER_NS_V6} sysctl -w \ + ${ROUTER_CONF}.accept_untracked_na=${accept_untracked_na} \ + >/dev/null 2>&1 + set +e +} + +verify_arp() { + local arp_accept=$1 + local same_subnet=$2 + + neigh_show_output=$(ip netns exec ${ROUTER_NS} ip neigh get \ + ${HOST_ADDR} dev ${ROUTER_INTF} 2>/dev/null) + + if [ ${arp_accept} -eq 1 ]; then + # Neighbor entries expected + [[ ${neigh_show_output} ]] + elif [ ${arp_accept} -eq 2 ]; then + if [ ${same_subnet} -eq 1 ]; then + # Neighbor entries expected + [[ ${neigh_show_output} ]] + else + [[ -z "${neigh_show_output}" ]] + fi + else + [[ -z "${neigh_show_output}" ]] + fi + } + +arp_test_gratuitous() { + set -e + local arp_accept=$1 + local same_subnet=$2 + + if [ ${arp_accept} -eq 2 ]; then + test_msg=("test_arp: " + "accept_arp=$1 " + "same_subnet=$2") + if [ ${same_subnet} -eq 0 ]; then + HOST_ADDR=10.0.11.3 + else + HOST_ADDR=10.0.10.3 + fi + else + test_msg=("test_arp: " + "accept_arp=$1") + fi + # Supply arp_accept option to set up which sets it in sysctl + setup ${arp_accept} + ip netns exec ${HOST_NS} arping -A -U ${HOST_ADDR} -c1 2>&1 >/dev/null + + if verify_arp $1 $2; then + printf " TEST: %-60s [ OK ]\n" "${test_msg[*]}" + else + printf " TEST: %-60s [FAIL]\n" "${test_msg[*]}" + fi + cleanup + set +e +} + +arp_test_gratuitous_combinations() { + arp_test_gratuitous 0 + arp_test_gratuitous 1 + arp_test_gratuitous 2 0 # Second entry indicates subnet or not + arp_test_gratuitous 2 1 +} + +cleanup_tcpdump() { + set -e + [[ ! -z ${tcpdump_stdout} ]] && rm -f ${tcpdump_stdout} + [[ ! -z ${tcpdump_stderr} ]] && rm -f ${tcpdump_stderr} + tcpdump_stdout= + tcpdump_stderr= + set +e +} + +start_tcpdump() { + set -e + tcpdump_stdout=`mktemp` + tcpdump_stderr=`mktemp` + ip netns exec ${ROUTER_NS_V6} timeout 15s \ + tcpdump --immediate-mode -tpni ${ROUTER_INTF} -c 1 \ + "icmp6 && icmp6[0] == 136 && src ${HOST_ADDR_V6}" \ + > ${tcpdump_stdout} 2> /dev/null + set +e +} + +verify_ndisc() { + local accept_untracked_na=$1 + local same_subnet=$2 + + neigh_show_output=$(ip -6 -netns ${ROUTER_NS_V6} neigh show \ + to ${HOST_ADDR_V6} dev ${ROUTER_INTF} nud stale) + + if [ ${accept_untracked_na} -eq 1 ]; then + # Neighbour entry expected to be present + [[ ${neigh_show_output} ]] + elif [ ${accept_untracked_na} -eq 2 ]; then + if [ ${same_subnet} -eq 1 ]; then + [[ ${neigh_show_output} ]] + else + [[ -z "${neigh_show_output}" ]] + fi + else + # Neighbour entry expected to be absent for all other cases + [[ -z "${neigh_show_output}" ]] + fi +} + +ndisc_test_untracked_advertisements() { + set -e + test_msg=("test_ndisc: " + "accept_untracked_na=$1") + + local accept_untracked_na=$1 + local same_subnet=$2 + if [ ${accept_untracked_na} -eq 2 ]; then + test_msg=("test_ndisc: " + "accept_untracked_na=$1 " + "same_subnet=$2") + if [ ${same_subnet} -eq 0 ]; then + # Not same subnet + HOST_ADDR_V6=2000:db8:abcd:0013::4 + else + HOST_ADDR_V6=2001:db8:abcd:0012::3 + fi + fi + setup_v6 $1 $2 + start_tcpdump + + if verify_ndisc $1 $2; then + printf " TEST: %-60s [ OK ]\n" "${test_msg[*]}" + else + printf " TEST: %-60s [FAIL]\n" "${test_msg[*]}" + fi + + cleanup_tcpdump + cleanup_v6 + set +e +} + +ndisc_test_untracked_combinations() { + ndisc_test_untracked_advertisements 0 + ndisc_test_untracked_advertisements 1 + ndisc_test_untracked_advertisements 2 0 + ndisc_test_untracked_advertisements 2 1 +} + +################################################################################ +# usage + +usage() +{ + cat < Test(s) to run (default: all) + (options: $TESTS) +EOF +} + +################################################################################ +# main + +while getopts ":t:h" opt; do + case $opt in + t) TESTS=$OPTARG;; + h) usage; exit 0;; + *) usage; exit 1;; + esac +done + +if [ "$(id -u)" -ne 0 ];then + echo "SKIP: Need root privileges" + exit $ksft_skip; +fi + +if [ ! -x "$(command -v ip)" ]; then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +if [ ! -x "$(command -v tcpdump)" ]; then + echo "SKIP: Could not run test without tcpdump tool" + exit $ksft_skip +fi + +if [ ! -x "$(command -v arping)" ]; then + echo "SKIP: Could not run test without arping tool" + exit $ksft_skip +fi + +# start clean +cleanup &> /dev/null +cleanup_v6 &> /dev/null + +for t in $TESTS +do + case $t in + arp_test_gratuitous_combinations|arp) arp_test_gratuitous_combinations;; + ndisc_test_untracked_combinations|ndisc) \ + ndisc_test_untracked_combinations;; + help) echo "Test names: $TESTS"; exit 0;; +esac +done -- cgit v1.2.3 From eee51fe38e372b89317f3950d2dc3e3ea7bace12 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 9 May 2021 09:39:02 -0300 Subject: tools headers UAPI: Sync linux/kvm.h with the kernel sources To pick the changes in: 1b870fa5573e260b ("kvm: stats: tell userspace which values are boolean") That just rebuilds perf, as these patches don't add any new KVM ioctl to be harvested for the the 'perf trace' ioctl syscall argument beautifiers. This is also by now used by tools/testing/selftests/kvm/, a simple test build succeeded. This silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/kvm.h' differs from latest version at 'include/uapi/linux/kvm.h' diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Paolo Bonzini Link: http://lore.kernel.org/lkml/YtQLDvQrBhJNl3n5@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 5088bd9f1922..811897dadcae 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -2083,6 +2083,7 @@ struct kvm_stats_header { #define KVM_STATS_UNIT_BYTES (0x1 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_SECONDS (0x2 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_CYCLES (0x3 << KVM_STATS_UNIT_SHIFT) +#define KVM_STATS_UNIT_BOOLEAN (0x4 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_MAX KVM_STATS_UNIT_CYCLES #define KVM_STATS_BASE_SHIFT 8 -- cgit v1.2.3 From f098addbdb44c8a565367f5162f3ab170ed9404a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 1 Jul 2021 13:39:15 -0300 Subject: tools headers cpufeatures: Sync with the kernel sources To pick the changes from: f43b9876e857c739 ("x86/retbleed: Add fine grained Kconfig knobs") a149180fbcf336e9 ("x86: Add magic AMD return-thunk") 15e67227c49a5783 ("x86: Undo return-thunk damage") 369ae6ffc41a3c11 ("x86/retpoline: Cleanup some #ifdefery") 4ad3278df6fe2b08 x86/speculation: Disable RRSBA behavior 26aae8ccbc197223 x86/cpu/amd: Enumerate BTC_NO 9756bba28470722d x86/speculation: Fill RSB on vmexit for IBRS 3ebc170068885b6f x86/bugs: Add retbleed=ibpb 2dbb887e875b1de3 x86/entry: Add kernel IBRS implementation 6b80b59b35557065 x86/bugs: Report AMD retbleed vulnerability a149180fbcf336e9 x86: Add magic AMD return-thunk 15e67227c49a5783 x86: Undo return-thunk damage a883d624aed463c8 x86/cpufeatures: Move RETPOLINE flags to word 11 51802186158c74a0 x86/speculation/mmio: Enumerate Processor MMIO Stale Data bug This only causes these perf files to be rebuilt: CC /tmp/build/perf/bench/mem-memcpy-x86-64-asm.o CC /tmp/build/perf/bench/mem-memset-x86-64-asm.o And addresses this perf build warning: Warning: Kernel ABI header at 'tools/arch/x86/include/asm/cpufeatures.h' differs from latest version at 'arch/x86/include/asm/cpufeatures.h' diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h Warning: Kernel ABI header at 'tools/arch/x86/include/asm/disabled-features.h' differs from latest version at 'arch/x86/include/asm/disabled-features.h' diff -u tools/arch/x86/include/asm/disabled-features.h arch/x86/include/asm/disabled-features.h Cc: Adrian Hunter Cc: Borislav Petkov Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra --- tools/arch/x86/include/asm/cpufeatures.h | 12 ++++++++++-- tools/arch/x86/include/asm/disabled-features.h | 21 ++++++++++++++++++++- 2 files changed, 30 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 03acc823838a..00f5227c8459 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -203,8 +203,8 @@ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ #define X86_FEATURE_XCOMPACTED ( 7*32+10) /* "" Use compacted XSTATE (XSAVES or XSAVEC) */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ -#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCE for Spectre variant 2 */ +#define X86_FEATURE_KERNEL_IBRS ( 7*32+12) /* "" Set/clear IBRS on kernel entry/exit */ +#define X86_FEATURE_RSB_VMEXIT ( 7*32+13) /* "" Fill RSB on VM-Exit */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */ #define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ @@ -296,6 +296,12 @@ #define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ #define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */ #define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */ +#define X86_FEATURE_ENTRY_IBPB (11*32+10) /* "" Issue an IBPB on kernel entry */ +#define X86_FEATURE_RRSBA_CTRL (11*32+11) /* "" RET prediction control */ +#define X86_FEATURE_RETPOLINE (11*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */ +#define X86_FEATURE_RETHUNK (11*32+14) /* "" Use REturn THUNK */ +#define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ @@ -316,6 +322,7 @@ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ #define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ #define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */ +#define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */ #define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ @@ -447,5 +454,6 @@ #define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ #define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ #define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */ +#define X86_BUG_RETBLEED X86_BUG(26) /* CPU is affected by RETBleed */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index 36369e76cc63..33d2cd04d254 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -50,6 +50,25 @@ # define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) #endif +#ifdef CONFIG_RETPOLINE +# define DISABLE_RETPOLINE 0 +#else +# define DISABLE_RETPOLINE ((1 << (X86_FEATURE_RETPOLINE & 31)) | \ + (1 << (X86_FEATURE_RETPOLINE_LFENCE & 31))) +#endif + +#ifdef CONFIG_RETHUNK +# define DISABLE_RETHUNK 0 +#else +# define DISABLE_RETHUNK (1 << (X86_FEATURE_RETHUNK & 31)) +#endif + +#ifdef CONFIG_CPU_UNRET_ENTRY +# define DISABLE_UNRET 0 +#else +# define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31)) +#endif + #ifdef CONFIG_INTEL_IOMMU_SVM # define DISABLE_ENQCMD 0 #else @@ -82,7 +101,7 @@ #define DISABLED_MASK8 (DISABLE_TDX_GUEST) #define DISABLED_MASK9 (DISABLE_SGX) #define DISABLED_MASK10 0 -#define DISABLED_MASK11 0 +#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET) #define DISABLED_MASK12 0 #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 -- cgit v1.2.3 From 91d248c3b903b46a58cbc7e8d38d684d3e4007c2 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 1 Jul 2021 13:32:18 -0300 Subject: tools arch x86: Sync the msr-index.h copy with the kernel sources To pick up the changes from these csets: 4ad3278df6fe2b08 ("x86/speculation: Disable RRSBA behavior") d7caac991feeef1b ("x86/cpu/amd: Add Spectral Chicken") That cause no changes to tooling: $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > before $ cp arch/x86/include/asm/msr-index.h tools/arch/x86/include/asm/msr-index.h $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > after $ diff -u before after $ Just silences this perf build warning: Warning: Kernel ABI header at 'tools/arch/x86/include/asm/msr-index.h' differs from latest version at 'arch/x86/include/asm/msr-index.h' diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index.h Cc: Adrian Hunter Cc: Borislav Petkov Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Pawan Gupta Cc: Peter Zijlstra Link: https://lore.kernel.org/lkml/YtQTm9wsB3hxQWvy@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/msr-index.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 2eab6a3a8a8c..cc615be27a54 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -95,6 +95,7 @@ #define MSR_IA32_ARCH_CAPABILITIES 0x0000010a #define ARCH_CAP_RDCL_NO BIT(0) /* Not susceptible to Meltdown */ #define ARCH_CAP_IBRS_ALL BIT(1) /* Enhanced IBRS support */ +#define ARCH_CAP_RSBA BIT(2) /* RET may use alternative branch predictors */ #define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH BIT(3) /* Skip L1D flush on vmentry */ #define ARCH_CAP_SSB_NO BIT(4) /* * Not susceptible to Speculative Store Bypass @@ -576,6 +577,9 @@ /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 +#define MSR_ZEN2_SPECTRAL_CHICKEN 0xc00110e3 +#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT BIT_ULL(1) + /* Fam 16h MSRs */ #define MSR_F16H_L2I_PERF_CTL 0xc0010230 #define MSR_F16H_L2I_PERF_CTR 0xc0010231 -- cgit v1.2.3 From 498c7a54f169b2699104d3060604d840424f15d2 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 13 Jul 2022 15:34:58 +0300 Subject: perf tests: Stop Convert perf time to TSC test opening events twice Do not call evlist__open() twice. Fixes: 5bb017d4b97a0f13 ("perf test: Fix error message for test case 71 on s390, where it is not supported") Reviewed-by: Kan Liang Signed-off-by: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Thomas Richter Link: https://lore.kernel.org/r/20220713123459.24145-2-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/perf-time-to-tsc.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/perf-time-to-tsc.c b/tools/perf/tests/perf-time-to-tsc.c index 4ad0dfbc8b21..8d6d60173693 100644 --- a/tools/perf/tests/perf-time-to-tsc.c +++ b/tools/perf/tests/perf-time-to-tsc.c @@ -123,11 +123,14 @@ static int test__perf_time_to_tsc(struct test_suite *test __maybe_unused, int su evsel->core.attr.enable_on_exec = 0; } - if (evlist__open(evlist) == -ENOENT) { - err = TEST_SKIP; + ret = evlist__open(evlist); + if (ret < 0) { + if (ret == -ENOENT) + err = TEST_SKIP; + else + pr_debug("evlist__open() failed\n"); goto out_err; } - CHECK__(evlist__open(evlist)); CHECK__(evlist__mmap(evlist, UINT_MAX)); -- cgit v1.2.3 From deb44a6249f696106645c63c0603eab08a6122af Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 13 Jul 2022 15:34:59 +0300 Subject: perf tests: Fix Convert perf time to TSC test for hybrid The test does not always correctly determine the number of events for hybrids, nor allow for more than 1 evsel when parsing. Fix by iterating the events actually created and getting the correct evsel for the events processed. Fixes: d9da6f70eb235110 ("perf tests: Support 'Convert perf time to TSC' test for hybrid") Reviewed-by: Kan Liang Signed-off-by: Adrian Hunter Cc: Ian Rogers Cc: Jin Yao Cc: Jiri Olsa Cc: Namhyung Kim Cc: Thomas Richter Link: https://lore.kernel.org/r/20220713123459.24145-3-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/perf-time-to-tsc.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/perf-time-to-tsc.c b/tools/perf/tests/perf-time-to-tsc.c index 8d6d60173693..7c7d20fc503a 100644 --- a/tools/perf/tests/perf-time-to-tsc.c +++ b/tools/perf/tests/perf-time-to-tsc.c @@ -20,8 +20,6 @@ #include "tsc.h" #include "mmap.h" #include "tests.h" -#include "pmu.h" -#include "pmu-hybrid.h" /* * Except x86_64/i386 and Arm64, other archs don't support TSC in perf. Just @@ -106,18 +104,8 @@ static int test__perf_time_to_tsc(struct test_suite *test __maybe_unused, int su evlist__config(evlist, &opts, NULL); - evsel = evlist__first(evlist); - - evsel->core.attr.comm = 1; - evsel->core.attr.disabled = 1; - evsel->core.attr.enable_on_exec = 0; - - /* - * For hybrid "cycles:u", it creates two events. - * Init the second evsel here. - */ - if (perf_pmu__has_hybrid() && perf_pmu__hybrid_mounted("cpu_atom")) { - evsel = evsel__next(evsel); + /* For hybrid "cycles:u", it creates two events */ + evlist__for_each_entry(evlist, evsel) { evsel->core.attr.comm = 1; evsel->core.attr.disabled = 1; evsel->core.attr.enable_on_exec = 0; @@ -170,10 +158,12 @@ static int test__perf_time_to_tsc(struct test_suite *test __maybe_unused, int su goto next_event; if (strcmp(event->comm.comm, comm1) == 0) { + CHECK_NOT_NULL__(evsel = evlist__event2evsel(evlist, event)); CHECK__(evsel__parse_sample(evsel, event, &sample)); comm1_time = sample.time; } if (strcmp(event->comm.comm, comm2) == 0) { + CHECK_NOT_NULL__(evsel = evlist__event2evsel(evlist, event)); CHECK__(evsel__parse_sample(evsel, event, &sample)); comm2_time = sample.time; } -- cgit v1.2.3 From 4b335e1e0d6f8fa91dac615a44b123c9f26e93d3 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 7 Jul 2022 14:39:00 +0530 Subject: perf trace: Fix SIGSEGV when processing syscall args On powerpc, 'perf trace' is crashing with a SIGSEGV when trying to process a perf.data file created with 'perf trace record -p': #0 0x00000001225b8988 in syscall_arg__scnprintf_augmented_string at builtin-trace.c:1492 #1 syscall_arg__scnprintf_filename at builtin-trace.c:1492 #2 syscall_arg__scnprintf_filename at builtin-trace.c:1486 #3 0x00000001225bdd9c in syscall_arg_fmt__scnprintf_val at builtin-trace.c:1973 #4 syscall__scnprintf_args at builtin-trace.c:2041 #5 0x00000001225bff04 in trace__sys_enter at builtin-trace.c:2319 That points to the below code in tools/perf/builtin-trace.c: /* * If this is raw_syscalls.sys_enter, then it always comes with the 6 possible * arguments, even if the syscall being handled, say "openat", uses only 4 arguments * this breaks syscall__augmented_args() check for augmented args, as we calculate * syscall->args_size using each syscalls:sys_enter_NAME tracefs format file, * so when handling, say the openat syscall, we end up getting 6 args for the * raw_syscalls:sys_enter event, when we expected just 4, we end up mistakenly * thinking that the extra 2 u64 args are the augmented filename, so just check * here and avoid using augmented syscalls when the evsel is the raw_syscalls one. */ if (evsel != trace->syscalls.events.sys_enter) augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size); As the comment points out, we should not be trying to augment the args for raw_syscalls. However, when processing a perf.data file, we are not initializing those properly. Fix the same. Reported-by: Claudio Carvalho Signed-off-by: Naveen N. Rao Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lore.kernel.org/lkml/20220707090900.572584-1-naveen.n.rao@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 897fc504918b..f075cf37a65e 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -4280,6 +4280,7 @@ static int trace__replay(struct trace *trace) goto out; evsel = evlist__find_tracepoint_by_name(session->evlist, "raw_syscalls:sys_enter"); + trace->syscalls.events.sys_enter = evsel; /* older kernels have syscalls tp versus raw_syscalls */ if (evsel == NULL) evsel = evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_enter"); @@ -4292,6 +4293,7 @@ static int trace__replay(struct trace *trace) } evsel = evlist__find_tracepoint_by_name(session->evlist, "raw_syscalls:sys_exit"); + trace->syscalls.events.sys_exit = evsel; if (evsel == NULL) evsel = evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_exit"); if (evsel && -- cgit v1.2.3 From 30f6f8614a1d1f8ccae6642eaad629997aa33b3b Mon Sep 17 00:00:00 2001 From: Kalpana Shetty Date: Tue, 31 May 2022 15:55:56 +0530 Subject: selftests/vm: add protection_keys tests to run_vmtests Add "protected_keys" tests to "run_vmtests.sh" would help run all VM related tests from a single shell script. [kalpana.shetty@amd.com: Shuah Khan's review comments incorporated, added -x executable check] Link: https://lkml.kernel.org/r/20220617202931.357-1-kalpana.shetty@amd.com Link: https://lkml.kernel.org/r/20220610090704.296-1-kalpana.shetty@amd.com Link: https://lkml.kernel.org/r/20220531102556.388-1-kalpana.shetty@amd.com Signed-off-by: Kalpana Shetty Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/run_vmtests.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 27c01c35c7a9..2af563a9652e 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -179,4 +179,15 @@ run_test ./ksm_tests -N -m 1 # KSM test with 2 NUMA nodes and merge_across_nodes = 0 run_test ./ksm_tests -N -m 0 +# protection_keys tests +if [ -x ./protection_keys_32 ] +then + run_test ./protection_keys_32 +fi + +if [ -x ./protection_keys_64 ] +then + run_test ./protection_keys_64 +fi + exit $exitcode -- cgit v1.2.3 From f70dab3c015153ad4837f305e6d65cfaff573dac Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Fri, 15 Jul 2022 10:05:18 -0500 Subject: tools: update hmm-test to support device coherent type Test cases such as migrate_fault and migrate_multiple, were modified to explicit migrate from device to sys memory without the need of page faults, when using device coherent type. Snapshot test case updated to read memory device type first and based on that, get the proper returned results migrate_ping_pong test case added to test explicit migration from device to sys memory for both private and coherent zone types. Helpers to migrate from device to sys memory and vicerversa were also added. Link: https://lkml.kernel.org/r/20220715150521.18165-12-alex.sierra@amd.com Signed-off-by: Alex Sierra Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Reviewed-by: Alistair Popple Cc: David Hildenbrand Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/hmm-tests.c | 121 +++++++++++++++++++++++++++------ 1 file changed, 100 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c index 203323967b50..4b547188ec40 100644 --- a/tools/testing/selftests/vm/hmm-tests.c +++ b/tools/testing/selftests/vm/hmm-tests.c @@ -46,6 +46,13 @@ struct hmm_buffer { uint64_t faults; }; +enum { + HMM_PRIVATE_DEVICE_ONE, + HMM_PRIVATE_DEVICE_TWO, + HMM_COHERENCE_DEVICE_ONE, + HMM_COHERENCE_DEVICE_TWO, +}; + #define TWOMEG (1 << 21) #define HMM_BUFFER_SIZE (1024 << 12) #define HMM_PATH_MAX 64 @@ -60,6 +67,21 @@ FIXTURE(hmm) unsigned int page_shift; }; +FIXTURE_VARIANT(hmm) +{ + int device_number; +}; + +FIXTURE_VARIANT_ADD(hmm, hmm_device_private) +{ + .device_number = HMM_PRIVATE_DEVICE_ONE, +}; + +FIXTURE_VARIANT_ADD(hmm, hmm_device_coherent) +{ + .device_number = HMM_COHERENCE_DEVICE_ONE, +}; + FIXTURE(hmm2) { int fd0; @@ -68,6 +90,24 @@ FIXTURE(hmm2) unsigned int page_shift; }; +FIXTURE_VARIANT(hmm2) +{ + int device_number0; + int device_number1; +}; + +FIXTURE_VARIANT_ADD(hmm2, hmm2_device_private) +{ + .device_number0 = HMM_PRIVATE_DEVICE_ONE, + .device_number1 = HMM_PRIVATE_DEVICE_TWO, +}; + +FIXTURE_VARIANT_ADD(hmm2, hmm2_device_coherent) +{ + .device_number0 = HMM_COHERENCE_DEVICE_ONE, + .device_number1 = HMM_COHERENCE_DEVICE_TWO, +}; + static int hmm_open(int unit) { char pathname[HMM_PATH_MAX]; @@ -81,12 +121,19 @@ static int hmm_open(int unit) return fd; } +static bool hmm_is_coherent_type(int dev_num) +{ + return (dev_num >= HMM_COHERENCE_DEVICE_ONE); +} + FIXTURE_SETUP(hmm) { self->page_size = sysconf(_SC_PAGE_SIZE); self->page_shift = ffs(self->page_size) - 1; - self->fd = hmm_open(0); + self->fd = hmm_open(variant->device_number); + if (self->fd < 0 && hmm_is_coherent_type(variant->device_number)) + SKIP(exit(0), "DEVICE_COHERENT not available"); ASSERT_GE(self->fd, 0); } @@ -95,9 +142,11 @@ FIXTURE_SETUP(hmm2) self->page_size = sysconf(_SC_PAGE_SIZE); self->page_shift = ffs(self->page_size) - 1; - self->fd0 = hmm_open(0); + self->fd0 = hmm_open(variant->device_number0); + if (self->fd0 < 0 && hmm_is_coherent_type(variant->device_number0)) + SKIP(exit(0), "DEVICE_COHERENT not available"); ASSERT_GE(self->fd0, 0); - self->fd1 = hmm_open(1); + self->fd1 = hmm_open(variant->device_number1); ASSERT_GE(self->fd1, 0); } @@ -211,6 +260,20 @@ static void hmm_nanosleep(unsigned int n) nanosleep(&t, NULL); } +static int hmm_migrate_sys_to_dev(int fd, + struct hmm_buffer *buffer, + unsigned long npages) +{ + return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_DEV, buffer, npages); +} + +static int hmm_migrate_dev_to_sys(int fd, + struct hmm_buffer *buffer, + unsigned long npages) +{ + return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_SYS, buffer, npages); +} + /* * Simple NULL test of device open/close. */ @@ -875,7 +938,7 @@ TEST_F(hmm, migrate) ptr[i] = i; /* Migrate memory to device. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, npages); @@ -923,7 +986,7 @@ TEST_F(hmm, migrate_fault) ptr[i] = i; /* Migrate memory to device. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, npages); @@ -936,7 +999,7 @@ TEST_F(hmm, migrate_fault) ASSERT_EQ(ptr[i], i); /* Migrate memory to the device again. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, npages); @@ -976,7 +1039,7 @@ TEST_F(hmm, migrate_shared) ASSERT_NE(buffer->ptr, MAP_FAILED); /* Migrate memory to device. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); ASSERT_EQ(ret, -ENOENT); hmm_buffer_free(buffer); @@ -1015,7 +1078,7 @@ TEST_F(hmm2, migrate_mixed) p = buffer->ptr; /* Migrating a protected area should be an error. */ - ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, npages); + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages); ASSERT_EQ(ret, -EINVAL); /* Punch a hole after the first page address. */ @@ -1023,7 +1086,7 @@ TEST_F(hmm2, migrate_mixed) ASSERT_EQ(ret, 0); /* We expect an error if the vma doesn't cover the range. */ - ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, 3); + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 3); ASSERT_EQ(ret, -EINVAL); /* Page 2 will be a read-only zero page. */ @@ -1055,13 +1118,13 @@ TEST_F(hmm2, migrate_mixed) /* Now try to migrate pages 2-5 to device 1. */ buffer->ptr = p + 2 * self->page_size; - ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, 4); + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 4); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, 4); /* Page 5 won't be migrated to device 0 because it's on device 1. */ buffer->ptr = p + 5 * self->page_size; - ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_MIGRATE, buffer, 1); + ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1); ASSERT_EQ(ret, -ENOENT); buffer->ptr = p; @@ -1070,8 +1133,12 @@ TEST_F(hmm2, migrate_mixed) } /* - * Migrate anonymous memory to device private memory and fault it back to system - * memory multiple times. + * Migrate anonymous memory to device memory and back to system memory + * multiple times. In case of private zone configuration, this is done + * through fault pages accessed by CPU. In case of coherent zone configuration, + * the pages from the device should be explicitly migrated back to system memory. + * The reason is Coherent device zone has coherent access by CPU, therefore + * it will not generate any page fault. */ TEST_F(hmm, migrate_multiple) { @@ -1107,8 +1174,7 @@ TEST_F(hmm, migrate_multiple) ptr[i] = i; /* Migrate memory to device. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, - npages); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, npages); @@ -1116,7 +1182,13 @@ TEST_F(hmm, migrate_multiple) for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) ASSERT_EQ(ptr[i], i); - /* Fault pages back to system memory and check them. */ + /* Migrate back to system memory and check them. */ + if (hmm_is_coherent_type(variant->device_number)) { + ret = hmm_migrate_dev_to_sys(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + } + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) ASSERT_EQ(ptr[i], i); @@ -1354,13 +1426,13 @@ TEST_F(hmm2, snapshot) /* Page 5 will be migrated to device 0. */ buffer->ptr = p + 5 * self->page_size; - ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_MIGRATE, buffer, 1); + ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, 1); /* Page 6 will be migrated to device 1. */ buffer->ptr = p + 6 * self->page_size; - ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, 1); + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 1); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, 1); @@ -1377,9 +1449,16 @@ TEST_F(hmm2, snapshot) ASSERT_EQ(m[2], HMM_DMIRROR_PROT_ZERO | HMM_DMIRROR_PROT_READ); ASSERT_EQ(m[3], HMM_DMIRROR_PROT_READ); ASSERT_EQ(m[4], HMM_DMIRROR_PROT_WRITE); - ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL | - HMM_DMIRROR_PROT_WRITE); - ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE); + if (!hmm_is_coherent_type(variant->device_number0)) { + ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL | + HMM_DMIRROR_PROT_WRITE); + ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE); + } else { + ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | + HMM_DMIRROR_PROT_WRITE); + ASSERT_EQ(m[6], HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE | + HMM_DMIRROR_PROT_WRITE); + } hmm_buffer_free(buffer); } -- cgit v1.2.3 From e6474b1aeb2a0bb01c925f79cafa829b4b5e05c2 Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Fri, 15 Jul 2022 10:05:19 -0500 Subject: tools: update test_hmm script to support SP config Add two more parameters to set spm_addr_dev0 & spm_addr_dev1 addresses. These two parameters configure the start SP addresses for each device in test_hmm driver. Consequently, this configures zone device type as coherent. Link: https://lkml.kernel.org/r/20220715150521.18165-13-alex.sierra@amd.com Signed-off-by: Alex Sierra Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Reviewed-by: Alistair Popple Cc: David Hildenbrand Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/test_hmm.sh | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/vm/test_hmm.sh b/tools/testing/selftests/vm/test_hmm.sh index 0647b525a625..539c9371e592 100755 --- a/tools/testing/selftests/vm/test_hmm.sh +++ b/tools/testing/selftests/vm/test_hmm.sh @@ -40,11 +40,26 @@ check_test_requirements() load_driver() { - modprobe $DRIVER > /dev/null 2>&1 + if [ $# -eq 0 ]; then + modprobe $DRIVER > /dev/null 2>&1 + else + if [ $# -eq 2 ]; then + modprobe $DRIVER spm_addr_dev0=$1 spm_addr_dev1=$2 + > /dev/null 2>&1 + else + echo "Missing module parameters. Make sure pass"\ + "spm_addr_dev0 and spm_addr_dev1" + usage + fi + fi if [ $? == 0 ]; then major=$(awk "\$2==\"HMM_DMIRROR\" {print \$1}" /proc/devices) mknod /dev/hmm_dmirror0 c $major 0 mknod /dev/hmm_dmirror1 c $major 1 + if [ $# -eq 2 ]; then + mknod /dev/hmm_dmirror2 c $major 2 + mknod /dev/hmm_dmirror3 c $major 3 + fi fi } @@ -58,7 +73,7 @@ run_smoke() { echo "Running smoke test. Note, this test provides basic coverage." - load_driver + load_driver $1 $2 $(dirname "${BASH_SOURCE[0]}")/hmm-tests unload_driver } @@ -75,6 +90,9 @@ usage() echo "# Smoke testing" echo "./${TEST_NAME}.sh smoke" echo + echo "# Smoke testing with SPM enabled" + echo "./${TEST_NAME}.sh smoke " + echo exit 0 } @@ -84,7 +102,7 @@ function run_test() usage else if [ "$1" = "smoke" ]; then - run_smoke + run_smoke $2 $3 else usage fi -- cgit v1.2.3 From 9e09b705fdb8f3892ec1efc2382f64adabc14c50 Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Fri, 15 Jul 2022 10:05:20 -0500 Subject: tools: add hmm gup tests for device coherent type The intention is to test hmm device coherent type under different get user pages paths. Also, test gup with FOLL_LONGTERM flag set in device coherent pages. These pages should get migrated back to system memory. Link: https://lkml.kernel.org/r/20220715150521.18165-14-alex.sierra@amd.com Signed-off-by: Alex Sierra Reviewed-by: Alistair Popple Cc: Christoph Hellwig Cc: David Hildenbrand Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/hmm-tests.c | 110 +++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c index 4b547188ec40..bb38b9777610 100644 --- a/tools/testing/selftests/vm/hmm-tests.c +++ b/tools/testing/selftests/vm/hmm-tests.c @@ -36,6 +36,7 @@ * in the usual include/uapi/... directory. */ #include "../../../../lib/test_hmm_uapi.h" +#include "../../../../mm/gup_test.h" struct hmm_buffer { void *ptr; @@ -59,6 +60,9 @@ enum { #define NTIMES 10 #define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) +/* Just the flags we need, copied from mm.h: */ +#define FOLL_WRITE 0x01 /* check pte is writable */ +#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite */ FIXTURE(hmm) { @@ -1764,4 +1768,110 @@ TEST_F(hmm, exclusive_cow) hmm_buffer_free(buffer); } +static int gup_test_exec(int gup_fd, unsigned long addr, int cmd, + int npages, int size, int flags) +{ + struct gup_test gup = { + .nr_pages_per_call = npages, + .addr = addr, + .gup_flags = FOLL_WRITE | flags, + .size = size, + }; + + if (ioctl(gup_fd, cmd, &gup)) { + perror("ioctl on error\n"); + return errno; + } + + return 0; +} + +/* + * Test get user device pages through gup_test. Setting PIN_LONGTERM flag. + * This should trigger a migration back to system memory for both, private + * and coherent type pages. + * This test makes use of gup_test module. Make sure GUP_TEST_CONFIG is added + * to your configuration before you run it. + */ +TEST_F(hmm, hmm_gup_test) +{ + struct hmm_buffer *buffer; + int gup_fd; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + unsigned char *m; + + gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); + if (gup_fd == -1) + SKIP(return, "Skipping test, could not find gup_test driver"); + + npages = 4; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + ASSERT_EQ(gup_test_exec(gup_fd, + (unsigned long)buffer->ptr, + GUP_BASIC_TEST, 1, self->page_size, 0), 0); + ASSERT_EQ(gup_test_exec(gup_fd, + (unsigned long)buffer->ptr + 1 * self->page_size, + GUP_FAST_BENCHMARK, 1, self->page_size, 0), 0); + ASSERT_EQ(gup_test_exec(gup_fd, + (unsigned long)buffer->ptr + 2 * self->page_size, + PIN_FAST_BENCHMARK, 1, self->page_size, FOLL_LONGTERM), 0); + ASSERT_EQ(gup_test_exec(gup_fd, + (unsigned long)buffer->ptr + 3 * self->page_size, + PIN_LONGTERM_BENCHMARK, 1, self->page_size, 0), 0); + + /* Take snapshot to CPU pagetables */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + m = buffer->mirror; + if (hmm_is_coherent_type(variant->device_number)) { + ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[0]); + ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[1]); + } else { + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[0]); + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[1]); + } + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[2]); + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[3]); + /* + * Check again the content on the pages. Make sure there's no + * corrupted data. + */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + close(gup_fd); + hmm_buffer_free(buffer); +} TEST_HARNESS_MAIN -- cgit v1.2.3 From 96c0657383fefdee04d4053bcc266bfa620d073d Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Fri, 15 Jul 2022 10:05:21 -0500 Subject: tools: add selftests to hmm for COW in device memory The objective is to test device migration mechanism in pages marked as COW, for private and coherent device type. In case of writing to COW private page(s), a page fault will migrate pages back to system memory first. Then, these pages will be duplicated. In case of COW device coherent type, pages are duplicated directly from device memory. Link: https://lkml.kernel.org/r/20220715150521.18165-15-alex.sierra@amd.com Signed-off-by: Alex Sierra Acked-by: Felix Kuehling Cc: Alistair Popple Cc: Christoph Hellwig Cc: David Hildenbrand Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/hmm-tests.c | 80 ++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c index bb38b9777610..716b62c05e3d 100644 --- a/tools/testing/selftests/vm/hmm-tests.c +++ b/tools/testing/selftests/vm/hmm-tests.c @@ -1874,4 +1874,84 @@ TEST_F(hmm, hmm_gup_test) close(gup_fd); hmm_buffer_free(buffer); } + +/* + * Test copy-on-write in device pages. + * In case of writing to COW private page(s), a page fault will migrate pages + * back to system memory first. Then, these pages will be duplicated. In case + * of COW device coherent type, pages are duplicated directly from device + * memory. + */ +TEST_F(hmm, hmm_cow_in_device) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + unsigned char *m; + pid_t pid; + int status; + + npages = 4; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + pid = fork(); + if (pid == -1) + ASSERT_EQ(pid, 0); + if (!pid) { + /* Child process waitd for SIGTERM from the parent. */ + while (1) { + } + perror("Should not reach this\n"); + exit(0); + } + /* Parent process writes to COW pages(s) and gets a + * new copy in system. In case of device private pages, + * this write causes a migration to system mem first. + */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Terminate child and wait */ + EXPECT_EQ(0, kill(pid, SIGTERM)); + EXPECT_EQ(pid, waitpid(pid, &status, 0)); + EXPECT_NE(0, WIFSIGNALED(status)); + EXPECT_EQ(SIGTERM, WTERMSIG(status)); + + /* Take snapshot to CPU pagetables */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + m = buffer->mirror; + for (i = 0; i < npages; i++) + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[i]); + + hmm_buffer_free(buffer); +} TEST_HARNESS_MAIN -- cgit v1.2.3 From 3adb2d87238dea5e05bab747238bb47306b9cb56 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 12 Jul 2022 17:51:45 +0300 Subject: proc: fix test for "vsyscall=xonly" boot option Booting with vsyscall=xonly results in the following vsyscall VMA: ffffffffff600000-ffffffffff601000 --xp ... [vsyscall] Test does read from fixed vsyscall address to determine if kernel supports vsyscall page but it doesn't work because, well, vsyscall page is execute only. Fix test by trying to execute from the first byte of the page which contains gettimeofday() stub. This should work because vsyscall entry points have stable addresses by design. Alexey, avoiding parsing .config, /proc/config.gz and /proc/cmdline at all costs. Link: https://lkml.kernel.org/r/Ys2KgeiEMboU8Ytu@localhost.localdomain Signed-off-by: Alexey Dobriyan Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/proc/proc-pid-vm.c | 75 +++++++++++++++++++++++++++--- 1 file changed, 68 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/proc/proc-pid-vm.c b/tools/testing/selftests/proc/proc-pid-vm.c index 28604c9f805c..e5962f4794f5 100644 --- a/tools/testing/selftests/proc/proc-pid-vm.c +++ b/tools/testing/selftests/proc/proc-pid-vm.c @@ -211,10 +211,19 @@ static int make_exe(const uint8_t *payload, size_t len) } #endif -static bool g_vsyscall = false; +/* + * 0: vsyscall VMA doesn't exist vsyscall=none + * 1: vsyscall VMA is r-xp vsyscall=emulate + * 2: vsyscall VMA is --xp vsyscall=xonly + */ +static int g_vsyscall; +static const char *str_vsyscall; -static const char str_vsyscall[] = +static const char str_vsyscall_0[] = ""; +static const char str_vsyscall_1[] = "ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n"; +static const char str_vsyscall_2[] = +"ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0 [vsyscall]\n"; #ifdef __x86_64__ static void sigaction_SIGSEGV(int _, siginfo_t *__, void *___) @@ -223,13 +232,47 @@ static void sigaction_SIGSEGV(int _, siginfo_t *__, void *___) } /* - * vsyscall page can't be unmapped, probe it with memory load. + * vsyscall page can't be unmapped, probe it directly. */ static void vsyscall(void) { pid_t pid; int wstatus; + pid = fork(); + if (pid < 0) { + fprintf(stderr, "fork, errno %d\n", errno); + exit(1); + } + if (pid == 0) { + struct rlimit rlim = {0, 0}; + (void)setrlimit(RLIMIT_CORE, &rlim); + + /* Hide "segfault at ffffffffff600000" messages. */ + struct sigaction act; + memset(&act, 0, sizeof(struct sigaction)); + act.sa_flags = SA_SIGINFO; + act.sa_sigaction = sigaction_SIGSEGV; + (void)sigaction(SIGSEGV, &act, NULL); + + /* gettimeofday(NULL, NULL); */ + asm volatile ( + "call %P0" + : + : "i" (0xffffffffff600000), "D" (NULL), "S" (NULL) + : "rax", "rcx", "r11" + ); + exit(0); + } + waitpid(pid, &wstatus, 0); + if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == 0) { + /* vsyscall page exists and is executable. */ + } else { + /* vsyscall page doesn't exist. */ + g_vsyscall = 0; + return; + } + pid = fork(); if (pid < 0) { fprintf(stderr, "fork, errno %d\n", errno); @@ -251,8 +294,13 @@ static void vsyscall(void) } waitpid(pid, &wstatus, 0); if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == 0) { - g_vsyscall = true; + /* vsyscall page is readable and executable. */ + g_vsyscall = 1; + return; } + + /* vsyscall page is executable but unreadable. */ + g_vsyscall = 2; } int main(void) @@ -261,6 +309,19 @@ int main(void) int exec_fd; vsyscall(); + switch (g_vsyscall) { + case 0: + str_vsyscall = str_vsyscall_0; + break; + case 1: + str_vsyscall = str_vsyscall_1; + break; + case 2: + str_vsyscall = str_vsyscall_2; + break; + default: + abort(); + } atexit(ate); @@ -314,7 +375,7 @@ int main(void) /* Test /proc/$PID/maps */ { - const size_t len = strlen(buf0) + (g_vsyscall ? strlen(str_vsyscall) : 0); + const size_t len = strlen(buf0) + strlen(str_vsyscall); char buf[256]; ssize_t rv; int fd; @@ -327,7 +388,7 @@ int main(void) rv = read(fd, buf, sizeof(buf)); assert(rv == len); assert(memcmp(buf, buf0, strlen(buf0)) == 0); - if (g_vsyscall) { + if (g_vsyscall > 0) { assert(memcmp(buf + strlen(buf0), str_vsyscall, strlen(str_vsyscall)) == 0); } } @@ -374,7 +435,7 @@ int main(void) assert(memmem(buf, rv, S[i], strlen(S[i]))); } - if (g_vsyscall) { + if (g_vsyscall > 0) { assert(memmem(buf, rv, str_vsyscall, strlen(str_vsyscall))); } } -- cgit v1.2.3 From 9592eef7c16ec5fb9f36c4d9abe8eeffc2e1d2f3 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 5 Jul 2022 20:48:41 +0200 Subject: random: remove CONFIG_ARCH_RANDOM When RDRAND was introduced, there was much discussion on whether it should be trusted and how the kernel should handle that. Initially, two mechanisms cropped up, CONFIG_ARCH_RANDOM, a compile time switch, and "nordrand", a boot-time switch. Later the thinking evolved. With a properly designed RNG, using RDRAND values alone won't harm anything, even if the outputs are malicious. Rather, the issue is whether those values are being *trusted* to be good or not. And so a new set of options were introduced as the real ones that people use -- CONFIG_RANDOM_TRUST_CPU and "random.trust_cpu". With these options, RDRAND is used, but it's not always credited. So in the worst case, it does nothing, and in the best case, maybe it helps. Along the way, CONFIG_ARCH_RANDOM's meaning got sort of pulled into the center and became something certain platforms force-select. The old options don't really help with much, and it's a bit odd to have special handling for these instructions when the kernel can deal fine with the existence or untrusted existence or broken existence or non-existence of that CPU capability. Simplify the situation by removing CONFIG_ARCH_RANDOM and using the ordinary asm-generic fallback pattern instead, keeping the two options that are actually used. For now it leaves "nordrand" for now, as the removal of that will take a different route. Acked-by: Michael Ellerman Acked-by: Catalin Marinas Acked-by: Borislav Petkov Acked-by: Heiko Carstens Acked-by: Greg Kroah-Hartman Signed-off-by: Jason A. Donenfeld --- arch/arm/include/asm/archrandom.h | 2 ++ arch/arm64/Kconfig | 8 ------- arch/arm64/include/asm/archrandom.h | 10 --------- arch/arm64/kernel/cpufeature.c | 2 -- arch/powerpc/Kconfig | 3 --- arch/powerpc/include/asm/archrandom.h | 3 --- arch/powerpc/include/asm/machdep.h | 2 -- arch/powerpc/platforms/microwatt/Kconfig | 1 - arch/powerpc/platforms/powernv/Kconfig | 1 - arch/powerpc/platforms/pseries/Kconfig | 1 - arch/s390/Kconfig | 15 ------------- arch/s390/configs/zfcpdump_defconfig | 1 - arch/s390/crypto/Makefile | 2 +- arch/s390/include/asm/archrandom.h | 3 --- arch/s390/kernel/setup.c | 2 -- arch/x86/Kconfig | 9 -------- arch/x86/include/asm/archrandom.h | 14 ++++-------- arch/x86/kernel/cpu/rdrand.c | 2 -- drivers/char/Kconfig | 1 - drivers/char/hw_random/s390-trng.c | 9 -------- include/asm-generic/Kbuild | 1 + include/asm-generic/archrandom.h | 25 ++++++++++++++++++++++ include/linux/random.h | 9 +------- .../testing/selftests/wireguard/qemu/kernel.config | 1 - 24 files changed, 34 insertions(+), 93 deletions(-) create mode 100644 include/asm-generic/archrandom.h (limited to 'tools') diff --git a/arch/arm/include/asm/archrandom.h b/arch/arm/include/asm/archrandom.h index a8e84ca5c2ee..cc4714eb1a75 100644 --- a/arch/arm/include/asm/archrandom.h +++ b/arch/arm/include/asm/archrandom.h @@ -7,4 +7,6 @@ static inline bool __init smccc_probe_trng(void) return false; } +#include + #endif /* _ASM_ARCHRANDOM_H */ diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1652a9800ebe..1880f71c2547 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1858,14 +1858,6 @@ config ARM64_E0PD This option enables E0PD for TTBR1 where available. -config ARCH_RANDOM - bool "Enable support for random number generation" - default y - help - Random number generation (part of the ARMv8.5 Extensions) - provides a high bandwidth, cryptographically secure - hardware random number generator. - config ARM64_AS_HAS_MTE # Initial support for MTE went in binutils 2.32.0, checked with # ".arch armv8.5-a+memtag" below. However, this was incomplete diff --git a/arch/arm64/include/asm/archrandom.h b/arch/arm64/include/asm/archrandom.h index 3a6b6d38c5b8..c3b9fa56af67 100644 --- a/arch/arm64/include/asm/archrandom.h +++ b/arch/arm64/include/asm/archrandom.h @@ -2,8 +2,6 @@ #ifndef _ASM_ARCHRANDOM_H #define _ASM_ARCHRANDOM_H -#ifdef CONFIG_ARCH_RANDOM - #include #include #include @@ -167,12 +165,4 @@ arch_get_random_seed_long_early(unsigned long *v) } #define arch_get_random_seed_long_early arch_get_random_seed_long_early -#else /* !CONFIG_ARCH_RANDOM */ - -static inline bool __init smccc_probe_trng(void) -{ - return false; -} - -#endif /* CONFIG_ARCH_RANDOM */ #endif /* _ASM_ARCHRANDOM_H */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 8d88433de81d..0e9462abeb77 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2416,7 +2416,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = cpu_enable_e0pd, }, #endif -#ifdef CONFIG_ARCH_RANDOM { .desc = "Random Number Generator", .capability = ARM64_HAS_RNG, @@ -2428,7 +2427,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .sign = FTR_UNSIGNED, .min_field_value = 1, }, -#endif #ifdef CONFIG_ARM64_BTI { .desc = "Branch Target Identification", diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 7aa12e88c580..623deb5bedf6 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -1252,9 +1252,6 @@ config PHYSICAL_START default "0x00000000" endif -config ARCH_RANDOM - def_bool n - config PPC_LIB_RHEAP bool diff --git a/arch/powerpc/include/asm/archrandom.h b/arch/powerpc/include/asm/archrandom.h index 9a53e29680f4..25ba65df6b1a 100644 --- a/arch/powerpc/include/asm/archrandom.h +++ b/arch/powerpc/include/asm/archrandom.h @@ -2,8 +2,6 @@ #ifndef _ASM_POWERPC_ARCHRANDOM_H #define _ASM_POWERPC_ARCHRANDOM_H -#ifdef CONFIG_ARCH_RANDOM - #include static inline bool __must_check arch_get_random_long(unsigned long *v) @@ -35,7 +33,6 @@ static inline bool __must_check arch_get_random_seed_int(unsigned int *v) return rc; } -#endif /* CONFIG_ARCH_RANDOM */ #ifdef CONFIG_PPC_POWERNV int powernv_hwrng_present(void); diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 358d171ae8e0..6c1002043367 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -200,9 +200,7 @@ struct machdep_calls { ssize_t (*cpu_release)(const char *, size_t); #endif -#ifdef CONFIG_ARCH_RANDOM int (*get_random_seed)(unsigned long *v); -#endif }; extern void e500_idle(void); diff --git a/arch/powerpc/platforms/microwatt/Kconfig b/arch/powerpc/platforms/microwatt/Kconfig index 5e320f49583a..6af443a1db99 100644 --- a/arch/powerpc/platforms/microwatt/Kconfig +++ b/arch/powerpc/platforms/microwatt/Kconfig @@ -6,7 +6,6 @@ config PPC_MICROWATT select PPC_ICS_NATIVE select PPC_ICP_NATIVE select PPC_UDBG_16550 - select ARCH_RANDOM help This option enables support for FPGA-based Microwatt implementations. diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 161dfe024085..e1a05c5a9004 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -12,7 +12,6 @@ config PPC_POWERNV select EPAPR_BOOT select PPC_INDIRECT_PIO select PPC_UDBG_16550 - select ARCH_RANDOM select CPU_FREQ select PPC_DOORBELL select MMU_NOTIFIER diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index f7fd91d153a4..f4a647c1f0b2 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -19,7 +19,6 @@ config PPC_PSERIES select PPC_UDBG_16550 select PPC_DOORBELL select HOTPLUG_CPU - select ARCH_RANDOM select FORCE_SMP select SWIOTLB default y diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 8cd9e56c629b..9b6e4e7cb17b 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -507,21 +507,6 @@ config KEXEC_SIG verification for the corresponding kernel image type being loaded in order for this to work. -config ARCH_RANDOM - def_bool y - prompt "s390 architectural random number generation API" - help - Enable the s390 architectural random number generation API - to provide random data for all consumers within the Linux - kernel. - - When enabled the arch_random_* functions declared in linux/random.h - are implemented. The implementation is based on the s390 CPACF - instruction subfunction TRNG which provides a real true random - number generator. - - If unsure, say Y. - config KERNEL_NOBP def_bool n prompt "Enable modified branch prediction for the kernel by default" diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index a87fcc45e307..f4976f611b94 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -15,7 +15,6 @@ CONFIG_TUNE_ZEC12=y # CONFIG_COMPAT is not set CONFIG_NR_CPUS=2 CONFIG_HZ_100=y -# CONFIG_ARCH_RANDOM is not set # CONFIG_RELOCATABLE is not set # CONFIG_CHSC_SCH is not set # CONFIG_SCM_BUS is not set diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile index c63abfeb6d17..1b1cc478fa94 100644 --- a/arch/s390/crypto/Makefile +++ b/arch/s390/crypto/Makefile @@ -15,7 +15,7 @@ obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o obj-$(CONFIG_S390_PRNG) += prng.o obj-$(CONFIG_CRYPTO_GHASH_S390) += ghash_s390.o obj-$(CONFIG_CRYPTO_CRC32_S390) += crc32-vx_s390.o -obj-$(CONFIG_ARCH_RANDOM) += arch_random.o +obj-y += arch_random.o crc32-vx_s390-y := crc32-vx.o crc32le-vx.o crc32be-vx.o chacha_s390-y := chacha-glue.o chacha-s390.o diff --git a/arch/s390/include/asm/archrandom.h b/arch/s390/include/asm/archrandom.h index 2c6e1c6ecbe7..0a1c2e66c709 100644 --- a/arch/s390/include/asm/archrandom.h +++ b/arch/s390/include/asm/archrandom.h @@ -11,8 +11,6 @@ #ifndef _ASM_S390_ARCHRANDOM_H #define _ASM_S390_ARCHRANDOM_H -#ifdef CONFIG_ARCH_RANDOM - #include #include #include @@ -50,5 +48,4 @@ static inline bool __must_check arch_get_random_seed_int(unsigned int *v) return false; } -#endif /* CONFIG_ARCH_RANDOM */ #endif /* _ASM_S390_ARCHRANDOM_H */ diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 0a37f5de2863..ebad41afe355 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -876,10 +876,8 @@ static void __init setup_randomness(void) add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count); memblock_free(vmms, PAGE_SIZE); -#ifdef CONFIG_ARCH_RANDOM if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG)) static_branch_enable(&s390_arch_random_available); -#endif } /* diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e58798f636d4..ba13749c09c8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1810,15 +1810,6 @@ config ARCH_USES_PG_UNCACHED def_bool y depends on X86_PAT -config ARCH_RANDOM - def_bool y - prompt "x86 architectural random number generator" if EXPERT - help - Enable the x86 architectural RDRAND instruction - (Intel Bull Mountain technology) to generate random numbers. - If supported, this is a high bandwidth, cryptographically - secure hardware random number generator. - config X86_UMIP def_bool y prompt "User Mode Instruction Prevention" if EXPERT diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h index ebc248e49549..fb235b696175 100644 --- a/arch/x86/include/asm/archrandom.h +++ b/arch/x86/include/asm/archrandom.h @@ -65,10 +65,8 @@ static inline bool __must_check rdseed_int(unsigned int *v) /* * These are the generic interfaces; they must not be declared if the - * stubs in are to be invoked, - * i.e. CONFIG_ARCH_RANDOM is not defined. + * stubs in are to be invoked. */ -#ifdef CONFIG_ARCH_RANDOM static inline bool __must_check arch_get_random_long(unsigned long *v) { @@ -90,12 +88,8 @@ static inline bool __must_check arch_get_random_seed_int(unsigned int *v) return static_cpu_has(X86_FEATURE_RDSEED) ? rdseed_int(v) : false; } -extern void x86_init_rdrand(struct cpuinfo_x86 *c); - -#else /* !CONFIG_ARCH_RANDOM */ - -static inline void x86_init_rdrand(struct cpuinfo_x86 *c) { } - -#endif /* !CONFIG_ARCH_RANDOM */ +#ifndef CONFIG_UML +void x86_init_rdrand(struct cpuinfo_x86 *c); +#endif #endif /* ASM_X86_ARCHRANDOM_H */ diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index c4be62058dd9..8f216669ecb8 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -26,7 +26,6 @@ __setup("nordrand", x86_rdrand_setup); */ #define SANITY_CHECK_LOOPS 8 -#ifdef CONFIG_ARCH_RANDOM void x86_init_rdrand(struct cpuinfo_x86 *c) { unsigned int changed = 0; @@ -63,4 +62,3 @@ void x86_init_rdrand(struct cpuinfo_x86 *c) "RDRAND gives funky smelling output, might consider not using it by booting with \"nordrand\""); } -#endif diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index 0b6c03643ddc..30192e123e5f 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -431,7 +431,6 @@ config ADI config RANDOM_TRUST_CPU bool "Initialize RNG using CPU RNG instructions" default y - depends on ARCH_RANDOM help Initialize the RNG using random numbers supplied by the CPU's RNG instructions (e.g. RDRAND), if supported and available. These diff --git a/drivers/char/hw_random/s390-trng.c b/drivers/char/hw_random/s390-trng.c index 2beaa35c0d74..488808dc17a2 100644 --- a/drivers/char/hw_random/s390-trng.c +++ b/drivers/char/hw_random/s390-trng.c @@ -108,7 +108,6 @@ static ssize_t trng_counter_show(struct device *dev, { u64 dev_counter = atomic64_read(&trng_dev_counter); u64 hwrng_counter = atomic64_read(&trng_hwrng_counter); -#if IS_ENABLED(CONFIG_ARCH_RANDOM) u64 arch_counter = atomic64_read(&s390_arch_random_counter); return sysfs_emit(buf, @@ -118,14 +117,6 @@ static ssize_t trng_counter_show(struct device *dev, "total: %llu\n", dev_counter, hwrng_counter, arch_counter, dev_counter + hwrng_counter + arch_counter); -#else - return sysfs_emit(buf, - "trng: %llu\n" - "hwrng: %llu\n" - "total: %llu\n", - dev_counter, hwrng_counter, - dev_counter + hwrng_counter); -#endif } static DEVICE_ATTR(byte_counter, 0444, trng_counter_show, NULL); diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild index 8e47d483b524..36db8b9eb68a 100644 --- a/include/asm-generic/Kbuild +++ b/include/asm-generic/Kbuild @@ -5,6 +5,7 @@ # asm headers from the host architecutre.) mandatory-y += atomic.h +mandatory-y += archrandom.h mandatory-y += barrier.h mandatory-y += bitops.h mandatory-y += bug.h diff --git a/include/asm-generic/archrandom.h b/include/asm-generic/archrandom.h new file mode 100644 index 000000000000..3a5ee202dd86 --- /dev/null +++ b/include/asm-generic/archrandom.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_GENERIC_ARCHRANDOM_H__ +#define __ASM_GENERIC_ARCHRANDOM_H__ + +static inline bool __must_check arch_get_random_long(unsigned long *v) +{ + return false; +} + +static inline bool __must_check arch_get_random_int(unsigned int *v) +{ + return false; +} + +static inline bool __must_check arch_get_random_seed_long(unsigned long *v) +{ + return false; +} + +static inline bool __must_check arch_get_random_seed_int(unsigned int *v) +{ + return false; +} + +#endif diff --git a/include/linux/random.h b/include/linux/random.h index 20e389a14e5c..865770e29f3e 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -106,14 +106,7 @@ declare_get_random_var_wait(long, unsigned long) */ #include -#ifdef CONFIG_ARCH_RANDOM -# include -#else -static inline bool __must_check arch_get_random_long(unsigned long *v) { return false; } -static inline bool __must_check arch_get_random_int(unsigned int *v) { return false; } -static inline bool __must_check arch_get_random_seed_long(unsigned long *v) { return false; } -static inline bool __must_check arch_get_random_seed_int(unsigned int *v) { return false; } -#endif +#include /* * Called from the boot CPU during startup; not valid to call once diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config index bad88f4b0a03..e1858ce7003f 100644 --- a/tools/testing/selftests/wireguard/qemu/kernel.config +++ b/tools/testing/selftests/wireguard/qemu/kernel.config @@ -58,7 +58,6 @@ CONFIG_NO_HZ_IDLE=y CONFIG_NO_HZ_FULL=n CONFIG_HZ_PERIODIC=n CONFIG_HIGH_RES_TIMERS=y -CONFIG_ARCH_RANDOM=y CONFIG_FILE_LOCKING=y CONFIG_POSIX_TIMERS=y CONFIG_DEVTMPFS=y -- cgit v1.2.3 From a6bd98c45d1aeec59493617b02a86de39d384535 Mon Sep 17 00:00:00 2001 From: Blake Jones Date: Wed, 29 Jun 2022 14:36:32 -0700 Subject: perf buildid-list: Add a "-m" option to show kernel and modules build-ids This new option displays all of the information needed to do external BuildID-based symbolization of kernel stack traces, such as those collected by bpf_get_stackid(). For each kernel module plus the main kernel, it displays the BuildID, the start and end virtual addresses of that module's text range (rounded out to page boundaries), and the pathname of the module. When run as a non-privileged user, the actual addresses of the modules' text ranges are not available, so the tools displays "0, " for kernel modules and "0, 0xffffffffffffffff" for the kernel itself. Sample output: root# perf buildid-list -m cf6df852fd4da122d616153353cc8f560fd12fe0 ffffffffa5400000 ffffffffa6001e27 [kernel.kallsyms] 1aa7209aa2acb067d66ed6cf7676d65066384d61 ffffffffc0087000 ffffffffc008b000 /lib/modules/5.15.15-1rodete2-amd64/kernel/crypto/sha512_generic.ko 3857815b5bf0183697b68f8fe0ea06121644041e ffffffffc008c000 ffffffffc0098000 /lib/modules/5.15.15-1rodete2-amd64/kernel/arch/x86/crypto/sha512-ssse3.ko 4081fde0bca2bc097cb3e9d1efcb836047d485f1 ffffffffc0099000 ffffffffc009f000 /lib/modules/5.15.15-1rodete2-amd64/kernel/drivers/acpi/button.ko 1ef81ba4890552ea6b0314f9635fc43fc8cef568 ffffffffc00a4000 ffffffffc00aa000 /lib/modules/5.15.15-1rodete2-amd64/kernel/crypto/cryptd.ko cc5c985506cb240d7d082b55ed260cbb851f983e ffffffffc00af000 ffffffffc00b6000 /lib/modules/5.15.15-1rodete2-amd64/kernel/drivers/i2c/busses/i2c-piix4.ko [...] Committer notes: u64 formatter should be PRIx64 for printing as hex numbers, fix this: 28 5.28 debian:experimental-x-mips : FAIL gcc version 11.2.0 (Debian 11.2.0-18) builtin-buildid-list.c: In function 'buildid__map_cb': builtin-buildid-list.c:32:24: error: format '%lx' expects argument of type 'long unsigned int', but argument 3 has type 'u64' {aka 'long long unsigned int'} [-Werror=format=] 32 | printf("%s %16lx %16lx", bid_buf, map->start, map->end); | ~~~~^ ~~~~~~~~~~ | | | | long unsigned int u64 {aka long long unsigned int} | %16llx builtin-buildid-list.c:32:30: error: format '%lx' expects argument of type 'long unsigned int', but argument 4 has type 'u64' {aka 'long long unsigned int'} [-Werror=format=] 32 | printf("%s %16lx %16lx", bid_buf, map->start, map->end); | ~~~~^ ~~~~~~~~ | | | | long unsigned int u64 {aka long long unsigned int} | %16llx cc1: all warnings being treated as errors Signed-off-by: Blake Jones Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20220629213632.3899212-1-blakejones@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-buildid-list.txt | 4 +++ tools/perf/builtin-buildid-list.c | 39 +++++++++++++++++++++++++- tools/perf/util/machine.c | 15 ++++++++++ tools/perf/util/machine.h | 5 ++++ 4 files changed, 62 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-buildid-list.txt b/tools/perf/Documentation/perf-buildid-list.txt index 25c52efcc7f0..e1e8fdbe06b9 100644 --- a/tools/perf/Documentation/perf-buildid-list.txt +++ b/tools/perf/Documentation/perf-buildid-list.txt @@ -33,6 +33,10 @@ OPTIONS -k:: --kernel:: Show running kernel build id. +-m:: +--kernel-maps:: + Show buildid, start/end text address, and path of running kernel and + its modules. -v:: --verbose:: Be more verbose. diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c index cebadd632234..00bfe89f0b5d 100644 --- a/tools/perf/builtin-buildid-list.c +++ b/tools/perf/builtin-buildid-list.c @@ -12,14 +12,44 @@ #include "util/build-id.h" #include "util/debug.h" #include "util/dso.h" +#include "util/map.h" #include #include #include "util/session.h" #include "util/symbol.h" #include "util/data.h" #include +#include #include +static int buildid__map_cb(struct map *map, void *arg __maybe_unused) +{ + const struct dso *dso = map->dso; + char bid_buf[SBUILD_ID_SIZE]; + + memset(bid_buf, 0, sizeof(bid_buf)); + if (dso->has_build_id) + build_id__sprintf(&dso->bid, bid_buf); + printf("%s %16" PRIx64 " %16" PRIx64, bid_buf, map->start, map->end); + if (dso->long_name != NULL) { + printf(" %s", dso->long_name); + } else if (dso->short_name != NULL) { + printf(" %s", dso->short_name); + } + printf("\n"); + + return 0; +} + +static void buildid__show_kernel_maps(void) +{ + struct machine *machine; + + machine = machine__new_host(); + machine__for_each_kernel_map(machine, buildid__map_cb, NULL); + machine__delete(machine); +} + static int sysfs__fprintf_build_id(FILE *fp) { char sbuild_id[SBUILD_ID_SIZE]; @@ -99,6 +129,7 @@ out: int cmd_buildid_list(int argc, const char **argv) { bool show_kernel = false; + bool show_kernel_maps = false; bool with_hits = false; bool force = false; const struct option options[] = { @@ -106,6 +137,8 @@ int cmd_buildid_list(int argc, const char **argv) OPT_STRING('i', "input", &input_name, "file", "input file name"), OPT_BOOLEAN('f', "force", &force, "don't complain, do it"), OPT_BOOLEAN('k', "kernel", &show_kernel, "Show current kernel build id"), + OPT_BOOLEAN('m', "kernel-maps", &show_kernel_maps, + "Show build id of current kernel + modules"), OPT_INCR('v', "verbose", &verbose, "be more verbose"), OPT_END() }; @@ -117,8 +150,12 @@ int cmd_buildid_list(int argc, const char **argv) argc = parse_options(argc, argv, options, buildid_list_usage, 0); setup_pager(); - if (show_kernel) + if (show_kernel) { return !(sysfs__fprintf_build_id(stdout) > 0); + } else if (show_kernel_maps) { + buildid__show_kernel_maps(); + return 0; + } return perf_session__list_build_ids(force, with_hits); } diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 009061852808..16d225149b93 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -3327,3 +3327,18 @@ int machine__for_each_dso(struct machine *machine, machine__dso_t fn, void *priv } return err; } + +int machine__for_each_kernel_map(struct machine *machine, machine__map_t fn, void *priv) +{ + struct maps *maps = machine__kernel_maps(machine); + struct map *map; + int err = 0; + + for (map = maps__first(maps); map != NULL; map = map__next(map)) { + err = fn(map, priv); + if (err != 0) { + break; + } + } + return err; +} diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h index 5d7daf7cb7bc..e1476343cbb2 100644 --- a/tools/perf/util/machine.h +++ b/tools/perf/util/machine.h @@ -262,6 +262,11 @@ typedef int (*machine__dso_t)(struct dso *dso, struct machine *machine, void *pr int machine__for_each_dso(struct machine *machine, machine__dso_t fn, void *priv); + +typedef int (*machine__map_t)(struct map *map, void *priv); +int machine__for_each_kernel_map(struct machine *machine, machine__map_t fn, + void *priv); + int machine__for_each_thread(struct machine *machine, int (*fn)(struct thread *thread, void *p), void *priv); -- cgit v1.2.3 From e923b0537d28e15c9d31ce8b38f810b325816903 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 19 Jul 2022 10:08:30 +0800 Subject: KVM: selftests: Fix target thread to be migrated in rseq_test In rseq_test, there are two threads, which are vCPU thread and migration worker separately. Unfortunately, the test has the wrong PID passed to sched_setaffinity() in the migration worker. It forces migration on the migration worker because zeroed PID represents the calling thread, which is the migration worker itself. It means the vCPU thread is never enforced to migration and it can migrate at any time, which eventually leads to failure as the following logs show. host# uname -r 5.19.0-rc6-gavin+ host# # cat /proc/cpuinfo | grep processor | tail -n 1 processor : 223 host# pwd /home/gavin/sandbox/linux.main/tools/testing/selftests/kvm host# for i in `seq 1 100`; do \ echo "--------> $i"; ./rseq_test; done --------> 1 --------> 2 --------> 3 --------> 4 --------> 5 --------> 6 ==== Test Assertion Failure ==== rseq_test.c:265: rseq_cpu == cpu pid=3925 tid=3925 errno=4 - Interrupted system call 1 0x0000000000401963: main at rseq_test.c:265 (discriminator 2) 2 0x0000ffffb044affb: ?? ??:0 3 0x0000ffffb044b0c7: ?? ??:0 4 0x0000000000401a6f: _start at ??:? rseq CPU = 4, sched CPU = 27 Fix the issue by passing correct parameter, TID of the vCPU thread, to sched_setaffinity() in the migration worker. Fixes: 61e52f1630f5 ("KVM: selftests: Add a test for KVM_RUN+rseq to detect task migration bugs") Suggested-by: Sean Christopherson Signed-off-by: Gavin Shan Reviewed-by: Oliver Upton Message-Id: <20220719020830.3479482-1-gshan@redhat.com> Reviewed-by: Andrew Jones Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/rseq_test.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c index 4158da0da2bb..2237d1aac801 100644 --- a/tools/testing/selftests/kvm/rseq_test.c +++ b/tools/testing/selftests/kvm/rseq_test.c @@ -82,8 +82,9 @@ static int next_cpu(int cpu) return cpu; } -static void *migration_worker(void *ign) +static void *migration_worker(void *__rseq_tid) { + pid_t rseq_tid = (pid_t)(unsigned long)__rseq_tid; cpu_set_t allowed_mask; int r, i, cpu; @@ -106,7 +107,7 @@ static void *migration_worker(void *ign) * stable, i.e. while changing affinity is in-progress. */ smp_wmb(); - r = sched_setaffinity(0, sizeof(allowed_mask), &allowed_mask); + r = sched_setaffinity(rseq_tid, sizeof(allowed_mask), &allowed_mask); TEST_ASSERT(!r, "sched_setaffinity failed, errno = %d (%s)", errno, strerror(errno)); smp_wmb(); @@ -231,7 +232,8 @@ int main(int argc, char *argv[]) vm = vm_create_default(VCPU_ID, 0, guest_code); ucall_init(vm, NULL); - pthread_create(&migration_thread, NULL, migration_worker, 0); + pthread_create(&migration_thread, NULL, migration_worker, + (void *)(unsigned long)gettid()); for (i = 0; !done; i++) { vcpu_run(vm, VCPU_ID); -- cgit v1.2.3 From dc951e22a1a2a6a11b29648c3c8b191bc8f3e5df Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 19 Jul 2022 09:16:53 -0400 Subject: tools headers UAPI: Sync linux/kvm.h with the kernel sources Silence this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/kvm.h' differs from latest version at 'include/uapi/linux/kvm.h' diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h Reported-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Signed-off-by: Paolo Bonzini --- tools/include/uapi/linux/kvm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 5088bd9f1922..860f867c50c0 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -2083,7 +2083,8 @@ struct kvm_stats_header { #define KVM_STATS_UNIT_BYTES (0x1 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_SECONDS (0x2 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_CYCLES (0x3 << KVM_STATS_UNIT_SHIFT) -#define KVM_STATS_UNIT_MAX KVM_STATS_UNIT_CYCLES +#define KVM_STATS_UNIT_BOOLEAN (0x4 << KVM_STATS_UNIT_SHIFT) +#define KVM_STATS_UNIT_MAX KVM_STATS_UNIT_BOOLEAN #define KVM_STATS_BASE_SHIFT 8 #define KVM_STATS_BASE_MASK (0xF << KVM_STATS_BASE_SHIFT) -- cgit v1.2.3 From 55d00c37ebc396539b10ab73203bfd5e189c3660 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Jul 2022 00:07:51 -0700 Subject: libbpf: generalize virtual __kconfig externs and use it for USDT Libbpf supports single virtual __kconfig extern currently: LINUX_KERNEL_VERSION. LINUX_KERNEL_VERSION isn't coming from /proc/kconfig.gz and is intead customly filled out by libbpf. This patch generalizes this approach to support more such virtual __kconfig externs. One such extern added in this patch is LINUX_HAS_BPF_COOKIE which is used for BPF-side USDT supporting code in usdt.bpf.h instead of using CO-RE-based enum detection approach for detecting bpf_get_attach_cookie() BPF helper. This allows to remove otherwise not needed CO-RE dependency and keeps user-space and BPF-side parts of libbpf's USDT support strictly in sync in terms of their feature detection. We'll use similar approach for syscall wrapper detection for BPF_KSYSCALL() BPF-side macro in follow up patch. Generally, currently libbpf reserves CONFIG_ prefix for Kconfig values and LINUX_ for virtual libbpf-backed externs. In the future we might extend the set of prefixes that are supported. This can be done without any breaking changes, as currently any __kconfig extern with unrecognized name is rejected. For LINUX_xxx externs we support the normal "weak rule": if libbpf doesn't recognize given LINUX_xxx extern but such extern is marked as __weak, it is not rejected and defaults to zero. This follows CONFIG_xxx handling logic and will allow BPF applications to opportunistically use newer libbpf virtual externs without breaking on older libbpf versions unnecessarily. Tested-by: Alan Maguire Reviewed-by: Alan Maguire Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220714070755.3235561-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 95 ++++++++++++++++++++++++++++++++---------------- tools/lib/bpf/usdt.bpf.h | 16 +------- 2 files changed, 66 insertions(+), 45 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 77ae83308199..2df8b98d97bd 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1694,7 +1694,7 @@ static int set_kcfg_value_tri(struct extern_desc *ext, void *ext_val, switch (ext->kcfg.type) { case KCFG_BOOL: if (value == 'm') { - pr_warn("extern (kcfg) %s=%c should be tristate or char\n", + pr_warn("extern (kcfg) '%s': value '%c' implies tristate or char type\n", ext->name, value); return -EINVAL; } @@ -1715,7 +1715,7 @@ static int set_kcfg_value_tri(struct extern_desc *ext, void *ext_val, case KCFG_INT: case KCFG_CHAR_ARR: default: - pr_warn("extern (kcfg) %s=%c should be bool, tristate, or char\n", + pr_warn("extern (kcfg) '%s': value '%c' implies bool, tristate, or char type\n", ext->name, value); return -EINVAL; } @@ -1729,7 +1729,8 @@ static int set_kcfg_value_str(struct extern_desc *ext, char *ext_val, size_t len; if (ext->kcfg.type != KCFG_CHAR_ARR) { - pr_warn("extern (kcfg) %s=%s should be char array\n", ext->name, value); + pr_warn("extern (kcfg) '%s': value '%s' implies char array type\n", + ext->name, value); return -EINVAL; } @@ -1743,7 +1744,7 @@ static int set_kcfg_value_str(struct extern_desc *ext, char *ext_val, /* strip quotes */ len -= 2; if (len >= ext->kcfg.sz) { - pr_warn("extern (kcfg) '%s': long string config %s of (%zu bytes) truncated to %d bytes\n", + pr_warn("extern (kcfg) '%s': long string '%s' of (%zu bytes) truncated to %d bytes\n", ext->name, value, len, ext->kcfg.sz - 1); len = ext->kcfg.sz - 1; } @@ -1800,13 +1801,20 @@ static bool is_kcfg_value_in_range(const struct extern_desc *ext, __u64 v) static int set_kcfg_value_num(struct extern_desc *ext, void *ext_val, __u64 value) { - if (ext->kcfg.type != KCFG_INT && ext->kcfg.type != KCFG_CHAR) { - pr_warn("extern (kcfg) %s=%llu should be integer\n", + if (ext->kcfg.type != KCFG_INT && ext->kcfg.type != KCFG_CHAR && + ext->kcfg.type != KCFG_BOOL) { + pr_warn("extern (kcfg) '%s': value '%llu' implies integer, char, or boolean type\n", ext->name, (unsigned long long)value); return -EINVAL; } + if (ext->kcfg.type == KCFG_BOOL && value > 1) { + pr_warn("extern (kcfg) '%s': value '%llu' isn't boolean compatible\n", + ext->name, (unsigned long long)value); + return -EINVAL; + + } if (!is_kcfg_value_in_range(ext, value)) { - pr_warn("extern (kcfg) %s=%llu value doesn't fit in %d bytes\n", + pr_warn("extern (kcfg) '%s': value '%llu' doesn't fit in %d bytes\n", ext->name, (unsigned long long)value, ext->kcfg.sz); return -ERANGE; } @@ -1870,16 +1878,19 @@ static int bpf_object__process_kconfig_line(struct bpf_object *obj, /* assume integer */ err = parse_u64(value, &num); if (err) { - pr_warn("extern (kcfg) %s=%s should be integer\n", - ext->name, value); + pr_warn("extern (kcfg) '%s': value '%s' isn't a valid integer\n", ext->name, value); return err; } + if (ext->kcfg.type != KCFG_INT && ext->kcfg.type != KCFG_CHAR) { + pr_warn("extern (kcfg) '%s': value '%s' implies integer type\n", ext->name, value); + return -EINVAL; + } err = set_kcfg_value_num(ext, ext_val, num); break; } if (err) return err; - pr_debug("extern (kcfg) %s=%s\n", ext->name, value); + pr_debug("extern (kcfg) '%s': set to %s\n", ext->name, value); return 0; } @@ -3687,7 +3698,7 @@ static int bpf_object__collect_externs(struct bpf_object *obj) ext->kcfg.type = find_kcfg_type(obj->btf, t->type, &ext->kcfg.is_signed); if (ext->kcfg.type == KCFG_UNKNOWN) { - pr_warn("extern (kcfg) '%s' type is unsupported\n", ext_name); + pr_warn("extern (kcfg) '%s': type is unsupported\n", ext_name); return -ENOTSUP; } } else if (strcmp(sec_name, KSYMS_SEC) == 0) { @@ -7287,14 +7298,14 @@ static int kallsyms_cb(unsigned long long sym_addr, char sym_type, return 0; if (ext->is_set && ext->ksym.addr != sym_addr) { - pr_warn("extern (ksym) '%s' resolution is ambiguous: 0x%llx or 0x%llx\n", + pr_warn("extern (ksym) '%s': resolution is ambiguous: 0x%llx or 0x%llx\n", sym_name, ext->ksym.addr, sym_addr); return -EINVAL; } if (!ext->is_set) { ext->is_set = true; ext->ksym.addr = sym_addr; - pr_debug("extern (ksym) %s=0x%llx\n", sym_name, sym_addr); + pr_debug("extern (ksym) '%s': set to 0x%llx\n", sym_name, sym_addr); } return 0; } @@ -7498,28 +7509,50 @@ static int bpf_object__resolve_externs(struct bpf_object *obj, for (i = 0; i < obj->nr_extern; i++) { ext = &obj->externs[i]; - if (ext->type == EXT_KCFG && - strcmp(ext->name, "LINUX_KERNEL_VERSION") == 0) { - void *ext_val = kcfg_data + ext->kcfg.data_off; - __u32 kver = get_kernel_version(); + if (ext->type == EXT_KSYM) { + if (ext->ksym.type_id) + need_vmlinux_btf = true; + else + need_kallsyms = true; + continue; + } else if (ext->type == EXT_KCFG) { + void *ext_ptr = kcfg_data + ext->kcfg.data_off; + __u64 value = 0; - if (!kver) { - pr_warn("failed to get kernel version\n"); + /* Kconfig externs need actual /proc/config.gz */ + if (str_has_pfx(ext->name, "CONFIG_")) { + need_config = true; + continue; + } + + /* Virtual kcfg externs are customly handled by libbpf */ + if (strcmp(ext->name, "LINUX_KERNEL_VERSION") == 0) { + value = get_kernel_version(); + if (!value) { + pr_warn("extern (kcfg) '%s': failed to get kernel version\n", ext->name); + return -EINVAL; + } + } else if (strcmp(ext->name, "LINUX_HAS_BPF_COOKIE") == 0) { + value = kernel_supports(obj, FEAT_BPF_COOKIE); + } else if (!str_has_pfx(ext->name, "LINUX_") || !ext->is_weak) { + /* Currently libbpf supports only CONFIG_ and LINUX_ prefixed + * __kconfig externs, where LINUX_ ones are virtual and filled out + * customly by libbpf (their values don't come from Kconfig). + * If LINUX_xxx variable is not recognized by libbpf, but is marked + * __weak, it defaults to zero value, just like for CONFIG_xxx + * externs. + */ + pr_warn("extern (kcfg) '%s': unrecognized virtual extern\n", ext->name); return -EINVAL; } - err = set_kcfg_value_num(ext, ext_val, kver); + + err = set_kcfg_value_num(ext, ext_ptr, value); if (err) return err; - pr_debug("extern (kcfg) %s=0x%x\n", ext->name, kver); - } else if (ext->type == EXT_KCFG && str_has_pfx(ext->name, "CONFIG_")) { - need_config = true; - } else if (ext->type == EXT_KSYM) { - if (ext->ksym.type_id) - need_vmlinux_btf = true; - else - need_kallsyms = true; + pr_debug("extern (kcfg) '%s': set to 0x%llx\n", + ext->name, (long long)value); } else { - pr_warn("unrecognized extern '%s'\n", ext->name); + pr_warn("extern '%s': unrecognized extern kind\n", ext->name); return -EINVAL; } } @@ -7555,10 +7588,10 @@ static int bpf_object__resolve_externs(struct bpf_object *obj, ext = &obj->externs[i]; if (!ext->is_set && !ext->is_weak) { - pr_warn("extern %s (strong) not resolved\n", ext->name); + pr_warn("extern '%s' (strong): not resolved\n", ext->name); return -ESRCH; } else if (!ext->is_set) { - pr_debug("extern %s (weak) not resolved, defaulting to zero\n", + pr_debug("extern '%s' (weak): not resolved, defaulting to zero\n", ext->name); } } diff --git a/tools/lib/bpf/usdt.bpf.h b/tools/lib/bpf/usdt.bpf.h index 4181fddb3687..4f2adc0bd6ca 100644 --- a/tools/lib/bpf/usdt.bpf.h +++ b/tools/lib/bpf/usdt.bpf.h @@ -6,7 +6,6 @@ #include #include #include -#include /* Below types and maps are internal implementation details of libbpf's USDT * support and are subjects to change. Also, bpf_usdt_xxx() API helpers should @@ -30,14 +29,6 @@ #ifndef BPF_USDT_MAX_IP_CNT #define BPF_USDT_MAX_IP_CNT (4 * BPF_USDT_MAX_SPEC_CNT) #endif -/* We use BPF CO-RE to detect support for BPF cookie from BPF side. This is - * the only dependency on CO-RE, so if it's undesirable, user can override - * BPF_USDT_HAS_BPF_COOKIE to specify whether to BPF cookie is supported or not. - */ -#ifndef BPF_USDT_HAS_BPF_COOKIE -#define BPF_USDT_HAS_BPF_COOKIE \ - bpf_core_enum_value_exists(enum bpf_func_id___usdt, BPF_FUNC_get_attach_cookie___usdt) -#endif enum __bpf_usdt_arg_type { BPF_USDT_ARG_CONST, @@ -83,15 +74,12 @@ struct { __type(value, __u32); } __bpf_usdt_ip_to_spec_id SEC(".maps") __weak; -/* don't rely on user's BPF code to have latest definition of bpf_func_id */ -enum bpf_func_id___usdt { - BPF_FUNC_get_attach_cookie___usdt = 0xBAD, /* value doesn't matter */ -}; +extern const _Bool LINUX_HAS_BPF_COOKIE __kconfig; static __always_inline int __bpf_usdt_spec_id(struct pt_regs *ctx) { - if (!BPF_USDT_HAS_BPF_COOKIE) { + if (!LINUX_HAS_BPF_COOKIE) { long ip = PT_REGS_IP(ctx); int *spec_id_ptr; -- cgit v1.2.3 From ce6dc74a0a4a22047fdaf893047483b303b64898 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Jul 2022 00:07:52 -0700 Subject: selftests/bpf: add test of __weak unknown virtual __kconfig extern Exercise libbpf's logic for unknown __weak virtual __kconfig externs. USDT selftests are already excercising non-weak known virtual extern already (LINUX_HAS_BPF_COOKIE), so no need to add explicit tests for it. Tested-by: Alan Maguire Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220714070755.3235561-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/core_extern.c | 17 +++++++---------- tools/testing/selftests/bpf/progs/test_core_extern.c | 3 +++ 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/core_extern.c b/tools/testing/selftests/bpf/prog_tests/core_extern.c index 1931a158510e..63a51e9f3630 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_extern.c +++ b/tools/testing/selftests/bpf/prog_tests/core_extern.c @@ -39,6 +39,7 @@ static struct test_case { "CONFIG_STR=\"abracad\"\n" "CONFIG_MISSING=0", .data = { + .unkn_virt_val = 0, .bpf_syscall = false, .tristate_val = TRI_MODULE, .bool_val = true, @@ -121,7 +122,7 @@ static struct test_case { void test_core_extern(void) { const uint32_t kern_ver = get_kernel_version(); - int err, duration = 0, i, j; + int err, i, j; struct test_core_extern *skel = NULL; uint64_t *got, *exp; int n = sizeof(*skel->data) / sizeof(uint64_t); @@ -136,19 +137,17 @@ void test_core_extern(void) continue; skel = test_core_extern__open_opts(&opts); - if (CHECK(!skel, "skel_open", "skeleton open failed\n")) + if (!ASSERT_OK_PTR(skel, "skel_open")) goto cleanup; err = test_core_extern__load(skel); if (t->fails) { - CHECK(!err, "skel_load", - "shouldn't succeed open/load of skeleton\n"); + ASSERT_ERR(err, "skel_load_should_fail"); goto cleanup; - } else if (CHECK(err, "skel_load", - "failed to open/load skeleton\n")) { + } else if (!ASSERT_OK(err, "skel_load")) { goto cleanup; } err = test_core_extern__attach(skel); - if (CHECK(err, "attach_raw_tp", "failed attach: %d\n", err)) + if (!ASSERT_OK(err, "attach_raw_tp")) goto cleanup; usleep(1); @@ -158,9 +157,7 @@ void test_core_extern(void) got = (uint64_t *)skel->data; exp = (uint64_t *)&t->data; for (j = 0; j < n; j++) { - CHECK(got[j] != exp[j], "check_res", - "result #%d: expected %llx, but got %llx\n", - j, (__u64)exp[j], (__u64)got[j]); + ASSERT_EQ(got[j], exp[j], "result"); } cleanup: test_core_extern__destroy(skel); diff --git a/tools/testing/selftests/bpf/progs/test_core_extern.c b/tools/testing/selftests/bpf/progs/test_core_extern.c index 3ac3603ad53d..a3c7c1042f35 100644 --- a/tools/testing/selftests/bpf/progs/test_core_extern.c +++ b/tools/testing/selftests/bpf/progs/test_core_extern.c @@ -11,6 +11,7 @@ static int (*bpf_missing_helper)(const void *arg1, int arg2) = (void *) 999; extern int LINUX_KERNEL_VERSION __kconfig; +extern int LINUX_UNKNOWN_VIRTUAL_EXTERN __kconfig __weak; extern bool CONFIG_BPF_SYSCALL __kconfig; /* strong */ extern enum libbpf_tristate CONFIG_TRISTATE __kconfig __weak; extern bool CONFIG_BOOL __kconfig __weak; @@ -22,6 +23,7 @@ extern const char CONFIG_STR[8] __kconfig __weak; extern uint64_t CONFIG_MISSING __kconfig __weak; uint64_t kern_ver = -1; +uint64_t unkn_virt_val = -1; uint64_t bpf_syscall = -1; uint64_t tristate_val = -1; uint64_t bool_val = -1; @@ -38,6 +40,7 @@ int handle_sys_enter(struct pt_regs *ctx) int i; kern_ver = LINUX_KERNEL_VERSION; + unkn_virt_val = LINUX_UNKNOWN_VIRTUAL_EXTERN; bpf_syscall = CONFIG_BPF_SYSCALL; tristate_val = CONFIG_TRISTATE; bool_val = CONFIG_BOOL; -- cgit v1.2.3 From 6f5d467d55f0d8fc2143179dd4869d408ae3bd25 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Jul 2022 00:07:53 -0700 Subject: libbpf: improve BPF_KPROBE_SYSCALL macro and rename it to BPF_KSYSCALL Improve BPF_KPROBE_SYSCALL (and rename it to shorter BPF_KSYSCALL to match libbpf's SEC("ksyscall") section name, added in next patch) to use __kconfig variable to determine how to properly fetch syscall arguments. Instead of relying on hard-coded knowledge of whether kernel's architecture uses syscall wrapper or not (which only reflects the latest kernel versions, but is not necessarily true for older kernels and won't necessarily hold for later kernel versions on some particular host architecture), determine this at runtime by attempting to create perf_event (with fallback to kprobe event creation through tracefs on legacy kernels, just like kprobe attachment code is doing) for kernel function that would correspond to bpf() syscall on a system that has CONFIG_ARCH_HAS_SYSCALL_WRAPPER set (e.g., for x86-64 it would try '__x64_sys_bpf'). If host kernel uses syscall wrapper, syscall kernel function's first argument is a pointer to struct pt_regs that then contains syscall arguments. In such case we need to use bpf_probe_read_kernel() to fetch actual arguments (which we do through BPF_CORE_READ() macro) from inner pt_regs. But if the kernel doesn't use syscall wrapper approach, input arguments can be read from struct pt_regs directly with no probe reading. All this feature detection is done without requiring /proc/config.gz existence and parsing, and BPF-side helper code uses newly added LINUX_HAS_SYSCALL_WRAPPER virtual __kconfig extern to keep in sync with user-side feature detection of libbpf. BPF_KSYSCALL() macro can be used both with SEC("kprobe") programs that define syscall function explicitly (e.g., SEC("kprobe/__x64_sys_bpf")) and SEC("ksyscall") program added in the next patch (which are the same kprobe program with added benefit of libbpf determining correct kernel function name automatically). Kretprobe and kretsyscall (added in next patch) programs don't need BPF_KSYSCALL as they don't provide access to input arguments. Normal BPF_KRETPROBE is completely sufficient and is recommended. Tested-by: Alan Maguire Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220714070755.3235561-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf_tracing.h | 51 +++++++++++++++++++++++++++++++++------------ tools/lib/bpf/libbpf.c | 2 ++ 2 files changed, 40 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index 11f9096407fc..f4d3e1e2abe2 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -2,6 +2,8 @@ #ifndef __BPF_TRACING_H__ #define __BPF_TRACING_H__ +#include + /* Scan the ARCH passed in from ARCH env variable (see Makefile) */ #if defined(__TARGET_ARCH_x86) #define bpf_target_x86 @@ -140,7 +142,7 @@ struct pt_regs___s390 { #define __PT_RC_REG gprs[2] #define __PT_SP_REG gprs[15] #define __PT_IP_REG psw.addr -#define PT_REGS_PARM1_SYSCALL(x) ({ _Pragma("GCC error \"use PT_REGS_PARM1_CORE_SYSCALL() instead\""); 0l; }) +#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1_CORE_SYSCALL(x) #define PT_REGS_PARM1_CORE_SYSCALL(x) BPF_CORE_READ((const struct pt_regs___s390 *)(x), orig_gpr2) #elif defined(bpf_target_arm) @@ -174,7 +176,7 @@ struct pt_regs___arm64 { #define __PT_RC_REG regs[0] #define __PT_SP_REG sp #define __PT_IP_REG pc -#define PT_REGS_PARM1_SYSCALL(x) ({ _Pragma("GCC error \"use PT_REGS_PARM1_CORE_SYSCALL() instead\""); 0l; }) +#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1_CORE_SYSCALL(x) #define PT_REGS_PARM1_CORE_SYSCALL(x) BPF_CORE_READ((const struct pt_regs___arm64 *)(x), orig_x0) #elif defined(bpf_target_mips) @@ -493,39 +495,62 @@ typeof(name(0)) name(struct pt_regs *ctx) \ } \ static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) +/* If kernel has CONFIG_ARCH_HAS_SYSCALL_WRAPPER, read pt_regs directly */ #define ___bpf_syscall_args0() ctx -#define ___bpf_syscall_args1(x) ___bpf_syscall_args0(), (void *)PT_REGS_PARM1_CORE_SYSCALL(regs) -#define ___bpf_syscall_args2(x, args...) ___bpf_syscall_args1(args), (void *)PT_REGS_PARM2_CORE_SYSCALL(regs) -#define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (void *)PT_REGS_PARM3_CORE_SYSCALL(regs) -#define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (void *)PT_REGS_PARM4_CORE_SYSCALL(regs) -#define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (void *)PT_REGS_PARM5_CORE_SYSCALL(regs) +#define ___bpf_syscall_args1(x) ___bpf_syscall_args0(), (void *)PT_REGS_PARM1_SYSCALL(regs) +#define ___bpf_syscall_args2(x, args...) ___bpf_syscall_args1(args), (void *)PT_REGS_PARM2_SYSCALL(regs) +#define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (void *)PT_REGS_PARM3_SYSCALL(regs) +#define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (void *)PT_REGS_PARM4_SYSCALL(regs) +#define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (void *)PT_REGS_PARM5_SYSCALL(regs) #define ___bpf_syscall_args(args...) ___bpf_apply(___bpf_syscall_args, ___bpf_narg(args))(args) +/* If kernel doesn't have CONFIG_ARCH_HAS_SYSCALL_WRAPPER, we have to BPF_CORE_READ from pt_regs */ +#define ___bpf_syswrap_args0() ctx +#define ___bpf_syswrap_args1(x) ___bpf_syswrap_args0(), (void *)PT_REGS_PARM1_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args2(x, args...) ___bpf_syswrap_args1(args), (void *)PT_REGS_PARM2_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args3(x, args...) ___bpf_syswrap_args2(args), (void *)PT_REGS_PARM3_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args4(x, args...) ___bpf_syswrap_args3(args), (void *)PT_REGS_PARM4_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args5(x, args...) ___bpf_syswrap_args4(args), (void *)PT_REGS_PARM5_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args(args...) ___bpf_apply(___bpf_syswrap_args, ___bpf_narg(args))(args) + /* - * BPF_KPROBE_SYSCALL is a variant of BPF_KPROBE, which is intended for + * BPF_KSYSCALL is a variant of BPF_KPROBE, which is intended for * tracing syscall functions, like __x64_sys_close. It hides the underlying * platform-specific low-level way of getting syscall input arguments from * struct pt_regs, and provides a familiar typed and named function arguments * syntax and semantics of accessing syscall input parameters. * - * Original struct pt_regs* context is preserved as 'ctx' argument. This might + * Original struct pt_regs * context is preserved as 'ctx' argument. This might * be necessary when using BPF helpers like bpf_perf_event_output(). * - * This macro relies on BPF CO-RE support. + * At the moment BPF_KSYSCALL does not handle all the calling convention + * quirks for mmap(), clone() and compat syscalls transparrently. This may or + * may not change in the future. User needs to take extra measures to handle + * such quirks explicitly, if necessary. + * + * This macro relies on BPF CO-RE support and virtual __kconfig externs. */ -#define BPF_KPROBE_SYSCALL(name, args...) \ +#define BPF_KSYSCALL(name, args...) \ name(struct pt_regs *ctx); \ +extern _Bool LINUX_HAS_SYSCALL_WRAPPER __kconfig; \ static __attribute__((always_inline)) typeof(name(0)) \ ____##name(struct pt_regs *ctx, ##args); \ typeof(name(0)) name(struct pt_regs *ctx) \ { \ - struct pt_regs *regs = PT_REGS_SYSCALL_REGS(ctx); \ + struct pt_regs *regs = LINUX_HAS_SYSCALL_WRAPPER \ + ? (struct pt_regs *)PT_REGS_PARM1(ctx) \ + : ctx; \ _Pragma("GCC diagnostic push") \ _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ - return ____##name(___bpf_syscall_args(args)); \ + if (LINUX_HAS_SYSCALL_WRAPPER) \ + return ____##name(___bpf_syswrap_args(args)); \ + else \ + return ____##name(___bpf_syscall_args(args)); \ _Pragma("GCC diagnostic pop") \ } \ static __attribute__((always_inline)) typeof(name(0)) \ ____##name(struct pt_regs *ctx, ##args) +#define BPF_KPROBE_SYSCALL BPF_KSYSCALL + #endif diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 2df8b98d97bd..b868f9d9679b 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -7534,6 +7534,8 @@ static int bpf_object__resolve_externs(struct bpf_object *obj, } } else if (strcmp(ext->name, "LINUX_HAS_BPF_COOKIE") == 0) { value = kernel_supports(obj, FEAT_BPF_COOKIE); + } else if (strcmp(ext->name, "LINUX_HAS_SYSCALL_WRAPPER") == 0) { + value = kernel_supports(obj, FEAT_SYSCALL_WRAPPER); } else if (!str_has_pfx(ext->name, "LINUX_") || !ext->is_weak) { /* Currently libbpf supports only CONFIG_ and LINUX_ prefixed * __kconfig externs, where LINUX_ ones are virtual and filled out -- cgit v1.2.3 From 708ac5bea0ce51fa1d70a44a1f817c8b02787c1e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Jul 2022 00:07:54 -0700 Subject: libbpf: add ksyscall/kretsyscall sections support for syscall kprobes Add SEC("ksyscall")/SEC("ksyscall/") and corresponding kretsyscall variants (for return kprobes) to allow users to kprobe syscall functions in kernel. These special sections allow to ignore complexities and differences between kernel versions and host architectures when it comes to syscall wrapper and corresponding ___sys_ vs __se_sys_ differences, depending on whether host kernel has CONFIG_ARCH_HAS_SYSCALL_WRAPPER (though libbpf itself doesn't rely on /proc/config.gz for detecting this, see BPF_KSYSCALL patch for how it's done internally). Combined with the use of BPF_KSYSCALL() macro, this allows to just specify intended syscall name and expected input arguments and leave dealing with all the variations to libbpf. In addition to SEC("ksyscall+") and SEC("kretsyscall+") add bpf_program__attach_ksyscall() API which allows to specify syscall name at runtime and provide associated BPF cookie value. At the moment SEC("ksyscall") and bpf_program__attach_ksyscall() do not handle all the calling convention quirks for mmap(), clone() and compat syscalls. It also only attaches to "native" syscall interfaces. If host system supports compat syscalls or defines 32-bit syscalls in 64-bit kernel, such syscall interfaces won't be attached to by libbpf. These limitations may or may not change in the future. Therefore it is recommended to use SEC("kprobe") for these syscalls or if working with compat and 32-bit interfaces is required. Tested-by: Alan Maguire Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220714070755.3235561-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 117 ++++++++++++++++++++++++++++++++++++---- tools/lib/bpf/libbpf.h | 46 ++++++++++++++++ tools/lib/bpf/libbpf.map | 1 + tools/lib/bpf/libbpf_internal.h | 2 + 4 files changed, 157 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index b868f9d9679b..431409a2336b 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -4670,6 +4670,8 @@ static int probe_kern_btf_enum64(void) strs, sizeof(strs))); } +static int probe_kern_syscall_wrapper(void); + enum kern_feature_result { FEAT_UNKNOWN = 0, FEAT_SUPPORTED = 1, @@ -4738,6 +4740,9 @@ static struct kern_feature_desc { [FEAT_BTF_ENUM64] = { "BTF_KIND_ENUM64 support", probe_kern_btf_enum64, }, + [FEAT_SYSCALL_WRAPPER] = { + "Kernel using syscall wrapper", probe_kern_syscall_wrapper, + }, }; bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id) @@ -8421,6 +8426,7 @@ int bpf_program__set_log_buf(struct bpf_program *prog, char *log_buf, size_t log static int attach_kprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_uprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_ksyscall(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_usdt(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link); @@ -8441,6 +8447,8 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("uretprobe.s+", KPROBE, 0, SEC_SLEEPABLE, attach_uprobe), SEC_DEF("kprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), SEC_DEF("kretprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), + SEC_DEF("ksyscall+", KPROBE, 0, SEC_NONE, attach_ksyscall), + SEC_DEF("kretsyscall+", KPROBE, 0, SEC_NONE, attach_ksyscall), SEC_DEF("usdt+", KPROBE, 0, SEC_NONE, attach_usdt), SEC_DEF("tc", SCHED_CLS, 0, SEC_NONE), SEC_DEF("classifier", SCHED_CLS, 0, SEC_NONE), @@ -9797,7 +9805,7 @@ static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name, { struct perf_event_attr attr = {}; char errmsg[STRERR_BUFSIZE]; - int type, pfd, err; + int type, pfd; if (ref_ctr_off >= (1ULL << PERF_UPROBE_REF_CTR_OFFSET_BITS)) return -EINVAL; @@ -9833,14 +9841,7 @@ static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name, pid < 0 ? -1 : pid /* pid */, pid == -1 ? 0 : -1 /* cpu */, -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC); - if (pfd < 0) { - err = -errno; - pr_warn("%s perf_event_open() failed: %s\n", - uprobe ? "uprobe" : "kprobe", - libbpf_strerror_r(err, errmsg, sizeof(errmsg))); - return err; - } - return pfd; + return pfd >= 0 ? pfd : -errno; } static int append_to_file(const char *file, const char *fmt, ...) @@ -9945,6 +9946,60 @@ err_clean_legacy: return err; } +static const char *arch_specific_syscall_pfx(void) +{ +#if defined(__x86_64__) + return "x64"; +#elif defined(__i386__) + return "ia32"; +#elif defined(__s390x__) + return "s390x"; +#elif defined(__s390__) + return "s390"; +#elif defined(__arm__) + return "arm"; +#elif defined(__aarch64__) + return "arm64"; +#elif defined(__mips__) + return "mips"; +#elif defined(__riscv) + return "riscv"; +#else + return NULL; +#endif +} + +static int probe_kern_syscall_wrapper(void) +{ + char syscall_name[64]; + const char *ksys_pfx; + + ksys_pfx = arch_specific_syscall_pfx(); + if (!ksys_pfx) + return 0; + + snprintf(syscall_name, sizeof(syscall_name), "__%s_sys_bpf", ksys_pfx); + + if (determine_kprobe_perf_type() >= 0) { + int pfd; + + pfd = perf_event_open_probe(false, false, syscall_name, 0, getpid(), 0); + if (pfd >= 0) + close(pfd); + + return pfd >= 0 ? 1 : 0; + } else { /* legacy mode */ + char probe_name[128]; + + gen_kprobe_legacy_event_name(probe_name, sizeof(probe_name), syscall_name, 0); + if (add_kprobe_event_legacy(probe_name, false, syscall_name, 0) < 0) + return 0; + + (void)remove_kprobe_event_legacy(probe_name, false); + return 1; + } +} + struct bpf_link * bpf_program__attach_kprobe_opts(const struct bpf_program *prog, const char *func_name, @@ -10030,6 +10085,29 @@ struct bpf_link *bpf_program__attach_kprobe(const struct bpf_program *prog, return bpf_program__attach_kprobe_opts(prog, func_name, &opts); } +struct bpf_link *bpf_program__attach_ksyscall(const struct bpf_program *prog, + const char *syscall_name, + const struct bpf_ksyscall_opts *opts) +{ + LIBBPF_OPTS(bpf_kprobe_opts, kprobe_opts); + char func_name[128]; + + if (!OPTS_VALID(opts, bpf_ksyscall_opts)) + return libbpf_err_ptr(-EINVAL); + + if (kernel_supports(prog->obj, FEAT_SYSCALL_WRAPPER)) { + snprintf(func_name, sizeof(func_name), "__%s_sys_%s", + arch_specific_syscall_pfx(), syscall_name); + } else { + snprintf(func_name, sizeof(func_name), "__se_sys_%s", syscall_name); + } + + kprobe_opts.retprobe = OPTS_GET(opts, retprobe, false); + kprobe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); + + return bpf_program__attach_kprobe_opts(prog, func_name, &kprobe_opts); +} + /* Adapted from perf/util/string.c */ static bool glob_match(const char *str, const char *pat) { @@ -10200,6 +10278,27 @@ static int attach_kprobe(const struct bpf_program *prog, long cookie, struct bpf return libbpf_get_error(*link); } +static int attach_ksyscall(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + LIBBPF_OPTS(bpf_ksyscall_opts, opts); + const char *syscall_name; + + *link = NULL; + + /* no auto-attach for SEC("ksyscall") and SEC("kretsyscall") */ + if (strcmp(prog->sec_name, "ksyscall") == 0 || strcmp(prog->sec_name, "kretsyscall") == 0) + return 0; + + opts.retprobe = str_has_pfx(prog->sec_name, "kretsyscall/"); + if (opts.retprobe) + syscall_name = prog->sec_name + sizeof("kretsyscall/") - 1; + else + syscall_name = prog->sec_name + sizeof("ksyscall/") - 1; + + *link = bpf_program__attach_ksyscall(prog, syscall_name, &opts); + return *link ? 0 : -errno; +} + static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link) { LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index c51d6e6b3066..61493c4cddac 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -457,6 +457,52 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, const char *pattern, const struct bpf_kprobe_multi_opts *opts); +struct bpf_ksyscall_opts { + /* size of this struct, for forward/backward compatiblity */ + size_t sz; + /* custom user-provided value fetchable through bpf_get_attach_cookie() */ + __u64 bpf_cookie; + /* attach as return probe? */ + bool retprobe; + size_t :0; +}; +#define bpf_ksyscall_opts__last_field retprobe + +/** + * @brief **bpf_program__attach_ksyscall()** attaches a BPF program + * to kernel syscall handler of a specified syscall. Optionally it's possible + * to request to install retprobe that will be triggered at syscall exit. It's + * also possible to associate BPF cookie (though options). + * + * Libbpf automatically will determine correct full kernel function name, + * which depending on system architecture and kernel version/configuration + * could be of the form ___sys_ or __se_sys_, and will + * attach specified program using kprobe/kretprobe mechanism. + * + * **bpf_program__attach_ksyscall()** is an API counterpart of declarative + * **SEC("ksyscall/")** annotation of BPF programs. + * + * At the moment **SEC("ksyscall")** and **bpf_program__attach_ksyscall()** do + * not handle all the calling convention quirks for mmap(), clone() and compat + * syscalls. It also only attaches to "native" syscall interfaces. If host + * system supports compat syscalls or defines 32-bit syscalls in 64-bit + * kernel, such syscall interfaces won't be attached to by libbpf. + * + * These limitations may or may not change in the future. Therefore it is + * recommended to use SEC("kprobe") for these syscalls or if working with + * compat and 32-bit interfaces is required. + * + * @param prog BPF program to attach + * @param syscall_name Symbolic name of the syscall (e.g., "bpf") + * @param opts Additional options (see **struct bpf_ksyscall_opts**) + * @return Reference to the newly created BPF link; or NULL is returned on + * error, error code is stored in errno + */ +LIBBPF_API struct bpf_link * +bpf_program__attach_ksyscall(const struct bpf_program *prog, + const char *syscall_name, + const struct bpf_ksyscall_opts *opts); + struct bpf_uprobe_opts { /* size of this struct, for forward/backward compatiblity */ size_t sz; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 4c4c40b8f935..0625adb9e888 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -356,6 +356,7 @@ LIBBPF_0.8.0 { LIBBPF_1.0.0 { global: bpf_prog_query_opts; + bpf_program__attach_ksyscall; btf__add_enum64; btf__add_enum64_value; libbpf_bpf_attach_type_str; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 9cd7829cbe41..f01dbab49da9 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -352,6 +352,8 @@ enum kern_feature_id { FEAT_BPF_COOKIE, /* BTF_KIND_ENUM64 support and BTF_KIND_ENUM kflag support */ FEAT_BTF_ENUM64, + /* Kernel uses syscall wrapper (CONFIG_ARCH_HAS_SYSCALL_WRAPPER) */ + FEAT_SYSCALL_WRAPPER, __FEAT_CNT, }; -- cgit v1.2.3 From d814ed62d3d24eb5c5f904b897e0414c1ccb5740 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Jul 2022 00:07:55 -0700 Subject: selftests/bpf: use BPF_KSYSCALL and SEC("ksyscall") in selftests Convert few selftest that used plain SEC("kprobe") with arch-specific syscall wrapper prefix to ksyscall/kretsyscall and corresponding BPF_KSYSCALL macro. test_probe_user.c is especially benefiting from this simplification. Tested-by: Alan Maguire Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220714070755.3235561-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/bpf_syscall_macro.c | 6 ++--- .../selftests/bpf/progs/test_attach_probe.c | 15 ++++++------ .../testing/selftests/bpf/progs/test_probe_user.c | 27 +++++----------------- 3 files changed, 16 insertions(+), 32 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/bpf_syscall_macro.c b/tools/testing/selftests/bpf/progs/bpf_syscall_macro.c index 05838ed9b89c..e1e11897e99b 100644 --- a/tools/testing/selftests/bpf/progs/bpf_syscall_macro.c +++ b/tools/testing/selftests/bpf/progs/bpf_syscall_macro.c @@ -64,9 +64,9 @@ int BPF_KPROBE(handle_sys_prctl) return 0; } -SEC("kprobe/" SYS_PREFIX "sys_prctl") -int BPF_KPROBE_SYSCALL(prctl_enter, int option, unsigned long arg2, - unsigned long arg3, unsigned long arg4, unsigned long arg5) +SEC("ksyscall/prctl") +int BPF_KSYSCALL(prctl_enter, int option, unsigned long arg2, + unsigned long arg3, unsigned long arg4, unsigned long arg5) { pid_t pid = bpf_get_current_pid_tgid() >> 32; diff --git a/tools/testing/selftests/bpf/progs/test_attach_probe.c b/tools/testing/selftests/bpf/progs/test_attach_probe.c index f1c88ad368ef..a1e45fec8938 100644 --- a/tools/testing/selftests/bpf/progs/test_attach_probe.c +++ b/tools/testing/selftests/bpf/progs/test_attach_probe.c @@ -1,11 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2017 Facebook -#include -#include +#include "vmlinux.h" #include #include -#include +#include #include "bpf_misc.h" int kprobe_res = 0; @@ -31,8 +30,8 @@ int handle_kprobe(struct pt_regs *ctx) return 0; } -SEC("kprobe/" SYS_PREFIX "sys_nanosleep") -int BPF_KPROBE(handle_kprobe_auto) +SEC("ksyscall/nanosleep") +int BPF_KSYSCALL(handle_kprobe_auto, struct __kernel_timespec *req, struct __kernel_timespec *rem) { kprobe2_res = 11; return 0; @@ -56,11 +55,11 @@ int handle_kretprobe(struct pt_regs *ctx) return 0; } -SEC("kretprobe/" SYS_PREFIX "sys_nanosleep") -int BPF_KRETPROBE(handle_kretprobe_auto) +SEC("kretsyscall/nanosleep") +int BPF_KRETPROBE(handle_kretprobe_auto, int ret) { kretprobe2_res = 22; - return 0; + return ret; } SEC("uprobe") diff --git a/tools/testing/selftests/bpf/progs/test_probe_user.c b/tools/testing/selftests/bpf/progs/test_probe_user.c index 702578a5e496..8e1495008e4d 100644 --- a/tools/testing/selftests/bpf/progs/test_probe_user.c +++ b/tools/testing/selftests/bpf/progs/test_probe_user.c @@ -1,35 +1,20 @@ // SPDX-License-Identifier: GPL-2.0 - -#include -#include - -#include - +#include "vmlinux.h" #include #include +#include #include "bpf_misc.h" static struct sockaddr_in old; -SEC("kprobe/" SYS_PREFIX "sys_connect") -int BPF_KPROBE(handle_sys_connect) +SEC("ksyscall/connect") +int BPF_KSYSCALL(handle_sys_connect, int fd, struct sockaddr_in *uservaddr, int addrlen) { -#if SYSCALL_WRAPPER == 1 - struct pt_regs *real_regs; -#endif struct sockaddr_in new; - void *ptr; - -#if SYSCALL_WRAPPER == 0 - ptr = (void *)PT_REGS_PARM2(ctx); -#else - real_regs = (struct pt_regs *)PT_REGS_PARM1(ctx); - bpf_probe_read_kernel(&ptr, sizeof(ptr), &PT_REGS_PARM2(real_regs)); -#endif - bpf_probe_read_user(&old, sizeof(old), ptr); + bpf_probe_read_user(&old, sizeof(old), uservaddr); __builtin_memset(&new, 0xab, sizeof(new)); - bpf_probe_write_user(ptr, &new, sizeof(new)); + bpf_probe_write_user(uservaddr, &new, sizeof(new)); return 0; } -- cgit v1.2.3 From 24316461200502aa5feddaa72dcbb8059503a528 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Jul 2022 22:31:46 -0700 Subject: selftests/bpf: validate .bss section bigger than 8MB is possible now Add a simple big 16MB array and validate access to the very last byte of it to make sure that kernel supports > KMALLOC_MAX_SIZE value_size for BPF array maps (which are backing .bss in this case). Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220715053146.1291891-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/skeleton.c | 2 ++ tools/testing/selftests/bpf/progs/test_skeleton.c | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/skeleton.c b/tools/testing/selftests/bpf/prog_tests/skeleton.c index 180afd632f4c..99dac5292b41 100644 --- a/tools/testing/selftests/bpf/prog_tests/skeleton.c +++ b/tools/testing/selftests/bpf/prog_tests/skeleton.c @@ -122,6 +122,8 @@ void test_skeleton(void) ASSERT_EQ(skel->bss->out_mostly_var, 123, "out_mostly_var"); + ASSERT_EQ(bss->huge_arr[ARRAY_SIZE(bss->huge_arr) - 1], 123, "huge_arr"); + elf_bytes = test_skeleton__elf_bytes(&elf_bytes_sz); ASSERT_OK_PTR(elf_bytes, "elf_bytes"); ASSERT_GE(elf_bytes_sz, 0, "elf_bytes_sz"); diff --git a/tools/testing/selftests/bpf/progs/test_skeleton.c b/tools/testing/selftests/bpf/progs/test_skeleton.c index 1b1187d2967b..1a4e93f6d9df 100644 --- a/tools/testing/selftests/bpf/progs/test_skeleton.c +++ b/tools/testing/selftests/bpf/progs/test_skeleton.c @@ -51,6 +51,8 @@ int out_dynarr[4] SEC(".data.dyn") = { 1, 2, 3, 4 }; int read_mostly_var __read_mostly; int out_mostly_var; +char huge_arr[16 * 1024 * 1024]; + SEC("raw_tp/sys_enter") int handler(const void *ctx) { @@ -71,6 +73,8 @@ int handler(const void *ctx) out_mostly_var = read_mostly_var; + huge_arr[sizeof(huge_arr) - 1] = 123; + return 0; } -- cgit v1.2.3 From a1ac9fd6c6504aa1838930bbfae6217ab6b95945 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 15 Jul 2022 11:57:36 -0700 Subject: libbpf: fallback to tracefs mount point if debugfs is not mounted Teach libbpf to fallback to tracefs mount point (/sys/kernel/tracing) if debugfs (/sys/kernel/debug/tracing) isn't mounted. Acked-by: Yonghong Song Suggested-by: Connor O'Brien Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220715185736.898848-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 61 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 431409a2336b..9b55006593c0 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -9864,6 +9864,34 @@ static int append_to_file(const char *file, const char *fmt, ...) return err; } +#define DEBUGFS "/sys/kernel/debug/tracing" +#define TRACEFS "/sys/kernel/tracing" + +static bool use_debugfs(void) +{ + static int has_debugfs = -1; + + if (has_debugfs < 0) + has_debugfs = access(DEBUGFS, F_OK) == 0; + + return has_debugfs == 1; +} + +static const char *tracefs_path(void) +{ + return use_debugfs() ? DEBUGFS : TRACEFS; +} + +static const char *tracefs_kprobe_events(void) +{ + return use_debugfs() ? DEBUGFS"/kprobe_events" : TRACEFS"/kprobe_events"; +} + +static const char *tracefs_uprobe_events(void) +{ + return use_debugfs() ? DEBUGFS"/uprobe_events" : TRACEFS"/uprobe_events"; +} + static void gen_kprobe_legacy_event_name(char *buf, size_t buf_sz, const char *kfunc_name, size_t offset) { @@ -9876,9 +9904,7 @@ static void gen_kprobe_legacy_event_name(char *buf, size_t buf_sz, static int add_kprobe_event_legacy(const char *probe_name, bool retprobe, const char *kfunc_name, size_t offset) { - const char *file = "/sys/kernel/debug/tracing/kprobe_events"; - - return append_to_file(file, "%c:%s/%s %s+0x%zx", + return append_to_file(tracefs_kprobe_events(), "%c:%s/%s %s+0x%zx", retprobe ? 'r' : 'p', retprobe ? "kretprobes" : "kprobes", probe_name, kfunc_name, offset); @@ -9886,18 +9912,16 @@ static int add_kprobe_event_legacy(const char *probe_name, bool retprobe, static int remove_kprobe_event_legacy(const char *probe_name, bool retprobe) { - const char *file = "/sys/kernel/debug/tracing/kprobe_events"; - - return append_to_file(file, "-:%s/%s", retprobe ? "kretprobes" : "kprobes", probe_name); + return append_to_file(tracefs_kprobe_events(), "-:%s/%s", + retprobe ? "kretprobes" : "kprobes", probe_name); } static int determine_kprobe_perf_type_legacy(const char *probe_name, bool retprobe) { char file[256]; - snprintf(file, sizeof(file), - "/sys/kernel/debug/tracing/events/%s/%s/id", - retprobe ? "kretprobes" : "kprobes", probe_name); + snprintf(file, sizeof(file), "%s/events/%s/%s/id", + tracefs_path(), retprobe ? "kretprobes" : "kprobes", probe_name); return parse_uint_from_file(file, "%d\n"); } @@ -10347,9 +10371,7 @@ static void gen_uprobe_legacy_event_name(char *buf, size_t buf_sz, static inline int add_uprobe_event_legacy(const char *probe_name, bool retprobe, const char *binary_path, size_t offset) { - const char *file = "/sys/kernel/debug/tracing/uprobe_events"; - - return append_to_file(file, "%c:%s/%s %s:0x%zx", + return append_to_file(tracefs_uprobe_events(), "%c:%s/%s %s:0x%zx", retprobe ? 'r' : 'p', retprobe ? "uretprobes" : "uprobes", probe_name, binary_path, offset); @@ -10357,18 +10379,16 @@ static inline int add_uprobe_event_legacy(const char *probe_name, bool retprobe, static inline int remove_uprobe_event_legacy(const char *probe_name, bool retprobe) { - const char *file = "/sys/kernel/debug/tracing/uprobe_events"; - - return append_to_file(file, "-:%s/%s", retprobe ? "uretprobes" : "uprobes", probe_name); + return append_to_file(tracefs_uprobe_events(), "-:%s/%s", + retprobe ? "uretprobes" : "uprobes", probe_name); } static int determine_uprobe_perf_type_legacy(const char *probe_name, bool retprobe) { char file[512]; - snprintf(file, sizeof(file), - "/sys/kernel/debug/tracing/events/%s/%s/id", - retprobe ? "uretprobes" : "uprobes", probe_name); + snprintf(file, sizeof(file), "%s/events/%s/%s/id", + tracefs_path(), retprobe ? "uretprobes" : "uprobes", probe_name); return parse_uint_from_file(file, "%d\n"); } @@ -10916,9 +10936,8 @@ static int determine_tracepoint_id(const char *tp_category, char file[PATH_MAX]; int ret; - ret = snprintf(file, sizeof(file), - "/sys/kernel/debug/tracing/events/%s/%s/id", - tp_category, tp_name); + ret = snprintf(file, sizeof(file), "%s/events/%s/%s/id", + tracefs_path(), tp_category, tp_name); if (ret < 0) return -errno; if (ret >= sizeof(file)) { -- cgit v1.2.3 From bdb2bc7599298ebb677e40fc92b1fa9e69e05098 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Fri, 15 Jul 2022 12:38:00 -0700 Subject: bpf: fix bpf_skb_pull_data documentation Fix documentation for bpf_skb_pull_data() helper for when len == 0. Fixes: fa15601ab31e ("bpf: add documentation for eBPF helpers (33-41)") Signed-off-by: Joanne Koong Acked-by: Quentin Monnet Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220715193800.3940070-1-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 3 ++- tools/include/uapi/linux/bpf.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 379e68fb866f..ffcbf79a556b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2361,7 +2361,8 @@ union bpf_attr { * Pull in non-linear data in case the *skb* is non-linear and not * all of *len* are part of the linear section. Make *len* bytes * from *skb* readable and writable. If a zero value is passed for - * *len*, then the whole length of the *skb* is pulled. + * *len*, then all bytes in the linear part of *skb* will be made + * readable and writable. * * This helper is only needed for reading and writing with direct * packet access. diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 379e68fb866f..ffcbf79a556b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2361,7 +2361,8 @@ union bpf_attr { * Pull in non-linear data in case the *skb* is non-linear and not * all of *len* are part of the linear section. Make *len* bytes * from *skb* readable and writable. If a zero value is passed for - * *len*, then the whole length of the *skb* is pulled. + * *len*, then all bytes in the linear part of *skb* will be made + * readable and writable. * * This helper is only needed for reading and writing with direct * packet access. -- cgit v1.2.3 From 597fbc4682969361dd141aaa58b8cc73a80da85d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 15 Jul 2022 16:09:51 -0700 Subject: libbpf: make RINGBUF map size adjustments more eagerly Make libbpf adjust RINGBUF map size (rounding it up to closest power-of-2 of page_size) more eagerly: during open phase when initializing the map and on explicit calls to bpf_map__set_max_entries(). Such approach allows user to check actual size of BPF ringbuf even before it's created in the kernel, but also it prevents various edge case scenarios where BPF ringbuf size can get out of sync with what it would be in kernel. One of them (reported in [0]) is during an attempt to pin/reuse BPF ringbuf. Move adjust_ringbuf_sz() helper closer to its first actual use. The implementation of the helper is unchanged. Also make detection of whether bpf_object is already loaded more robust by checking obj->loaded explicitly, given that map->fd can be < 0 even if bpf_object is already loaded due to ability to disable map creation with bpf_map__set_autocreate(map, false). [0] Closes: https://github.com/libbpf/libbpf/pull/530 Fixes: 0087a681fa8c ("libbpf: Automatically fix up BPF_MAP_TYPE_RINGBUF size, if necessary") Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20220715230952.2219271-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 77 +++++++++++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 35 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 9b55006593c0..b01fe01b0761 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -2331,6 +2331,37 @@ int parse_btf_map_def(const char *map_name, struct btf *btf, return 0; } +static size_t adjust_ringbuf_sz(size_t sz) +{ + __u32 page_sz = sysconf(_SC_PAGE_SIZE); + __u32 mul; + + /* if user forgot to set any size, make sure they see error */ + if (sz == 0) + return 0; + /* Kernel expects BPF_MAP_TYPE_RINGBUF's max_entries to be + * a power-of-2 multiple of kernel's page size. If user diligently + * satisified these conditions, pass the size through. + */ + if ((sz % page_sz) == 0 && is_pow_of_2(sz / page_sz)) + return sz; + + /* Otherwise find closest (page_sz * power_of_2) product bigger than + * user-set size to satisfy both user size request and kernel + * requirements and substitute correct max_entries for map creation. + */ + for (mul = 1; mul <= UINT_MAX / page_sz; mul <<= 1) { + if (mul * page_sz > sz) + return mul * page_sz; + } + + /* if it's impossible to satisfy the conditions (i.e., user size is + * very close to UINT_MAX but is not a power-of-2 multiple of + * page_size) then just return original size and let kernel reject it + */ + return sz; +} + static void fill_map_from_def(struct bpf_map *map, const struct btf_map_def *def) { map->def.type = def->map_type; @@ -2344,6 +2375,10 @@ static void fill_map_from_def(struct bpf_map *map, const struct btf_map_def *def map->btf_key_type_id = def->key_type_id; map->btf_value_type_id = def->value_type_id; + /* auto-adjust BPF ringbuf map max_entries to be a multiple of page size */ + if (map->def.type == BPF_MAP_TYPE_RINGBUF) + map->def.max_entries = adjust_ringbuf_sz(map->def.max_entries); + if (def->parts & MAP_DEF_MAP_TYPE) pr_debug("map '%s': found type = %u.\n", map->name, def->map_type); @@ -4317,9 +4352,15 @@ struct bpf_map *bpf_map__inner_map(struct bpf_map *map) int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries) { - if (map->fd >= 0) + if (map->obj->loaded) return libbpf_err(-EBUSY); + map->def.max_entries = max_entries; + + /* auto-adjust BPF ringbuf map max_entries to be a multiple of page size */ + if (map->def.type == BPF_MAP_TYPE_RINGBUF) + map->def.max_entries = adjust_ringbuf_sz(map->def.max_entries); + return 0; } @@ -4875,37 +4916,6 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map) static void bpf_map__destroy(struct bpf_map *map); -static size_t adjust_ringbuf_sz(size_t sz) -{ - __u32 page_sz = sysconf(_SC_PAGE_SIZE); - __u32 mul; - - /* if user forgot to set any size, make sure they see error */ - if (sz == 0) - return 0; - /* Kernel expects BPF_MAP_TYPE_RINGBUF's max_entries to be - * a power-of-2 multiple of kernel's page size. If user diligently - * satisified these conditions, pass the size through. - */ - if ((sz % page_sz) == 0 && is_pow_of_2(sz / page_sz)) - return sz; - - /* Otherwise find closest (page_sz * power_of_2) product bigger than - * user-set size to satisfy both user size request and kernel - * requirements and substitute correct max_entries for map creation. - */ - for (mul = 1; mul <= UINT_MAX / page_sz; mul <<= 1) { - if (mul * page_sz > sz) - return mul * page_sz; - } - - /* if it's impossible to satisfy the conditions (i.e., user size is - * very close to UINT_MAX but is not a power-of-2 multiple of - * page_size) then just return original size and let kernel reject it - */ - return sz; -} - static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, bool is_inner) { LIBBPF_OPTS(bpf_map_create_opts, create_attr); @@ -4944,9 +4954,6 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b } switch (def->type) { - case BPF_MAP_TYPE_RINGBUF: - map->def.max_entries = adjust_ringbuf_sz(map->def.max_entries); - /* fallthrough */ case BPF_MAP_TYPE_PERF_EVENT_ARRAY: case BPF_MAP_TYPE_CGROUP_ARRAY: case BPF_MAP_TYPE_STACK_TRACE: -- cgit v1.2.3 From e134601961fef4516df9413b270fb96ef6d034bc Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 15 Jul 2022 16:09:52 -0700 Subject: selftests/bpf: test eager BPF ringbuf size adjustment logic Add test validating that libbpf adjusts (and reflects adjusted) ringbuf size early, before bpf_object is loaded. Also make sure we can't successfully resize ringbuf map after bpf_object is loaded. Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20220715230952.2219271-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c index eb5f7f5aa81a..1455911d9fcb 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c @@ -50,6 +50,13 @@ void test_ringbuf_multi(void) if (CHECK(!skel, "skel_open", "skeleton open failed\n")) return; + /* validate ringbuf size adjustment logic */ + ASSERT_EQ(bpf_map__max_entries(skel->maps.ringbuf1), page_size, "rb1_size_before"); + ASSERT_OK(bpf_map__set_max_entries(skel->maps.ringbuf1, page_size + 1), "rb1_resize"); + ASSERT_EQ(bpf_map__max_entries(skel->maps.ringbuf1), 2 * page_size, "rb1_size_after"); + ASSERT_OK(bpf_map__set_max_entries(skel->maps.ringbuf1, page_size), "rb1_reset"); + ASSERT_EQ(bpf_map__max_entries(skel->maps.ringbuf1), page_size, "rb1_size_final"); + proto_fd = bpf_map_create(BPF_MAP_TYPE_RINGBUF, NULL, 0, 0, page_size, NULL); if (CHECK(proto_fd < 0, "bpf_map_create", "bpf_map_create failed\n")) goto cleanup; @@ -65,6 +72,10 @@ void test_ringbuf_multi(void) close(proto_fd); proto_fd = -1; + /* make sure we can't resize ringbuf after object load */ + if (!ASSERT_ERR(bpf_map__set_max_entries(skel->maps.ringbuf1, 3 * page_size), "rb1_resize_after_load")) + goto cleanup; + /* only trigger BPF program for current process */ skel->bss->pid = getpid(); -- cgit v1.2.3 From c5d22f4cfe8dfb93f1db0a1e7e2e7ebc41395d98 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 19 Jul 2022 12:50:32 +0300 Subject: selftests/bpf: fix a test for snprintf() overflow The snprintf() function returns the number of bytes which *would* have been copied if there were space. In other words, it can be > sizeof(pin_path). Fixes: c0fa1b6c3efc ("bpf: btf: Add BTF tests") Signed-off-by: Dan Carpenter Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/YtZ+aD/tZMkgOUw+@kili Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index 941b0100bafa..ef6528b8084c 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -5338,7 +5338,7 @@ static void do_test_pprint(int test_num) ret = snprintf(pin_path, sizeof(pin_path), "%s/%s", "/sys/fs/bpf", test->map_name); - if (CHECK(ret == sizeof(pin_path), "pin_path %s/%s is too long", + if (CHECK(ret >= sizeof(pin_path), "pin_path %s/%s is too long", "/sys/fs/bpf", test->map_name)) { err = -1; goto done; -- cgit v1.2.3 From b77ffb30cfc5f58e957571d8541c6a7e3da19221 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 19 Jul 2022 12:51:28 +0300 Subject: libbpf: fix an snprintf() overflow check The snprintf() function returns the number of bytes it *would* have copied if there were enough space. So it can return > the sizeof(gen->attach_target). Fixes: 67234743736a ("libbpf: Generate loader program out of BPF ELF file.") Signed-off-by: Dan Carpenter Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/YtZ+oAySqIhFl6/J@kili Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/gen_loader.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c index 927745b08014..23f5c46708f8 100644 --- a/tools/lib/bpf/gen_loader.c +++ b/tools/lib/bpf/gen_loader.c @@ -533,7 +533,7 @@ void bpf_gen__record_attach_target(struct bpf_gen *gen, const char *attach_name, gen->attach_kind = kind; ret = snprintf(gen->attach_target, sizeof(gen->attach_target), "%s%s", prefix, attach_name); - if (ret == sizeof(gen->attach_target)) + if (ret >= sizeof(gen->attach_target)) gen->error = -ENOSPC; } -- cgit v1.2.3 From 68566a7cf56bf3148797c218ed45a9de078ef47c Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Jul 2022 12:31:44 +0300 Subject: perf tools: Fix dso_id inode generation comparison Synthesized MMAP events have zero ino_generation, so do not compare them to DSOs with a real ino_generation otherwise we end up with a DSO without a build id. Fixes: 0e3149f86b99ddab ("perf dso: Move dso_id from 'struct map' to 'struct dso'") Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: kvm@vger.kernel.org Cc: Namhyung Kim Link: https://lore.kernel.org/r/20220711093218.10967-2-adrian.hunter@intel.com [ Added clarification to the comment from Ian + more detailed explanation from Adrian ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dsos.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/dsos.c b/tools/perf/util/dsos.c index b97366f77bbf..2bd23e4cf19e 100644 --- a/tools/perf/util/dsos.c +++ b/tools/perf/util/dsos.c @@ -23,8 +23,19 @@ static int __dso_id__cmp(struct dso_id *a, struct dso_id *b) if (a->ino > b->ino) return -1; if (a->ino < b->ino) return 1; - if (a->ino_generation > b->ino_generation) return -1; - if (a->ino_generation < b->ino_generation) return 1; + /* + * Synthesized MMAP events have zero ino_generation, avoid comparing + * them with MMAP events with actual ino_generation. + * + * I found it harmful because the mismatch resulted in a new + * dso that did not have a build ID whereas the original dso did have a + * build ID. The build ID was essential because the object was not found + * otherwise. - Adrian + */ + if (a->ino_generation && b->ino_generation) { + if (a->ino_generation > b->ino_generation) return -1; + if (a->ino_generation < b->ino_generation) return 1; + } return 0; } -- cgit v1.2.3 From 06c8580aa23ddc9b1c37b9c7c5e2cc504e460207 Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Wed, 13 Jul 2022 22:17:17 -0500 Subject: memblock tests: change build options to run-time options Change verbose and movable node build options to run-time options. Movable node usage: $ ./main -m Or: $ ./main --movable-node Verbose usage: $ ./main -v Or: $ ./main --verbose Signed-off-by: Rebecca Mckeever Reviewed-by: David Hildenbrand Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/20220714031717.12258-1-remckee0@gmail.com --- tools/testing/memblock/Makefile | 5 -- tools/testing/memblock/linux/memory_hotplug.h | 8 +-- tools/testing/memblock/main.c | 2 + tools/testing/memblock/scripts/Makefile.include | 10 --- tools/testing/memblock/tests/common.c | 87 +++++++++++++++++++++---- tools/testing/memblock/tests/common.h | 10 +-- 6 files changed, 80 insertions(+), 42 deletions(-) (limited to 'tools') diff --git a/tools/testing/memblock/Makefile b/tools/testing/memblock/Makefile index 9fde49ad73bd..246f7ac8489b 100644 --- a/tools/testing/memblock/Makefile +++ b/tools/testing/memblock/Makefile @@ -45,13 +45,8 @@ help: @echo ' clean - Remove generated files and symlinks in the directory' @echo '' @echo 'Configuration:' - @echo ' make VERBOSE=1 - enable verbose output, which includes the' - @echo ' names of functions being tested and the' - @echo ' number of test cases passing' @echo ' make MEMBLOCK_DEBUG=1 - enable memblock_dbg() messages' @echo ' make NUMA=1 - simulate enabled NUMA' - @echo ' make MOVABLE_NODE=1 - override `movable_node_is_enabled`' - @echo ' definition to simulate movable NUMA nodes' @echo ' make 32BIT_PHYS_ADDR_T=1 - Use 32 bit physical addresses' vpath %.c ../../lib diff --git a/tools/testing/memblock/linux/memory_hotplug.h b/tools/testing/memblock/linux/memory_hotplug.h index 47988765a219..dabe2c556858 100644 --- a/tools/testing/memblock/linux/memory_hotplug.h +++ b/tools/testing/memblock/linux/memory_hotplug.h @@ -7,13 +7,11 @@ #include #include +extern bool movable_node_enabled; + static inline bool movable_node_is_enabled(void) { -#ifdef MOVABLE_NODE - return true; -#else - return false; -#endif + return movable_node_enabled; } #endif diff --git a/tools/testing/memblock/main.c b/tools/testing/memblock/main.c index fb183c9e76d1..4ca1024342b1 100644 --- a/tools/testing/memblock/main.c +++ b/tools/testing/memblock/main.c @@ -3,9 +3,11 @@ #include "tests/alloc_api.h" #include "tests/alloc_helpers_api.h" #include "tests/alloc_nid_api.h" +#include "tests/common.h" int main(int argc, char **argv) { + parse_args(argc, argv); memblock_basic_checks(); memblock_alloc_checks(); memblock_alloc_helpers_checks(); diff --git a/tools/testing/memblock/scripts/Makefile.include b/tools/testing/memblock/scripts/Makefile.include index 4401f79bed4c..aa6d82d56a23 100644 --- a/tools/testing/memblock/scripts/Makefile.include +++ b/tools/testing/memblock/scripts/Makefile.include @@ -6,11 +6,6 @@ ifeq ($(NUMA), 1) CFLAGS += -D CONFIG_NUMA endif -# Simulate movable NUMA memory regions -ifeq ($(MOVABLE_NODE), 1) - CFLAGS += -D MOVABLE_NODE -endif - # Use 32 bit physical addresses. # Remember to install 32-bit version of dependencies. ifeq ($(32BIT_PHYS_ADDR_T), 1) @@ -18,11 +13,6 @@ ifeq ($(32BIT_PHYS_ADDR_T), 1) LDFLAGS += -m32 endif -# Enable verbose testing output -ifeq ($(VERBOSE), 1) - CFLAGS += -D VERBOSE -endif - # Enable memblock_dbg() messages ifeq ($(MEMBLOCK_DEBUG), 1) CFLAGS += -D MEMBLOCK_DEBUG diff --git a/tools/testing/memblock/tests/common.c b/tools/testing/memblock/tests/common.c index ebc06b4c3255..e43b2676af81 100644 --- a/tools/testing/memblock/tests/common.c +++ b/tools/testing/memblock/tests/common.c @@ -1,6 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "tests/common.h" #include +#include +#include +#include #define INIT_MEMBLOCK_REGIONS 128 #define INIT_MEMBLOCK_RESERVED_REGIONS INIT_MEMBLOCK_REGIONS @@ -11,6 +14,27 @@ static struct test_memory memory_block; static const char __maybe_unused *prefixes[PREFIXES_MAX]; static int __maybe_unused nr_prefixes; +static const char *short_opts = "mv"; +static const struct option long_opts[] = { + {"movable-node", 0, NULL, 'm'}, + {"verbose", 0, NULL, 'v'}, + {NULL, 0, NULL, 0} +}; + +static const char * const help_opts[] = { + "disallow allocations from regions marked as hotplugged\n\t\t\t" + "by simulating enabling the \"movable_node\" kernel\n\t\t\t" + "parameter", + "enable verbose output, which includes the name of the\n\t\t\t" + "memblock function being tested, the name of the test,\n\t\t\t" + "and whether the test passed or failed." +}; + +static int verbose; + +/* sets global variable returned by movable_node_is_enabled() stub */ +bool movable_node_enabled; + void reset_memblock_regions(void) { memset(memblock.memory.regions, 0, @@ -51,7 +75,39 @@ void dummy_physical_memory_cleanup(void) free(memory_block.base); } -#ifdef VERBOSE +static void usage(const char *prog) +{ + BUILD_BUG_ON(ARRAY_SIZE(help_opts) != ARRAY_SIZE(long_opts) - 1); + + printf("Usage: %s [-%s]\n", prog, short_opts); + + for (int i = 0; long_opts[i].name; i++) { + printf(" -%c, --%-12s\t%s\n", long_opts[i].val, + long_opts[i].name, help_opts[i]); + } + + exit(1); +} + +void parse_args(int argc, char **argv) +{ + int c; + + while ((c = getopt_long_only(argc, argv, short_opts, long_opts, + NULL)) != -1) { + switch (c) { + case 'm': + movable_node_enabled = true; + break; + case 'v': + verbose = 1; + break; + default: + usage(argv[0]); + } + } +} + void print_prefixes(const char *postfix) { for (int i = 0; i < nr_prefixes; i++) @@ -61,25 +117,31 @@ void print_prefixes(const char *postfix) void test_fail(void) { - ksft_test_result_fail(": "); - print_prefixes("failed\n"); + if (verbose) { + ksft_test_result_fail(": "); + print_prefixes("failed\n"); + } } void test_pass(void) { - ksft_test_result_pass(": "); - print_prefixes("passed\n"); + if (verbose) { + ksft_test_result_pass(": "); + print_prefixes("passed\n"); + } } void test_print(const char *fmt, ...) { - int saved_errno = errno; - va_list args; - - va_start(args, fmt); - errno = saved_errno; - vprintf(fmt, args); - va_end(args); + if (verbose) { + int saved_errno = errno; + va_list args; + + va_start(args, fmt); + errno = saved_errno; + vprintf(fmt, args); + va_end(args); + } } void prefix_reset(void) @@ -102,4 +164,3 @@ void prefix_pop(void) nr_prefixes--; } } -#endif /* VERBOSE */ diff --git a/tools/testing/memblock/tests/common.h b/tools/testing/memblock/tests/common.h index 46de86a755f3..3e7f23d341d7 100644 --- a/tools/testing/memblock/tests/common.h +++ b/tools/testing/memblock/tests/common.h @@ -70,22 +70,14 @@ void reset_memblock_attributes(void); void setup_memblock(void); void dummy_physical_memory_init(void); void dummy_physical_memory_cleanup(void); +void parse_args(int argc, char **argv); -#ifdef VERBOSE void test_fail(void); void test_pass(void); void test_print(const char *fmt, ...); void prefix_reset(void); void prefix_push(const char *prefix); void prefix_pop(void); -#else -static inline void test_fail(void) {} -static inline void test_pass(void) {} -static inline void test_print(const char *fmt, ...) {} -static inline void prefix_reset(void) {} -static inline void prefix_push(const char *prefix) {} -static inline void prefix_pop(void) {} -#endif /* VERBOSE */ static inline void test_pass_pop(void) { -- cgit v1.2.3 From 9b31e60800d8fa69027baf9ec7f03a0c5b145079 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 15 Jul 2022 11:55:49 -0700 Subject: tools: Fixed MIPS builds due to struct flock re-definition Building perf for MIPS failed after 9f79b8b72339 ("uapi: simplify __ARCH_FLOCK{,64}_PAD a little") with the following error: CC /home/fainelli/work/buildroot/output/bmips/build/linux-custom/tools/perf/trace/beauty/fcntl.o In file included from ../../../../host/mipsel-buildroot-linux-gnu/sysroot/usr/include/asm/fcntl.h:77, from ../include/uapi/linux/fcntl.h:5, from trace/beauty/fcntl.c:10: ../include/uapi/asm-generic/fcntl.h:188:8: error: redefinition of 'struct flock' struct flock { ^~~~~ In file included from ../include/uapi/linux/fcntl.h:5, from trace/beauty/fcntl.c:10: ../../../../host/mipsel-buildroot-linux-gnu/sysroot/usr/include/asm/fcntl.h:63:8: note: originally defined here struct flock { ^~~~~ This is due to the local copy under tools/include/uapi/asm-generic/fcntl.h including the toolchain's kernel headers which already define 'struct flock' and define HAVE_ARCH_STRUCT_FLOCK to future inclusions make a decision as to whether re-defining 'struct flock' is appropriate or not. Make sure what do not re-define 'struct flock' when HAVE_ARCH_STRUCT_FLOCK is already defined. Fixes: 9f79b8b72339 ("uapi: simplify __ARCH_FLOCK{,64}_PAD a little") Signed-off-by: Florian Fainelli Reviewed-by: Christoph Hellwig [arnd: sync with include/uapi/asm-generic/fcntl.h as well] Signed-off-by: Arnd Bergmann --- include/uapi/asm-generic/fcntl.h | 2 ++ tools/include/uapi/asm-generic/fcntl.h | 11 ++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h index f13d37b60775..1ecdb911add8 100644 --- a/include/uapi/asm-generic/fcntl.h +++ b/include/uapi/asm-generic/fcntl.h @@ -192,6 +192,7 @@ struct f_owner_ex { #define F_LINUX_SPECIFIC_BASE 1024 +#ifndef HAVE_ARCH_STRUCT_FLOCK struct flock { short l_type; short l_whence; @@ -216,5 +217,6 @@ struct flock64 { __ARCH_FLOCK64_PAD #endif }; +#endif /* HAVE_ARCH_STRUCT_FLOCK */ #endif /* _ASM_GENERIC_FCNTL_H */ diff --git a/tools/include/uapi/asm-generic/fcntl.h b/tools/include/uapi/asm-generic/fcntl.h index 0197042b7dfb..1ecdb911add8 100644 --- a/tools/include/uapi/asm-generic/fcntl.h +++ b/tools/include/uapi/asm-generic/fcntl.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_GENERIC_FCNTL_H #define _ASM_GENERIC_FCNTL_H @@ -90,7 +91,7 @@ /* a horrid kludge trying to make sure that this will fail on old kernels */ #define O_TMPFILE (__O_TMPFILE | O_DIRECTORY) -#define O_TMPFILE_MASK (__O_TMPFILE | O_DIRECTORY | O_CREAT) +#define O_TMPFILE_MASK (__O_TMPFILE | O_DIRECTORY | O_CREAT) #ifndef O_NDELAY #define O_NDELAY O_NONBLOCK @@ -115,11 +116,13 @@ #define F_GETSIG 11 /* for sockets. */ #endif +#if __BITS_PER_LONG == 32 || defined(__KERNEL__) #ifndef F_GETLK64 #define F_GETLK64 12 /* using 'struct flock64' */ #define F_SETLK64 13 #define F_SETLKW64 14 #endif +#endif /* __BITS_PER_LONG == 32 || defined(__KERNEL__) */ #ifndef F_SETOWN_EX #define F_SETOWN_EX 15 @@ -178,6 +181,10 @@ struct f_owner_ex { blocking */ #define LOCK_UN 8 /* remove lock */ +/* + * LOCK_MAND support has been removed from the kernel. We leave the symbols + * here to not break legacy builds, but these should not be used in new code. + */ #define LOCK_MAND 32 /* This is a mandatory flock ... */ #define LOCK_READ 64 /* which allows concurrent read operations */ #define LOCK_WRITE 128 /* which allows concurrent write operations */ @@ -185,6 +192,7 @@ struct f_owner_ex { #define F_LINUX_SPECIFIC_BASE 1024 +#ifndef HAVE_ARCH_STRUCT_FLOCK struct flock { short l_type; short l_whence; @@ -209,5 +217,6 @@ struct flock64 { __ARCH_FLOCK64_PAD #endif }; +#endif /* HAVE_ARCH_STRUCT_FLOCK */ #endif /* _ASM_GENERIC_FCNTL_H */ -- cgit v1.2.3 From f63731e18e8d8350e05b0176e39a76639f6483c7 Mon Sep 17 00:00:00 2001 From: Kent Gibson Date: Tue, 19 Jul 2022 19:06:01 +0800 Subject: selftests: gpio: fix include path to kernel headers for out of tree builds When building selftests out of the kernel tree the gpio.h the include path is incorrect and the build falls back to the system includes which may be outdated. Add the KHDR_INCLUDES to the CFLAGS to include the gpio.h from the build tree. Fixes: 4f4d0af7b2d9 ("selftests: gpio: restore CFLAGS options") Reported-by: kernel test robot Signed-off-by: Kent Gibson Signed-off-by: Bartosz Golaszewski --- tools/testing/selftests/gpio/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/gpio/Makefile b/tools/testing/selftests/gpio/Makefile index 71b306602368..616ed4019655 100644 --- a/tools/testing/selftests/gpio/Makefile +++ b/tools/testing/selftests/gpio/Makefile @@ -3,6 +3,6 @@ TEST_PROGS := gpio-mockup.sh gpio-sim.sh TEST_FILES := gpio-mockup-sysfs.sh TEST_GEN_PROGS_EXTENDED := gpio-mockup-cdev gpio-chip-info gpio-line-name -CFLAGS += -O2 -g -Wall -I../../../../usr/include/ +CFLAGS += -O2 -g -Wall -I../../../../usr/include/ $(KHDR_INCLUDES) include ../lib.mk -- cgit v1.2.3 From 163dac34d7a22b4fd980e4d00459a07090f8b9af Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Jul 2022 12:31:45 +0300 Subject: perf tools: Export dsos__for_each_with_build_id() Export dsos__for_each_with_build_id() so it can be used elsewhere. Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Cc: kvm@vger.kernel.org Link: https://lore.kernel.org/r/20220711093218.10967-3-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/build-id.c | 6 ------ tools/perf/util/dso.h | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index 328668f38c69..4c9093b64d1f 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -300,12 +300,6 @@ char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size, return __dso__build_id_filename(dso, bf, size, is_debug, is_kallsyms); } -#define dsos__for_each_with_build_id(pos, head) \ - list_for_each_entry(pos, head, node) \ - if (!pos->has_build_id) \ - continue; \ - else - static int write_buildid(const char *name, size_t name_len, struct build_id *bid, pid_t pid, u16 misc, struct feat_fd *fd) { diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index 97047a11282b..66981c7a9a18 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -227,6 +227,12 @@ struct dso { #define dso__for_each_symbol(dso, pos, n) \ symbols__for_each_entry(&(dso)->symbols, pos, n) +#define dsos__for_each_with_build_id(pos, head) \ + list_for_each_entry(pos, head, node) \ + if (!pos->has_build_id) \ + continue; \ + else + static inline void dso__set_loaded(struct dso *dso) { dso->loaded = true; -- cgit v1.2.3 From f8bcf1e223ad08a4f0d45369a749c58c9bcd5f3c Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Jul 2022 12:31:46 +0300 Subject: perf ordered_events: Add ordered_events__last_flush_time() Allow callers to get the ordered_events last flush timestamp. This is needed in perf inject to obey finished-round ordering when injecting additional events (e.g. from a guest perf.data file) with timestamps. Any additional events that have timestamps before the last flush time must be injected before the corresponding FINISHED_ROUND event. Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Cc: kvm@vger.kernel.org Link: https://lore.kernel.org/r/20220711093218.10967-4-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/ordered-events.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/ordered-events.h b/tools/perf/util/ordered-events.h index 0b05c3c0aeaa..8febbd7c98ca 100644 --- a/tools/perf/util/ordered-events.h +++ b/tools/perf/util/ordered-events.h @@ -75,4 +75,10 @@ void ordered_events__set_copy_on_queue(struct ordered_events *oe, bool copy) { oe->copy_on_queue = copy; } + +static inline u64 ordered_events__last_flush_time(struct ordered_events *oe) +{ + return oe->last_flush; +} + #endif /* __ORDERED_EVENTS_H */ -- cgit v1.2.3 From eddc6e3f6684597924f368cb564d7432b2f7d23e Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Jul 2022 12:31:47 +0300 Subject: perf tools: Export perf_event__process_finished_round() Export perf_event__process_finished_round() so it can be used elsewhere. This is needed in perf inject to obey finished-round ordering. Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: kvm@vger.kernel.org Link: https://lore.kernel.org/r/20220711093218.10967-5-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 12 ++++-------- tools/perf/util/session.h | 4 ++++ 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 37f833c3c81b..4c9513bc6d89 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -374,10 +374,6 @@ static int process_finished_round_stub(struct perf_tool *tool __maybe_unused, return 0; } -static int process_finished_round(struct perf_tool *tool, - union perf_event *event, - struct ordered_events *oe); - static int skipn(int fd, off_t n) { char buf[4096]; @@ -534,7 +530,7 @@ void perf_tool__fill_defaults(struct perf_tool *tool) tool->build_id = process_event_op2_stub; if (tool->finished_round == NULL) { if (tool->ordered_events) - tool->finished_round = process_finished_round; + tool->finished_round = perf_event__process_finished_round; else tool->finished_round = process_finished_round_stub; } @@ -1069,9 +1065,9 @@ static perf_event__swap_op perf_event__swap_ops[] = { * Flush every events below timestamp 7 * etc... */ -static int process_finished_round(struct perf_tool *tool __maybe_unused, - union perf_event *event __maybe_unused, - struct ordered_events *oe) +int perf_event__process_finished_round(struct perf_tool *tool __maybe_unused, + union perf_event *event __maybe_unused, + struct ordered_events *oe) { if (dump_trace) fprintf(stdout, "\n"); diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index 34500a3da735..be5871ea558f 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -155,4 +155,8 @@ int perf_session__deliver_synth_event(struct perf_session *session, int perf_event__process_id_index(struct perf_session *session, union perf_event *event); +int perf_event__process_finished_round(struct perf_tool *tool, + union perf_event *event, + struct ordered_events *oe); + #endif /* __PERF_SESSION_H */ -- cgit v1.2.3 From 0a64de04c94ad4285120bed0dfb382ea98d6d499 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Jul 2022 12:31:48 +0300 Subject: perf tools: Factor out evsel__id_hdr_size() Factor out evsel__id_hdr_size() so it can be reused. This is needed by perf inject. When injecting events from a guest perf.data file, there is a possibility that the sample ID numbers conflict. To re-write an ID sample, the old one needs to be removed first, which means determining how big it is with evsel__id_hdr_size() and then subtracting that from the event size. Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: kvm@vger.kernel.org Link: https://lore.kernel.org/r/20220711093218.10967-6-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 28 +--------------------------- tools/perf/util/evsel.c | 26 ++++++++++++++++++++++++++ tools/perf/util/evsel.h | 2 ++ 3 files changed, 29 insertions(+), 27 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 48af7d379d82..03fbe151b0c4 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1244,34 +1244,8 @@ bool evlist__valid_read_format(struct evlist *evlist) u16 evlist__id_hdr_size(struct evlist *evlist) { struct evsel *first = evlist__first(evlist); - struct perf_sample *data; - u64 sample_type; - u16 size = 0; - if (!first->core.attr.sample_id_all) - goto out; - - sample_type = first->core.attr.sample_type; - - if (sample_type & PERF_SAMPLE_TID) - size += sizeof(data->tid) * 2; - - if (sample_type & PERF_SAMPLE_TIME) - size += sizeof(data->time); - - if (sample_type & PERF_SAMPLE_ID) - size += sizeof(data->id); - - if (sample_type & PERF_SAMPLE_STREAM_ID) - size += sizeof(data->stream_id); - - if (sample_type & PERF_SAMPLE_CPU) - size += sizeof(data->cpu) * 2; - - if (sample_type & PERF_SAMPLE_IDENTIFIER) - size += sizeof(data->id); -out: - return size; + return first->core.attr.sample_id_all ? evsel__id_hdr_size(first) : 0; } bool evlist__valid_sample_id_all(struct evlist *evlist) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index a67cc3f2fa74..9a30ccb7b104 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2724,6 +2724,32 @@ int evsel__parse_sample_timestamp(struct evsel *evsel, union perf_event *event, return 0; } +u16 evsel__id_hdr_size(struct evsel *evsel) +{ + u64 sample_type = evsel->core.attr.sample_type; + u16 size = 0; + + if (sample_type & PERF_SAMPLE_TID) + size += sizeof(u64); + + if (sample_type & PERF_SAMPLE_TIME) + size += sizeof(u64); + + if (sample_type & PERF_SAMPLE_ID) + size += sizeof(u64); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + size += sizeof(u64); + + if (sample_type & PERF_SAMPLE_CPU) + size += sizeof(u64); + + if (sample_type & PERF_SAMPLE_IDENTIFIER) + size += sizeof(u64); + + return size; +} + struct tep_format_field *evsel__field(struct evsel *evsel, const char *name) { return tep_find_field(evsel->tp_format, name); diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 92bed8e2f7d8..699448f2bc2b 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -381,6 +381,8 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event, int evsel__parse_sample_timestamp(struct evsel *evsel, union perf_event *event, u64 *timestamp); +u16 evsel__id_hdr_size(struct evsel *evsel); + static inline struct evsel *evsel__next(struct evsel *evsel) { return list_entry(evsel->core.node.next, struct evsel, core.node); -- cgit v1.2.3 From 1ee94463e9ac4721cfef27ffcd6f97ab026e3aac Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Jul 2022 12:31:49 +0300 Subject: perf tools: Add perf_event__synthesize_id_sample() Add perf_event__synthesize_id_sample() to enable the synthesis of ID samples. This is needed by perf inject. When injecting events from a guest perf.data file, there is a possibility that the sample ID numbers conflict. In that case, perf_event__synthesize_id_sample() can be used to re-write the ID sample. Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: kvm@vger.kernel.org Link: https://lore.kernel.org/r/20220711093218.10967-7-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/synthetic-events.c | 47 ++++++++++++++++++++++++++++++++++++++ tools/perf/util/synthetic-events.h | 1 + 2 files changed, 48 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index fe5db4bf0042..ed9623702f34 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1712,6 +1712,53 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_fo return 0; } +int perf_event__synthesize_id_sample(__u64 *array, u64 type, const struct perf_sample *sample) +{ + __u64 *start = array; + + /* + * used for cross-endian analysis. See git commit 65014ab3 + * for why this goofiness is needed. + */ + union u64_swap u; + + if (type & PERF_SAMPLE_TID) { + u.val32[0] = sample->pid; + u.val32[1] = sample->tid; + *array = u.val64; + array++; + } + + if (type & PERF_SAMPLE_TIME) { + *array = sample->time; + array++; + } + + if (type & PERF_SAMPLE_ID) { + *array = sample->id; + array++; + } + + if (type & PERF_SAMPLE_STREAM_ID) { + *array = sample->stream_id; + array++; + } + + if (type & PERF_SAMPLE_CPU) { + u.val32[0] = sample->cpu; + u.val32[1] = 0; + *array = u.val64; + array++; + } + + if (type & PERF_SAMPLE_IDENTIFIER) { + *array = sample->id; + array++; + } + + return (void *)array - (void *)start; +} + int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_t process, struct evlist *evlist, struct machine *machine) { diff --git a/tools/perf/util/synthetic-events.h b/tools/perf/util/synthetic-events.h index 78a0450db164..b136ec3ec95d 100644 --- a/tools/perf/util/synthetic-events.h +++ b/tools/perf/util/synthetic-events.h @@ -55,6 +55,7 @@ int perf_event__synthesize_extra_attr(struct perf_tool *tool, struct evlist *evs int perf_event__synthesize_extra_kmaps(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine); int perf_event__synthesize_features(struct perf_tool *tool, struct perf_session *session, struct evlist *evlist, perf_event__handler_t process); int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_t process, struct evlist *evlist, struct machine *machine); +int perf_event__synthesize_id_sample(__u64 *array, u64 type, const struct perf_sample *sample); int perf_event__synthesize_kernel_mmap(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine); int perf_event__synthesize_mmap_events(struct perf_tool *tool, union perf_event *event, pid_t pid, pid_t tgid, perf_event__handler_t process, struct machine *machine, bool mmap_data); int perf_event__synthesize_modules(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine); -- cgit v1.2.3 From 57190e38b00d25abba9631b8db53740fce4baced Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Jul 2022 12:31:50 +0300 Subject: perf script: Add --dump-unsorted-raw-trace option When reviewing the results of perf inject, it is useful to be able to see the events in the order they appear in the file. So add --dump-unsorted-raw-trace option to do an unsorted dump. Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Cc: kvm@vger.kernel.org Link: https://lore.kernel.org/r/20220711093218.10967-8-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-script.txt | 3 +++ tools/perf/builtin-script.c | 8 ++++++++ 2 files changed, 11 insertions(+) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 1a557ff8f210..e250ff5566cf 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -79,6 +79,9 @@ OPTIONS --dump-raw-trace=:: Display verbose dump of the trace data. +--dump-unsorted-raw-trace=:: + Same as --dump-raw-trace but not sorted in time order. + -L:: --Latency=:: Show latency attributes (irqs/preemption disabled, etc). diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 7cf21ab16f4f..4b00a50faf00 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -3746,6 +3746,7 @@ int cmd_script(int argc, const char **argv) bool header = false; bool header_only = false; bool script_started = false; + bool unsorted_dump = false; char *rec_script_path = NULL; char *rep_script_path = NULL; struct perf_session *session; @@ -3794,6 +3795,8 @@ int cmd_script(int argc, const char **argv) const struct option options[] = { OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"), + OPT_BOOLEAN(0, "dump-unsorted-raw-trace", &unsorted_dump, + "dump unsorted raw trace in ASCII"), OPT_INCR('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), OPT_BOOLEAN('L', "Latency", &latency_format, @@ -3956,6 +3959,11 @@ int cmd_script(int argc, const char **argv) data.path = input_name; data.force = symbol_conf.force; + if (unsorted_dump) { + dump_trace = true; + script.tool.ordered_events = false; + } + if (symbol__validate_sym_arguments()) return -1; -- cgit v1.2.3 From 15fe03621d9df90c23de7a2099b692e2da344cde Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Jul 2022 12:31:51 +0300 Subject: perf buildid-cache: Add guestmount'd files to the build ID cache When the guestmount option is used, a guest machine's file system mount point is recorded in machine->root_dir. perf already iterates guest machines when adding files to the build ID cache, but does not take machine->root_dir into account. Use machine->root_dir to find files for guest build IDs, and add them to the build ID cache using the "proper" name i.e. relative to the guest root directory not the host root directory. Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Cc: kvm@vger.kernel.org Link: https://lore.kernel.org/r/20220711093218.10967-9-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/build-id.c | 67 +++++++++++++++++++++++++++++++++++----------- tools/perf/util/build-id.h | 16 ++++++++--- 2 files changed, 63 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index 4c9093b64d1f..7c9f441936ee 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -625,9 +625,12 @@ static int build_id_cache__add_sdt_cache(const char *sbuild_id, #endif static char *build_id_cache__find_debug(const char *sbuild_id, - struct nsinfo *nsi) + struct nsinfo *nsi, + const char *root_dir) { + const char *dirname = "/usr/lib/debug/.build-id/"; char *realname = NULL; + char dirbuf[PATH_MAX]; char *debugfile; struct nscookie nsc; size_t len = 0; @@ -636,8 +639,12 @@ static char *build_id_cache__find_debug(const char *sbuild_id, if (!debugfile) goto out; - len = __symbol__join_symfs(debugfile, PATH_MAX, - "/usr/lib/debug/.build-id/"); + if (root_dir) { + path__join(dirbuf, PATH_MAX, root_dir, dirname); + dirname = dirbuf; + } + + len = __symbol__join_symfs(debugfile, PATH_MAX, dirname); snprintf(debugfile + len, PATH_MAX - len, "%.2s/%s.debug", sbuild_id, sbuild_id + 2); @@ -668,14 +675,18 @@ out: int build_id_cache__add(const char *sbuild_id, const char *name, const char *realname, - struct nsinfo *nsi, bool is_kallsyms, bool is_vdso) + struct nsinfo *nsi, bool is_kallsyms, bool is_vdso, + const char *proper_name, const char *root_dir) { const size_t size = PATH_MAX; char *filename = NULL, *dir_name = NULL, *linkname = zalloc(size), *tmp; char *debugfile = NULL; int err = -1; - dir_name = build_id_cache__cachedir(sbuild_id, name, nsi, is_kallsyms, + if (!proper_name) + proper_name = name; + + dir_name = build_id_cache__cachedir(sbuild_id, proper_name, nsi, is_kallsyms, is_vdso); if (!dir_name) goto out_free; @@ -715,7 +726,7 @@ build_id_cache__add(const char *sbuild_id, const char *name, const char *realnam */ if (!is_kallsyms && !is_vdso && strncmp(".ko", name + strlen(name) - 3, 3)) { - debugfile = build_id_cache__find_debug(sbuild_id, nsi); + debugfile = build_id_cache__find_debug(sbuild_id, nsi, root_dir); if (debugfile) { zfree(&filename); if (asprintf(&filename, "%s/%s", dir_name, @@ -781,8 +792,9 @@ out_free: return err; } -int build_id_cache__add_s(const char *sbuild_id, const char *name, - struct nsinfo *nsi, bool is_kallsyms, bool is_vdso) +int __build_id_cache__add_s(const char *sbuild_id, const char *name, + struct nsinfo *nsi, bool is_kallsyms, bool is_vdso, + const char *proper_name, const char *root_dir) { char *realname = NULL; int err = -1; @@ -796,8 +808,8 @@ int build_id_cache__add_s(const char *sbuild_id, const char *name, goto out_free; } - err = build_id_cache__add(sbuild_id, name, realname, nsi, is_kallsyms, is_vdso); - + err = build_id_cache__add(sbuild_id, name, realname, nsi, + is_kallsyms, is_vdso, proper_name, root_dir); out_free: if (!is_kallsyms) free(realname); @@ -806,14 +818,16 @@ out_free: static int build_id_cache__add_b(const struct build_id *bid, const char *name, struct nsinfo *nsi, - bool is_kallsyms, bool is_vdso) + bool is_kallsyms, bool is_vdso, + const char *proper_name, + const char *root_dir) { char sbuild_id[SBUILD_ID_SIZE]; build_id__sprintf(bid, sbuild_id); - return build_id_cache__add_s(sbuild_id, name, nsi, is_kallsyms, - is_vdso); + return __build_id_cache__add_s(sbuild_id, name, nsi, is_kallsyms, + is_vdso, proper_name, root_dir); } bool build_id_cache__cached(const char *sbuild_id) @@ -896,6 +910,10 @@ static int dso__cache_build_id(struct dso *dso, struct machine *machine, bool is_kallsyms = dso__is_kallsyms(dso); bool is_vdso = dso__is_vdso(dso); const char *name = dso->long_name; + const char *proper_name = NULL; + const char *root_dir = NULL; + char *allocated_name = NULL; + int ret = 0; if (!dso->has_build_id) return 0; @@ -905,11 +923,28 @@ static int dso__cache_build_id(struct dso *dso, struct machine *machine, name = machine->mmap_name; } + if (!machine__is_host(machine)) { + if (*machine->root_dir) { + root_dir = machine->root_dir; + ret = asprintf(&allocated_name, "%s/%s", root_dir, name); + if (ret < 0) + return ret; + proper_name = name; + name = allocated_name; + } else if (is_kallsyms) { + /* Cannot get guest kallsyms */ + return 0; + } + } + if (!is_kallsyms && dso__build_id_mismatch(dso, name)) - return 0; + goto out_free; - return build_id_cache__add_b(&dso->bid, name, dso->nsinfo, - is_kallsyms, is_vdso); + ret = build_id_cache__add_b(&dso->bid, name, dso->nsinfo, + is_kallsyms, is_vdso, proper_name, root_dir); +out_free: + free(allocated_name); + return ret; } static int diff --git a/tools/perf/util/build-id.h b/tools/perf/util/build-id.h index c19617151670..4e3a1169379b 100644 --- a/tools/perf/util/build-id.h +++ b/tools/perf/util/build-id.h @@ -66,10 +66,18 @@ int build_id_cache__list_build_ids(const char *pathname, struct nsinfo *nsi, struct strlist **result); bool build_id_cache__cached(const char *sbuild_id); int build_id_cache__add(const char *sbuild_id, const char *name, const char *realname, - struct nsinfo *nsi, bool is_kallsyms, bool is_vdso); -int build_id_cache__add_s(const char *sbuild_id, - const char *name, struct nsinfo *nsi, - bool is_kallsyms, bool is_vdso); + struct nsinfo *nsi, bool is_kallsyms, bool is_vdso, + const char *proper_name, const char *root_dir); +int __build_id_cache__add_s(const char *sbuild_id, + const char *name, struct nsinfo *nsi, + bool is_kallsyms, bool is_vdso, + const char *proper_name, const char *root_dir); +static inline int build_id_cache__add_s(const char *sbuild_id, + const char *name, struct nsinfo *nsi, + bool is_kallsyms, bool is_vdso) +{ + return __build_id_cache__add_s(sbuild_id, name, nsi, is_kallsyms, is_vdso, NULL, NULL); +} int build_id_cache__remove_s(const char *sbuild_id); extern char buildid_dir[]; -- cgit v1.2.3 From c1fd5b7d8aed8104d5189c3d55545d67f9149bb6 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Jul 2022 12:31:52 +0300 Subject: perf buildid-cache: Do not require purge files to also be in the file system realname() returns NULL if the file is not in the file system, but we can still remove it from the build ID cache in that case, so continue and attempt the purge with the name provided. Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: kvm@vger.kernel.org Link: https://lore.kernel.org/r/20220711093218.10967-10-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/build-id.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index 7c9f441936ee..9e176146eb10 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -561,14 +561,11 @@ char *build_id_cache__cachedir(const char *sbuild_id, const char *name, char *realname = (char *)name, *filename; bool slash = is_kallsyms || is_vdso; - if (!slash) { + if (!slash) realname = nsinfo__realpath(name, nsi); - if (!realname) - return NULL; - } if (asprintf(&filename, "%s%s%s%s%s", buildid_dir, slash ? "/" : "", - is_vdso ? DSO__NAME_VDSO : realname, + is_vdso ? DSO__NAME_VDSO : (realname ? realname : name), sbuild_id ? "/" : "", sbuild_id ?: "") < 0) filename = NULL; -- cgit v1.2.3 From b47bb18661eaed30790d70b7563a5220b3c59594 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Jul 2022 12:31:53 +0300 Subject: perf tools: Add machine_pid and vcpu to id_index When injecting events from a guest perf.data file, the events will have separate sample ID numbers. These ID numbers can then be used to determine which machine an event belongs to. To facilitate that, add machine_pid and vcpu to id_index records. For backward compatibility, these are added at the end of the record, and the length of the record is used to determine if they are present or not. Note, this is needed because the events from a guest perf.data file contain the pid/tid of the process running at that time inside the VM not the pid/tid of the (QEMU) hypervisor thread. So a way is needed to relate guest events back to the guest machine and VCPU, and using sample ID numbers for that is relatively simple and convenient. Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: kvm@vger.kernel.org Link: https://lore.kernel.org/r/20220711093218.10967-11-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/include/internal/evsel.h | 4 +++ tools/lib/perf/include/perf/event.h | 5 ++++ tools/perf/util/session.c | 40 ++++++++++++++++++++++---- tools/perf/util/synthetic-events.c | 51 +++++++++++++++++++++++++-------- tools/perf/util/synthetic-events.h | 1 + 5 files changed, 84 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/lib/perf/include/internal/evsel.h b/tools/lib/perf/include/internal/evsel.h index 2a912a1f1989..a99a75d9e78f 100644 --- a/tools/lib/perf/include/internal/evsel.h +++ b/tools/lib/perf/include/internal/evsel.h @@ -30,6 +30,10 @@ struct perf_sample_id { struct perf_cpu cpu; pid_t tid; + /* Guest machine pid and VCPU, valid only if machine_pid is non-zero */ + pid_t machine_pid; + struct perf_cpu vcpu; + /* Holds total ID period value for PERF_SAMPLE_READ processing. */ u64 period; }; diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h index 9f7ca070da87..c2dbd3e88885 100644 --- a/tools/lib/perf/include/perf/event.h +++ b/tools/lib/perf/include/perf/event.h @@ -237,6 +237,11 @@ struct id_index_entry { __u64 tid; }; +struct id_index_entry_2 { + __u64 machine_pid; + __u64 vcpu; +}; + struct perf_record_id_index { struct perf_event_header header; __u64 nr; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 4c9513bc6d89..5141fe164e97 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -2756,18 +2756,35 @@ int perf_event__process_id_index(struct perf_session *session, { struct evlist *evlist = session->evlist; struct perf_record_id_index *ie = &event->id_index; + size_t sz = ie->header.size - sizeof(*ie); size_t i, nr, max_nr; + size_t e1_sz = sizeof(struct id_index_entry); + size_t e2_sz = sizeof(struct id_index_entry_2); + size_t etot_sz = e1_sz + e2_sz; + struct id_index_entry_2 *e2; - max_nr = (ie->header.size - sizeof(struct perf_record_id_index)) / - sizeof(struct id_index_entry); + max_nr = sz / e1_sz; nr = ie->nr; - if (nr > max_nr) + if (nr > max_nr) { + printf("Too big: nr %zu max_nr %zu\n", nr, max_nr); return -EINVAL; + } + + if (sz >= nr * etot_sz) { + max_nr = sz / etot_sz; + if (nr > max_nr) { + printf("Too big2: nr %zu max_nr %zu\n", nr, max_nr); + return -EINVAL; + } + e2 = (void *)ie + sizeof(*ie) + nr * e1_sz; + } else { + e2 = NULL; + } if (dump_trace) fprintf(stdout, " nr: %zu\n", nr); - for (i = 0; i < nr; i++) { + for (i = 0; i < nr; i++, (e2 ? e2++ : 0)) { struct id_index_entry *e = &ie->entries[i]; struct perf_sample_id *sid; @@ -2775,15 +2792,28 @@ int perf_event__process_id_index(struct perf_session *session, fprintf(stdout, " ... id: %"PRI_lu64, e->id); fprintf(stdout, " idx: %"PRI_lu64, e->idx); fprintf(stdout, " cpu: %"PRI_ld64, e->cpu); - fprintf(stdout, " tid: %"PRI_ld64"\n", e->tid); + fprintf(stdout, " tid: %"PRI_ld64, e->tid); + if (e2) { + fprintf(stdout, " machine_pid: %"PRI_ld64, e2->machine_pid); + fprintf(stdout, " vcpu: %"PRI_lu64"\n", e2->vcpu); + } else { + fprintf(stdout, "\n"); + } } sid = evlist__id2sid(evlist, e->id); if (!sid) return -ENOENT; + sid->idx = e->idx; sid->cpu.cpu = e->cpu; sid->tid = e->tid; + + if (!e2) + continue; + + sid->machine_pid = e2->machine_pid; + sid->vcpu.cpu = e2->vcpu; } return 0; } diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index ed9623702f34..2ae59c03ae77 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1759,19 +1759,26 @@ int perf_event__synthesize_id_sample(__u64 *array, u64 type, const struct perf_s return (void *)array - (void *)start; } -int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_t process, - struct evlist *evlist, struct machine *machine) +int __perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_t process, + struct evlist *evlist, struct machine *machine, size_t from) { union perf_event *ev; struct evsel *evsel; - size_t nr = 0, i = 0, sz, max_nr, n; + size_t nr = 0, i = 0, sz, max_nr, n, pos; + size_t e1_sz = sizeof(struct id_index_entry); + size_t e2_sz = sizeof(struct id_index_entry_2); + size_t etot_sz = e1_sz + e2_sz; + bool e2_needed = false; int err; - max_nr = (UINT16_MAX - sizeof(struct perf_record_id_index)) / - sizeof(struct id_index_entry); + max_nr = (UINT16_MAX - sizeof(struct perf_record_id_index)) / etot_sz; - evlist__for_each_entry(evlist, evsel) + pos = 0; + evlist__for_each_entry(evlist, evsel) { + if (pos++ < from) + continue; nr += evsel->core.ids; + } if (!nr) return 0; @@ -1779,31 +1786,38 @@ int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_ pr_debug2("Synthesizing id index\n"); n = nr > max_nr ? max_nr : nr; - sz = sizeof(struct perf_record_id_index) + n * sizeof(struct id_index_entry); + sz = sizeof(struct perf_record_id_index) + n * etot_sz; ev = zalloc(sz); if (!ev) return -ENOMEM; + sz = sizeof(struct perf_record_id_index) + n * e1_sz; + ev->id_index.header.type = PERF_RECORD_ID_INDEX; - ev->id_index.header.size = sz; ev->id_index.nr = n; + pos = 0; evlist__for_each_entry(evlist, evsel) { u32 j; - for (j = 0; j < evsel->core.ids; j++) { + if (pos++ < from) + continue; + for (j = 0; j < evsel->core.ids; j++, i++) { struct id_index_entry *e; + struct id_index_entry_2 *e2; struct perf_sample_id *sid; if (i >= n) { + ev->id_index.header.size = sz + (e2_needed ? n * e2_sz : 0); err = process(tool, ev, NULL, machine); if (err) goto out_err; nr -= n; i = 0; + e2_needed = false; } - e = &ev->id_index.entries[i++]; + e = &ev->id_index.entries[i]; e->id = evsel->core.id[j]; @@ -1816,11 +1830,18 @@ int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_ e->idx = sid->idx; e->cpu = sid->cpu.cpu; e->tid = sid->tid; + + if (sid->machine_pid) + e2_needed = true; + + e2 = (void *)ev + sz; + e2[i].machine_pid = sid->machine_pid; + e2[i].vcpu = sid->vcpu.cpu; } } - sz = sizeof(struct perf_record_id_index) + nr * sizeof(struct id_index_entry); - ev->id_index.header.size = sz; + sz = sizeof(struct perf_record_id_index) + nr * e1_sz; + ev->id_index.header.size = sz + (e2_needed ? nr * e2_sz : 0); ev->id_index.nr = nr; err = process(tool, ev, NULL, machine); @@ -1830,6 +1851,12 @@ out_err: return err; } +int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_t process, + struct evlist *evlist, struct machine *machine) +{ + return __perf_event__synthesize_id_index(tool, process, evlist, machine, 0); +} + int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool, struct target *target, struct perf_thread_map *threads, perf_event__handler_t process, bool needs_mmap, diff --git a/tools/perf/util/synthetic-events.h b/tools/perf/util/synthetic-events.h index b136ec3ec95d..81cb3d6af0b9 100644 --- a/tools/perf/util/synthetic-events.h +++ b/tools/perf/util/synthetic-events.h @@ -55,6 +55,7 @@ int perf_event__synthesize_extra_attr(struct perf_tool *tool, struct evlist *evs int perf_event__synthesize_extra_kmaps(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine); int perf_event__synthesize_features(struct perf_tool *tool, struct perf_session *session, struct evlist *evlist, perf_event__handler_t process); int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_t process, struct evlist *evlist, struct machine *machine); +int __perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_t process, struct evlist *evlist, struct machine *machine, size_t from); int perf_event__synthesize_id_sample(__u64 *array, u64 type, const struct perf_sample *sample); int perf_event__synthesize_kernel_mmap(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine); int perf_event__synthesize_mmap_events(struct perf_tool *tool, union perf_event *event, pid_t pid, pid_t tgid, perf_event__handler_t process, struct machine *machine, bool mmap_data); -- cgit v1.2.3 From ff7a78c210ed90bed915df55cc56876d4151c5ac Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Jul 2022 12:31:54 +0300 Subject: perf session: Create guest machines from id_index Now that id_index has machine_pid, use it to create guest machines. Create the guest machines with an idle thread because guest events for "swapper" will be possible. Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Cc: kvm@vger.kernel.org Link: https://lore.kernel.org/r/20220711093218.10967-12-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 5141fe164e97..1af981d5ad3c 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -2751,6 +2751,24 @@ void perf_session__fprintf_info(struct perf_session *session, FILE *fp, fprintf(fp, "# ========\n#\n"); } +static int perf_session__register_guest(struct perf_session *session, pid_t machine_pid) +{ + struct machine *machine = machines__findnew(&session->machines, machine_pid); + struct thread *thread; + + if (!machine) + return -ENOMEM; + + machine->single_address_space = session->machines.host.single_address_space; + + thread = machine__idle_thread(machine); + if (!thread) + return -ENOMEM; + thread__put(thread); + + return 0; +} + int perf_event__process_id_index(struct perf_session *session, union perf_event *event) { @@ -2762,6 +2780,7 @@ int perf_event__process_id_index(struct perf_session *session, size_t e2_sz = sizeof(struct id_index_entry_2); size_t etot_sz = e1_sz + e2_sz; struct id_index_entry_2 *e2; + pid_t last_pid = 0; max_nr = sz / e1_sz; nr = ie->nr; @@ -2787,6 +2806,7 @@ int perf_event__process_id_index(struct perf_session *session, for (i = 0; i < nr; i++, (e2 ? e2++ : 0)) { struct id_index_entry *e = &ie->entries[i]; struct perf_sample_id *sid; + int ret; if (dump_trace) { fprintf(stdout, " ... id: %"PRI_lu64, e->id); @@ -2814,6 +2834,17 @@ int perf_event__process_id_index(struct perf_session *session, sid->machine_pid = e2->machine_pid; sid->vcpu.cpu = e2->vcpu; + + if (!sid->machine_pid) + continue; + + if (sid->machine_pid != last_pid) { + ret = perf_session__register_guest(session, sid->machine_pid); + if (ret) + return ret; + last_pid = sid->machine_pid; + perf_guest = true; + } } return 0; } -- cgit v1.2.3 From 797efbc523b37de29dc533a8561d34b97deb42e4 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Jul 2022 12:31:55 +0300 Subject: perf tools: Add guest_cpu to hypervisor threads It is possible to know which guest machine was running at a point in time based on the PID of the currently running host thread. That is, perf identifies guest machines by the PID of the hypervisor. To determine the guest CPU, put it on the hypervisor (QEMU) thread for that VCPU. This is done when processing the id_index which provides the necessary information. Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: kvm@vger.kernel.org Link: https://lore.kernel.org/r/20220711093218.10967-13-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 18 ++++++++++++++++++ tools/perf/util/thread.c | 1 + tools/perf/util/thread.h | 1 + 3 files changed, 20 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 1af981d5ad3c..91a091c35945 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -2769,6 +2769,20 @@ static int perf_session__register_guest(struct perf_session *session, pid_t mach return 0; } +static int perf_session__set_guest_cpu(struct perf_session *session, pid_t pid, + pid_t tid, int guest_cpu) +{ + struct machine *machine = &session->machines.host; + struct thread *thread = machine__findnew_thread(machine, pid, tid); + + if (!thread) + return -ENOMEM; + thread->guest_cpu = guest_cpu; + thread__put(thread); + + return 0; +} + int perf_event__process_id_index(struct perf_session *session, union perf_event *event) { @@ -2845,6 +2859,10 @@ int perf_event__process_id_index(struct perf_session *session, last_pid = sid->machine_pid; perf_guest = true; } + + ret = perf_session__set_guest_cpu(session, sid->machine_pid, e->tid, e2->vcpu); + if (ret) + return ret; } return 0; } diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index 665e5c0618ed..e3e5427e1c3c 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -47,6 +47,7 @@ struct thread *thread__new(pid_t pid, pid_t tid) thread->tid = tid; thread->ppid = -1; thread->cpu = -1; + thread->guest_cpu = -1; thread->lbr_stitch_enable = false; INIT_LIST_HEAD(&thread->namespaces_list); INIT_LIST_HEAD(&thread->comm_list); diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index b066fb30d203..241f300d7d6e 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -39,6 +39,7 @@ struct thread { pid_t tid; pid_t ppid; int cpu; + int guest_cpu; /* For QEMU thread */ refcount_t refcnt; bool comm_set; int comm_len; -- cgit v1.2.3 From 3461b65da7d48c57080d40e6b545f1360f0b195b Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Jul 2022 12:31:56 +0300 Subject: perf tools: Add machine_pid and vcpu to perf_sample When parsing a sample with a sample ID, copy machine_pid and vcpu from perf_sample_id to perf_sample. Note, machine_pid will be zero when unused, so only a non-zero value represents a guest machine. vcpu should be ignored if machine_pid is zero. Note also, machine_pid is used with events that have come from injecting a guest perf.data file, however guest events recorded on the host (i.e. using perf kvm) have the (QEMU) hypervisor process pid to identify them - refer machines__find_for_cpumode(). Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Cc: kvm@vger.kernel.org Link: https://lore.kernel.org/r/20220711093218.10967-14-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/event.h | 2 ++ tools/perf/util/evlist.c | 14 +++++++++++++- tools/perf/util/evsel.c | 1 + 3 files changed, 16 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index cdd72e05fd28..a660f304f83c 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -148,6 +148,8 @@ struct perf_sample { u64 code_page_size; u64 cgroup; u32 flags; + u32 machine_pid; + u32 vcpu; u16 insn_len; u8 cpumode; u16 misc; diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 03fbe151b0c4..64f5a8074c0c 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1507,10 +1507,22 @@ int evlist__start_workload(struct evlist *evlist) int evlist__parse_sample(struct evlist *evlist, union perf_event *event, struct perf_sample *sample) { struct evsel *evsel = evlist__event2evsel(evlist, event); + int ret; if (!evsel) return -EFAULT; - return evsel__parse_sample(evsel, event, sample); + ret = evsel__parse_sample(evsel, event, sample); + if (ret) + return ret; + if (perf_guest && sample->id) { + struct perf_sample_id *sid = evlist__id2sid(evlist, sample->id); + + if (sid) { + sample->machine_pid = sid->machine_pid; + sample->vcpu = sid->vcpu.cpu; + } + } + return 0; } int evlist__parse_sample_timestamp(struct evlist *evlist, union perf_event *event, u64 *timestamp) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 9a30ccb7b104..14396ea5a968 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2365,6 +2365,7 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event, data->misc = event->header.misc; data->id = -1ULL; data->data_src = PERF_MEM_DATA_SRC_NONE; + data->vcpu = -1; if (event->header.type != PERF_RECORD_SAMPLE) { if (!evsel->core.attr.sample_id_all) -- cgit v1.2.3 From 635049099582e45be4720722919912ca113c823c Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Jul 2022 12:31:57 +0300 Subject: perf session: Use sample->machine_pid to find guest machine If machine_pid is set, use it to find the guest machine. Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Cc: kvm@vger.kernel.org Link: https://lore.kernel.org/r/20220711093218.10967-15-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 91a091c35945..f3e9fa557bc9 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1418,7 +1418,9 @@ static struct machine *machines__find_for_cpumode(struct machines *machines, (sample->cpumode == PERF_RECORD_MISC_GUEST_USER))) { u32 pid; - if (event->header.type == PERF_RECORD_MMAP + if (sample->machine_pid) + pid = sample->machine_pid; + else if (event->header.type == PERF_RECORD_MMAP || event->header.type == PERF_RECORD_MMAP2) pid = event->mmap.pid; else -- cgit v1.2.3 From e28fb159f1163e76811b9b9024564c33027d9a44 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Jul 2022 12:31:58 +0300 Subject: perf script: Add machine_pid and vcpu Add fields machine_pid and vcpu. These are displayed only if machine_pid is non-zero. Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: kvm@vger.kernel.org Link: https://lore.kernel.org/r/20220711093218.10967-16-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-script.txt | 7 ++++++- tools/perf/builtin-script.c | 11 +++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index e250ff5566cf..c09cc44e50ee 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -133,7 +133,8 @@ OPTIONS comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff, srcline, period, iregs, uregs, brstack, brstacksym, flags, bpf-output, brstackinsn, brstackinsnlen, brstackoff, callindent, insn, insnlen, synth, - phys_addr, metric, misc, srccode, ipc, data_page_size, code_page_size, ins_lat. + phys_addr, metric, misc, srccode, ipc, data_page_size, code_page_size, ins_lat, + machine_pid, vcpu. Field list can be prepended with the type, trace, sw or hw, to indicate to which event type the field list applies. e.g., -F sw:comm,tid,time,ip,sym and -F trace:time,cpu,trace @@ -226,6 +227,10 @@ OPTIONS The ipc (instructions per cycle) field is synthesized and may have a value when Instruction Trace decoding. + The machine_pid and vcpu fields are derived from data resulting from using + perf insert to insert a perf.data file recorded inside a virtual machine into + a perf.data file recorded on the host at the same time. + Finally, a user may not set fields to none for all event types. i.e., -F "" is not allowed. diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 4b00a50faf00..ac19fee62d8e 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -125,6 +125,8 @@ enum perf_output_field { PERF_OUTPUT_CODE_PAGE_SIZE = 1ULL << 34, PERF_OUTPUT_INS_LAT = 1ULL << 35, PERF_OUTPUT_BRSTACKINSNLEN = 1ULL << 36, + PERF_OUTPUT_MACHINE_PID = 1ULL << 37, + PERF_OUTPUT_VCPU = 1ULL << 38, }; struct perf_script { @@ -193,6 +195,8 @@ struct output_option { {.str = "code_page_size", .field = PERF_OUTPUT_CODE_PAGE_SIZE}, {.str = "ins_lat", .field = PERF_OUTPUT_INS_LAT}, {.str = "brstackinsnlen", .field = PERF_OUTPUT_BRSTACKINSNLEN}, + {.str = "machine_pid", .field = PERF_OUTPUT_MACHINE_PID}, + {.str = "vcpu", .field = PERF_OUTPUT_VCPU}, }; enum { @@ -746,6 +750,13 @@ static int perf_sample__fprintf_start(struct perf_script *script, int printed = 0; char tstr[128]; + if (PRINT_FIELD(MACHINE_PID) && sample->machine_pid) + printed += fprintf(fp, "VM:%5d ", sample->machine_pid); + + /* Print VCPU only for guest events i.e. with machine_pid */ + if (PRINT_FIELD(VCPU) && sample->machine_pid) + printed += fprintf(fp, "VCPU:%03d ", sample->vcpu); + if (PRINT_FIELD(COMM)) { const char *comm = thread ? thread__comm_str(thread) : ":-1"; -- cgit v1.2.3 From 2273e46b98377f563628d694a1ad49b5d11afb43 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Jul 2022 12:31:59 +0300 Subject: perf dlfilter: Add machine_pid and vcpu Add machine_pid and vcpu to struct perf_dlfilter_sample. The 'size' can be used to determine if the values are present, however machine_pid is zero if unused in any case. vcpu should be ignored if machine_pid is zero. Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Cc: kvm@vger.kernel.org Link: https://lore.kernel.org/r/20220711093218.10967-17-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-dlfilter.txt | 22 ++++++++++++++++++++++ tools/perf/include/perf/perf_dlfilter.h | 8 ++++++++ tools/perf/util/dlfilter.c | 2 ++ 3 files changed, 32 insertions(+) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-dlfilter.txt b/tools/perf/Documentation/perf-dlfilter.txt index 594f5a5a0c9e..fb22e3b31dc5 100644 --- a/tools/perf/Documentation/perf-dlfilter.txt +++ b/tools/perf/Documentation/perf-dlfilter.txt @@ -107,9 +107,31 @@ struct perf_dlfilter_sample { __u64 raw_callchain_nr; /* Number of raw_callchain entries */ const __u64 *raw_callchain; /* Refer */ const char *event; + __s32 machine_pid; + __s32 vcpu; }; ---- +Note: 'machine_pid' and 'vcpu' are not original members, but were added together later. +'size' can be used to determine their presence at run time. +PERF_DLFILTER_HAS_MACHINE_PID will be defined if they are present at compile time. +For example: +[source,c] +---- +#include +#include +#include + +static inline bool have_machine_pid(const struct perf_dlfilter_sample *sample) +{ +#ifdef PERF_DLFILTER_HAS_MACHINE_PID + return sample->size >= offsetof(struct perf_dlfilter_sample, vcpu) + sizeof(sample->vcpu); +#else + return false; +#endif +} +---- + The perf_dlfilter_fns structure ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/tools/perf/include/perf/perf_dlfilter.h b/tools/perf/include/perf/perf_dlfilter.h index 3eef03d661b4..a26e2f129f83 100644 --- a/tools/perf/include/perf/perf_dlfilter.h +++ b/tools/perf/include/perf/perf_dlfilter.h @@ -9,6 +9,12 @@ #include #include +/* + * The following macro can be used to determine if this header defines + * perf_dlfilter_sample machine_pid and vcpu. + */ +#define PERF_DLFILTER_HAS_MACHINE_PID + /* Definitions for perf_dlfilter_sample flags */ enum { PERF_DLFILTER_FLAG_BRANCH = 1ULL << 0, @@ -62,6 +68,8 @@ struct perf_dlfilter_sample { __u64 raw_callchain_nr; /* Number of raw_callchain entries */ const __u64 *raw_callchain; /* Refer */ const char *event; + __s32 machine_pid; + __s32 vcpu; }; /* diff --git a/tools/perf/util/dlfilter.c b/tools/perf/util/dlfilter.c index db964d5a52af..54e4d4495e00 100644 --- a/tools/perf/util/dlfilter.c +++ b/tools/perf/util/dlfilter.c @@ -495,6 +495,8 @@ int dlfilter__do_filter_event(struct dlfilter *d, ASSIGN(misc); ASSIGN(raw_size); ASSIGN(raw_data); + ASSIGN(machine_pid); + ASSIGN(vcpu); if (sample->branch_stack) { d_sample.brstack_nr = sample->branch_stack->nr; -- cgit v1.2.3 From 7151c1d17820c0cfcda0c890f55a868cb4336afc Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Jul 2022 12:32:00 +0300 Subject: perf auxtrace: Add machine_pid and vcpu to auxtrace_error Add machine_pid and vcpu to struct perf_record_auxtrace_error. The existing fmt member is used to identify the new format. The new members make it possible to easily differentiate errors from guest machines. Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Cc: kvm@vger.kernel.org Link: https://lore.kernel.org/r/20220711093218.10967-18-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/include/perf/event.h | 2 ++ tools/perf/util/auxtrace.c | 30 +++++++++++++++++----- tools/perf/util/auxtrace.h | 4 +++ .../util/scripting-engines/trace-event-python.c | 4 ++- tools/perf/util/session.c | 4 +++ 5 files changed, 37 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h index c2dbd3e88885..556bb06798f2 100644 --- a/tools/lib/perf/include/perf/event.h +++ b/tools/lib/perf/include/perf/event.h @@ -279,6 +279,8 @@ struct perf_record_auxtrace_error { __u64 ip; __u64 time; char msg[MAX_AUXTRACE_ERROR_MSG]; + __u32 machine_pid; + __u32 vcpu; }; struct perf_record_aux { diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 511dd3caa1bc..6edab8a16de6 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -1189,9 +1189,10 @@ void auxtrace_buffer__free(struct auxtrace_buffer *buffer) free(buffer); } -void auxtrace_synth_error(struct perf_record_auxtrace_error *auxtrace_error, int type, - int code, int cpu, pid_t pid, pid_t tid, u64 ip, - const char *msg, u64 timestamp) +void auxtrace_synth_guest_error(struct perf_record_auxtrace_error *auxtrace_error, int type, + int code, int cpu, pid_t pid, pid_t tid, u64 ip, + const char *msg, u64 timestamp, + pid_t machine_pid, int vcpu) { size_t size; @@ -1207,12 +1208,26 @@ void auxtrace_synth_error(struct perf_record_auxtrace_error *auxtrace_error, int auxtrace_error->ip = ip; auxtrace_error->time = timestamp; strlcpy(auxtrace_error->msg, msg, MAX_AUXTRACE_ERROR_MSG); - - size = (void *)auxtrace_error->msg - (void *)auxtrace_error + - strlen(auxtrace_error->msg) + 1; + if (machine_pid) { + auxtrace_error->fmt = 2; + auxtrace_error->machine_pid = machine_pid; + auxtrace_error->vcpu = vcpu; + size = sizeof(*auxtrace_error); + } else { + size = (void *)auxtrace_error->msg - (void *)auxtrace_error + + strlen(auxtrace_error->msg) + 1; + } auxtrace_error->header.size = PERF_ALIGN(size, sizeof(u64)); } +void auxtrace_synth_error(struct perf_record_auxtrace_error *auxtrace_error, int type, + int code, int cpu, pid_t pid, pid_t tid, u64 ip, + const char *msg, u64 timestamp) +{ + auxtrace_synth_guest_error(auxtrace_error, type, code, cpu, pid, tid, + ip, msg, timestamp, 0, -1); +} + int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr, struct perf_tool *tool, struct perf_session *session, @@ -1662,6 +1677,9 @@ size_t perf_event__fprintf_auxtrace_error(union perf_event *event, FILE *fp) if (!e->fmt) msg = (const char *)&e->time; + if (e->fmt >= 2 && e->machine_pid) + ret += fprintf(fp, " machine_pid %d vcpu %d", e->machine_pid, e->vcpu); + ret += fprintf(fp, " cpu %d pid %d tid %d ip %#"PRI_lx64" code %u: %s\n", e->cpu, e->pid, e->tid, e->ip, e->code, msg); return ret; diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h index cd0d25c2751c..6a4fbfd34c6b 100644 --- a/tools/perf/util/auxtrace.h +++ b/tools/perf/util/auxtrace.h @@ -595,6 +595,10 @@ int auxtrace_index__process(int fd, u64 size, struct perf_session *session, bool needs_swap); void auxtrace_index__free(struct list_head *head); +void auxtrace_synth_guest_error(struct perf_record_auxtrace_error *auxtrace_error, int type, + int code, int cpu, pid_t pid, pid_t tid, u64 ip, + const char *msg, u64 timestamp, + pid_t machine_pid, int vcpu); void auxtrace_synth_error(struct perf_record_auxtrace_error *auxtrace_error, int type, int code, int cpu, pid_t pid, pid_t tid, u64 ip, const char *msg, u64 timestamp); diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index adba01b7d9dd..3367c5479199 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -1559,7 +1559,7 @@ static void python_process_auxtrace_error(struct perf_session *session __maybe_u msg = (const char *)&e->time; } - t = tuple_new(9); + t = tuple_new(11); tuple_set_u32(t, 0, e->type); tuple_set_u32(t, 1, e->code); @@ -1570,6 +1570,8 @@ static void python_process_auxtrace_error(struct perf_session *session __maybe_u tuple_set_u64(t, 6, tm); tuple_set_string(t, 7, msg); tuple_set_u32(t, 8, cpumode); + tuple_set_s32(t, 9, e->machine_pid); + tuple_set_s32(t, 10, e->vcpu); call_object(handler, t, handler_name); diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index f3e9fa557bc9..7ea0b91013ea 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -895,6 +895,10 @@ static void perf_event__auxtrace_error_swap(union perf_event *event, event->auxtrace_error.ip = bswap_64(event->auxtrace_error.ip); if (event->auxtrace_error.fmt) event->auxtrace_error.time = bswap_64(event->auxtrace_error.time); + if (event->auxtrace_error.fmt >= 2) { + event->auxtrace_error.machine_pid = bswap_32(event->auxtrace_error.machine_pid); + event->auxtrace_error.vcpu = bswap_32(event->auxtrace_error.vcpu); + } } static void perf_event__thread_map_swap(union perf_event *event, -- cgit v1.2.3 From 6de306b7a5304e873fd6fc014121bd8f19ec418f Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Jul 2022 12:32:01 +0300 Subject: perf script python: Add machine_pid and vcpu Add machine_pid and vcpu to python sample events and context switch events. Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Cc: kvm@vger.kernel.org Link: https://lore.kernel.org/r/20220711093218.10967-19-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/scripting-engines/trace-event-python.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 3367c5479199..5bbc1b16f368 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -861,6 +861,13 @@ static PyObject *get_perf_sample_dict(struct perf_sample *sample, brstacksym = python_process_brstacksym(sample, al->thread); pydict_set_item_string_decref(dict, "brstacksym", brstacksym); + if (sample->machine_pid) { + pydict_set_item_string_decref(dict_sample, "machine_pid", + _PyLong_FromLong(sample->machine_pid)); + pydict_set_item_string_decref(dict_sample, "vcpu", + _PyLong_FromLong(sample->vcpu)); + } + pydict_set_item_string_decref(dict_sample, "cpumode", _PyLong_FromLong((unsigned long)sample->cpumode)); @@ -1509,7 +1516,7 @@ static void python_do_process_switch(union perf_event *event, np_tid = event->context_switch.next_prev_tid; } - t = tuple_new(9); + t = tuple_new(11); if (!t) return; @@ -1522,6 +1529,8 @@ static void python_do_process_switch(union perf_event *event, tuple_set_s32(t, 6, machine->pid); tuple_set_bool(t, 7, out); tuple_set_bool(t, 8, out_preempt); + tuple_set_s32(t, 9, sample->machine_pid); + tuple_set_s32(t, 10, sample->vcpu); call_object(handler, t, handler_name); -- cgit v1.2.3 From 13a133b2550e062bb2710e9dd3d9a6b794e91053 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Jul 2022 12:32:02 +0300 Subject: perf script python: intel-pt-events: Add machine_pid and vcpu Add machine_pid and vcpu to the intel-pt-events.py script. Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Cc: kvm@vger.kernel.org Link: https://lore.kernel.org/r/20220711093218.10967-20-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/scripts/python/intel-pt-events.py | 32 +++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/scripts/python/intel-pt-events.py b/tools/perf/scripts/python/intel-pt-events.py index 9b7746b89381..6be7fd8fd615 100644 --- a/tools/perf/scripts/python/intel-pt-events.py +++ b/tools/perf/scripts/python/intel-pt-events.py @@ -197,7 +197,12 @@ def common_start_str(comm, sample): cpu = sample["cpu"] pid = sample["pid"] tid = sample["tid"] - return "%16s %5u/%-5u [%03u] %9u.%09u " % (comm, pid, tid, cpu, ts / 1000000000, ts %1000000000) + if "machine_pid" in sample: + machine_pid = sample["machine_pid"] + vcpu = sample["vcpu"] + return "VM:%5d VCPU:%03d %16s %5u/%-5u [%03u] %9u.%09u " % (machine_pid, vcpu, comm, pid, tid, cpu, ts / 1000000000, ts %1000000000) + else: + return "%16s %5u/%-5u [%03u] %9u.%09u " % (comm, pid, tid, cpu, ts / 1000000000, ts %1000000000) def print_common_start(comm, sample, name): flags_disp = get_optional_null(sample, "flags_disp") @@ -379,9 +384,19 @@ def process_event(param_dict): sys.exit(1) def auxtrace_error(typ, code, cpu, pid, tid, ip, ts, msg, cpumode, *x): + if len(x) >= 2 and x[0]: + machine_pid = x[0] + vcpu = x[1] + else: + machine_pid = 0 + vcpu = -1 try: - print("%16s %5u/%-5u [%03u] %9u.%09u error type %u code %u: %s ip 0x%16x" % - ("Trace error", pid, tid, cpu, ts / 1000000000, ts %1000000000, typ, code, msg, ip)) + if machine_pid: + print("VM:%5d VCPU:%03d %16s %5u/%-5u [%03u] %9u.%09u error type %u code %u: %s ip 0x%16x" % + (machine_pid, vcpu, "Trace error", pid, tid, cpu, ts / 1000000000, ts %1000000000, typ, code, msg, ip)) + else: + print("%16s %5u/%-5u [%03u] %9u.%09u error type %u code %u: %s ip 0x%16x" % + ("Trace error", pid, tid, cpu, ts / 1000000000, ts %1000000000, typ, code, msg, ip)) except broken_pipe_exception: # Stop python printing broken pipe errors and traceback sys.stdout = open(os.devnull, 'w') @@ -396,14 +411,21 @@ def context_switch(ts, cpu, pid, tid, np_pid, np_tid, machine_pid, out, out_pree preempt_str = "preempt" else: preempt_str = "" + if len(x) >= 2 and x[0]: + machine_pid = x[0] + vcpu = x[1] + else: + vcpu = None; if machine_pid == -1: machine_str = "" - else: + elif vcpu is None: machine_str = "machine PID %d" % machine_pid + else: + machine_str = "machine PID %d VCPU %d" % (machine_pid, vcpu) switch_str = "%16s %5d/%-5d [%03u] %9u.%09u %5d/%-5d %s %s" % \ (out_str, pid, tid, cpu, ts / 1000000000, ts %1000000000, np_pid, np_tid, machine_str, preempt_str) if glb_args.all_switch_events: - print(switch_str); + print(switch_str) else: global glb_switch_str glb_switch_str[cpu] = switch_str -- cgit v1.2.3 From 386e0d83d351a4461525b06b845ffcd58235381e Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Jul 2022 12:32:03 +0300 Subject: perf tools: Remove also guest kcore_dir with host kcore_dir Copies of /proc/kallsyms, /proc/modules and an extract of /proc/kcore can be stored in the perf.data output directory under the subdirectory named kcore_dir. Guest machines will have their files also under subdirectories beginning kcore_dir__ followed by the machine pid. Remove these also when removing kcore_dir. Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Cc: kvm@vger.kernel.org Link: https://lore.kernel.org/r/20220711093218.10967-21-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/util.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index eeb83c80f458..9b02edf9311d 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -200,7 +200,7 @@ static int rm_rf_depth_pat(const char *path, int depth, const char **pat) return rmdir(path); } -static int rm_rf_kcore_dir(const char *path) +static int rm_rf_a_kcore_dir(const char *path, const char *name) { char kcore_dir_path[PATH_MAX]; const char *pat[] = { @@ -210,11 +210,44 @@ static int rm_rf_kcore_dir(const char *path) NULL, }; - snprintf(kcore_dir_path, sizeof(kcore_dir_path), "%s/kcore_dir", path); + snprintf(kcore_dir_path, sizeof(kcore_dir_path), "%s/%s", path, name); return rm_rf_depth_pat(kcore_dir_path, 0, pat); } +static bool kcore_dir_filter(const char *name __maybe_unused, struct dirent *d) +{ + const char *pat[] = { + "kcore_dir", + "kcore_dir__[1-9]*", + NULL, + }; + + return match_pat(d->d_name, pat); +} + +static int rm_rf_kcore_dir(const char *path) +{ + struct strlist *kcore_dirs; + struct str_node *nd; + int ret; + + kcore_dirs = lsdir(path, kcore_dir_filter); + + if (!kcore_dirs) + return 0; + + strlist__for_each_entry(nd, kcore_dirs) { + ret = rm_rf_a_kcore_dir(path, nd->s); + if (ret) + return ret; + } + + strlist__delete(kcore_dirs); + + return 0; +} + int rm_rf_perf_data(const char *path) { const char *pat[] = { -- cgit v1.2.3 From 65691e9ff0c90109dd7d9d495d9528ad4c1b82d4 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Jul 2022 12:32:04 +0300 Subject: perf tools: Make has_kcore_dir() work also for guest kcore_dir Copies of /proc/kallsyms, /proc/modules and an extract of /proc/kcore can be stored in the perf.data output directory under the subdirectory named kcore_dir. Guest machines will have their files also under subdirectories beginning kcore_dir__ followed by the machine pid. Make has_kcore_dir() return true also if there is a guest machine kcore_dir. Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Cc: kvm@vger.kernel.org Link: https://lore.kernel.org/r/20220711093218.10967-22-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/data.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index caabeac24c69..9782ccbe595d 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -481,16 +482,21 @@ int perf_data__make_kcore_dir(struct perf_data *data, char *buf, size_t buf_sz) bool has_kcore_dir(const char *path) { - char *kcore_dir; - int ret; - - if (asprintf(&kcore_dir, "%s/kcore_dir", path) < 0) - return false; - - ret = access(kcore_dir, F_OK); + struct dirent *d = ERR_PTR(-EINVAL); + const char *name = "kcore_dir"; + DIR *dir = opendir(path); + size_t n = strlen(name); + bool result = false; + + if (dir) { + while (d && !result) { + d = readdir(dir); + result = d ? strncmp(d->d_name, name, n) : false; + } + closedir(dir); + } - free(kcore_dir); - return !ret; + return result; } char *perf_data__kallsyms_name(struct perf_data *data) -- cgit v1.2.3 From a5367ecb5353fbf28bfd3979fc4f61ddebec80b1 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Jul 2022 12:32:05 +0300 Subject: perf tools: Automatically use guest kcore_dir if present When registering a guest machine using machine_pid from the id index, check perf.data for a matching kcore_dir subdirectory and set the kallsyms file name accordingly. If set, use it to find the machine's kernel symbols and object code (from kcore). Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: kvm@vger.kernel.org Link: https://lore.kernel.org/r/20220711093218.10967-23-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/data.c | 19 +++++++++++++++++++ tools/perf/util/data.h | 1 + tools/perf/util/machine.h | 1 + tools/perf/util/session.c | 2 ++ tools/perf/util/symbol.c | 6 ++++-- 5 files changed, 27 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index 9782ccbe595d..a7f68c309545 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -518,6 +518,25 @@ char *perf_data__kallsyms_name(struct perf_data *data) return kallsyms_name; } +char *perf_data__guest_kallsyms_name(struct perf_data *data, pid_t machine_pid) +{ + char *kallsyms_name; + struct stat st; + + if (!data->is_dir) + return NULL; + + if (asprintf(&kallsyms_name, "%s/kcore_dir__%d/kallsyms", data->path, machine_pid) < 0) + return NULL; + + if (stat(kallsyms_name, &st)) { + free(kallsyms_name); + return NULL; + } + + return kallsyms_name; +} + bool is_perf_data(const char *path) { bool ret = false; diff --git a/tools/perf/util/data.h b/tools/perf/util/data.h index 7de53d6e2d7f..173132d502f5 100644 --- a/tools/perf/util/data.h +++ b/tools/perf/util/data.h @@ -101,5 +101,6 @@ unsigned long perf_data__size(struct perf_data *data); int perf_data__make_kcore_dir(struct perf_data *data, char *buf, size_t buf_sz); bool has_kcore_dir(const char *path); char *perf_data__kallsyms_name(struct perf_data *data); +char *perf_data__guest_kallsyms_name(struct perf_data *data, pid_t machine_pid); bool is_perf_data(const char *path); #endif /* __PERF_DATA_H */ diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h index e1476343cbb2..199807ac3c54 100644 --- a/tools/perf/util/machine.h +++ b/tools/perf/util/machine.h @@ -48,6 +48,7 @@ struct machine { bool single_address_space; char *root_dir; char *mmap_name; + char *kallsyms_filename; struct threads threads[THREADS__TABLE_SIZE]; struct vdso_info *vdso_info; struct perf_env *env; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 7ea0b91013ea..98e16659a149 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -2772,6 +2772,8 @@ static int perf_session__register_guest(struct perf_session *session, pid_t mach return -ENOMEM; thread__put(thread); + machine->kallsyms_filename = perf_data__guest_kallsyms_name(session->data, machine_pid); + return 0; } diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index f72baf636724..a4b22caa7c24 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -2300,11 +2300,13 @@ do_kallsyms: static int dso__load_guest_kernel_sym(struct dso *dso, struct map *map) { int err; - const char *kallsyms_filename = NULL; + const char *kallsyms_filename; struct machine *machine = map__kmaps(map)->machine; char path[PATH_MAX]; - if (machine__is_default_guest(machine)) { + if (machine->kallsyms_filename) { + kallsyms_filename = machine->kallsyms_filename; + } else if (machine__is_default_guest(machine)) { /* * if the user specified a vmlinux filename, use it and only * it, reporting errors to the user if it cannot be used. -- cgit v1.2.3 From 10d34700223b14ed94a6399cf026142956f87965 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Jul 2022 12:32:06 +0300 Subject: perf tools: Add reallocarray_as_needed() Add helper reallocarray_as_needed() to reallocate an array to a larger size and initialize the extra entries to an arbitrary value. Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: kvm@vger.kernel.org Link: https://lore.kernel.org/r/20220711093218.10967-24-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/util.c | 33 +++++++++++++++++++++++++++++++++ tools/perf/util/util.h | 15 +++++++++++++++ 2 files changed, 48 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index 9b02edf9311d..391c1e928bd7 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include "cap.h" #include "strlist.h" @@ -500,3 +501,35 @@ char *filename_with_chroot(int pid, const char *filename) return new_name; } + +/* + * Reallocate an array *arr of size *arr_sz so that it is big enough to contain + * x elements of size msz, initializing new entries to *init_val or zero if + * init_val is NULL + */ +int do_realloc_array_as_needed(void **arr, size_t *arr_sz, size_t x, size_t msz, const void *init_val) +{ + size_t new_sz = *arr_sz; + void *new_arr; + size_t i; + + if (!new_sz) + new_sz = msz >= 64 ? 1 : roundup(64, msz); /* Start with at least 64 bytes */ + while (x >= new_sz) { + if (check_mul_overflow(new_sz, (size_t)2, &new_sz)) + return -ENOMEM; + } + if (new_sz == *arr_sz) + return 0; + new_arr = calloc(new_sz, msz); + if (!new_arr) + return -ENOMEM; + memcpy(new_arr, *arr, *arr_sz * msz); + if (init_val) { + for (i = *arr_sz; i < new_sz; i++) + memcpy(new_arr + (i * msz), init_val, msz); + } + *arr = new_arr; + *arr_sz = new_sz; + return 0; +} diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 0f78f1e7782d..c1f2d423a9ec 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -79,4 +79,19 @@ struct perf_debuginfod { void perf_debuginfod_setup(struct perf_debuginfod *di); char *filename_with_chroot(int pid, const char *filename); + +int do_realloc_array_as_needed(void **arr, size_t *arr_sz, size_t x, + size_t msz, const void *init_val); + +#define realloc_array_as_needed(a, n, x, v) ({ \ + typeof(x) __x = (x); \ + __x >= (n) ? \ + do_realloc_array_as_needed((void **)&(a), \ + &(n), \ + __x, \ + sizeof(*(a)), \ + (const void *)(v)) : \ + 0; \ + }) + #endif /* GIT_COMPAT_UTIL_H */ -- cgit v1.2.3 From 97406a7e4fa6e5cad3be80cd2188d8fb50a6ec75 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Jul 2022 12:32:07 +0300 Subject: perf inject: Add support for injecting guest sideband events Inject events from a perf.data file recorded in a virtual machine into a perf.data file recorded on the host at the same time. Only side band events (e.g. mmap, comm, fork, exit etc) and build IDs are injected. Additionally, the guest kcore_dir is copied as kcore_dir__ appended to the machine PID. This is non-trivial because: o It is not possible to process 2 sessions simultaneously so instead events are first written to a temporary file. o To avoid conflict, guest sample IDs are replaced with new unused sample IDs. o Guest event's CPU is changed to be the host CPU because it is more useful for reporting and analysis. o Sample ID is mapped to machine PID which is recorded with VCPU in the id index. This is important to allow guest events to be related to the guest machine and VCPU. o Timestamps must be converted. o Events are inserted to obey finished-round ordering. The anticipated use-case is: - start recording sideband events in a guest machine - start recording an AUX area trace on the host which can trace also the guest (e.g. Intel PT) - run test case on the guest - stop recording on the host - stop recording on the guest - copy the guest perf.data file to the host - inject the guest perf.data file sideband events into the host perf.data file using perf inject - the resulting perf.data file can now be used Subsequent patches provide Intel PT support for this. Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: kvm@vger.kernel.org Link: https://lore.kernel.org/r/20220711093218.10967-25-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-inject.txt | 17 + tools/perf/builtin-inject.c | 1043 +++++++++++++++++++++++++++++- 2 files changed, 1059 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt index 0570a1ccd344..646aa31586ed 100644 --- a/tools/perf/Documentation/perf-inject.txt +++ b/tools/perf/Documentation/perf-inject.txt @@ -85,6 +85,23 @@ include::itrace.txt[] without updating it. Currently this option is supported only by Intel PT, refer linkperf:perf-intel-pt[1] +--guest-data=,[,