diff options
Diffstat (limited to 'tools')
107 files changed, 11413 insertions, 1989 deletions
diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst index ff4d327a582e..88b28aa7431f 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst @@ -12,7 +12,8 @@ SYNOPSIS **bpftool** [*OPTIONS*] **btf** *COMMAND* - *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] } + *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | {**-d** | **--debug** } | + { **-B** | **--base-btf** } } *COMMANDS* := { **dump** | **help** } @@ -73,6 +74,20 @@ OPTIONS ======= .. include:: common_options.rst + -B, --base-btf *FILE* + Pass a base BTF object. Base BTF objects are typically used + with BTF objects for kernel modules. To avoid duplicating + all kernel symbols required by modules, BTF objects for + modules are "split", they are built incrementally on top of + the kernel (vmlinux) BTF object. So the base BTF reference + should usually point to the kernel BTF. + + When the main BTF object to process (for example, the + module BTF to dump) is passed as a *FILE*, bpftool attempts + to autodetect the path for the base object, and passing + this option is optional. When the main BTF object is passed + through other handles, this option becomes necessary. + EXAMPLES ======== **# bpftool btf dump id 1226** @@ -217,3 +232,34 @@ All the standard ways to specify map or program are supported: **# bpftool btf dump prog tag b88e0a09b1d9759d** **# bpftool btf dump prog pinned /sys/fs/bpf/prog_name** + +| +| **# bpftool btf dump file /sys/kernel/btf/i2c_smbus** +| (or) +| **# I2C_SMBUS_ID=$(bpftool btf show -p | jq '.[] | select(.name=="i2c_smbus").id')** +| **# bpftool btf dump id ${I2C_SMBUS_ID} -B /sys/kernel/btf/vmlinux** + +:: + + [104848] STRUCT 'i2c_smbus_alert' size=40 vlen=2 + 'alert' type_id=393 bits_offset=0 + 'ara' type_id=56050 bits_offset=256 + [104849] STRUCT 'alert_data' size=12 vlen=3 + 'addr' type_id=16 bits_offset=0 + 'type' type_id=56053 bits_offset=32 + 'data' type_id=7 bits_offset=64 + [104850] PTR '(anon)' type_id=104848 + [104851] PTR '(anon)' type_id=104849 + [104852] FUNC 'i2c_register_spd' type_id=84745 linkage=static + [104853] FUNC 'smbalert_driver_init' type_id=1213 linkage=static + [104854] FUNC_PROTO '(anon)' ret_type_id=18 vlen=1 + 'ara' type_id=56050 + [104855] FUNC 'i2c_handle_smbus_alert' type_id=104854 linkage=static + [104856] FUNC 'smbalert_remove' type_id=104854 linkage=static + [104857] FUNC_PROTO '(anon)' ret_type_id=18 vlen=2 + 'ara' type_id=56050 + 'id' type_id=56056 + [104858] FUNC 'smbalert_probe' type_id=104857 linkage=static + [104859] FUNC 'smbalert_work' type_id=9695 linkage=static + [104860] FUNC 'smbus_alert' type_id=71367 linkage=static + [104861] FUNC 'smbus_do_alert' type_id=84827 linkage=static diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index baee8591ac76..3e4395eede4f 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -12,7 +12,8 @@ SYNOPSIS **bpftool** [*OPTIONS*] **cgroup** *COMMAND* - *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-f** | **--bpffs** } } + *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } | + { **-f** | **--bpffs** } } *COMMANDS* := { **show** | **list** | **tree** | **attach** | **detach** | **help** } diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index dd3771bdbc57..ab9f57ee4c3a 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -12,7 +12,7 @@ SYNOPSIS **bpftool** [*OPTIONS*] **feature** *COMMAND* - *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] } + *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } } *COMMANDS* := { **probe** | **help** } diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index 7cd6681137f3..2ef2f2df0279 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -12,7 +12,8 @@ SYNOPSIS **bpftool** [*OPTIONS*] **gen** *COMMAND* - *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] } + *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } | + { **-L** | **--use-loader** } } *COMMAND* := { **object** | **skeleton** | **help** } @@ -152,6 +153,12 @@ OPTIONS ======= .. include:: common_options.rst + -L, --use-loader + For skeletons, generate a "light" skeleton (also known as "loader" + skeleton). A light skeleton contains a loader eBPF program. It does + not use the majority of the libbpf infrastructure, and does not need + libelf. + EXAMPLES ======== **$ cat example1.bpf.c** diff --git a/tools/bpf/bpftool/Documentation/bpftool-iter.rst b/tools/bpf/bpftool/Documentation/bpftool-iter.rst index 51f49bead619..471f363a725a 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-iter.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-iter.rst @@ -12,6 +12,8 @@ SYNOPSIS **bpftool** [*OPTIONS*] **iter** *COMMAND* + *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } } + *COMMANDS* := { **pin** | **help** } ITER COMMANDS diff --git a/tools/bpf/bpftool/Documentation/bpftool-link.rst b/tools/bpf/bpftool/Documentation/bpftool-link.rst index 5f7db2a837cc..0de90f086238 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-link.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-link.rst @@ -12,7 +12,8 @@ SYNOPSIS **bpftool** [*OPTIONS*] **link** *COMMAND* - *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-f** | **--bpffs** } } + *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } | + { **-f** | **--bpffs** } | { **-n** | **--nomount** } } *COMMANDS* := { **show** | **list** | **pin** | **help** } diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 3d52256ba75f..d0c4abe08aba 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -12,7 +12,8 @@ SYNOPSIS **bpftool** [*OPTIONS*] **map** *COMMAND* - *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-f** | **--bpffs** } } + *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } | + { **-f** | **--bpffs** } | { **-n** | **--nomount** } } *COMMANDS* := { **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index d8165d530937..1ae0375e8fea 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -12,7 +12,7 @@ SYNOPSIS **bpftool** [*OPTIONS*] **net** *COMMAND* - *OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] } + *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } } *COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** } diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst index e958ce91de72..ce52798a917d 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-perf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst @@ -12,7 +12,7 @@ SYNOPSIS **bpftool** [*OPTIONS*] **perf** *COMMAND* - *OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] } + *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } } *COMMANDS* := { **show** | **list** | **help** } diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index fe1b38e7e887..91608cb7e44a 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -12,7 +12,9 @@ SYNOPSIS **bpftool** [*OPTIONS*] **prog** *COMMAND* - *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-f** | **--bpffs** } } + *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } | + { **-f** | **--bpffs** } | { **-m** | **--mapcompat** } | { **-n** | **--nomount** } | + { **-L** | **--use-loader** } } *COMMANDS* := { **show** | **list** | **dump xlated** | **dump jited** | **pin** | **load** @@ -48,10 +50,11 @@ PROG COMMANDS | **struct_ops** | **fentry** | **fexit** | **freplace** | **sk_lookup** | } | *ATTACH_TYPE* := { -| **msg_verdict** | **stream_verdict** | **stream_parser** | **flow_dissector** +| **msg_verdict** | **skb_verdict** | **stream_verdict** | **stream_parser** | **flow_dissector** | } | *METRICs* := { -| **cycles** | **instructions** | **l1d_loads** | **llc_misses** +| **cycles** | **instructions** | **l1d_loads** | **llc_misses** | +| **itlb_misses** | **dtlb_misses** | } @@ -223,6 +226,20 @@ OPTIONS Do not automatically attempt to mount any virtual file system (such as tracefs or BPF virtual file system) when necessary. + -L, --use-loader + Load program as a "loader" program. This is useful to debug + the generation of such programs. When this option is in + use, bpftool attempts to load the programs from the object + file into the kernel, but does not pin them (therefore, the + *PATH* must not be provided). + + When combined with the **-d**\ \|\ **--debug** option, + additional debug messages are generated, and the execution + of the loader program will use the **bpf_trace_printk**\ () + helper to log each step of loading BTF, creating the maps, + and loading the programs (see **bpftool prog tracelog** as + a way to dump those messages). + EXAMPLES ======== **# bpftool prog show** @@ -326,3 +343,16 @@ EXAMPLES 40176203 cycles (83.05%) 42518139 instructions # 1.06 insns per cycle (83.39%) 123 llc_misses # 2.89 LLC misses per million insns (83.15%) + +| +| Output below is for the trace logs. +| Run in separate terminals: +| **# bpftool prog tracelog** +| **# bpftool prog load -L -d file.o** + +:: + + bpftool-620059 [004] d... 2634685.517903: bpf_trace_printk: btf_load size 665 r=5 + bpftool-620059 [004] d... 2634685.517912: bpf_trace_printk: map_create sample_map idx 0 type 2 value_size 4 value_btf_id 0 r=6 + bpftool-620059 [004] d... 2634685.517997: bpf_trace_printk: prog_load sample insn_cnt 13 r=7 + bpftool-620059 [004] d... 2634685.517999: bpf_trace_printk: close(5) = 0 diff --git a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst index 506e70ee78e9..02afc0fc14cb 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst @@ -12,7 +12,7 @@ SYNOPSIS **bpftool** [*OPTIONS*] **struct_ops** *COMMAND* - *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] } + *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } } *COMMANDS* := { **show** | **list** | **dump** | **register** | **unregister** | **help** } diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index e7d949334961..bb23f55bb05a 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -18,15 +18,15 @@ SYNOPSIS *OBJECT* := { **map** | **program** | **cgroup** | **perf** | **net** | **feature** } - *OPTIONS* := { { **-V** | **--version** } | { **-h** | **--help** } - | { **-j** | **--json** } [{ **-p** | **--pretty** }] } + *OPTIONS* := { { **-V** | **--version** } | + { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } } *MAP-COMMANDS* := - { **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** - | **delete** | **pin** | **event_pipe** | **help** } + { **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** | + **delete** | **pin** | **event_pipe** | **help** } - *PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin** - | **load** | **attach** | **detach** | **help** } + *PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin** | + **load** | **attach** | **detach** | **help** } *CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** } diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index cc33c5824a2f..88e2bcf16cca 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -260,7 +260,8 @@ _bpftool() # Deal with options if [[ ${words[cword]} == -* ]]; then - local c='--version --json --pretty --bpffs --mapcompat --debug' + local c='--version --json --pretty --bpffs --mapcompat --debug \ + --use-loader --base-btf' COMPREPLY=( $( compgen -W "$c" -- "$cur" ) ) return 0 fi @@ -278,7 +279,7 @@ _bpftool() _sysfs_get_netdevs return 0 ;; - file|pinned) + file|pinned|-B|--base-btf) _filedir return 0 ;; @@ -291,7 +292,8 @@ _bpftool() # Remove all options so completions don't have to deal with them. local i for (( i=1; i < ${#words[@]}; )); do - if [[ ${words[i]::1} == - ]]; then + if [[ ${words[i]::1} == - ]] && + [[ ${words[i]} != "-B" ]] && [[ ${words[i]} != "--base-btf" ]]; then words=( "${words[@]:0:i}" "${words[@]:i+1}" ) [[ $i -le $cword ]] && cword=$(( cword - 1 )) else @@ -343,7 +345,8 @@ _bpftool() local PROG_TYPE='id pinned tag name' local MAP_TYPE='id pinned name' - local METRIC_TYPE='cycles instructions l1d_loads llc_misses' + local METRIC_TYPE='cycles instructions l1d_loads llc_misses \ + itlb_misses dtlb_misses' case $command in show|list) [[ $prev != "$command" ]] && return 0 @@ -404,8 +407,10 @@ _bpftool() return 0 ;; 5) - COMPREPLY=( $( compgen -W 'msg_verdict stream_verdict \ - stream_parser flow_dissector' -- "$cur" ) ) + local BPFTOOL_PROG_ATTACH_TYPES='msg_verdict \ + skb_verdict stream_verdict stream_parser \ + flow_dissector' + COMPREPLY=( $( compgen -W "$BPFTOOL_PROG_ATTACH_TYPES" -- "$cur" ) ) return 0 ;; 6) @@ -464,7 +469,7 @@ _bpftool() case $prev in type) - COMPREPLY=( $( compgen -W "socket kprobe \ + local BPFTOOL_PROG_LOAD_TYPES='socket kprobe \ kretprobe classifier flow_dissector \ action tracepoint raw_tracepoint \ xdp perf_event cgroup/skb cgroup/sock \ @@ -479,8 +484,8 @@ _bpftool() cgroup/post_bind4 cgroup/post_bind6 \ cgroup/sysctl cgroup/getsockopt \ cgroup/setsockopt cgroup/sock_release struct_ops \ - fentry fexit freplace sk_lookup" -- \ - "$cur" ) ) + fentry fexit freplace sk_lookup' + COMPREPLY=( $( compgen -W "$BPFTOOL_PROG_LOAD_TYPES" -- "$cur" ) ) return 0 ;; id) @@ -698,15 +703,15 @@ _bpftool() return 0 ;; type) - COMPREPLY=( $( compgen -W 'hash array prog_array \ - perf_event_array percpu_hash percpu_array \ - stack_trace cgroup_array lru_hash \ + local BPFTOOL_MAP_CREATE_TYPES='hash array \ + prog_array perf_event_array percpu_hash \ + percpu_array stack_trace cgroup_array lru_hash \ lru_percpu_hash lpm_trie array_of_maps \ hash_of_maps devmap devmap_hash sockmap cpumap \ xskmap sockhash cgroup_storage reuseport_sockarray \ percpu_cgroup_storage queue stack sk_storage \ - struct_ops inode_storage task_storage' -- \ - "$cur" ) ) + struct_ops inode_storage task_storage ringbuf' + COMPREPLY=( $( compgen -W "$BPFTOOL_MAP_CREATE_TYPES" -- "$cur" ) ) return 0 ;; key|value|flags|entries) @@ -1017,34 +1022,37 @@ _bpftool() return 0 ;; attach|detach) - local ATTACH_TYPES='ingress egress sock_create sock_ops \ - device bind4 bind6 post_bind4 post_bind6 connect4 connect6 \ + local BPFTOOL_CGROUP_ATTACH_TYPES='ingress egress \ + sock_create sock_ops device \ + bind4 bind6 post_bind4 post_bind6 connect4 connect6 \ getpeername4 getpeername6 getsockname4 getsockname6 \ sendmsg4 sendmsg6 recvmsg4 recvmsg6 sysctl getsockopt \ setsockopt sock_release' local ATTACH_FLAGS='multi override' local PROG_TYPE='id pinned tag name' - case $prev in - $command) - _filedir - return 0 - ;; - ingress|egress|sock_create|sock_ops|device|bind4|bind6|\ - post_bind4|post_bind6|connect4|connect6|getpeername4|\ - getpeername6|getsockname4|getsockname6|sendmsg4|sendmsg6|\ - recvmsg4|recvmsg6|sysctl|getsockopt|setsockopt|sock_release) + # Check for $prev = $command first + if [ $prev = $command ]; then + _filedir + return 0 + # Then check for attach type. This is done outside of the + # "case $prev in" to avoid writing the whole list of attach + # types again as pattern to match (where we cannot reuse + # our variable). + elif [[ $BPFTOOL_CGROUP_ATTACH_TYPES =~ $prev ]]; then COMPREPLY=( $( compgen -W "$PROG_TYPE" -- \ "$cur" ) ) return 0 - ;; + fi + # case/esac for the other cases + case $prev in id) _bpftool_get_prog_ids return 0 ;; *) - if ! _bpftool_search_list "$ATTACH_TYPES"; then - COMPREPLY=( $( compgen -W "$ATTACH_TYPES" -- \ - "$cur" ) ) + if ! _bpftool_search_list "$BPFTOOL_CGROUP_ATTACH_TYPES"; then + COMPREPLY=( $( compgen -W \ + "$BPFTOOL_CGROUP_ATTACH_TYPES" -- "$cur" ) ) elif [[ "$command" == "attach" ]]; then # We have an attach type on the command line, # but it is not the previous word, or diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index 385d5c955cf3..f7e5ff3586c9 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -580,16 +580,12 @@ static int do_dump(int argc, char **argv) } if (!btf) { - err = btf__get_from_id(btf_id, &btf); + btf = btf__load_from_kernel_by_id_split(btf_id, base_btf); + err = libbpf_get_error(btf); if (err) { p_err("get btf by id (%u): %s", btf_id, strerror(err)); goto done; } - if (!btf) { - err = -ENOENT; - p_err("can't find btf with ID (%u)", btf_id); - goto done; - } } if (dump_c) { @@ -985,7 +981,8 @@ static int do_help(int argc, char **argv) " FORMAT := { raw | c }\n" " " HELP_SPEC_MAP "\n" " " HELP_SPEC_PROGRAM "\n" - " " HELP_SPEC_OPTIONS "\n" + " " HELP_SPEC_OPTIONS " |\n" + " {-B|--base-btf} }\n" "", bin_name, "btf"); diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c index 7ca54d046362..9c25286a5c73 100644 --- a/tools/bpf/bpftool/btf_dumper.c +++ b/tools/bpf/bpftool/btf_dumper.c @@ -64,8 +64,10 @@ static int dump_prog_id_as_func_ptr(const struct btf_dumper *d, } info = &prog_info->info; - if (!info->btf_id || !info->nr_func_info || - btf__get_from_id(info->btf_id, &prog_btf)) + if (!info->btf_id || !info->nr_func_info) + goto print; + prog_btf = btf__load_from_kernel_by_id(info->btf_id); + if (libbpf_get_error(prog_btf)) goto print; finfo = u64_to_ptr(info->func_info); func_type = btf__type_by_id(prog_btf, finfo->type_id); diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c index 6e53b1d393f4..3571a281c43f 100644 --- a/tools/bpf/bpftool/cgroup.c +++ b/tools/bpf/bpftool/cgroup.c @@ -501,7 +501,8 @@ static int do_help(int argc, char **argv) HELP_SPEC_ATTACH_TYPES "\n" " " HELP_SPEC_ATTACH_FLAGS "\n" " " HELP_SPEC_PROGRAM "\n" - " " HELP_SPEC_OPTIONS "\n" + " " HELP_SPEC_OPTIONS " |\n" + " {-f|--bpffs} }\n" "", bin_name, argv[-2]); diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index dc6daa193557..d42d930a3ec4 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -67,6 +67,12 @@ const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = { [BPF_MODIFY_RETURN] = "mod_ret", [BPF_LSM_MAC] = "lsm_mac", [BPF_SK_LOOKUP] = "sk_lookup", + [BPF_TRACE_ITER] = "trace_iter", + [BPF_XDP_DEVMAP] = "xdp_devmap", + [BPF_XDP_CPUMAP] = "xdp_cpumap", + [BPF_XDP] = "xdp", + [BPF_SK_REUSEPORT_SELECT] = "sk_skb_reuseport_select", + [BPF_SK_REUSEPORT_SELECT_OR_MIGRATE] = "sk_skb_reuseport_select_or_migrate", }; void p_err(const char *fmt, ...) diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 40a88df275f9..7f36385aa9e2 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -1005,6 +1005,7 @@ static int do_help(int argc, char **argv) " %1$s %2$s help\n" "\n" " COMPONENT := { kernel | dev NAME }\n" + " " HELP_SPEC_OPTIONS " }\n" "", bin_name, argv[-2]); diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 1d71ff8c52fa..d40d92bbf0e4 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -1026,7 +1026,8 @@ static int do_help(int argc, char **argv) " %1$s %2$s skeleton FILE [name OBJECT_NAME]\n" " %1$s %2$s help\n" "\n" - " " HELP_SPEC_OPTIONS "\n" + " " HELP_SPEC_OPTIONS " |\n" + " {-L|--use-loader} }\n" "", bin_name, "gen"); diff --git a/tools/bpf/bpftool/iter.c b/tools/bpf/bpftool/iter.c index 3b1aad7535dd..84a9b01d956d 100644 --- a/tools/bpf/bpftool/iter.c +++ b/tools/bpf/bpftool/iter.c @@ -97,7 +97,9 @@ static int do_help(int argc, char **argv) fprintf(stderr, "Usage: %1$s %2$s pin OBJ PATH [map MAP]\n" " %1$s %2$s help\n" + "\n" " " HELP_SPEC_MAP "\n" + " " HELP_SPEC_OPTIONS " }\n" "", bin_name, "iter"); diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index e77e1525d20a..8cc3e36f8cc6 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -401,7 +401,8 @@ static int do_help(int argc, char **argv) " %1$s %2$s help\n" "\n" " " HELP_SPEC_LINK "\n" - " " HELP_SPEC_OPTIONS "\n" + " " HELP_SPEC_OPTIONS " |\n" + " {-f|--bpffs} | {-n|--nomount} }\n" "", bin_name, argv[-2]); diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index 3ddfd4843738..02eaaf065f65 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -64,7 +64,8 @@ static int do_help(int argc, char **argv) " %s version\n" "\n" " OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops | iter }\n" - " " HELP_SPEC_OPTIONS "\n" + " " HELP_SPEC_OPTIONS " |\n" + " {-V|--version} }\n" "", bin_name, bin_name, bin_name); diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index c1cf29798b99..90caa42aac4c 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -57,8 +57,7 @@ static inline void *u64_to_ptr(__u64 ptr) #define HELP_SPEC_PROGRAM \ "PROG := { id PROG_ID | pinned FILE | tag PROG_TAG | name PROG_NAME }" #define HELP_SPEC_OPTIONS \ - "OPTIONS := { {-j|--json} [{-p|--pretty}] | {-f|--bpffs} |\n" \ - "\t {-m|--mapcompat} | {-n|--nomount} }" + "OPTIONS := { {-j|--json} [{-p|--pretty}] | {-d|--debug}" #define HELP_SPEC_MAP \ "MAP := { id MAP_ID | pinned FILE | name MAP_NAME }" #define HELP_SPEC_LINK \ diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 09ae0381205b..407071d54ab1 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -807,10 +807,11 @@ static struct btf *get_map_kv_btf(const struct bpf_map_info *info) } else if (info->btf_value_type_id) { int err; - err = btf__get_from_id(info->btf_id, &btf); - if (err || !btf) { + btf = btf__load_from_kernel_by_id(info->btf_id); + err = libbpf_get_error(btf); + if (err) { p_err("failed to get btf"); - btf = err ? ERR_PTR(err) : ERR_PTR(-ESRCH); + btf = ERR_PTR(err); } } @@ -1039,11 +1040,10 @@ static void print_key_value(struct bpf_map_info *info, void *key, void *value) { json_writer_t *btf_wtr; - struct btf *btf = NULL; - int err; + struct btf *btf; - err = btf__get_from_id(info->btf_id, &btf); - if (err) { + btf = btf__load_from_kernel_by_id(info->btf_id); + if (libbpf_get_error(btf)) { p_err("failed to get btf"); return; } @@ -1466,8 +1466,9 @@ static int do_help(int argc, char **argv) " devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n" " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n" " queue | stack | sk_storage | struct_ops | ringbuf | inode_storage |\n" - " task_storage }\n" - " " HELP_SPEC_OPTIONS "\n" + " task_storage }\n" + " " HELP_SPEC_OPTIONS " |\n" + " {-f|--bpffs} | {-n|--nomount} }\n" "", bin_name, argv[-2]); diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c index f836d115d7d6..649053704bd7 100644 --- a/tools/bpf/bpftool/net.c +++ b/tools/bpf/bpftool/net.c @@ -729,6 +729,7 @@ static int do_help(int argc, char **argv) "\n" " " HELP_SPEC_PROGRAM "\n" " ATTACH_TYPE := { xdp | xdpgeneric | xdpdrv | xdpoffload }\n" + " " HELP_SPEC_OPTIONS " }\n" "\n" "Note: Only xdp and tc attachments are supported now.\n" " For progs attached to cgroups, use \"bpftool cgroup\"\n" diff --git a/tools/bpf/bpftool/perf.c b/tools/bpf/bpftool/perf.c index ad23934819c7..50de087b0db7 100644 --- a/tools/bpf/bpftool/perf.c +++ b/tools/bpf/bpftool/perf.c @@ -231,7 +231,10 @@ static int do_show(int argc, char **argv) static int do_help(int argc, char **argv) { fprintf(stderr, - "Usage: %1$s %2$s { show | list | help }\n" + "Usage: %1$s %2$s { show | list }\n" + " %1$s %2$s help }\n" + "\n" + " " HELP_SPEC_OPTIONS " }\n" "", bin_name, argv[-2]); diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index cc48726740ad..9c3e343b7d87 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -249,10 +249,10 @@ static void show_prog_metadata(int fd, __u32 num_maps) struct bpf_map_info map_info; struct btf_var_secinfo *vsi; bool printed_header = false; - struct btf *btf = NULL; unsigned int i, vlen; void *value = NULL; const char *name; + struct btf *btf; int err; if (!num_maps) @@ -263,8 +263,8 @@ static void show_prog_metadata(int fd, __u32 num_maps) if (!value) return; - err = btf__get_from_id(map_info.btf_id, &btf); - if (err || !btf) + btf = btf__load_from_kernel_by_id(map_info.btf_id); + if (libbpf_get_error(btf)) goto out_free; t_datasec = btf__type_by_id(btf, map_info.btf_value_type_id); @@ -646,9 +646,12 @@ prog_dump(struct bpf_prog_info *info, enum dump_mode mode, member_len = info->xlated_prog_len; } - if (info->btf_id && btf__get_from_id(info->btf_id, &btf)) { - p_err("failed to get btf"); - return -1; + if (info->btf_id) { + btf = btf__load_from_kernel_by_id(info->btf_id); + if (libbpf_get_error(btf)) { + p_err("failed to get btf"); + return -1; + } } func_info = u64_to_ptr(info->func_info); @@ -781,6 +784,8 @@ prog_dump(struct bpf_prog_info *info, enum dump_mode mode, kernel_syms_destroy(&dd); } + btf__free(btf); + return 0; } @@ -2002,8 +2007,8 @@ static char *profile_target_name(int tgt_fd) struct bpf_prog_info_linear *info_linear; struct bpf_func_info *func_info; const struct btf_type *t; + struct btf *btf = NULL; char *name = NULL; - struct btf *btf; info_linear = bpf_program__get_prog_info_linear( tgt_fd, 1UL << BPF_PROG_INFO_FUNC_INFO); @@ -2012,12 +2017,17 @@ static char *profile_target_name(int tgt_fd) return NULL; } - if (info_linear->info.btf_id == 0 || - btf__get_from_id(info_linear->info.btf_id, &btf)) { + if (info_linear->info.btf_id == 0) { p_err("prog FD %d doesn't have valid btf", tgt_fd); goto out; } + btf = btf__load_from_kernel_by_id(info_linear->info.btf_id); + if (libbpf_get_error(btf)) { + p_err("failed to load btf for prog FD %d", tgt_fd); + goto out; + } + func_info = u64_to_ptr(info_linear->info.func_info); t = btf__type_by_id(btf, func_info[0].type_id); if (!t) { @@ -2027,6 +2037,7 @@ static char *profile_target_name(int tgt_fd) } name = strdup(btf__name_by_offset(btf, t->name_off)); out: + btf__free(btf); free(info_linear); return name; } @@ -2245,10 +2256,12 @@ static int do_help(int argc, char **argv) " cgroup/sendmsg6 | cgroup/recvmsg4 | cgroup/recvmsg6 |\n" " cgroup/getsockopt | cgroup/setsockopt | cgroup/sock_release |\n" " struct_ops | fentry | fexit | freplace | sk_lookup }\n" - " ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n" - " flow_dissector }\n" + " ATTACH_TYPE := { msg_verdict | skb_verdict | stream_verdict |\n" + " stream_parser | flow_dissector }\n" " METRIC := { cycles | instructions | l1d_loads | llc_misses | itlb_misses | dtlb_misses }\n" - " " HELP_SPEC_OPTIONS "\n" + " " HELP_SPEC_OPTIONS " |\n" + " {-f|--bpffs} | {-m|--mapcompat} | {-n|--nomount} |\n" + " {-L|--use-loader} }\n" "", bin_name, argv[-2]); diff --git a/tools/bpf/bpftool/struct_ops.c b/tools/bpf/bpftool/struct_ops.c index b58b91f62ffb..ab2d2290569a 100644 --- a/tools/bpf/bpftool/struct_ops.c +++ b/tools/bpf/bpftool/struct_ops.c @@ -572,8 +572,8 @@ static int do_help(int argc, char **argv) " %1$s %2$s unregister STRUCT_OPS_MAP\n" " %1$s %2$s help\n" "\n" - " OPTIONS := { {-j|--json} [{-p|--pretty}] }\n" " STRUCT_OPS_MAP := [ id STRUCT_OPS_MAP_ID | name STRUCT_OPS_MAP_NAME ]\n" + " " HELP_SPEC_OPTIONS " }\n" "", bin_name, argv[-2]); diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index 3ad9301b0f00..de6365b53c9c 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -291,7 +291,7 @@ static int compressed_section_fix(Elf *elf, Elf_Scn *scn, GElf_Shdr *sh) sh->sh_addralign = expected; if (gelf_update_shdr(scn, sh) == 0) { - printf("FAILED cannot update section header: %s\n", + pr_err("FAILED cannot update section header: %s\n", elf_errmsg(-1)); return -1; } @@ -317,6 +317,7 @@ static int elf_collect(struct object *obj) elf = elf_begin(fd, ELF_C_RDWR_MMAP, NULL); if (!elf) { + close(fd); pr_err("FAILED cannot create ELF descriptor: %s\n", elf_errmsg(-1)); return -1; @@ -484,7 +485,7 @@ static int symbols_resolve(struct object *obj) err = libbpf_get_error(btf); if (err) { pr_err("FAILED: load BTF from %s: %s\n", - obj->path, strerror(-err)); + obj->btf ?: obj->path, strerror(-err)); return -1; } @@ -555,8 +556,7 @@ static int id_patch(struct object *obj, struct btf_id *id) int i; if (!id->id) { - pr_err("FAILED unresolved symbol %s\n", id->name); - return -EINVAL; + pr_err("WARN: resolve_btfids: unresolved symbol %s\n", id->name); } for (i = 0; i < id->addr_cnt; i++) { @@ -734,8 +734,9 @@ int main(int argc, const char **argv) err = 0; out: - if (obj.efile.elf) + if (obj.efile.elf) { elf_end(obj.efile.elf); - close(obj.efile.fd); + close(obj.efile.fd); + } return err; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index bf9252c7381e..2db6925e04f4 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -324,9 +324,6 @@ union bpf_iter_link_info { * **BPF_PROG_TYPE_SK_LOOKUP** * *data_in* and *data_out* must be NULL. * - * **BPF_PROG_TYPE_XDP** - * *ctx_in* and *ctx_out* must be NULL. - * * **BPF_PROG_TYPE_RAW_TRACEPOINT**, * **BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE** * @@ -3249,7 +3246,7 @@ union bpf_attr { * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) * Description * Select a **SO_REUSEPORT** socket from a - * **BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*. + * **BPF_MAP_TYPE_REUSEPORT_SOCKARRAY** *map*. * It checks the selected socket is matching the incoming * request in the socket buffer. * Return @@ -4780,6 +4777,76 @@ union bpf_attr { * Execute close syscall for given FD. * Return * A syscall result. + * + * long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, u64 flags) + * Description + * Initialize the timer. + * First 4 bits of *flags* specify clockid. + * Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed. + * All other bits of *flags* are reserved. + * The verifier will reject the program if *timer* is not from + * the same *map*. + * Return + * 0 on success. + * **-EBUSY** if *timer* is already initialized. + * **-EINVAL** if invalid *flags* are passed. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + * + * long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn) + * Description + * Configure the timer to call *callback_fn* static function. + * Return + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + * + * long bpf_timer_start(struct bpf_timer *timer, u64 nsecs, u64 flags) + * Description + * Set timer expiration N nanoseconds from the current time. The + * configured callback will be invoked in soft irq context on some cpu + * and will not repeat unless another bpf_timer_start() is made. + * In such case the next invocation can migrate to a different cpu. + * Since struct bpf_timer is a field inside map element the map + * owns the timer. The bpf_timer_set_callback() will increment refcnt + * of BPF program to make sure that callback_fn code stays valid. + * When user space reference to a map reaches zero all timers + * in a map are cancelled and corresponding program's refcnts are + * decremented. This is done to make sure that Ctrl-C of a user + * process doesn't leave any timers running. If map is pinned in + * bpffs the callback_fn can re-arm itself indefinitely. + * bpf_map_update/delete_elem() helpers and user space sys_bpf commands + * cancel and free the timer in the given map element. + * The map can contain timers that invoke callback_fn-s from different + * programs. The same callback_fn can serve different timers from + * different maps if key/value layout matches across maps. + * Every bpf_timer_set_callback() can have different callback_fn. + * + * Return + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier + * or invalid *flags* are passed. + * + * long bpf_timer_cancel(struct bpf_timer *timer) + * Description + * Cancel the timer and wait for callback_fn to finish if it was running. + * Return + * 0 if the timer was not active. + * 1 if the timer was active. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its + * own timer which would have led to a deadlock otherwise. + * + * u64 bpf_get_func_ip(void *ctx) + * Description + * Get address of the traced function (for tracing and kprobe programs). + * Return + * Address of the traced function. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4951,6 +5018,11 @@ union bpf_attr { FN(sys_bpf), \ FN(btf_find_by_name_kind), \ FN(sys_close), \ + FN(timer_init), \ + FN(timer_set_callback), \ + FN(timer_start), \ + FN(timer_cancel), \ + FN(get_func_ip), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -6077,6 +6149,11 @@ struct bpf_spin_lock { __u32 val; }; +struct bpf_timer { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write. diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index d208b2af697f..eb15f319aa57 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -653,6 +653,7 @@ enum { IFLA_BOND_AD_ACTOR_SYSTEM, IFLA_BOND_TLB_DYNAMIC_LB, IFLA_BOND_PEER_NOTIF_DELAY, + IFLA_BOND_AD_LACP_ACTIVE, __IFLA_BOND_MAX, }; diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build index 430f6874fa41..94f0a146bb7b 100644 --- a/tools/lib/bpf/Build +++ b/tools/lib/bpf/Build @@ -1,3 +1,3 @@ libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \ netlink.o bpf_prog_linfo.o libbpf_probes.o xsk.o hashmap.o \ - btf_dump.o ringbuf.o strset.o linker.o gen_loader.o + btf_dump.o ringbuf.o strset.o linker.o gen_loader.o relo_core.o diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 7ff3d5ce44f9..77dc24d58302 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -1179,7 +1179,7 @@ int btf__finalize_data(struct bpf_object *obj, struct btf *btf) static void *btf_get_raw_data(const struct btf *btf, __u32 *size, bool swap_endian); -int btf__load(struct btf *btf) +int btf__load_into_kernel(struct btf *btf) { __u32 log_buf_size = 0, raw_size; char *log_buf = NULL; @@ -1227,6 +1227,7 @@ done: free(log_buf); return libbpf_err(err); } +int btf__load(struct btf *) __attribute__((alias("btf__load_into_kernel"))); int btf__fd(const struct btf *btf) { @@ -1381,21 +1382,35 @@ exit_free: return btf; } -int btf__get_from_id(__u32 id, struct btf **btf) +struct btf *btf__load_from_kernel_by_id_split(__u32 id, struct btf *base_btf) { - struct btf *res; - int err, btf_fd; + struct btf *btf; + int btf_fd; - *btf = NULL; btf_fd = bpf_btf_get_fd_by_id(id); if (btf_fd < 0) - return libbpf_err(-errno); - - res = btf_get_from_fd(btf_fd, NULL); - err = libbpf_get_error(res); + return libbpf_err_ptr(-errno); + btf = btf_get_from_fd(btf_fd, base_btf); close(btf_fd); + return libbpf_ptr(btf); +} + +struct btf *btf__load_from_kernel_by_id(__u32 id) +{ + return btf__load_from_kernel_by_id_split(id, NULL); +} + +int btf__get_from_id(__u32 id, struct btf **btf) +{ + struct btf *res; + int err; + + *btf = NULL; + res = btf__load_from_kernel_by_id(id); + err = libbpf_get_error(res); + if (err) return libbpf_err(err); @@ -4020,7 +4035,7 @@ static void btf_dedup_merge_hypot_map(struct btf_dedup *d) */ if (d->hypot_adjust_canon) continue; - + if (t_kind == BTF_KIND_FWD && c_kind != BTF_KIND_FWD) d->map[t_id] = c_id; @@ -4393,7 +4408,7 @@ static int btf_dedup_remap_types(struct btf_dedup *d) * Probe few well-known locations for vmlinux kernel image and try to load BTF * data out of it to use for target BTF. */ -struct btf *libbpf_find_kernel_btf(void) +struct btf *btf__load_vmlinux_btf(void) { struct { const char *path_fmt; @@ -4439,6 +4454,16 @@ struct btf *libbpf_find_kernel_btf(void) return libbpf_err_ptr(-ESRCH); } +struct btf *libbpf_find_kernel_btf(void) __attribute__((alias("btf__load_vmlinux_btf"))); + +struct btf *btf__load_module_btf(const char *module_name, struct btf *vmlinux_btf) +{ + char path[80]; + + snprintf(path, sizeof(path), "/sys/kernel/btf/%s", module_name); + return btf__parse_split(path, vmlinux_btf); +} + int btf_type_visit_type_ids(struct btf_type *t, type_id_visit_fn visit, void *ctx) { int i, n, err; diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index b54f1c3ebd57..4a711f990904 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -44,8 +44,17 @@ LIBBPF_API struct btf *btf__parse_elf_split(const char *path, struct btf *base_b LIBBPF_API struct btf *btf__parse_raw(const char *path); LIBBPF_API struct btf *btf__parse_raw_split(const char *path, struct btf *base_btf); +LIBBPF_API struct btf *btf__load_vmlinux_btf(void); +LIBBPF_API struct btf *btf__load_module_btf(const char *module_name, struct btf *vmlinux_btf); +LIBBPF_API struct btf *libbpf_find_kernel_btf(void); + +LIBBPF_API struct btf *btf__load_from_kernel_by_id(__u32 id); +LIBBPF_API struct btf *btf__load_from_kernel_by_id_split(__u32 id, struct btf *base_btf); +LIBBPF_API int btf__get_from_id(__u32 id, struct btf **btf); + LIBBPF_API int btf__finalize_data(struct bpf_object *obj, struct btf *btf); LIBBPF_API int btf__load(struct btf *btf); +LIBBPF_API int btf__load_into_kernel(struct btf *btf); LIBBPF_API __s32 btf__find_by_name(const struct btf *btf, const char *type_name); LIBBPF_API __s32 btf__find_by_name_kind(const struct btf *btf, @@ -66,7 +75,6 @@ LIBBPF_API void btf__set_fd(struct btf *btf, int fd); LIBBPF_API const void *btf__get_raw_data(const struct btf *btf, __u32 *size); LIBBPF_API const char *btf__name_by_offset(const struct btf *btf, __u32 offset); LIBBPF_API const char *btf__str_by_offset(const struct btf *btf, __u32 offset); -LIBBPF_API int btf__get_from_id(__u32 id, struct btf **btf); LIBBPF_API int btf__get_map_kv_tids(const struct btf *btf, const char *map_name, __u32 expected_key_size, __u32 expected_value_size, @@ -89,8 +97,6 @@ int btf_ext__reloc_line_info(const struct btf *btf, LIBBPF_API __u32 btf_ext__func_info_rec_size(const struct btf_ext *btf_ext); LIBBPF_API __u32 btf_ext__line_info_rec_size(const struct btf_ext *btf_ext); -LIBBPF_API struct btf *libbpf_find_kernel_btf(void); - LIBBPF_API int btf__find_str(struct btf *btf, const char *s); LIBBPF_API int btf__add_str(struct btf *btf, const char *s); LIBBPF_API int btf__add_type(struct btf *btf, const struct btf *src_btf, @@ -184,6 +190,25 @@ LIBBPF_API int btf_dump__emit_type_decl(struct btf_dump *d, __u32 id, const struct btf_dump_emit_type_decl_opts *opts); + +struct btf_dump_type_data_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + const char *indent_str; + int indent_level; + /* below match "show" flags for bpf_show_snprintf() */ + bool compact; /* no newlines/indentation */ + bool skip_names; /* skip member/type names */ + bool emit_zeroes; /* show 0-valued fields */ + size_t :0; +}; +#define btf_dump_type_data_opts__last_field emit_zeroes + +LIBBPF_API int +btf_dump__dump_type_data(struct btf_dump *d, __u32 id, + const void *data, size_t data_sz, + const struct btf_dump_type_data_opts *opts); + /* * A set of helpers for easier BTF types handling */ diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 5dc6b5172bb3..e4b483f15fb9 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -10,6 +10,8 @@ #include <stddef.h> #include <stdlib.h> #include <string.h> +#include <ctype.h> +#include <endian.h> #include <errno.h> #include <linux/err.h> #include <linux/btf.h> @@ -53,6 +55,26 @@ struct btf_dump_type_aux_state { __u8 referenced: 1; }; +/* indent string length; one indent string is added for each indent level */ +#define BTF_DATA_INDENT_STR_LEN 32 + +/* + * Common internal data for BTF type data dump operations. + */ +struct btf_dump_data { + const void *data_end; /* end of valid data to show */ + bool compact; + bool skip_names; + bool emit_zeroes; + __u8 indent_lvl; /* base indent level */ + char indent_str[BTF_DATA_INDENT_STR_LEN]; + /* below are used during iteration */ + int depth; + bool is_array_member; + bool is_array_terminated; + bool is_array_char; +}; + struct btf_dump { const struct btf *btf; const struct btf_ext *btf_ext; @@ -60,6 +82,7 @@ struct btf_dump { struct btf_dump_opts opts; int ptr_sz; bool strip_mods; + bool skip_anon_defs; int last_id; /* per-type auxiliary state */ @@ -89,6 +112,10 @@ struct btf_dump { * name occurrences */ struct hashmap *ident_names; + /* + * data for typed display; allocated if needed. + */ + struct btf_dump_data *typed_dump; }; static size_t str_hash_fn(const void *key, void *ctx) @@ -765,11 +792,11 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id) break; case BTF_KIND_FUNC_PROTO: { const struct btf_param *p = btf_params(t); - __u16 vlen = btf_vlen(t); + __u16 n = btf_vlen(t); int i; btf_dump_emit_type(d, t->type, cont_id); - for (i = 0; i < vlen; i++, p++) + for (i = 0; i < n; i++, p++) btf_dump_emit_type(d, p->type, cont_id); break; @@ -852,8 +879,9 @@ static void btf_dump_emit_bit_padding(const struct btf_dump *d, static void btf_dump_emit_struct_fwd(struct btf_dump *d, __u32 id, const struct btf_type *t) { - btf_dump_printf(d, "%s %s", + btf_dump_printf(d, "%s%s%s", btf_is_struct(t) ? "struct" : "union", + t->name_off ? " " : "", btf_dump_type_name(d, id)); } @@ -1259,7 +1287,7 @@ static void btf_dump_emit_type_chain(struct btf_dump *d, case BTF_KIND_UNION: btf_dump_emit_mods(d, decls); /* inline anonymous struct/union */ - if (t->name_off == 0) + if (t->name_off == 0 && !d->skip_anon_defs) btf_dump_emit_struct_def(d, id, t, lvl); else btf_dump_emit_struct_fwd(d, id, t); @@ -1267,7 +1295,7 @@ static void btf_dump_emit_type_chain(struct btf_dump *d, case BTF_KIND_ENUM: btf_dump_emit_mods(d, decls); /* inline anonymous enum */ - if (t->name_off == 0) + if (t->name_off == 0 && !d->skip_anon_defs) btf_dump_emit_enum_def(d, id, t, lvl); else btf_dump_emit_enum_fwd(d, id, t); @@ -1392,6 +1420,39 @@ static void btf_dump_emit_type_chain(struct btf_dump *d, btf_dump_emit_name(d, fname, last_was_ptr); } +/* show type name as (type_name) */ +static void btf_dump_emit_type_cast(struct btf_dump *d, __u32 id, + bool top_level) +{ + const struct btf_type *t; + + /* for array members, we don't bother emitting type name for each + * member to avoid the redundancy of + * .name = (char[4])[(char)'f',(char)'o',(char)'o',] + */ + if (d->typed_dump->is_array_member) + return; + + /* avoid type name specification for variable/section; it will be done + * for the associated variable value(s). + */ + t = btf__type_by_id(d->btf, id); + if (btf_is_var(t) || btf_is_datasec(t)) + return; + + if (top_level) + btf_dump_printf(d, "("); + + d->skip_anon_defs = true; + d->strip_mods = true; + btf_dump_emit_type_decl(d, id, "", 0); + d->strip_mods = false; + d->skip_anon_defs = false; + + if (top_level) + btf_dump_printf(d, ")"); +} + /* return number of duplicates (occurrences) of a given name */ static size_t btf_dump_name_dups(struct btf_dump *d, struct hashmap *name_map, const char *orig_name) @@ -1442,3 +1503,803 @@ static const char *btf_dump_ident_name(struct btf_dump *d, __u32 id) { return btf_dump_resolve_name(d, id, d->ident_names); } + +static int btf_dump_dump_type_data(struct btf_dump *d, + const char *fname, + const struct btf_type *t, + __u32 id, + const void *data, + __u8 bits_offset, + __u8 bit_sz); + +static const char *btf_dump_data_newline(struct btf_dump *d) +{ + return d->typed_dump->compact || d->typed_dump->depth == 0 ? "" : "\n"; +} + +static const char *btf_dump_data_delim(struct btf_dump *d) +{ + return d->typed_dump->depth == 0 ? "" : ","; +} + +static void btf_dump_data_pfx(struct btf_dump *d) +{ + int i, lvl = d->typed_dump->indent_lvl + d->typed_dump->depth; + + if (d->typed_dump->compact) + return; + + for (i = 0; i < lvl; i++) + btf_dump_printf(d, "%s", d->typed_dump->indent_str); +} + +/* A macro is used here as btf_type_value[s]() appends format specifiers + * to the format specifier passed in; these do the work of appending + * delimiters etc while the caller simply has to specify the type values + * in the format specifier + value(s). + */ +#define btf_dump_type_values(d, fmt, ...) \ + btf_dump_printf(d, fmt "%s%s", \ + ##__VA_ARGS__, \ + btf_dump_data_delim(d), \ + btf_dump_data_newline(d)) + +static int btf_dump_unsupported_data(struct btf_dump *d, + const struct btf_type *t, + __u32 id) +{ + btf_dump_printf(d, "<unsupported kind:%u>", btf_kind(t)); + return -ENOTSUP; +} + +static int btf_dump_get_bitfield_value(struct btf_dump *d, + const struct btf_type *t, + const void *data, + __u8 bits_offset, + __u8 bit_sz, + __u64 *value) +{ + __u16 left_shift_bits, right_shift_bits; + __u8 nr_copy_bits, nr_copy_bytes; + const __u8 *bytes = data; + int sz = t->size; + __u64 num = 0; + int i; + + /* Maximum supported bitfield size is 64 bits */ + if (sz > 8) { + pr_warn("unexpected bitfield size %d\n", sz); + return -EINVAL; + } + + /* Bitfield value retrieval is done in two steps; first relevant bytes are + * stored in num, then we left/right shift num to eliminate irrelevant bits. + */ + nr_copy_bits = bit_sz + bits_offset; + nr_copy_bytes = t->size; +#if __BYTE_ORDER == __LITTLE_ENDIAN + for (i = nr_copy_bytes - 1; i >= 0; i--) + num = num * 256 + bytes[i]; +#elif __BYTE_ORDER == __BIG_ENDIAN + for (i = 0; i < nr_copy_bytes; i++) + num = num * 256 + bytes[i]; +#else +# error "Unrecognized __BYTE_ORDER__" +#endif + left_shift_bits = 64 - nr_copy_bits; + right_shift_bits = 64 - bit_sz; + + *value = (num << left_shift_bits) >> right_shift_bits; + + return 0; +} + +static int btf_dump_bitfield_check_zero(struct btf_dump *d, + const struct btf_type *t, + const void *data, + __u8 bits_offset, + __u8 bit_sz) +{ + __u64 check_num; + int err; + + err = btf_dump_get_bitfield_value(d, t, data, bits_offset, bit_sz, &check_num); + if (err) + return err; + if (check_num == 0) + return -ENODATA; + return 0; +} + +static int btf_dump_bitfield_data(struct btf_dump *d, + const struct btf_type *t, + const void *data, + __u8 bits_offset, + __u8 bit_sz) +{ + __u64 print_num; + int err; + + err = btf_dump_get_bitfield_value(d, t, data, bits_offset, bit_sz, &print_num); + if (err) + return err; + + btf_dump_type_values(d, "0x%llx", (unsigned long long)print_num); + + return 0; +} + +/* ints, floats and ptrs */ +static int btf_dump_base_type_check_zero(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data) +{ + static __u8 bytecmp[16] = {}; + int nr_bytes; + + /* For pointer types, pointer size is not defined on a per-type basis. + * On dump creation however, we store the pointer size. + */ + if (btf_kind(t) == BTF_KIND_PTR) + nr_bytes = d->ptr_sz; + else + nr_bytes = t->size; + + if (nr_bytes < 1 || nr_bytes > 16) { + pr_warn("unexpected size %d for id [%u]\n", nr_bytes, id); + return -EINVAL; + } + + if (memcmp(data, bytecmp, nr_bytes) == 0) + return -ENODATA; + return 0; +} + +static bool ptr_is_aligned(const void *data, int data_sz) +{ + return ((uintptr_t)data) % data_sz == 0; +} + +static int btf_dump_int_data(struct btf_dump *d, + const struct btf_type *t, + __u32 type_id, + const void *data, + __u8 bits_offset) +{ + __u8 encoding = btf_int_encoding(t); + bool sign = encoding & BTF_INT_SIGNED; + int sz = t->size; + + if (sz == 0) { + pr_warn("unexpected size %d for id [%u]\n", sz, type_id); + return -EINVAL; + } + + /* handle packed int data - accesses of integers not aligned on + * int boundaries can cause problems on some platforms. + */ + if (!ptr_is_aligned(data, sz)) + return btf_dump_bitfield_data(d, t, data, 0, 0); + + switch (sz) { + case 16: { + const __u64 *ints = data; + __u64 lsi, msi; + + /* avoid use of __int128 as some 32-bit platforms do not + * support it. + */ +#if __BYTE_ORDER == __LITTLE_ENDIAN + lsi = ints[0]; + msi = ints[1]; +#elif __BYTE_ORDER == __BIG_ENDIAN + lsi = ints[1]; + msi = ints[0]; +#else +# error "Unrecognized __BYTE_ORDER__" +#endif + if (msi == 0) + btf_dump_type_values(d, "0x%llx", (unsigned long long)lsi); + else + btf_dump_type_values(d, "0x%llx%016llx", (unsigned long long)msi, + (unsigned long long)lsi); + break; + } + case 8: + if (sign) + btf_dump_type_values(d, "%lld", *(long long *)data); + else + btf_dump_type_values(d, "%llu", *(unsigned long long *)data); + break; + case 4: + if (sign) + btf_dump_type_values(d, "%d", *(__s32 *)data); + else + btf_dump_type_values(d, "%u", *(__u32 *)data); + break; + case 2: + if (sign) + btf_dump_type_values(d, "%d", *(__s16 *)data); + else + btf_dump_type_values(d, "%u", *(__u16 *)data); + break; + case 1: + if (d->typed_dump->is_array_char) { + /* check for null terminator */ + if (d->typed_dump->is_array_terminated) + break; + if (*(char *)data == '\0') { + d->typed_dump->is_array_terminated = true; + break; + } + if (isprint(*(char *)data)) { + btf_dump_type_values(d, "'%c'", *(char *)data); + break; + } + } + if (sign) + btf_dump_type_values(d, "%d", *(__s8 *)data); + else + btf_dump_type_values(d, "%u", *(__u8 *)data); + break; + default: + pr_warn("unexpected sz %d for id [%u]\n", sz, type_id); + return -EINVAL; + } + return 0; +} + +union float_data { + long double ld; + double d; + float f; +}; + +static int btf_dump_float_data(struct btf_dump *d, + const struct btf_type *t, + __u32 type_id, + const void *data) +{ + const union float_data *flp = data; + union float_data fl; + int sz = t->size; + + /* handle unaligned data; copy to local union */ + if (!ptr_is_aligned(data, sz)) { + memcpy(&fl, data, sz); + flp = &fl; + } + + switch (sz) { + case 16: + btf_dump_type_values(d, "%Lf", flp->ld); + break; + case 8: + btf_dump_type_values(d, "%lf", flp->d); + break; + case 4: + btf_dump_type_values(d, "%f", flp->f); + break; + default: + pr_warn("unexpected size %d for id [%u]\n", sz, type_id); + return -EINVAL; + } + return 0; +} + +static int btf_dump_var_data(struct btf_dump *d, + const struct btf_type *v, + __u32 id, + const void *data) +{ + enum btf_func_linkage linkage = btf_var(v)->linkage; + const struct btf_type *t; + const char *l; + __u32 type_id; + + switch (linkage) { + case BTF_FUNC_STATIC: + l = "static "; + break; + case BTF_FUNC_EXTERN: + l = "extern "; + break; + case BTF_FUNC_GLOBAL: + default: + l = ""; + break; + } + + /* format of output here is [linkage] [type] [varname] = (type)value, + * for example "static int cpu_profile_flip = (int)1" + */ + btf_dump_printf(d, "%s", l); + type_id = v->type; + t = btf__type_by_id(d->btf, type_id); + btf_dump_emit_type_cast(d, type_id, false); + btf_dump_printf(d, " %s = ", btf_name_of(d, v->name_off)); + return btf_dump_dump_type_data(d, NULL, t, type_id, data, 0, 0); +} + +static int btf_dump_array_data(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data) +{ + const struct btf_array *array = btf_array(t); + const struct btf_type *elem_type; + __u32 i, elem_size = 0, elem_type_id; + bool is_array_member; + + elem_type_id = array->type; + elem_type = skip_mods_and_typedefs(d->btf, elem_type_id, NULL); + elem_size = btf__resolve_size(d->btf, elem_type_id); + if (elem_size <= 0) { + pr_warn("unexpected elem size %d for array type [%u]\n", elem_size, id); + return -EINVAL; + } + + if (btf_is_int(elem_type)) { + /* + * BTF_INT_CHAR encoding never seems to be set for + * char arrays, so if size is 1 and element is + * printable as a char, we'll do that. + */ + if (elem_size == 1) + d->typed_dump->is_array_char = true; + } + + /* note that we increment depth before calling btf_dump_print() below; + * this is intentional. btf_dump_data_newline() will not print a + * newline for depth 0 (since this leaves us with trailing newlines + * at the end of typed display), so depth is incremented first. + * For similar reasons, we decrement depth before showing the closing + * parenthesis. + */ + d->typed_dump->depth++; + btf_dump_printf(d, "[%s", btf_dump_data_newline(d)); + + /* may be a multidimensional array, so store current "is array member" + * status so we can restore it correctly later. + */ + is_array_member = d->typed_dump->is_array_member; + d->typed_dump->is_array_member = true; + for (i = 0; i < array->nelems; i++, data += elem_size) { + if (d->typed_dump->is_array_terminated) + break; + btf_dump_dump_type_data(d, NULL, elem_type, elem_type_id, data, 0, 0); + } + d->typed_dump->is_array_member = is_array_member; + d->typed_dump->depth--; + btf_dump_data_pfx(d); + btf_dump_type_values(d, "]"); + + return 0; +} + +static int btf_dump_struct_data(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data) +{ + const struct btf_member *m = btf_members(t); + __u16 n = btf_vlen(t); + int i, err; + + /* note that we increment depth before calling btf_dump_print() below; + * this is intentional. btf_dump_data_newline() will not print a + * newline for depth 0 (since this leaves us with trailing newlines + * at the end of typed display), so depth is incremented first. + * For similar reasons, we decrement depth before showing the closing + * parenthesis. + */ + d->typed_dump->depth++; + btf_dump_printf(d, "{%s", btf_dump_data_newline(d)); + + for (i = 0; i < n; i++, m++) { + const struct btf_type *mtype; + const char *mname; + __u32 moffset; + __u8 bit_sz; + + mtype = btf__type_by_id(d->btf, m->type); + mname = btf_name_of(d, m->name_off); + moffset = btf_member_bit_offset(t, i); + + bit_sz = btf_member_bitfield_size(t, i); + err = btf_dump_dump_type_data(d, mname, mtype, m->type, data + moffset / 8, + moffset % 8, bit_sz); + if (err < 0) + return err; + } + d->typed_dump->depth--; + btf_dump_data_pfx(d); + btf_dump_type_values(d, "}"); + return err; +} + +union ptr_data { + unsigned int p; + unsigned long long lp; +}; + +static int btf_dump_ptr_data(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data) +{ + if (ptr_is_aligned(data, d->ptr_sz) && d->ptr_sz == sizeof(void *)) { + btf_dump_type_values(d, "%p", *(void **)data); + } else { + union ptr_data pt; + + memcpy(&pt, data, d->ptr_sz); + if (d->ptr_sz == 4) + btf_dump_type_values(d, "0x%x", pt.p); + else + btf_dump_type_values(d, "0x%llx", pt.lp); + } + return 0; +} + +static int btf_dump_get_enum_value(struct btf_dump *d, + const struct btf_type *t, + const void *data, + __u32 id, + __s64 *value) +{ + int sz = t->size; + + /* handle unaligned enum value */ + if (!ptr_is_aligned(data, sz)) { + __u64 val; + int err; + + err = btf_dump_get_bitfield_value(d, t, data, 0, 0, &val); + if (err) + return err; + *value = (__s64)val; + return 0; + } + + switch (t->size) { + case 8: + *value = *(__s64 *)data; + return 0; + case 4: + *value = *(__s32 *)data; + return 0; + case 2: + *value = *(__s16 *)data; + return 0; + case 1: + *value = *(__s8 *)data; + return 0; + default: + pr_warn("unexpected size %d for enum, id:[%u]\n", t->size, id); + return -EINVAL; + } +} + +static int btf_dump_enum_data(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data) +{ + const struct btf_enum *e; + __s64 value; + int i, err; + + err = btf_dump_get_enum_value(d, t, data, id, &value); + if (err) + return err; + + for (i = 0, e = btf_enum(t); i < btf_vlen(t); i++, e++) { + if (value != e->val) + continue; + btf_dump_type_values(d, "%s", btf_name_of(d, e->name_off)); + return 0; + } + + btf_dump_type_values(d, "%d", value); + return 0; +} + +static int btf_dump_datasec_data(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data) +{ + const struct btf_var_secinfo *vsi; + const struct btf_type *var; + __u32 i; + int err; + + btf_dump_type_values(d, "SEC(\"%s\") ", btf_name_of(d, t->name_off)); + + for (i = 0, vsi = btf_var_secinfos(t); i < btf_vlen(t); i++, vsi++) { + var = btf__type_by_id(d->btf, vsi->type); + err = btf_dump_dump_type_data(d, NULL, var, vsi->type, data + vsi->offset, 0, 0); + if (err < 0) + return err; + btf_dump_printf(d, ";"); + } + return 0; +} + +/* return size of type, or if base type overflows, return -E2BIG. */ +static int btf_dump_type_data_check_overflow(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data, + __u8 bits_offset) +{ + __s64 size = btf__resolve_size(d->btf, id); + + if (size < 0 || size >= INT_MAX) { + pr_warn("unexpected size [%zu] for id [%u]\n", + (size_t)size, id); + return -EINVAL; + } + + /* Only do overflow checking for base types; we do not want to + * avoid showing part of a struct, union or array, even if we + * do not have enough data to show the full object. By + * restricting overflow checking to base types we can ensure + * that partial display succeeds, while avoiding overflowing + * and using bogus data for display. + */ + t = skip_mods_and_typedefs(d->btf, id, NULL); + if (!t) { + pr_warn("unexpected error skipping mods/typedefs for id [%u]\n", + id); + return -EINVAL; + } + + switch (btf_kind(t)) { + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_PTR: + case BTF_KIND_ENUM: + if (data + bits_offset / 8 + size > d->typed_dump->data_end) + return -E2BIG; + break; + default: + break; + } + return (int)size; +} + +static int btf_dump_type_data_check_zero(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data, + __u8 bits_offset, + __u8 bit_sz) +{ + __s64 value; + int i, err; + + /* toplevel exceptions; we show zero values if + * - we ask for them (emit_zeros) + * - if we are at top-level so we see "struct empty { }" + * - or if we are an array member and the array is non-empty and + * not a char array; we don't want to be in a situation where we + * have an integer array 0, 1, 0, 1 and only show non-zero values. + * If the array contains zeroes only, or is a char array starting + * with a '\0', the array-level check_zero() will prevent showing it; + * we are concerned with determining zero value at the array member + * level here. + */ + if (d->typed_dump->emit_zeroes || d->typed_dump->depth == 0 || + (d->typed_dump->is_array_member && + !d->typed_dump->is_array_char)) + return 0; + + t = skip_mods_and_typedefs(d->btf, id, NULL); + + switch (btf_kind(t)) { + case BTF_KIND_INT: + if (bit_sz) + return btf_dump_bitfield_check_zero(d, t, data, bits_offset, bit_sz); + return btf_dump_base_type_check_zero(d, t, id, data); + case BTF_KIND_FLOAT: + case BTF_KIND_PTR: + return btf_dump_base_type_check_zero(d, t, id, data); + case BTF_KIND_ARRAY: { + const struct btf_array *array = btf_array(t); + const struct btf_type *elem_type; + __u32 elem_type_id, elem_size; + bool ischar; + + elem_type_id = array->type; + elem_size = btf__resolve_size(d->btf, elem_type_id); + elem_type = skip_mods_and_typedefs(d->btf, elem_type_id, NULL); + + ischar = btf_is_int(elem_type) && elem_size == 1; + + /* check all elements; if _any_ element is nonzero, all + * of array is displayed. We make an exception however + * for char arrays where the first element is 0; these + * are considered zeroed also, even if later elements are + * non-zero because the string is terminated. + */ + for (i = 0; i < array->nelems; i++) { + if (i == 0 && ischar && *(char *)data == 0) + return -ENODATA; + err = btf_dump_type_data_check_zero(d, elem_type, + elem_type_id, + data + + (i * elem_size), + bits_offset, 0); + if (err != -ENODATA) + return err; + } + return -ENODATA; + } + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m = btf_members(t); + __u16 n = btf_vlen(t); + + /* if any struct/union member is non-zero, the struct/union + * is considered non-zero and dumped. + */ + for (i = 0; i < n; i++, m++) { + const struct btf_type *mtype; + __u32 moffset; + + mtype = btf__type_by_id(d->btf, m->type); + moffset = btf_member_bit_offset(t, i); + + /* btf_int_bits() does not store member bitfield size; + * bitfield size needs to be stored here so int display + * of member can retrieve it. + */ + bit_sz = btf_member_bitfield_size(t, i); + err = btf_dump_type_data_check_zero(d, mtype, m->type, data + moffset / 8, + moffset % 8, bit_sz); + if (err != ENODATA) + return err; + } + return -ENODATA; + } + case BTF_KIND_ENUM: + err = btf_dump_get_enum_value(d, t, data, id, &value); + if (err) + return err; + if (value == 0) + return -ENODATA; + return 0; + default: + return 0; + } +} + +/* returns size of data dumped, or error. */ +static int btf_dump_dump_type_data(struct btf_dump *d, + const char *fname, + const struct btf_type *t, + __u32 id, + const void *data, + __u8 bits_offset, + __u8 bit_sz) +{ + int size, err; + + size = btf_dump_type_data_check_overflow(d, t, id, data, bits_offset); + if (size < 0) + return size; + err = btf_dump_type_data_check_zero(d, t, id, data, bits_offset, bit_sz); + if (err) { + /* zeroed data is expected and not an error, so simply skip + * dumping such data. Record other errors however. + */ + if (err == -ENODATA) + return size; + return err; + } + btf_dump_data_pfx(d); + + if (!d->typed_dump->skip_names) { + if (fname && strlen(fname) > 0) + btf_dump_printf(d, ".%s = ", fname); + btf_dump_emit_type_cast(d, id, true); + } + + t = skip_mods_and_typedefs(d->btf, id, NULL); + + switch (btf_kind(t)) { + case BTF_KIND_UNKN: + case BTF_KIND_FWD: + case BTF_KIND_FUNC: + case BTF_KIND_FUNC_PROTO: + err = btf_dump_unsupported_data(d, t, id); + break; + case BTF_KIND_INT: + if (bit_sz) + err = btf_dump_bitfield_data(d, t, data, bits_offset, bit_sz); + else + err = btf_dump_int_data(d, t, id, data, bits_offset); + break; + case BTF_KIND_FLOAT: + err = btf_dump_float_data(d, t, id, data); + break; + case BTF_KIND_PTR: + err = btf_dump_ptr_data(d, t, id, data); + break; + case BTF_KIND_ARRAY: + err = btf_dump_array_data(d, t, id, data); + break; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + err = btf_dump_struct_data(d, t, id, data); + break; + case BTF_KIND_ENUM: + /* handle bitfield and int enum values */ + if (bit_sz) { + __u64 print_num; + __s64 enum_val; + + err = btf_dump_get_bitfield_value(d, t, data, bits_offset, bit_sz, + &print_num); + if (err) + break; + enum_val = (__s64)print_num; + err = btf_dump_enum_data(d, t, id, &enum_val); + } else + err = btf_dump_enum_data(d, t, id, data); + break; + case BTF_KIND_VAR: + err = btf_dump_var_data(d, t, id, data); + break; + case BTF_KIND_DATASEC: + err = btf_dump_datasec_data(d, t, id, data); + break; + default: + pr_warn("unexpected kind [%u] for id [%u]\n", + BTF_INFO_KIND(t->info), id); + return -EINVAL; + } + if (err < 0) + return err; + return size; +} + +int btf_dump__dump_type_data(struct btf_dump *d, __u32 id, + const void *data, size_t data_sz, + const struct btf_dump_type_data_opts *opts) +{ + struct btf_dump_data typed_dump = {}; + const struct btf_type *t; + int ret; + + if (!OPTS_VALID(opts, btf_dump_type_data_opts)) + return libbpf_err(-EINVAL); + + t = btf__type_by_id(d->btf, id); + if (!t) + return libbpf_err(-ENOENT); + + d->typed_dump = &typed_dump; + d->typed_dump->data_end = data + data_sz; + d->typed_dump->indent_lvl = OPTS_GET(opts, indent_level, 0); + + /* default indent string is a tab */ + if (!opts->indent_str) + d->typed_dump->indent_str[0] = '\t'; + else + strncat(d->typed_dump->indent_str, opts->indent_str, + sizeof(d->typed_dump->indent_str) - 1); + + d->typed_dump->compact = OPTS_GET(opts, compact, false); + d->typed_dump->skip_names = OPTS_GET(opts, skip_names, false); + d->typed_dump->emit_zeroes = OPTS_GET(opts, emit_zeroes, false); + + ret = btf_dump_dump_type_data(d, NULL, t, id, data, 0, 0); + + d->typed_dump = NULL; + + return libbpf_err(ret); +} diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 6f5e2757bb3c..cb106e8c42cb 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -498,6 +498,10 @@ struct bpf_object { * it at load time. */ struct btf *btf_vmlinux; + /* Path to the custom BTF to be used for BPF CO-RE relocations as an + * override for vmlinux BTF. + */ + char *btf_custom_path; /* vmlinux BTF override for CO-RE relocations */ struct btf *btf_vmlinux_override; /* Lazily initialized kernel module BTFs */ @@ -591,11 +595,6 @@ static bool insn_is_subprog_call(const struct bpf_insn *insn) insn->off == 0; } -static bool is_ldimm64_insn(struct bpf_insn *insn) -{ - return insn->code == (BPF_LD | BPF_IMM | BPF_DW); -} - static bool is_call_insn(const struct bpf_insn *insn) { return insn->code == (BPF_JMP | BPF_CALL); @@ -2645,8 +2644,10 @@ static bool obj_needs_vmlinux_btf(const struct bpf_object *obj) struct bpf_program *prog; int i; - /* CO-RE relocations need kernel BTF */ - if (obj->btf_ext && obj->btf_ext->core_relo_info.len) + /* CO-RE relocations need kernel BTF, only when btf_custom_path + * is not specified + */ + if (obj->btf_ext && obj->btf_ext->core_relo_info.len && !obj->btf_custom_path) return true; /* Support for typed ksyms needs kernel BTF */ @@ -2679,7 +2680,7 @@ static int bpf_object__load_vmlinux_btf(struct bpf_object *obj, bool force) if (!force && !obj_needs_vmlinux_btf(obj)) return 0; - obj->btf_vmlinux = libbpf_find_kernel_btf(); + obj->btf_vmlinux = btf__load_vmlinux_btf(); err = libbpf_get_error(obj->btf_vmlinux); if (err) { pr_warn("Error loading vmlinux BTF: %d\n", err); @@ -2768,7 +2769,7 @@ static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj) */ btf__set_fd(kern_btf, 0); } else { - err = btf__load(kern_btf); + err = btf__load_into_kernel(kern_btf); } if (sanitize) { if (!err) { @@ -3894,6 +3895,42 @@ static int bpf_map_find_btf_info(struct bpf_object *obj, struct bpf_map *map) return 0; } +static int bpf_get_map_info_from_fdinfo(int fd, struct bpf_map_info *info) +{ + char file[PATH_MAX], buff[4096]; + FILE *fp; + __u32 val; + int err; + + snprintf(file, sizeof(file), "/proc/%d/fdinfo/%d", getpid(), fd); + memset(info, 0, sizeof(*info)); + + fp = fopen(file, "r"); + if (!fp) { + err = -errno; + pr_warn("failed to open %s: %d. No procfs support?\n", file, + err); + return err; + } + + while (fgets(buff, sizeof(buff), fp)) { + if (sscanf(buff, "map_type:\t%u", &val) == 1) + info->type = val; + else if (sscanf(buff, "key_size:\t%u", &val) == 1) + info->key_size = val; + else if (sscanf(buff, "value_size:\t%u", &val) == 1) + info->value_size = val; + else if (sscanf(buff, "max_entries:\t%u", &val) == 1) + info->max_entries = val; + else if (sscanf(buff, "map_flags:\t%i", &val) == 1) + info->map_flags = val; + } + + fclose(fp); + + return 0; +} + int bpf_map__reuse_fd(struct bpf_map *map, int fd) { struct bpf_map_info info = {}; @@ -3902,6 +3939,8 @@ int bpf_map__reuse_fd(struct bpf_map *map, int fd) char *new_name; err = bpf_obj_get_info_by_fd(fd, &info, &len); + if (err && errno == EINVAL) + err = bpf_get_map_info_from_fdinfo(fd, &info); if (err) return libbpf_err(err); @@ -4381,12 +4420,16 @@ static bool map_is_reuse_compat(const struct bpf_map *map, int map_fd) struct bpf_map_info map_info = {}; char msg[STRERR_BUFSIZE]; __u32 map_info_len; + int err; map_info_len = sizeof(map_info); - if (bpf_obj_get_info_by_fd(map_fd, &map_info, &map_info_len)) { - pr_warn("failed to get map info for map FD %d: %s\n", - map_fd, libbpf_strerror_r(errno, msg, sizeof(msg))); + err = bpf_obj_get_info_by_fd(map_fd, &map_info, &map_info_len); + if (err && errno == EINVAL) + err = bpf_get_map_info_from_fdinfo(map_fd, &map_info); + if (err) { + pr_warn("failed to get map info for map FD %d: %s\n", map_fd, + libbpf_strerror_r(errno, msg, sizeof(msg))); return false; } @@ -4479,6 +4522,7 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b { struct bpf_create_map_attr create_attr; struct bpf_map_def *def = &map->def; + int err = 0; memset(&create_attr, 0, sizeof(create_attr)); @@ -4521,8 +4565,6 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b if (bpf_map_type__is_map_in_map(def->type)) { if (map->inner_map) { - int err; - err = bpf_object__create_map(obj, map->inner_map, true); if (err) { pr_warn("map '%s': failed to create inner map: %d\n", @@ -4547,8 +4589,8 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b if (map->fd < 0 && (create_attr.btf_key_type_id || create_attr.btf_value_type_id)) { char *cp, errmsg[STRERR_BUFSIZE]; - int err = -errno; + err = -errno; cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); pr_warn("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n", map->name, cp, err); @@ -4560,8 +4602,7 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b map->fd = bpf_create_map_xattr(&create_attr); } - if (map->fd < 0) - return -errno; + err = map->fd < 0 ? -errno : 0; if (bpf_map_type__is_map_in_map(def->type) && map->inner_map) { if (obj->gen_loader) @@ -4570,7 +4611,7 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b zfree(&map->inner_map); } - return 0; + return err; } static int init_map_slots(struct bpf_object *obj, struct bpf_map *map) @@ -4616,10 +4657,13 @@ bpf_object__create_maps(struct bpf_object *obj) char *cp, errmsg[STRERR_BUFSIZE]; unsigned int i, j; int err; + bool retried; for (i = 0; i < obj->nr_maps; i++) { map = &obj->maps[i]; + retried = false; +retry: if (map->pin_path) { err = bpf_object__reuse_map(map); if (err) { @@ -4627,6 +4671,12 @@ bpf_object__create_maps(struct bpf_object *obj) map->name); goto err_out; } + if (retried && map->fd < 0) { + pr_warn("map '%s': cannot find pinned map\n", + map->name); + err = -ENOENT; + goto err_out; + } } if (map->fd >= 0) { @@ -4660,9 +4710,13 @@ bpf_object__create_maps(struct bpf_object *obj) if (map->pin_path && !map->pinned) { err = bpf_map__pin(map, NULL); if (err) { + zclose(map->fd); + if (!retried && err == -EEXIST) { + retried = true; + goto retry; + } pr_warn("map '%s': failed to auto-pin at '%s': %d\n", map->name, map->pin_path, err); - zclose(map->fd); goto err_out; } } @@ -4679,279 +4733,6 @@ err_out: return err; } -#define BPF_CORE_SPEC_MAX_LEN 64 - -/* represents BPF CO-RE field or array element accessor */ -struct bpf_core_accessor { - __u32 type_id; /* struct/union type or array element type */ - __u32 idx; /* field index or array index */ - const char *name; /* field name or NULL for array accessor */ -}; - -struct bpf_core_spec { - const struct btf *btf; - /* high-level spec: named fields and array indices only */ - struct bpf_core_accessor spec[BPF_CORE_SPEC_MAX_LEN]; - /* original unresolved (no skip_mods_or_typedefs) root type ID */ - __u32 root_type_id; - /* CO-RE relocation kind */ - enum bpf_core_relo_kind relo_kind; - /* high-level spec length */ - int len; - /* raw, low-level spec: 1-to-1 with accessor spec string */ - int raw_spec[BPF_CORE_SPEC_MAX_LEN]; - /* raw spec length */ - int raw_len; - /* field bit offset represented by spec */ - __u32 bit_offset; -}; - -static bool str_is_empty(const char *s) -{ - return !s || !s[0]; -} - -static bool is_flex_arr(const struct btf *btf, - const struct bpf_core_accessor *acc, - const struct btf_array *arr) -{ - const struct btf_type *t; - - /* not a flexible array, if not inside a struct or has non-zero size */ - if (!acc->name || arr->nelems > 0) - return false; - - /* has to be the last member of enclosing struct */ - t = btf__type_by_id(btf, acc->type_id); - return acc->idx == btf_vlen(t) - 1; -} - -static const char *core_relo_kind_str(enum bpf_core_relo_kind kind) -{ - switch (kind) { - case BPF_FIELD_BYTE_OFFSET: return "byte_off"; - case BPF_FIELD_BYTE_SIZE: return "byte_sz"; - case BPF_FIELD_EXISTS: return "field_exists"; - case BPF_FIELD_SIGNED: return "signed"; - case BPF_FIELD_LSHIFT_U64: return "lshift_u64"; - case BPF_FIELD_RSHIFT_U64: return "rshift_u64"; - case BPF_TYPE_ID_LOCAL: return "local_type_id"; - case BPF_TYPE_ID_TARGET: return "target_type_id"; - case BPF_TYPE_EXISTS: return "type_exists"; - case BPF_TYPE_SIZE: return "type_size"; - case BPF_ENUMVAL_EXISTS: return "enumval_exists"; - case BPF_ENUMVAL_VALUE: return "enumval_value"; - default: return "unknown"; - } -} - -static bool core_relo_is_field_based(enum bpf_core_relo_kind kind) -{ - switch (kind) { - case BPF_FIELD_BYTE_OFFSET: - case BPF_FIELD_BYTE_SIZE: - case BPF_FIELD_EXISTS: - case BPF_FIELD_SIGNED: - case BPF_FIELD_LSHIFT_U64: - case BPF_FIELD_RSHIFT_U64: - return true; - default: - return false; - } -} - -static bool core_relo_is_type_based(enum bpf_core_relo_kind kind) -{ - switch (kind) { - case BPF_TYPE_ID_LOCAL: - case BPF_TYPE_ID_TARGET: - case BPF_TYPE_EXISTS: - case BPF_TYPE_SIZE: - return true; - default: - return false; - } -} - -static bool core_relo_is_enumval_based(enum bpf_core_relo_kind kind) -{ - switch (kind) { - case BPF_ENUMVAL_EXISTS: - case BPF_ENUMVAL_VALUE: - return true; - default: - return false; - } -} - -/* - * Turn bpf_core_relo into a low- and high-level spec representation, - * validating correctness along the way, as well as calculating resulting - * field bit offset, specified by accessor string. Low-level spec captures - * every single level of nestedness, including traversing anonymous - * struct/union members. High-level one only captures semantically meaningful - * "turning points": named fields and array indicies. - * E.g., for this case: - * - * struct sample { - * int __unimportant; - * struct { - * int __1; - * int __2; - * int a[7]; - * }; - * }; - * - * struct sample *s = ...; - * - * int x = &s->a[3]; // access string = '0:1:2:3' - * - * Low-level spec has 1:1 mapping with each element of access string (it's - * just a parsed access string representation): [0, 1, 2, 3]. - * - * High-level spec will capture only 3 points: - * - intial zero-index access by pointer (&s->... is the same as &s[0]...); - * - field 'a' access (corresponds to '2' in low-level spec); - * - array element #3 access (corresponds to '3' in low-level spec). - * - * Type-based relocations (TYPE_EXISTS/TYPE_SIZE, - * TYPE_ID_LOCAL/TYPE_ID_TARGET) don't capture any field information. Their - * spec and raw_spec are kept empty. - * - * Enum value-based relocations (ENUMVAL_EXISTS/ENUMVAL_VALUE) use access - * string to specify enumerator's value index that need to be relocated. - */ -static int bpf_core_parse_spec(const struct btf *btf, - __u32 type_id, - const char *spec_str, - enum bpf_core_relo_kind relo_kind, - struct bpf_core_spec *spec) -{ - int access_idx, parsed_len, i; - struct bpf_core_accessor *acc; - const struct btf_type *t; - const char *name; - __u32 id; - __s64 sz; - - if (str_is_empty(spec_str) || *spec_str == ':') - return -EINVAL; - - memset(spec, 0, sizeof(*spec)); - spec->btf = btf; - spec->root_type_id = type_id; - spec->relo_kind = relo_kind; - - /* type-based relocations don't have a field access string */ - if (core_relo_is_type_based(relo_kind)) { - if (strcmp(spec_str, "0")) - return -EINVAL; - return 0; - } - - /* parse spec_str="0:1:2:3:4" into array raw_spec=[0, 1, 2, 3, 4] */ - while (*spec_str) { - if (*spec_str == ':') - ++spec_str; - if (sscanf(spec_str, "%d%n", &access_idx, &parsed_len) != 1) - return -EINVAL; - if (spec->raw_len == BPF_CORE_SPEC_MAX_LEN) - return -E2BIG; - spec_str += parsed_len; - spec->raw_spec[spec->raw_len++] = access_idx; - } - - if (spec->raw_len == 0) - return -EINVAL; - - t = skip_mods_and_typedefs(btf, type_id, &id); - if (!t) - return -EINVAL; - - access_idx = spec->raw_spec[0]; - acc = &spec->spec[0]; - acc->type_id = id; - acc->idx = access_idx; - spec->len++; - - if (core_relo_is_enumval_based(relo_kind)) { - if (!btf_is_enum(t) || spec->raw_len > 1 || access_idx >= btf_vlen(t)) - return -EINVAL; - - /* record enumerator name in a first accessor */ - acc->name = btf__name_by_offset(btf, btf_enum(t)[access_idx].name_off); - return 0; - } - - if (!core_relo_is_field_based(relo_kind)) - return -EINVAL; - - sz = btf__resolve_size(btf, id); - if (sz < 0) - return sz; - spec->bit_offset = access_idx * sz * 8; - - for (i = 1; i < spec->raw_len; i++) { - t = skip_mods_and_typedefs(btf, id, &id); - if (!t) - return -EINVAL; - - access_idx = spec->raw_spec[i]; - acc = &spec->spec[spec->len]; - - if (btf_is_composite(t)) { - const struct btf_member *m; - __u32 bit_offset; - - if (access_idx >= btf_vlen(t)) - return -EINVAL; - - bit_offset = btf_member_bit_offset(t, access_idx); - spec->bit_offset += bit_offset; - - m = btf_members(t) + access_idx; - if (m->name_off) { - name = btf__name_by_offset(btf, m->name_off); - if (str_is_empty(name)) - return -EINVAL; - - acc->type_id = id; - acc->idx = access_idx; - acc->name = name; - spec->len++; - } - - id = m->type; - } else if (btf_is_array(t)) { - const struct btf_array *a = btf_array(t); - bool flex; - - t = skip_mods_and_typedefs(btf, a->type, &id); - if (!t) - return -EINVAL; - - flex = is_flex_arr(btf, acc - 1, a); - if (!flex && access_idx >= a->nelems) - return -EINVAL; - - spec->spec[spec->len].type_id = id; - spec->spec[spec->len].idx = access_idx; - spec->len++; - - sz = btf__resolve_size(btf, id); - if (sz < 0) - return sz; - spec->bit_offset += access_idx * sz * 8; - } else { - pr_warn("relo for [%u] %s (at idx %d) captures type [%d] of unexpected kind %s\n", - type_id, spec_str, i, id, btf_kind_str(t)); - return -EINVAL; - } - } - - return 0; -} - static bool bpf_core_is_flavor_sep(const char *s) { /* check X___Y name pattern, where X and Y are not underscores */ @@ -4964,7 +4745,7 @@ static bool bpf_core_is_flavor_sep(const char *s) * before last triple underscore. Struct name part after last triple * underscore is ignored by BPF CO-RE relocation during relocation matching. */ -static size_t bpf_core_essential_name_len(const char *name) +size_t bpf_core_essential_name_len(const char *name) { size_t n = strlen(name); int i; @@ -4976,34 +4757,20 @@ static size_t bpf_core_essential_name_len(const char *name) return n; } -struct core_cand -{ - const struct btf *btf; - const struct btf_type *t; - const char *name; - __u32 id; -}; - -/* dynamically sized list of type IDs and its associated struct btf */ -struct core_cand_list { - struct core_cand *cands; - int len; -}; - -static void bpf_core_free_cands(struct core_cand_list *cands) +static void bpf_core_free_cands(struct bpf_core_cand_list *cands) { free(cands->cands); free(cands); } -static int bpf_core_add_cands(struct core_cand *local_cand, +static int bpf_core_add_cands(struct bpf_core_cand *local_cand, size_t local_essent_len, const struct btf *targ_btf, const char *targ_btf_name, int targ_start_id, - struct core_cand_list *cands) + struct bpf_core_cand_list *cands) { - struct core_cand *new_cands, *cand; + struct bpf_core_cand *new_cands, *cand; const struct btf_type *t; const char *targ_name; size_t targ_essent_len; @@ -5139,11 +4906,11 @@ err_out: return 0; } -static struct core_cand_list * +static struct bpf_core_cand_list * bpf_core_find_cands(struct bpf_object *obj, const struct btf *local_btf, __u32 local_type_id) { - struct core_cand local_cand = {}; - struct core_cand_list *cands; + struct bpf_core_cand local_cand = {}; + struct bpf_core_cand_list *cands; const struct btf *main_btf; size_t local_essent_len; int err, i; @@ -5197,165 +4964,6 @@ err_out: return ERR_PTR(err); } -/* Check two types for compatibility for the purpose of field access - * relocation. const/volatile/restrict and typedefs are skipped to ensure we - * are relocating semantically compatible entities: - * - any two STRUCTs/UNIONs are compatible and can be mixed; - * - any two FWDs are compatible, if their names match (modulo flavor suffix); - * - any two PTRs are always compatible; - * - for ENUMs, names should be the same (ignoring flavor suffix) or at - * least one of enums should be anonymous; - * - for ENUMs, check sizes, names are ignored; - * - for INT, size and signedness are ignored; - * - any two FLOATs are always compatible; - * - for ARRAY, dimensionality is ignored, element types are checked for - * compatibility recursively; - * - everything else shouldn't be ever a target of relocation. - * These rules are not set in stone and probably will be adjusted as we get - * more experience with using BPF CO-RE relocations. - */ -static int bpf_core_fields_are_compat(const struct btf *local_btf, - __u32 local_id, - const struct btf *targ_btf, - __u32 targ_id) -{ - const struct btf_type *local_type, *targ_type; - -recur: - local_type = skip_mods_and_typedefs(local_btf, local_id, &local_id); - targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); - if (!local_type || !targ_type) - return -EINVAL; - - if (btf_is_composite(local_type) && btf_is_composite(targ_type)) - return 1; - if (btf_kind(local_type) != btf_kind(targ_type)) - return 0; - - switch (btf_kind(local_type)) { - case BTF_KIND_PTR: - case BTF_KIND_FLOAT: - return 1; - case BTF_KIND_FWD: - case BTF_KIND_ENUM: { - const char *local_name, *targ_name; - size_t local_len, targ_len; - - local_name = btf__name_by_offset(local_btf, - local_type->name_off); - targ_name = btf__name_by_offset(targ_btf, targ_type->name_off); - local_len = bpf_core_essential_name_len(local_name); - targ_len = bpf_core_essential_name_len(targ_name); - /* one of them is anonymous or both w/ same flavor-less names */ - return local_len == 0 || targ_len == 0 || - (local_len == targ_len && - strncmp(local_name, targ_name, local_len) == 0); - } - case BTF_KIND_INT: - /* just reject deprecated bitfield-like integers; all other - * integers are by default compatible between each other - */ - return btf_int_offset(local_type) == 0 && - btf_int_offset(targ_type) == 0; - case BTF_KIND_ARRAY: - local_id = btf_array(local_type)->type; - targ_id = btf_array(targ_type)->type; - goto recur; - default: - pr_warn("unexpected kind %d relocated, local [%d], target [%d]\n", - btf_kind(local_type), local_id, targ_id); - return 0; - } -} - -/* - * Given single high-level named field accessor in local type, find - * corresponding high-level accessor for a target type. Along the way, - * maintain low-level spec for target as well. Also keep updating target - * bit offset. - * - * Searching is performed through recursive exhaustive enumeration of all - * fields of a struct/union. If there are any anonymous (embedded) - * structs/unions, they are recursively searched as well. If field with - * desired name is found, check compatibility between local and target types, - * before returning result. - * - * 1 is returned, if field is found. - * 0 is returned if no compatible field is found. - * <0 is returned on error. - */ -static int bpf_core_match_member(const struct btf *local_btf, - const struct bpf_core_accessor *local_acc, - const struct btf *targ_btf, - __u32 targ_id, - struct bpf_core_spec *spec, - __u32 *next_targ_id) -{ - const struct btf_type *local_type, *targ_type; - const struct btf_member *local_member, *m; - const char *local_name, *targ_name; - __u32 local_id; - int i, n, found; - - targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); - if (!targ_type) - return -EINVAL; - if (!btf_is_composite(targ_type)) - return 0; - - local_id = local_acc->type_id; - local_type = btf__type_by_id(local_btf, local_id); - local_member = btf_members(local_type) + local_acc->idx; - local_name = btf__name_by_offset(local_btf, local_member->name_off); - - n = btf_vlen(targ_type); - m = btf_members(targ_type); - for (i = 0; i < n; i++, m++) { - __u32 bit_offset; - - bit_offset = btf_member_bit_offset(targ_type, i); - - /* too deep struct/union/array nesting */ - if (spec->raw_len == BPF_CORE_SPEC_MAX_LEN) - return -E2BIG; - - /* speculate this member will be the good one */ - spec->bit_offset += bit_offset; - spec->raw_spec[spec->raw_len++] = i; - - targ_name = btf__name_by_offset(targ_btf, m->name_off); - if (str_is_empty(targ_name)) { - /* embedded struct/union, we need to go deeper */ - found = bpf_core_match_member(local_btf, local_acc, - targ_btf, m->type, - spec, next_targ_id); - if (found) /* either found or error */ - return found; - } else if (strcmp(local_name, targ_name) == 0) { - /* matching named field */ - struct bpf_core_accessor *targ_acc; - - targ_acc = &spec->spec[spec->len++]; - targ_acc->type_id = targ_id; - targ_acc->idx = i; - targ_acc->name = targ_name; - - *next_targ_id = m->type; - found = bpf_core_fields_are_compat(local_btf, - local_member->type, - targ_btf, m->type); - if (!found) - spec->len--; /* pop accessor */ - return found; - } - /* member turned out not to be what we looked for */ - spec->bit_offset -= bit_offset; - spec->raw_len--; - } - - return 0; -} - /* Check local and target types for compatibility. This check is used for * type-based CO-RE relocations and follow slightly different rules than * field-based relocations. This function assumes that root types were already @@ -5375,8 +4983,8 @@ static int bpf_core_match_member(const struct btf *local_btf, * These rules are not set in stone and probably will be adjusted as we get * more experience with using BPF CO-RE relocations. */ -static int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, - const struct btf *targ_btf, __u32 targ_id) +int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id) { const struct btf_type *local_type, *targ_type; int depth = 32; /* max recursion depth */ @@ -5450,671 +5058,6 @@ recur: } } -/* - * Try to match local spec to a target type and, if successful, produce full - * target spec (high-level, low-level + bit offset). - */ -static int bpf_core_spec_match(struct bpf_core_spec *local_spec, - const struct btf *targ_btf, __u32 targ_id, - struct bpf_core_spec *targ_spec) -{ - const struct btf_type *targ_type; - const struct bpf_core_accessor *local_acc; - struct bpf_core_accessor *targ_acc; - int i, sz, matched; - - memset(targ_spec, 0, sizeof(*targ_spec)); - targ_spec->btf = targ_btf; - targ_spec->root_type_id = targ_id; - targ_spec->relo_kind = local_spec->relo_kind; - - if (core_relo_is_type_based(local_spec->relo_kind)) { - return bpf_core_types_are_compat(local_spec->btf, - local_spec->root_type_id, - targ_btf, targ_id); - } - - local_acc = &local_spec->spec[0]; - targ_acc = &targ_spec->spec[0]; - - if (core_relo_is_enumval_based(local_spec->relo_kind)) { - size_t local_essent_len, targ_essent_len; - const struct btf_enum *e; - const char *targ_name; - - /* has to resolve to an enum */ - targ_type = skip_mods_and_typedefs(targ_spec->btf, targ_id, &targ_id); - if (!btf_is_enum(targ_type)) - return 0; - - local_essent_len = bpf_core_essential_name_len(local_acc->name); - - for (i = 0, e = btf_enum(targ_type); i < btf_vlen(targ_type); i++, e++) { - targ_name = btf__name_by_offset(targ_spec->btf, e->name_off); - targ_essent_len = bpf_core_essential_name_len(targ_name); - if (targ_essent_len != local_essent_len) - continue; - if (strncmp(local_acc->name, targ_name, local_essent_len) == 0) { - targ_acc->type_id = targ_id; - targ_acc->idx = i; - targ_acc->name = targ_name; - targ_spec->len++; - targ_spec->raw_spec[targ_spec->raw_len] = targ_acc->idx; - targ_spec->raw_len++; - return 1; - } - } - return 0; - } - - if (!core_relo_is_field_based(local_spec->relo_kind)) - return -EINVAL; - - for (i = 0; i < local_spec->len; i++, local_acc++, targ_acc++) { - targ_type = skip_mods_and_typedefs(targ_spec->btf, targ_id, - &targ_id); - if (!targ_type) - return -EINVAL; - - if (local_acc->name) { - matched = bpf_core_match_member(local_spec->btf, - local_acc, - targ_btf, targ_id, - targ_spec, &targ_id); - if (matched <= 0) - return matched; - } else { - /* for i=0, targ_id is already treated as array element - * type (because it's the original struct), for others - * we should find array element type first - */ - if (i > 0) { - const struct btf_array *a; - bool flex; - - if (!btf_is_array(targ_type)) - return 0; - - a = btf_array(targ_type); - flex = is_flex_arr(targ_btf, targ_acc - 1, a); - if (!flex && local_acc->idx >= a->nelems) - return 0; - if (!skip_mods_and_typedefs(targ_btf, a->type, - &targ_id)) - return -EINVAL; - } - - /* too deep struct/union/array nesting */ - if (targ_spec->raw_len == BPF_CORE_SPEC_MAX_LEN) - return -E2BIG; - - targ_acc->type_id = targ_id; - targ_acc->idx = local_acc->idx; - targ_acc->name = NULL; - targ_spec->len++; - targ_spec->raw_spec[targ_spec->raw_len] = targ_acc->idx; - targ_spec->raw_len++; - - sz = btf__resolve_size(targ_btf, targ_id); - if (sz < 0) - return sz; - targ_spec->bit_offset += local_acc->idx * sz * 8; - } - } - - return 1; -} - -static int bpf_core_calc_field_relo(const struct bpf_program *prog, - const struct bpf_core_relo *relo, - const struct bpf_core_spec *spec, - __u32 *val, __u32 *field_sz, __u32 *type_id, - bool *validate) -{ - const struct bpf_core_accessor *acc; - const struct btf_type *t; - __u32 byte_off, byte_sz, bit_off, bit_sz, field_type_id; - const struct btf_member *m; - const struct btf_type *mt; - bool bitfield; - __s64 sz; - - *field_sz = 0; - - if (relo->kind == BPF_FIELD_EXISTS) { - *val = spec ? 1 : 0; - return 0; - } - - if (!spec) - return -EUCLEAN; /* request instruction poisoning */ - - acc = &spec->spec[spec->len - 1]; - t = btf__type_by_id(spec->btf, acc->type_id); - - /* a[n] accessor needs special handling */ - if (!acc->name) { - if (relo->kind == BPF_FIELD_BYTE_OFFSET) { - *val = spec->bit_offset / 8; - /* remember field size for load/store mem size */ - sz = btf__resolve_size(spec->btf, acc->type_id); - if (sz < 0) - return -EINVAL; - *field_sz = sz; - *type_id = acc->type_id; - } else if (relo->kind == BPF_FIELD_BYTE_SIZE) { - sz = btf__resolve_size(spec->btf, acc->type_id); - if (sz < 0) - return -EINVAL; - *val = sz; - } else { - pr_warn("prog '%s': relo %d at insn #%d can't be applied to array access\n", - prog->name, relo->kind, relo->insn_off / 8); - return -EINVAL; - } - if (validate) - *validate = true; - return 0; - } - - m = btf_members(t) + acc->idx; - mt = skip_mods_and_typedefs(spec->btf, m->type, &field_type_id); - bit_off = spec->bit_offset; - bit_sz = btf_member_bitfield_size(t, acc->idx); - - bitfield = bit_sz > 0; - if (bitfield) { - byte_sz = mt->size; - byte_off = bit_off / 8 / byte_sz * byte_sz; - /* figure out smallest int size necessary for bitfield load */ - while (bit_off + bit_sz - byte_off * 8 > byte_sz * 8) { - if (byte_sz >= 8) { - /* bitfield can't be read with 64-bit read */ - pr_warn("prog '%s': relo %d at insn #%d can't be satisfied for bitfield\n", - prog->name, relo->kind, relo->insn_off / 8); - return -E2BIG; - } - byte_sz *= 2; - byte_off = bit_off / 8 / byte_sz * byte_sz; - } - } else { - sz = btf__resolve_size(spec->btf, field_type_id); - if (sz < 0) - return -EINVAL; - byte_sz = sz; - byte_off = spec->bit_offset / 8; - bit_sz = byte_sz * 8; - } - - /* for bitfields, all the relocatable aspects are ambiguous and we - * might disagree with compiler, so turn off validation of expected - * value, except for signedness - */ - if (validate) - *validate = !bitfield; - - switch (relo->kind) { - case BPF_FIELD_BYTE_OFFSET: - *val = byte_off; - if (!bitfield) { - *field_sz = byte_sz; - *type_id = field_type_id; - } - break; - case BPF_FIELD_BYTE_SIZE: - *val = byte_sz; - break; - case BPF_FIELD_SIGNED: - /* enums will be assumed unsigned */ - *val = btf_is_enum(mt) || - (btf_int_encoding(mt) & BTF_INT_SIGNED); - if (validate) - *validate = true; /* signedness is never ambiguous */ - break; - case BPF_FIELD_LSHIFT_U64: -#if __BYTE_ORDER == __LITTLE_ENDIAN - *val = 64 - (bit_off + bit_sz - byte_off * 8); -#else - *val = (8 - byte_sz) * 8 + (bit_off - byte_off * 8); -#endif - break; - case BPF_FIELD_RSHIFT_U64: - *val = 64 - bit_sz; - if (validate) - *validate = true; /* right shift is never ambiguous */ - break; - case BPF_FIELD_EXISTS: - default: - return -EOPNOTSUPP; - } - - return 0; -} - -static int bpf_core_calc_type_relo(const struct bpf_core_relo *relo, - const struct bpf_core_spec *spec, - __u32 *val) -{ - __s64 sz; - - /* type-based relos return zero when target type is not found */ - if (!spec) { - *val = 0; - return 0; - } - - switch (relo->kind) { - case BPF_TYPE_ID_TARGET: - *val = spec->root_type_id; - break; - case BPF_TYPE_EXISTS: - *val = 1; - break; - case BPF_TYPE_SIZE: - sz = btf__resolve_size(spec->btf, spec->root_type_id); - if (sz < 0) - return -EINVAL; - *val = sz; - break; - case BPF_TYPE_ID_LOCAL: - /* BPF_TYPE_ID_LOCAL is handled specially and shouldn't get here */ - default: - return -EOPNOTSUPP; - } - - return 0; -} - -static int bpf_core_calc_enumval_relo(const struct bpf_core_relo *relo, - const struct bpf_core_spec *spec, - __u32 *val) -{ - const struct btf_type *t; - const struct btf_enum *e; - - switch (relo->kind) { - case BPF_ENUMVAL_EXISTS: - *val = spec ? 1 : 0; - break; - case BPF_ENUMVAL_VALUE: - if (!spec) - return -EUCLEAN; /* request instruction poisoning */ - t = btf__type_by_id(spec->btf, spec->spec[0].type_id); - e = btf_enum(t) + spec->spec[0].idx; - *val = e->val; - break; - default: - return -EOPNOTSUPP; - } - - return 0; -} - -struct bpf_core_relo_res -{ - /* expected value in the instruction, unless validate == false */ - __u32 orig_val; - /* new value that needs to be patched up to */ - __u32 new_val; - /* relocation unsuccessful, poison instruction, but don't fail load */ - bool poison; - /* some relocations can't be validated against orig_val */ - bool validate; - /* for field byte offset relocations or the forms: - * *(T *)(rX + <off>) = rY - * rX = *(T *)(rY + <off>), - * we remember original and resolved field size to adjust direct - * memory loads of pointers and integers; this is necessary for 32-bit - * host kernel architectures, but also allows to automatically - * relocate fields that were resized from, e.g., u32 to u64, etc. - */ - bool fail_memsz_adjust; - __u32 orig_sz; - __u32 orig_type_id; - __u32 new_sz; - __u32 new_type_id; -}; - -/* Calculate original and target relocation values, given local and target - * specs and relocation kind. These values are calculated for each candidate. - * If there are multiple candidates, resulting values should all be consistent - * with each other. Otherwise, libbpf will refuse to proceed due to ambiguity. - * If instruction has to be poisoned, *poison will be set to true. - */ -static int bpf_core_calc_relo(const struct bpf_program *prog, - const struct bpf_core_relo *relo, - int relo_idx, - const struct bpf_core_spec *local_spec, - const struct bpf_core_spec *targ_spec, - struct bpf_core_relo_res *res) -{ - int err = -EOPNOTSUPP; - - res->orig_val = 0; - res->new_val = 0; - res->poison = false; - res->validate = true; - res->fail_memsz_adjust = false; - res->orig_sz = res->new_sz = 0; - res->orig_type_id = res->new_type_id = 0; - - if (core_relo_is_field_based(relo->kind)) { - err = bpf_core_calc_field_relo(prog, relo, local_spec, - &res->orig_val, &res->orig_sz, - &res->orig_type_id, &res->validate); - err = err ?: bpf_core_calc_field_relo(prog, relo, targ_spec, - &res->new_val, &res->new_sz, - &res->new_type_id, NULL); - if (err) - goto done; - /* Validate if it's safe to adjust load/store memory size. - * Adjustments are performed only if original and new memory - * sizes differ. - */ - res->fail_memsz_adjust = false; - if (res->orig_sz != res->new_sz) { - const struct btf_type *orig_t, *new_t; - - orig_t = btf__type_by_id(local_spec->btf, res->orig_type_id); - new_t = btf__type_by_id(targ_spec->btf, res->new_type_id); - - /* There are two use cases in which it's safe to - * adjust load/store's mem size: - * - reading a 32-bit kernel pointer, while on BPF - * size pointers are always 64-bit; in this case - * it's safe to "downsize" instruction size due to - * pointer being treated as unsigned integer with - * zero-extended upper 32-bits; - * - reading unsigned integers, again due to - * zero-extension is preserving the value correctly. - * - * In all other cases it's incorrect to attempt to - * load/store field because read value will be - * incorrect, so we poison relocated instruction. - */ - if (btf_is_ptr(orig_t) && btf_is_ptr(new_t)) - goto done; - if (btf_is_int(orig_t) && btf_is_int(new_t) && - btf_int_encoding(orig_t) != BTF_INT_SIGNED && - btf_int_encoding(new_t) != BTF_INT_SIGNED) - goto done; - - /* mark as invalid mem size adjustment, but this will - * only be checked for LDX/STX/ST insns - */ - res->fail_memsz_adjust = true; - } - } else if (core_relo_is_type_based(relo->kind)) { - err = bpf_core_calc_type_relo(relo, local_spec, &res->orig_val); - err = err ?: bpf_core_calc_type_relo(relo, targ_spec, &res->new_val); - } else if (core_relo_is_enumval_based(relo->kind)) { - err = bpf_core_calc_enumval_relo(relo, local_spec, &res->orig_val); - err = err ?: bpf_core_calc_enumval_relo(relo, targ_spec, &res->new_val); - } - -done: - if (err == -EUCLEAN) { - /* EUCLEAN is used to signal instruction poisoning request */ - res->poison = true; - err = 0; - } else if (err == -EOPNOTSUPP) { - /* EOPNOTSUPP means unknown/unsupported relocation */ - pr_warn("prog '%s': relo #%d: unrecognized CO-RE relocation %s (%d) at insn #%d\n", - prog->name, relo_idx, core_relo_kind_str(relo->kind), - relo->kind, relo->insn_off / 8); - } - - return err; -} - -/* - * Turn instruction for which CO_RE relocation failed into invalid one with - * distinct signature. - */ -static void bpf_core_poison_insn(struct bpf_program *prog, int relo_idx, - int insn_idx, struct bpf_insn *insn) -{ - pr_debug("prog '%s': relo #%d: substituting insn #%d w/ invalid insn\n", - prog->name, relo_idx, insn_idx); - insn->code = BPF_JMP | BPF_CALL; - insn->dst_reg = 0; - insn->src_reg = 0; - insn->off = 0; - /* if this instruction is reachable (not a dead code), - * verifier will complain with the following message: - * invalid func unknown#195896080 - */ - insn->imm = 195896080; /* => 0xbad2310 => "bad relo" */ -} - -static int insn_bpf_size_to_bytes(struct bpf_insn *insn) -{ - switch (BPF_SIZE(insn->code)) { - case BPF_DW: return 8; - case BPF_W: return 4; - case BPF_H: return 2; - case BPF_B: return 1; - default: return -1; - } -} - -static int insn_bytes_to_bpf_size(__u32 sz) -{ - switch (sz) { - case 8: return BPF_DW; - case 4: return BPF_W; - case 2: return BPF_H; - case 1: return BPF_B; - default: return -1; - } -} - -/* - * Patch relocatable BPF instruction. - * - * Patched value is determined by relocation kind and target specification. - * For existence relocations target spec will be NULL if field/type is not found. - * Expected insn->imm value is determined using relocation kind and local - * spec, and is checked before patching instruction. If actual insn->imm value - * is wrong, bail out with error. - * - * Currently supported classes of BPF instruction are: - * 1. rX = <imm> (assignment with immediate operand); - * 2. rX += <imm> (arithmetic operations with immediate operand); - * 3. rX = <imm64> (load with 64-bit immediate value); - * 4. rX = *(T *)(rY + <off>), where T is one of {u8, u16, u32, u64}; - * 5. *(T *)(rX + <off>) = rY, where T is one of {u8, u16, u32, u64}; - * 6. *(T *)(rX + <off>) = <imm>, where T is one of {u8, u16, u32, u64}. - */ -static int bpf_core_patch_insn(struct bpf_program *prog, - const struct bpf_core_relo *relo, - int relo_idx, - const struct bpf_core_relo_res *res) -{ - __u32 orig_val, new_val; - struct bpf_insn *insn; - int insn_idx; - __u8 class; - - if (relo->insn_off % BPF_INSN_SZ) - return -EINVAL; - insn_idx = relo->insn_off / BPF_INSN_SZ; - /* adjust insn_idx from section frame of reference to the local - * program's frame of reference; (sub-)program code is not yet - * relocated, so it's enough to just subtract in-section offset - */ - insn_idx = insn_idx - prog->sec_insn_off; - insn = &prog->insns[insn_idx]; - class = BPF_CLASS(insn->code); - - if (res->poison) { -poison: - /* poison second part of ldimm64 to avoid confusing error from - * verifier about "unknown opcode 00" - */ - if (is_ldimm64_insn(insn)) - bpf_core_poison_insn(prog, relo_idx, insn_idx + 1, insn + 1); - bpf_core_poison_insn(prog, relo_idx, insn_idx, insn); - return 0; - } - - orig_val = res->orig_val; - new_val = res->new_val; - - switch (class) { - case BPF_ALU: - case BPF_ALU64: - if (BPF_SRC(insn->code) != BPF_K) - return -EINVAL; - if (res->validate && insn->imm != orig_val) { - pr_warn("prog '%s': relo #%d: unexpected insn #%d (ALU/ALU64) value: got %u, exp %u -> %u\n", - prog->name, relo_idx, - insn_idx, insn->imm, orig_val, new_val); - return -EINVAL; - } - orig_val = insn->imm; - insn->imm = new_val; - pr_debug("prog '%s': relo #%d: patched insn #%d (ALU/ALU64) imm %u -> %u\n", - prog->name, relo_idx, insn_idx, - orig_val, new_val); - break; - case BPF_LDX: - case BPF_ST: - case BPF_STX: - if (res->validate && insn->off != orig_val) { - pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDX/ST/STX) value: got %u, exp %u -> %u\n", - prog->name, relo_idx, insn_idx, insn->off, orig_val, new_val); - return -EINVAL; - } - if (new_val > SHRT_MAX) { - pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) value too big: %u\n", - prog->name, relo_idx, insn_idx, new_val); - return -ERANGE; - } - if (res->fail_memsz_adjust) { - pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) accesses field incorrectly. " - "Make sure you are accessing pointers, unsigned integers, or fields of matching type and size.\n", - prog->name, relo_idx, insn_idx); - goto poison; - } - - orig_val = insn->off; - insn->off = new_val; - pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) off %u -> %u\n", - prog->name, relo_idx, insn_idx, orig_val, new_val); - - if (res->new_sz != res->orig_sz) { - int insn_bytes_sz, insn_bpf_sz; - - insn_bytes_sz = insn_bpf_size_to_bytes(insn); - if (insn_bytes_sz != res->orig_sz) { - pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) unexpected mem size: got %d, exp %u\n", - prog->name, relo_idx, insn_idx, insn_bytes_sz, res->orig_sz); - return -EINVAL; - } - - insn_bpf_sz = insn_bytes_to_bpf_size(res->new_sz); - if (insn_bpf_sz < 0) { - pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) invalid new mem size: %u\n", - prog->name, relo_idx, insn_idx, res->new_sz); - return -EINVAL; - } - - insn->code = BPF_MODE(insn->code) | insn_bpf_sz | BPF_CLASS(insn->code); - pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) mem_sz %u -> %u\n", - prog->name, relo_idx, insn_idx, res->orig_sz, res->new_sz); - } - break; - case BPF_LD: { - __u64 imm; - - if (!is_ldimm64_insn(insn) || - insn[0].src_reg != 0 || insn[0].off != 0 || - insn_idx + 1 >= prog->insns_cnt || - insn[1].code != 0 || insn[1].dst_reg != 0 || - insn[1].src_reg != 0 || insn[1].off != 0) { - pr_warn("prog '%s': relo #%d: insn #%d (LDIMM64) has unexpected form\n", - prog->name, relo_idx, insn_idx); - return -EINVAL; - } - - imm = insn[0].imm + ((__u64)insn[1].imm << 32); - if (res->validate && imm != orig_val) { - pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDIMM64) value: got %llu, exp %u -> %u\n", - prog->name, relo_idx, - insn_idx, (unsigned long long)imm, - orig_val, new_val); - return -EINVAL; - } - - insn[0].imm = new_val; - insn[1].imm = 0; /* currently only 32-bit values are supported */ - pr_debug("prog '%s': relo #%d: patched insn #%d (LDIMM64) imm64 %llu -> %u\n", - prog->name, relo_idx, insn_idx, - (unsigned long long)imm, new_val); - break; - } - default: - pr_warn("prog '%s': relo #%d: trying to relocate unrecognized insn #%d, code:0x%x, src:0x%x, dst:0x%x, off:0x%x, imm:0x%x\n", - prog->name, relo_idx, insn_idx, insn->code, - insn->src_reg, insn->dst_reg, insn->off, insn->imm); - return -EINVAL; - } - - return 0; -} - -/* Output spec definition in the format: - * [<type-id>] (<type-name>) + <raw-spec> => <offset>@<spec>, - * where <spec> is a C-syntax view of recorded field access, e.g.: x.a[3].b - */ -static void bpf_core_dump_spec(int level, const struct bpf_core_spec *spec) -{ - const struct btf_type *t; - const struct btf_enum *e; - const char *s; - __u32 type_id; - int i; - - type_id = spec->root_type_id; - t = btf__type_by_id(spec->btf, type_id); - s = btf__name_by_offset(spec->btf, t->name_off); - - libbpf_print(level, "[%u] %s %s", type_id, btf_kind_str(t), str_is_empty(s) ? "<anon>" : s); - - if (core_relo_is_type_based(spec->relo_kind)) - return; - - if (core_relo_is_enumval_based(spec->relo_kind)) { - t = skip_mods_and_typedefs(spec->btf, type_id, NULL); - e = btf_enum(t) + spec->raw_spec[0]; - s = btf__name_by_offset(spec->btf, e->name_off); - - libbpf_print(level, "::%s = %u", s, e->val); - return; - } - - if (core_relo_is_field_based(spec->relo_kind)) { - for (i = 0; i < spec->len; i++) { - if (spec->spec[i].name) - libbpf_print(level, ".%s", spec->spec[i].name); - else if (i > 0 || spec->spec[i].idx > 0) - libbpf_print(level, "[%u]", spec->spec[i].idx); - } - - libbpf_print(level, " ("); - for (i = 0; i < spec->raw_len; i++) - libbpf_print(level, "%s%d", i == 0 ? "" : ":", spec->raw_spec[i]); - - if (spec->bit_offset % 8) - libbpf_print(level, " @ offset %u.%u)", - spec->bit_offset / 8, spec->bit_offset % 8); - else - libbpf_print(level, " @ offset %u)", spec->bit_offset / 8); - return; - } -} - static size_t bpf_core_hash_fn(const void *key, void *ctx) { return (size_t)key; @@ -6130,73 +5073,33 @@ static void *u32_as_hash_key(__u32 x) return (void *)(uintptr_t)x; } -/* - * CO-RE relocate single instruction. - * - * The outline and important points of the algorithm: - * 1. For given local type, find corresponding candidate target types. - * Candidate type is a type with the same "essential" name, ignoring - * everything after last triple underscore (___). E.g., `sample`, - * `sample___flavor_one`, `sample___flavor_another_one`, are all candidates - * for each other. Names with triple underscore are referred to as - * "flavors" and are useful, among other things, to allow to - * specify/support incompatible variations of the same kernel struct, which - * might differ between different kernel versions and/or build - * configurations. - * - * N.B. Struct "flavors" could be generated by bpftool's BTF-to-C - * converter, when deduplicated BTF of a kernel still contains more than - * one different types with the same name. In that case, ___2, ___3, etc - * are appended starting from second name conflict. But start flavors are - * also useful to be defined "locally", in BPF program, to extract same - * data from incompatible changes between different kernel - * versions/configurations. For instance, to handle field renames between - * kernel versions, one can use two flavors of the struct name with the - * same common name and use conditional relocations to extract that field, - * depending on target kernel version. - * 2. For each candidate type, try to match local specification to this - * candidate target type. Matching involves finding corresponding - * high-level spec accessors, meaning that all named fields should match, - * as well as all array accesses should be within the actual bounds. Also, - * types should be compatible (see bpf_core_fields_are_compat for details). - * 3. It is supported and expected that there might be multiple flavors - * matching the spec. As long as all the specs resolve to the same set of - * offsets across all candidates, there is no error. If there is any - * ambiguity, CO-RE relocation will fail. This is necessary to accomodate - * imprefection of BTF deduplication, which can cause slight duplication of - * the same BTF type, if some directly or indirectly referenced (by - * pointer) type gets resolved to different actual types in different - * object files. If such situation occurs, deduplicated BTF will end up - * with two (or more) structurally identical types, which differ only in - * types they refer to through pointer. This should be OK in most cases and - * is not an error. - * 4. Candidate types search is performed by linearly scanning through all - * types in target BTF. It is anticipated that this is overall more - * efficient memory-wise and not significantly worse (if not better) - * CPU-wise compared to prebuilding a map from all local type names to - * a list of candidate type names. It's also sped up by caching resolved - * list of matching candidates per each local "root" type ID, that has at - * least one bpf_core_relo associated with it. This list is shared - * between multiple relocations for the same type ID and is updated as some - * of the candidates are pruned due to structural incompatibility. - */ static int bpf_core_apply_relo(struct bpf_program *prog, const struct bpf_core_relo *relo, int relo_idx, const struct btf *local_btf, struct hashmap *cand_cache) { - struct bpf_core_spec local_spec, cand_spec, targ_spec = {}; const void *type_key = u32_as_hash_key(relo->type_id); - struct bpf_core_relo_res cand_res, targ_res; + struct bpf_core_cand_list *cands = NULL; + const char *prog_name = prog->name; const struct btf_type *local_type; const char *local_name; - struct core_cand_list *cands = NULL; - __u32 local_id; - const char *spec_str; - int i, j, err; + __u32 local_id = relo->type_id; + struct bpf_insn *insn; + int insn_idx, err; + + if (relo->insn_off % BPF_INSN_SZ) + return -EINVAL; + insn_idx = relo->insn_off / BPF_INSN_SZ; + /* adjust insn_idx from section frame of reference to the local + * program's frame of reference; (sub-)program code is not yet + * relocated, so it's enough to just subtract in-section offset + */ + insn_idx = insn_idx - prog->sec_insn_off; + if (insn_idx > prog->insns_cnt) + return -EINVAL; + insn = &prog->insns[insn_idx]; - local_id = relo->type_id; local_type = btf__type_by_id(local_btf, local_id); if (!local_type) return -EINVAL; @@ -6205,51 +5108,19 @@ static int bpf_core_apply_relo(struct bpf_program *prog, if (!local_name) return -EINVAL; - spec_str = btf__name_by_offset(local_btf, relo->access_str_off); - if (str_is_empty(spec_str)) - return -EINVAL; - if (prog->obj->gen_loader) { - pr_warn("// TODO core_relo: prog %td insn[%d] %s %s kind %d\n", + pr_warn("// TODO core_relo: prog %td insn[%d] %s kind %d\n", prog - prog->obj->programs, relo->insn_off / 8, - local_name, spec_str, relo->kind); + local_name, relo->kind); return -ENOTSUP; } - err = bpf_core_parse_spec(local_btf, local_id, spec_str, relo->kind, &local_spec); - if (err) { - pr_warn("prog '%s': relo #%d: parsing [%d] %s %s + %s failed: %d\n", - prog->name, relo_idx, local_id, btf_kind_str(local_type), - str_is_empty(local_name) ? "<anon>" : local_name, - spec_str, err); - return -EINVAL; - } - pr_debug("prog '%s': relo #%d: kind <%s> (%d), spec is ", prog->name, - relo_idx, core_relo_kind_str(relo->kind), relo->kind); - bpf_core_dump_spec(LIBBPF_DEBUG, &local_spec); - libbpf_print(LIBBPF_DEBUG, "\n"); - - /* TYPE_ID_LOCAL relo is special and doesn't need candidate search */ - if (relo->kind == BPF_TYPE_ID_LOCAL) { - targ_res.validate = true; - targ_res.poison = false; - targ_res.orig_val = local_spec.root_type_id; - targ_res.new_val = local_spec.root_type_id; - goto patch_insn; - } - - /* libbpf doesn't support candidate search for anonymous types */ - if (str_is_empty(spec_str)) { - pr_warn("prog '%s': relo #%d: <%s> (%d) relocation doesn't support anonymous types\n", - prog->name, relo_idx, core_relo_kind_str(relo->kind), relo->kind); - return -EOPNOTSUPP; - } - - if (!hashmap__find(cand_cache, type_key, (void **)&cands)) { + if (relo->kind != BPF_TYPE_ID_LOCAL && + !hashmap__find(cand_cache, type_key, (void **)&cands)) { cands = bpf_core_find_cands(prog->obj, local_btf, local_id); if (IS_ERR(cands)) { pr_warn("prog '%s': relo #%d: target candidate search failed for [%d] %s %s: %ld\n", - prog->name, relo_idx, local_id, btf_kind_str(local_type), + prog_name, relo_idx, local_id, btf_kind_str(local_type), local_name, PTR_ERR(cands)); return PTR_ERR(cands); } @@ -6260,97 +5131,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog, } } - for (i = 0, j = 0; i < cands->len; i++) { - err = bpf_core_spec_match(&local_spec, cands->cands[i].btf, - cands->cands[i].id, &cand_spec); - if (err < 0) { - pr_warn("prog '%s': relo #%d: error matching candidate #%d ", - prog->name, relo_idx, i); - bpf_core_dump_spec(LIBBPF_WARN, &cand_spec); - libbpf_print(LIBBPF_WARN, ": %d\n", err); - return err; - } - - pr_debug("prog '%s': relo #%d: %s candidate #%d ", prog->name, - relo_idx, err == 0 ? "non-matching" : "matching", i); - bpf_core_dump_spec(LIBBPF_DEBUG, &cand_spec); - libbpf_print(LIBBPF_DEBUG, "\n"); - - if (err == 0) - continue; - - err = bpf_core_calc_relo(prog, relo, relo_idx, &local_spec, &cand_spec, &cand_res); - if (err) - return err; - - if (j == 0) { - targ_res = cand_res; - targ_spec = cand_spec; - } else if (cand_spec.bit_offset != targ_spec.bit_offset) { - /* if there are many field relo candidates, they - * should all resolve to the same bit offset - */ - pr_warn("prog '%s': relo #%d: field offset ambiguity: %u != %u\n", - prog->name, relo_idx, cand_spec.bit_offset, - targ_spec.bit_offset); - return -EINVAL; - } else if (cand_res.poison != targ_res.poison || cand_res.new_val != targ_res.new_val) { - /* all candidates should result in the same relocation - * decision and value, otherwise it's dangerous to - * proceed due to ambiguity - */ - pr_warn("prog '%s': relo #%d: relocation decision ambiguity: %s %u != %s %u\n", - prog->name, relo_idx, - cand_res.poison ? "failure" : "success", cand_res.new_val, - targ_res.poison ? "failure" : "success", targ_res.new_val); - return -EINVAL; - } - - cands->cands[j++] = cands->cands[i]; - } - - /* - * For BPF_FIELD_EXISTS relo or when used BPF program has field - * existence checks or kernel version/config checks, it's expected - * that we might not find any candidates. In this case, if field - * wasn't found in any candidate, the list of candidates shouldn't - * change at all, we'll just handle relocating appropriately, - * depending on relo's kind. - */ - if (j > 0) - cands->len = j; - - /* - * If no candidates were found, it might be both a programmer error, - * as well as expected case, depending whether instruction w/ - * relocation is guarded in some way that makes it unreachable (dead - * code) if relocation can't be resolved. This is handled in - * bpf_core_patch_insn() uniformly by replacing that instruction with - * BPF helper call insn (using invalid helper ID). If that instruction - * is indeed unreachable, then it will be ignored and eliminated by - * verifier. If it was an error, then verifier will complain and point - * to a specific instruction number in its log. - */ - if (j == 0) { - pr_debug("prog '%s': relo #%d: no matching targets found\n", - prog->name, relo_idx); - - /* calculate single target relo result explicitly */ - err = bpf_core_calc_relo(prog, relo, relo_idx, &local_spec, NULL, &targ_res); - if (err) - return err; - } - -patch_insn: - /* bpf_core_patch_insn() should know how to handle missing targ_spec */ - err = bpf_core_patch_insn(prog, relo, relo_idx, &targ_res); - if (err) { - pr_warn("prog '%s': relo #%d: failed to patch insn #%zu: %d\n", - prog->name, relo_idx, relo->insn_off / BPF_INSN_SZ, err); - return -EINVAL; - } - - return 0; + return bpf_core_apply_relo_insn(prog_name, insn, insn_idx, relo, relo_idx, local_btf, cands); } static int @@ -7190,7 +5971,7 @@ static int bpf_object__collect_relos(struct bpf_object *obj) for (i = 0; i < obj->nr_programs; i++) { struct bpf_program *p = &obj->programs[i]; - + if (!p->nr_reloc) continue; @@ -7554,7 +6335,7 @@ static struct bpf_object * __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz, const struct bpf_object_open_opts *opts) { - const char *obj_name, *kconfig; + const char *obj_name, *kconfig, *btf_tmp_path; struct bpf_program *prog; struct bpf_object *obj; char tmp_name[64]; @@ -7585,11 +6366,26 @@ __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz, if (IS_ERR(obj)) return obj; + btf_tmp_path = OPTS_GET(opts, btf_custom_path, NULL); + if (btf_tmp_path) { + if (strlen(btf_tmp_path) >= PATH_MAX) { + err = -ENAMETOOLONG; + goto out; + } + obj->btf_custom_path = strdup(btf_tmp_path); + if (!obj->btf_custom_path) { + err = -ENOMEM; + goto out; + } + } + kconfig = OPTS_GET(opts, kconfig, NULL); if (kconfig) { obj->kconfig = strdup(kconfig); - if (!obj->kconfig) - return ERR_PTR(-ENOMEM); + if (!obj->kconfig) { + err = -ENOMEM; + goto out; + } } err = bpf_object__elf_init(obj); @@ -8055,7 +6851,7 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr) err = err ? : bpf_object__sanitize_maps(obj); err = err ? : bpf_object__init_kern_struct_ops_maps(obj); err = err ? : bpf_object__create_maps(obj); - err = err ? : bpf_object__relocate(obj, attr->target_btf_path); + err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : attr->target_btf_path); err = err ? : bpf_object__load_progs(obj, attr->log_level); if (obj->gen_loader) { @@ -8450,6 +7246,11 @@ const char *bpf_map__get_pin_path(const struct bpf_map *map) return map->pin_path; } +const char *bpf_map__pin_path(const struct bpf_map *map) +{ + return map->pin_path; +} + bool bpf_map__is_pinned(const struct bpf_map *map) { return map->pinned; @@ -8702,6 +7503,7 @@ void bpf_object__close(struct bpf_object *obj) for (i = 0; i < obj->nr_maps; i++) bpf_map__destroy(&obj->maps[i]); + zfree(&obj->btf_custom_path); zfree(&obj->kconfig); zfree(&obj->externs); obj->nr_extern = 0; @@ -9471,7 +8273,7 @@ static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix, ret = snprintf(btf_type_name, sizeof(btf_type_name), "%s%s", prefix, name); /* snprintf returns the number of characters written excluding the - * the terminating null. So, if >= BTF_MAX_NAME_SIZE are written, it + * terminating null. So, if >= BTF_MAX_NAME_SIZE are written, it * indicates truncation. */ if (ret < 0 || ret >= sizeof(btf_type_name)) @@ -9495,7 +8297,7 @@ int libbpf_find_vmlinux_btf_id(const char *name, struct btf *btf; int err; - btf = libbpf_find_kernel_btf(); + btf = btf__load_vmlinux_btf(); err = libbpf_get_error(btf); if (err) { pr_warn("vmlinux BTF is not found\n"); @@ -9514,8 +8316,8 @@ static int libbpf_find_prog_btf_id(const char *name, __u32 attach_prog_fd) { struct bpf_prog_info_linear *info_linear; struct bpf_prog_info *info; - struct btf *btf = NULL; - int err = -EINVAL; + struct btf *btf; + int err; info_linear = bpf_program__get_prog_info_linear(attach_prog_fd, 0); err = libbpf_get_error(info_linear); @@ -9524,12 +8326,15 @@ static int libbpf_find_prog_btf_id(const char *name, __u32 attach_prog_fd) attach_prog_fd); return err; } + + err = -EINVAL; info = &info_linear->info; if (!info->btf_id) { pr_warn("The target program doesn't have BTF\n"); goto out; } - if (btf__get_from_id(info->btf_id, &btf)) { + btf = btf__load_from_kernel_by_id(info->btf_id); + if (libbpf_get_error(btf)) { pr_warn("Failed to get BTF of the program\n"); goto out; } @@ -10013,7 +8818,7 @@ struct bpf_link { int bpf_link__update_program(struct bpf_link *link, struct bpf_program *prog) { int ret; - + ret = bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), NULL); return libbpf_err_errno(ret); } @@ -10304,16 +9109,25 @@ static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name, return pfd; } -struct bpf_link *bpf_program__attach_kprobe(struct bpf_program *prog, - bool retprobe, - const char *func_name) +struct bpf_link * +bpf_program__attach_kprobe_opts(struct bpf_program *prog, + const char *func_name, + struct bpf_kprobe_opts *opts) { char errmsg[STRERR_BUFSIZE]; struct bpf_link *link; + unsigned long offset; + bool retprobe; int pfd, err; + if (!OPTS_VALID(opts, bpf_kprobe_opts)) + return libbpf_err_ptr(-EINVAL); + + retprobe = OPTS_GET(opts, retprobe, false); + offset = OPTS_GET(opts, offset, 0); + pfd = perf_event_open_probe(false /* uprobe */, retprobe, func_name, - 0 /* offset */, -1 /* pid */); + offset, -1 /* pid */); if (pfd < 0) { pr_warn("prog '%s': failed to create %s '%s' perf event: %s\n", prog->name, retprobe ? "kretprobe" : "kprobe", func_name, @@ -10332,16 +9146,47 @@ struct bpf_link *bpf_program__attach_kprobe(struct bpf_program *prog, return link; } +struct bpf_link *bpf_program__attach_kprobe(struct bpf_program *prog, + bool retprobe, + const char *func_name) +{ + DECLARE_LIBBPF_OPTS(bpf_kprobe_opts, opts, + .retprobe = retprobe, + ); + + return bpf_program__attach_kprobe_opts(prog, func_name, &opts); +} + static struct bpf_link *attach_kprobe(const struct bpf_sec_def *sec, struct bpf_program *prog) { + DECLARE_LIBBPF_OPTS(bpf_kprobe_opts, opts); + unsigned long offset = 0; + struct bpf_link *link; const char *func_name; - bool retprobe; + char *func; + int n, err; func_name = prog->sec_name + sec->len; - retprobe = strcmp(sec->sec, "kretprobe/") == 0; + opts.retprobe = strcmp(sec->sec, "kretprobe/") == 0; - return bpf_program__attach_kprobe(prog, retprobe, func_name); + n = sscanf(func_name, "%m[a-zA-Z0-9_.]+%li", &func, &offset); + if (n < 1) { + err = -EINVAL; + pr_warn("kprobe name is invalid: %s\n", func_name); + return libbpf_err_ptr(err); + } + if (opts.retprobe && offset != 0) { + free(func); + err = -EINVAL; + pr_warn("kretprobes do not support offset specification\n"); + return libbpf_err_ptr(err); + } + + opts.offset = offset; + link = bpf_program__attach_kprobe_opts(prog, func, &opts); + free(func); + return link; } struct bpf_link *bpf_program__attach_uprobe(struct bpf_program *prog, diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 6e61342ba56c..1271d99bb7aa 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -94,8 +94,26 @@ struct bpf_object_open_opts { * system Kconfig for CONFIG_xxx externs. */ const char *kconfig; + /* Path to the custom BTF to be used for BPF CO-RE relocations. + * This custom BTF completely replaces the use of vmlinux BTF + * for the purpose of CO-RE relocations. + * NOTE: any other BPF feature (e.g., fentry/fexit programs, + * struct_ops, etc) will need actual kernel BTF at /sys/kernel/btf/vmlinux. + */ + const char *btf_custom_path; }; -#define bpf_object_open_opts__last_field kconfig +#define bpf_object_open_opts__last_field btf_custom_path + +struct bpf_kprobe_opts { + /* size of this struct, for forward/backward compatiblity */ + size_t sz; + /* function's offset to install kprobe to */ + unsigned long offset; + /* kprobe is return probe */ + bool retprobe; + size_t :0; +}; +#define bpf_kprobe_opts__last_field retprobe LIBBPF_API struct bpf_object *bpf_object__open(const char *path); LIBBPF_API struct bpf_object * @@ -243,6 +261,10 @@ LIBBPF_API struct bpf_link * bpf_program__attach_kprobe(struct bpf_program *prog, bool retprobe, const char *func_name); LIBBPF_API struct bpf_link * +bpf_program__attach_kprobe_opts(struct bpf_program *prog, + const char *func_name, + struct bpf_kprobe_opts *opts); +LIBBPF_API struct bpf_link * bpf_program__attach_uprobe(struct bpf_program *prog, bool retprobe, pid_t pid, const char *binary_path, size_t func_offset); @@ -477,6 +499,7 @@ LIBBPF_API bool bpf_map__is_offload_neutral(const struct bpf_map *map); LIBBPF_API bool bpf_map__is_internal(const struct bpf_map *map); LIBBPF_API int bpf_map__set_pin_path(struct bpf_map *map, const char *path); LIBBPF_API const char *bpf_map__get_pin_path(const struct bpf_map *map); +LIBBPF_API const char *bpf_map__pin_path(const struct bpf_map *map); LIBBPF_API bool bpf_map__is_pinned(const struct bpf_map *map); LIBBPF_API int bpf_map__pin(struct bpf_map *map, const char *path); LIBBPF_API int bpf_map__unpin(struct bpf_map *map, const char *path); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 944c99d1ded3..58e0fb2c482f 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -371,7 +371,15 @@ LIBBPF_0.4.0 { LIBBPF_0.5.0 { global: bpf_map__initial_value; + bpf_map__pin_path; bpf_map_lookup_and_delete_elem_flags; + bpf_program__attach_kprobe_opts; bpf_object__gen_loader; + btf__load_from_kernel_by_id; + btf__load_from_kernel_by_id_split; + btf__load_into_kernel; + btf__load_module_btf; + btf__load_vmlinux_btf; + btf_dump__dump_type_data; libbpf_set_strict_mode; } LIBBPF_0.4.0; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 016ca7cb4f8a..f7b691d5f9eb 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -14,6 +14,7 @@ #include <errno.h> #include <linux/err.h> #include "libbpf_legacy.h" +#include "relo_core.h" /* make sure libbpf doesn't use kernel-only integer typedefs */ #pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 @@ -366,76 +367,6 @@ struct bpf_line_info_min { __u32 line_col; }; -/* bpf_core_relo_kind encodes which aspect of captured field/type/enum value - * has to be adjusted by relocations. - */ -enum bpf_core_relo_kind { - BPF_FIELD_BYTE_OFFSET = 0, /* field byte offset */ - BPF_FIELD_BYTE_SIZE = 1, /* field size in bytes */ - BPF_FIELD_EXISTS = 2, /* field existence in target kernel */ - BPF_FIELD_SIGNED = 3, /* field signedness (0 - unsigned, 1 - signed) */ - BPF_FIELD_LSHIFT_U64 = 4, /* bitfield-specific left bitshift */ - BPF_FIELD_RSHIFT_U64 = 5, /* bitfield-specific right bitshift */ - BPF_TYPE_ID_LOCAL = 6, /* type ID in local BPF object */ - BPF_TYPE_ID_TARGET = 7, /* type ID in target kernel */ - BPF_TYPE_EXISTS = 8, /* type existence in target kernel */ - BPF_TYPE_SIZE = 9, /* type size in bytes */ - BPF_ENUMVAL_EXISTS = 10, /* enum value existence in target kernel */ - BPF_ENUMVAL_VALUE = 11, /* enum value integer value */ -}; - -/* The minimum bpf_core_relo checked by the loader - * - * CO-RE relocation captures the following data: - * - insn_off - instruction offset (in bytes) within a BPF program that needs - * its insn->imm field to be relocated with actual field info; - * - type_id - BTF type ID of the "root" (containing) entity of a relocatable - * type or field; - * - access_str_off - offset into corresponding .BTF string section. String - * interpretation depends on specific relocation kind: - * - for field-based relocations, string encodes an accessed field using - * a sequence of field and array indices, separated by colon (:). It's - * conceptually very close to LLVM's getelementptr ([0]) instruction's - * arguments for identifying offset to a field. - * - for type-based relocations, strings is expected to be just "0"; - * - for enum value-based relocations, string contains an index of enum - * value within its enum type; - * - * Example to provide a better feel. - * - * struct sample { - * int a; - * struct { - * int b[10]; - * }; - * }; - * - * struct sample *s = ...; - * int x = &s->a; // encoded as "0:0" (a is field #0) - * int y = &s->b[5]; // encoded as "0:1:0:5" (anon struct is field #1, - * // b is field #0 inside anon struct, accessing elem #5) - * int z = &s[10]->b; // encoded as "10:1" (ptr is used as an array) - * - * type_id for all relocs in this example will capture BTF type id of - * `struct sample`. - * - * Such relocation is emitted when using __builtin_preserve_access_index() - * Clang built-in, passing expression that captures field address, e.g.: - * - * bpf_probe_read(&dst, sizeof(dst), - * __builtin_preserve_access_index(&src->a.b.c)); - * - * In this case Clang will emit field relocation recording necessary data to - * be able to find offset of embedded `a.b.c` field within `src` struct. - * - * [0] https://llvm.org/docs/LangRef.html#getelementptr-instruction - */ -struct bpf_core_relo { - __u32 insn_off; - __u32 type_id; - __u32 access_str_off; - enum bpf_core_relo_kind kind; -}; typedef int (*type_id_visit_fn)(__u32 *type_id, void *ctx); typedef int (*str_off_visit_fn)(__u32 *str_off, void *ctx); @@ -494,4 +425,14 @@ static inline void *libbpf_ptr(void *ret) return ret; } +static inline bool str_is_empty(const char *s) +{ + return !s || !s[0]; +} + +static inline bool is_ldimm64_insn(struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW); +} + #endif /* __LIBBPF_LIBBPF_INTERNAL_H */ diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c new file mode 100644 index 000000000000..4016ed492d0c --- /dev/null +++ b/tools/lib/bpf/relo_core.c @@ -0,0 +1,1295 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2019 Facebook */ + +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <ctype.h> +#include <linux/err.h> + +#include "libbpf.h" +#include "bpf.h" +#include "btf.h" +#include "str_error.h" +#include "libbpf_internal.h" + +#define BPF_CORE_SPEC_MAX_LEN 64 + +/* represents BPF CO-RE field or array element accessor */ +struct bpf_core_accessor { + __u32 type_id; /* struct/union type or array element type */ + __u32 idx; /* field index or array index */ + const char *name; /* field name or NULL for array accessor */ +}; + +struct bpf_core_spec { + const struct btf *btf; + /* high-level spec: named fields and array indices only */ + struct bpf_core_accessor spec[BPF_CORE_SPEC_MAX_LEN]; + /* original unresolved (no skip_mods_or_typedefs) root type ID */ + __u32 root_type_id; + /* CO-RE relocation kind */ + enum bpf_core_relo_kind relo_kind; + /* high-level spec length */ + int len; + /* raw, low-level spec: 1-to-1 with accessor spec string */ + int raw_spec[BPF_CORE_SPEC_MAX_LEN]; + /* raw spec length */ + int raw_len; + /* field bit offset represented by spec */ + __u32 bit_offset; +}; + +static bool is_flex_arr(const struct btf *btf, + const struct bpf_core_accessor *acc, + const struct btf_array *arr) +{ + const struct btf_type *t; + + /* not a flexible array, if not inside a struct or has non-zero size */ + if (!acc->name || arr->nelems > 0) + return false; + + /* has to be the last member of enclosing struct */ + t = btf__type_by_id(btf, acc->type_id); + return acc->idx == btf_vlen(t) - 1; +} + +static const char *core_relo_kind_str(enum bpf_core_relo_kind kind) +{ + switch (kind) { + case BPF_FIELD_BYTE_OFFSET: return "byte_off"; + case BPF_FIELD_BYTE_SIZE: return "byte_sz"; + case BPF_FIELD_EXISTS: return "field_exists"; + case BPF_FIELD_SIGNED: return "signed"; + case BPF_FIELD_LSHIFT_U64: return "lshift_u64"; + case BPF_FIELD_RSHIFT_U64: return "rshift_u64"; + case BPF_TYPE_ID_LOCAL: return "local_type_id"; + case BPF_TYPE_ID_TARGET: return "target_type_id"; + case BPF_TYPE_EXISTS: return "type_exists"; + case BPF_TYPE_SIZE: return "type_size"; + case BPF_ENUMVAL_EXISTS: return "enumval_exists"; + case BPF_ENUMVAL_VALUE: return "enumval_value"; + default: return "unknown"; + } +} + +static bool core_relo_is_field_based(enum bpf_core_relo_kind kind) +{ + switch (kind) { + case BPF_FIELD_BYTE_OFFSET: + case BPF_FIELD_BYTE_SIZE: + case BPF_FIELD_EXISTS: + case BPF_FIELD_SIGNED: + case BPF_FIELD_LSHIFT_U64: + case BPF_FIELD_RSHIFT_U64: + return true; + default: + return false; + } +} + +static bool core_relo_is_type_based(enum bpf_core_relo_kind kind) +{ + switch (kind) { + case BPF_TYPE_ID_LOCAL: + case BPF_TYPE_ID_TARGET: + case BPF_TYPE_EXISTS: + case BPF_TYPE_SIZE: + return true; + default: + return false; + } +} + +static bool core_relo_is_enumval_based(enum bpf_core_relo_kind kind) +{ + switch (kind) { + case BPF_ENUMVAL_EXISTS: + case BPF_ENUMVAL_VALUE: + return true; + default: + return false; + } +} + +/* + * Turn bpf_core_relo into a low- and high-level spec representation, + * validating correctness along the way, as well as calculating resulting + * field bit offset, specified by accessor string. Low-level spec captures + * every single level of nestedness, including traversing anonymous + * struct/union members. High-level one only captures semantically meaningful + * "turning points": named fields and array indicies. + * E.g., for this case: + * + * struct sample { + * int __unimportant; + * struct { + * int __1; + * int __2; + * int a[7]; + * }; + * }; + * + * struct sample *s = ...; + * + * int x = &s->a[3]; // access string = '0:1:2:3' + * + * Low-level spec has 1:1 mapping with each element of access string (it's + * just a parsed access string representation): [0, 1, 2, 3]. + * + * High-level spec will capture only 3 points: + * - intial zero-index access by pointer (&s->... is the same as &s[0]...); + * - field 'a' access (corresponds to '2' in low-level spec); + * - array element #3 access (corresponds to '3' in low-level spec). + * + * Type-based relocations (TYPE_EXISTS/TYPE_SIZE, + * TYPE_ID_LOCAL/TYPE_ID_TARGET) don't capture any field information. Their + * spec and raw_spec are kept empty. + * + * Enum value-based relocations (ENUMVAL_EXISTS/ENUMVAL_VALUE) use access + * string to specify enumerator's value index that need to be relocated. + */ +static int bpf_core_parse_spec(const struct btf *btf, + __u32 type_id, + const char *spec_str, + enum bpf_core_relo_kind relo_kind, + struct bpf_core_spec *spec) +{ + int access_idx, parsed_len, i; + struct bpf_core_accessor *acc; + const struct btf_type *t; + const char *name; + __u32 id; + __s64 sz; + + if (str_is_empty(spec_str) || *spec_str == ':') + return -EINVAL; + + memset(spec, 0, sizeof(*spec)); + spec->btf = btf; + spec->root_type_id = type_id; + spec->relo_kind = relo_kind; + + /* type-based relocations don't have a field access string */ + if (core_relo_is_type_based(relo_kind)) { + if (strcmp(spec_str, "0")) + return -EINVAL; + return 0; + } + + /* parse spec_str="0:1:2:3:4" into array raw_spec=[0, 1, 2, 3, 4] */ + while (*spec_str) { + if (*spec_str == ':') + ++spec_str; + if (sscanf(spec_str, "%d%n", &access_idx, &parsed_len) != 1) + return -EINVAL; + if (spec->raw_len == BPF_CORE_SPEC_MAX_LEN) + return -E2BIG; + spec_str += parsed_len; + spec->raw_spec[spec->raw_len++] = access_idx; + } + + if (spec->raw_len == 0) + return -EINVAL; + + t = skip_mods_and_typedefs(btf, type_id, &id); + if (!t) + return -EINVAL; + + access_idx = spec->raw_spec[0]; + acc = &spec->spec[0]; + acc->type_id = id; + acc->idx = access_idx; + spec->len++; + + if (core_relo_is_enumval_based(relo_kind)) { + if (!btf_is_enum(t) || spec->raw_len > 1 || access_idx >= btf_vlen(t)) + return -EINVAL; + + /* record enumerator name in a first accessor */ + acc->name = btf__name_by_offset(btf, btf_enum(t)[access_idx].name_off); + return 0; + } + + if (!core_relo_is_field_based(relo_kind)) + return -EINVAL; + + sz = btf__resolve_size(btf, id); + if (sz < 0) + return sz; + spec->bit_offset = access_idx * sz * 8; + + for (i = 1; i < spec->raw_len; i++) { + t = skip_mods_and_typedefs(btf, id, &id); + if (!t) + return -EINVAL; + + access_idx = spec->raw_spec[i]; + acc = &spec->spec[spec->len]; + + if (btf_is_composite(t)) { + const struct btf_member *m; + __u32 bit_offset; + + if (access_idx >= btf_vlen(t)) + return -EINVAL; + + bit_offset = btf_member_bit_offset(t, access_idx); + spec->bit_offset += bit_offset; + + m = btf_members(t) + access_idx; + if (m->name_off) { + name = btf__name_by_offset(btf, m->name_off); + if (str_is_empty(name)) + return -EINVAL; + + acc->type_id = id; + acc->idx = access_idx; + acc->name = name; + spec->len++; + } + + id = m->type; + } else if (btf_is_array(t)) { + const struct btf_array *a = btf_array(t); + bool flex; + + t = skip_mods_and_typedefs(btf, a->type, &id); + if (!t) + return -EINVAL; + + flex = is_flex_arr(btf, acc - 1, a); + if (!flex && access_idx >= a->nelems) + return -EINVAL; + + spec->spec[spec->len].type_id = id; + spec->spec[spec->len].idx = access_idx; + spec->len++; + + sz = btf__resolve_size(btf, id); + if (sz < 0) + return sz; + spec->bit_offset += access_idx * sz * 8; + } else { + pr_warn("relo for [%u] %s (at idx %d) captures type [%d] of unexpected kind %s\n", + type_id, spec_str, i, id, btf_kind_str(t)); + return -EINVAL; + } + } + + return 0; +} + +/* Check two types for compatibility for the purpose of field access + * relocation. const/volatile/restrict and typedefs are skipped to ensure we + * are relocating semantically compatible entities: + * - any two STRUCTs/UNIONs are compatible and can be mixed; + * - any two FWDs are compatible, if their names match (modulo flavor suffix); + * - any two PTRs are always compatible; + * - for ENUMs, names should be the same (ignoring flavor suffix) or at + * least one of enums should be anonymous; + * - for ENUMs, check sizes, names are ignored; + * - for INT, size and signedness are ignored; + * - any two FLOATs are always compatible; + * - for ARRAY, dimensionality is ignored, element types are checked for + * compatibility recursively; + * - everything else shouldn't be ever a target of relocation. + * These rules are not set in stone and probably will be adjusted as we get + * more experience with using BPF CO-RE relocations. + */ +static int bpf_core_fields_are_compat(const struct btf *local_btf, + __u32 local_id, + const struct btf *targ_btf, + __u32 targ_id) +{ + const struct btf_type *local_type, *targ_type; + +recur: + local_type = skip_mods_and_typedefs(local_btf, local_id, &local_id); + targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); + if (!local_type || !targ_type) + return -EINVAL; + + if (btf_is_composite(local_type) && btf_is_composite(targ_type)) + return 1; + if (btf_kind(local_type) != btf_kind(targ_type)) + return 0; + + switch (btf_kind(local_type)) { + case BTF_KIND_PTR: + case BTF_KIND_FLOAT: + return 1; + case BTF_KIND_FWD: + case BTF_KIND_ENUM: { + const char *local_name, *targ_name; + size_t local_len, targ_len; + + local_name = btf__name_by_offset(local_btf, + local_type->name_off); + targ_name = btf__name_by_offset(targ_btf, targ_type->name_off); + local_len = bpf_core_essential_name_len(local_name); + targ_len = bpf_core_essential_name_len(targ_name); + /* one of them is anonymous or both w/ same flavor-less names */ + return local_len == 0 || targ_len == 0 || + (local_len == targ_len && + strncmp(local_name, targ_name, local_len) == 0); + } + case BTF_KIND_INT: + /* just reject deprecated bitfield-like integers; all other + * integers are by default compatible between each other + */ + return btf_int_offset(local_type) == 0 && + btf_int_offset(targ_type) == 0; + case BTF_KIND_ARRAY: + local_id = btf_array(local_type)->type; + targ_id = btf_array(targ_type)->type; + goto recur; + default: + pr_warn("unexpected kind %d relocated, local [%d], target [%d]\n", + btf_kind(local_type), local_id, targ_id); + return 0; + } +} + +/* + * Given single high-level named field accessor in local type, find + * corresponding high-level accessor for a target type. Along the way, + * maintain low-level spec for target as well. Also keep updating target + * bit offset. + * + * Searching is performed through recursive exhaustive enumeration of all + * fields of a struct/union. If there are any anonymous (embedded) + * structs/unions, they are recursively searched as well. If field with + * desired name is found, check compatibility between local and target types, + * before returning result. + * + * 1 is returned, if field is found. + * 0 is returned if no compatible field is found. + * <0 is returned on error. + */ +static int bpf_core_match_member(const struct btf *local_btf, + const struct bpf_core_accessor *local_acc, + const struct btf *targ_btf, + __u32 targ_id, + struct bpf_core_spec *spec, + __u32 *next_targ_id) +{ + const struct btf_type *local_type, *targ_type; + const struct btf_member *local_member, *m; + const char *local_name, *targ_name; + __u32 local_id; + int i, n, found; + + targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); + if (!targ_type) + return -EINVAL; + if (!btf_is_composite(targ_type)) + return 0; + + local_id = local_acc->type_id; + local_type = btf__type_by_id(local_btf, local_id); + local_member = btf_members(local_type) + local_acc->idx; + local_name = btf__name_by_offset(local_btf, local_member->name_off); + + n = btf_vlen(targ_type); + m = btf_members(targ_type); + for (i = 0; i < n; i++, m++) { + __u32 bit_offset; + + bit_offset = btf_member_bit_offset(targ_type, i); + + /* too deep struct/union/array nesting */ + if (spec->raw_len == BPF_CORE_SPEC_MAX_LEN) + return -E2BIG; + + /* speculate this member will be the good one */ + spec->bit_offset += bit_offset; + spec->raw_spec[spec->raw_len++] = i; + + targ_name = btf__name_by_offset(targ_btf, m->name_off); + if (str_is_empty(targ_name)) { + /* embedded struct/union, we need to go deeper */ + found = bpf_core_match_member(local_btf, local_acc, + targ_btf, m->type, + spec, next_targ_id); + if (found) /* either found or error */ + return found; + } else if (strcmp(local_name, targ_name) == 0) { + /* matching named field */ + struct bpf_core_accessor *targ_acc; + + targ_acc = &spec->spec[spec->len++]; + targ_acc->type_id = targ_id; + targ_acc->idx = i; + targ_acc->name = targ_name; + + *next_targ_id = m->type; + found = bpf_core_fields_are_compat(local_btf, + local_member->type, + targ_btf, m->type); + if (!found) + spec->len--; /* pop accessor */ + return found; + } + /* member turned out not to be what we looked for */ + spec->bit_offset -= bit_offset; + spec->raw_len--; + } + + return 0; +} + +/* + * Try to match local spec to a target type and, if successful, produce full + * target spec (high-level, low-level + bit offset). + */ +static int bpf_core_spec_match(struct bpf_core_spec *local_spec, + const struct btf *targ_btf, __u32 targ_id, + struct bpf_core_spec *targ_spec) +{ + const struct btf_type *targ_type; + const struct bpf_core_accessor *local_acc; + struct bpf_core_accessor *targ_acc; + int i, sz, matched; + + memset(targ_spec, 0, sizeof(*targ_spec)); + targ_spec->btf = targ_btf; + targ_spec->root_type_id = targ_id; + targ_spec->relo_kind = local_spec->relo_kind; + + if (core_relo_is_type_based(local_spec->relo_kind)) { + return bpf_core_types_are_compat(local_spec->btf, + local_spec->root_type_id, + targ_btf, targ_id); + } + + local_acc = &local_spec->spec[0]; + targ_acc = &targ_spec->spec[0]; + + if (core_relo_is_enumval_based(local_spec->relo_kind)) { + size_t local_essent_len, targ_essent_len; + const struct btf_enum *e; + const char *targ_name; + + /* has to resolve to an enum */ + targ_type = skip_mods_and_typedefs(targ_spec->btf, targ_id, &targ_id); + if (!btf_is_enum(targ_type)) + return 0; + + local_essent_len = bpf_core_essential_name_len(local_acc->name); + + for (i = 0, e = btf_enum(targ_type); i < btf_vlen(targ_type); i++, e++) { + targ_name = btf__name_by_offset(targ_spec->btf, e->name_off); + targ_essent_len = bpf_core_essential_name_len(targ_name); + if (targ_essent_len != local_essent_len) + continue; + if (strncmp(local_acc->name, targ_name, local_essent_len) == 0) { + targ_acc->type_id = targ_id; + targ_acc->idx = i; + targ_acc->name = targ_name; + targ_spec->len++; + targ_spec->raw_spec[targ_spec->raw_len] = targ_acc->idx; + targ_spec->raw_len++; + return 1; + } + } + return 0; + } + + if (!core_relo_is_field_based(local_spec->relo_kind)) + return -EINVAL; + + for (i = 0; i < local_spec->len; i++, local_acc++, targ_acc++) { + targ_type = skip_mods_and_typedefs(targ_spec->btf, targ_id, + &targ_id); + if (!targ_type) + return -EINVAL; + + if (local_acc->name) { + matched = bpf_core_match_member(local_spec->btf, + local_acc, + targ_btf, targ_id, + targ_spec, &targ_id); + if (matched <= 0) + return matched; + } else { + /* for i=0, targ_id is already treated as array element + * type (because it's the original struct), for others + * we should find array element type first + */ + if (i > 0) { + const struct btf_array *a; + bool flex; + + if (!btf_is_array(targ_type)) + return 0; + + a = btf_array(targ_type); + flex = is_flex_arr(targ_btf, targ_acc - 1, a); + if (!flex && local_acc->idx >= a->nelems) + return 0; + if (!skip_mods_and_typedefs(targ_btf, a->type, + &targ_id)) + return -EINVAL; + } + + /* too deep struct/union/array nesting */ + if (targ_spec->raw_len == BPF_CORE_SPEC_MAX_LEN) + return -E2BIG; + + targ_acc->type_id = targ_id; + targ_acc->idx = local_acc->idx; + targ_acc->name = NULL; + targ_spec->len++; + targ_spec->raw_spec[targ_spec->raw_len] = targ_acc->idx; + targ_spec->raw_len++; + + sz = btf__resolve_size(targ_btf, targ_id); + if (sz < 0) + return sz; + targ_spec->bit_offset += local_acc->idx * sz * 8; + } + } + + return 1; +} + +static int bpf_core_calc_field_relo(const char *prog_name, + const struct bpf_core_relo *relo, + const struct bpf_core_spec *spec, + __u32 *val, __u32 *field_sz, __u32 *type_id, + bool *validate) +{ + const struct bpf_core_accessor *acc; + const struct btf_type *t; + __u32 byte_off, byte_sz, bit_off, bit_sz, field_type_id; + const struct btf_member *m; + const struct btf_type *mt; + bool bitfield; + __s64 sz; + + *field_sz = 0; + + if (relo->kind == BPF_FIELD_EXISTS) { + *val = spec ? 1 : 0; + return 0; + } + + if (!spec) + return -EUCLEAN; /* request instruction poisoning */ + + acc = &spec->spec[spec->len - 1]; + t = btf__type_by_id(spec->btf, acc->type_id); + + /* a[n] accessor needs special handling */ + if (!acc->name) { + if (relo->kind == BPF_FIELD_BYTE_OFFSET) { + *val = spec->bit_offset / 8; + /* remember field size for load/store mem size */ + sz = btf__resolve_size(spec->btf, acc->type_id); + if (sz < 0) + return -EINVAL; + *field_sz = sz; + *type_id = acc->type_id; + } else if (relo->kind == BPF_FIELD_BYTE_SIZE) { + sz = btf__resolve_size(spec->btf, acc->type_id); + if (sz < 0) + return -EINVAL; + *val = sz; + } else { + pr_warn("prog '%s': relo %d at insn #%d can't be applied to array access\n", + prog_name, relo->kind, relo->insn_off / 8); + return -EINVAL; + } + if (validate) + *validate = true; + return 0; + } + + m = btf_members(t) + acc->idx; + mt = skip_mods_and_typedefs(spec->btf, m->type, &field_type_id); + bit_off = spec->bit_offset; + bit_sz = btf_member_bitfield_size(t, acc->idx); + + bitfield = bit_sz > 0; + if (bitfield) { + byte_sz = mt->size; + byte_off = bit_off / 8 / byte_sz * byte_sz; + /* figure out smallest int size necessary for bitfield load */ + while (bit_off + bit_sz - byte_off * 8 > byte_sz * 8) { + if (byte_sz >= 8) { + /* bitfield can't be read with 64-bit read */ + pr_warn("prog '%s': relo %d at insn #%d can't be satisfied for bitfield\n", + prog_name, relo->kind, relo->insn_off / 8); + return -E2BIG; + } + byte_sz *= 2; + byte_off = bit_off / 8 / byte_sz * byte_sz; + } + } else { + sz = btf__resolve_size(spec->btf, field_type_id); + if (sz < 0) + return -EINVAL; + byte_sz = sz; + byte_off = spec->bit_offset / 8; + bit_sz = byte_sz * 8; + } + + /* for bitfields, all the relocatable aspects are ambiguous and we + * might disagree with compiler, so turn off validation of expected + * value, except for signedness + */ + if (validate) + *validate = !bitfield; + + switch (relo->kind) { + case BPF_FIELD_BYTE_OFFSET: + *val = byte_off; + if (!bitfield) { + *field_sz = byte_sz; + *type_id = field_type_id; + } + break; + case BPF_FIELD_BYTE_SIZE: + *val = byte_sz; + break; + case BPF_FIELD_SIGNED: + /* enums will be assumed unsigned */ + *val = btf_is_enum(mt) || + (btf_int_encoding(mt) & BTF_INT_SIGNED); + if (validate) + *validate = true; /* signedness is never ambiguous */ + break; + case BPF_FIELD_LSHIFT_U64: +#if __BYTE_ORDER == __LITTLE_ENDIAN + *val = 64 - (bit_off + bit_sz - byte_off * 8); +#else + *val = (8 - byte_sz) * 8 + (bit_off - byte_off * 8); +#endif + break; + case BPF_FIELD_RSHIFT_U64: + *val = 64 - bit_sz; + if (validate) + *validate = true; /* right shift is never ambiguous */ + break; + case BPF_FIELD_EXISTS: + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int bpf_core_calc_type_relo(const struct bpf_core_relo *relo, + const struct bpf_core_spec *spec, + __u32 *val) +{ + __s64 sz; + + /* type-based relos return zero when target type is not found */ + if (!spec) { + *val = 0; + return 0; + } + + switch (relo->kind) { + case BPF_TYPE_ID_TARGET: + *val = spec->root_type_id; + break; + case BPF_TYPE_EXISTS: + *val = 1; + break; + case BPF_TYPE_SIZE: + sz = btf__resolve_size(spec->btf, spec->root_type_id); + if (sz < 0) + return -EINVAL; + *val = sz; + break; + case BPF_TYPE_ID_LOCAL: + /* BPF_TYPE_ID_LOCAL is handled specially and shouldn't get here */ + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int bpf_core_calc_enumval_relo(const struct bpf_core_relo *relo, + const struct bpf_core_spec *spec, + __u32 *val) +{ + const struct btf_type *t; + const struct btf_enum *e; + + switch (relo->kind) { + case BPF_ENUMVAL_EXISTS: + *val = spec ? 1 : 0; + break; + case BPF_ENUMVAL_VALUE: + if (!spec) + return -EUCLEAN; /* request instruction poisoning */ + t = btf__type_by_id(spec->btf, spec->spec[0].type_id); + e = btf_enum(t) + spec->spec[0].idx; + *val = e->val; + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +struct bpf_core_relo_res +{ + /* expected value in the instruction, unless validate == false */ + __u32 orig_val; + /* new value that needs to be patched up to */ + __u32 new_val; + /* relocation unsuccessful, poison instruction, but don't fail load */ + bool poison; + /* some relocations can't be validated against orig_val */ + bool validate; + /* for field byte offset relocations or the forms: + * *(T *)(rX + <off>) = rY + * rX = *(T *)(rY + <off>), + * we remember original and resolved field size to adjust direct + * memory loads of pointers and integers; this is necessary for 32-bit + * host kernel architectures, but also allows to automatically + * relocate fields that were resized from, e.g., u32 to u64, etc. + */ + bool fail_memsz_adjust; + __u32 orig_sz; + __u32 orig_type_id; + __u32 new_sz; + __u32 new_type_id; +}; + +/* Calculate original and target relocation values, given local and target + * specs and relocation kind. These values are calculated for each candidate. + * If there are multiple candidates, resulting values should all be consistent + * with each other. Otherwise, libbpf will refuse to proceed due to ambiguity. + * If instruction has to be poisoned, *poison will be set to true. + */ +static int bpf_core_calc_relo(const char *prog_name, + const struct bpf_core_relo *relo, + int relo_idx, + const struct bpf_core_spec *local_spec, + const struct bpf_core_spec *targ_spec, + struct bpf_core_relo_res *res) +{ + int err = -EOPNOTSUPP; + + res->orig_val = 0; + res->new_val = 0; + res->poison = false; + res->validate = true; + res->fail_memsz_adjust = false; + res->orig_sz = res->new_sz = 0; + res->orig_type_id = res->new_type_id = 0; + + if (core_relo_is_field_based(relo->kind)) { + err = bpf_core_calc_field_relo(prog_name, relo, local_spec, + &res->orig_val, &res->orig_sz, + &res->orig_type_id, &res->validate); + err = err ?: bpf_core_calc_field_relo(prog_name, relo, targ_spec, + &res->new_val, &res->new_sz, + &res->new_type_id, NULL); + if (err) + goto done; + /* Validate if it's safe to adjust load/store memory size. + * Adjustments are performed only if original and new memory + * sizes differ. + */ + res->fail_memsz_adjust = false; + if (res->orig_sz != res->new_sz) { + const struct btf_type *orig_t, *new_t; + + orig_t = btf__type_by_id(local_spec->btf, res->orig_type_id); + new_t = btf__type_by_id(targ_spec->btf, res->new_type_id); + + /* There are two use cases in which it's safe to + * adjust load/store's mem size: + * - reading a 32-bit kernel pointer, while on BPF + * size pointers are always 64-bit; in this case + * it's safe to "downsize" instruction size due to + * pointer being treated as unsigned integer with + * zero-extended upper 32-bits; + * - reading unsigned integers, again due to + * zero-extension is preserving the value correctly. + * + * In all other cases it's incorrect to attempt to + * load/store field because read value will be + * incorrect, so we poison relocated instruction. + */ + if (btf_is_ptr(orig_t) && btf_is_ptr(new_t)) + goto done; + if (btf_is_int(orig_t) && btf_is_int(new_t) && + btf_int_encoding(orig_t) != BTF_INT_SIGNED && + btf_int_encoding(new_t) != BTF_INT_SIGNED) + goto done; + + /* mark as invalid mem size adjustment, but this will + * only be checked for LDX/STX/ST insns + */ + res->fail_memsz_adjust = true; + } + } else if (core_relo_is_type_based(relo->kind)) { + err = bpf_core_calc_type_relo(relo, local_spec, &res->orig_val); + err = err ?: bpf_core_calc_type_relo(relo, targ_spec, &res->new_val); + } else if (core_relo_is_enumval_based(relo->kind)) { + err = bpf_core_calc_enumval_relo(relo, local_spec, &res->orig_val); + err = err ?: bpf_core_calc_enumval_relo(relo, targ_spec, &res->new_val); + } + +done: + if (err == -EUCLEAN) { + /* EUCLEAN is used to signal instruction poisoning request */ + res->poison = true; + err = 0; + } else if (err == -EOPNOTSUPP) { + /* EOPNOTSUPP means unknown/unsupported relocation */ + pr_warn("prog '%s': relo #%d: unrecognized CO-RE relocation %s (%d) at insn #%d\n", + prog_name, relo_idx, core_relo_kind_str(relo->kind), + relo->kind, relo->insn_off / 8); + } + + return err; +} + +/* + * Turn instruction for which CO_RE relocation failed into invalid one with + * distinct signature. + */ +static void bpf_core_poison_insn(const char *prog_name, int relo_idx, + int insn_idx, struct bpf_insn *insn) +{ + pr_debug("prog '%s': relo #%d: substituting insn #%d w/ invalid insn\n", + prog_name, relo_idx, insn_idx); + insn->code = BPF_JMP | BPF_CALL; + insn->dst_reg = 0; + insn->src_reg = 0; + insn->off = 0; + /* if this instruction is reachable (not a dead code), + * verifier will complain with the following message: + * invalid func unknown#195896080 + */ + insn->imm = 195896080; /* => 0xbad2310 => "bad relo" */ +} + +static int insn_bpf_size_to_bytes(struct bpf_insn *insn) +{ + switch (BPF_SIZE(insn->code)) { + case BPF_DW: return 8; + case BPF_W: return 4; + case BPF_H: return 2; + case BPF_B: return 1; + default: return -1; + } +} + +static int insn_bytes_to_bpf_size(__u32 sz) +{ + switch (sz) { + case 8: return BPF_DW; + case 4: return BPF_W; + case 2: return BPF_H; + case 1: return BPF_B; + default: return -1; + } +} + +/* + * Patch relocatable BPF instruction. + * + * Patched value is determined by relocation kind and target specification. + * For existence relocations target spec will be NULL if field/type is not found. + * Expected insn->imm value is determined using relocation kind and local + * spec, and is checked before patching instruction. If actual insn->imm value + * is wrong, bail out with error. + * + * Currently supported classes of BPF instruction are: + * 1. rX = <imm> (assignment with immediate operand); + * 2. rX += <imm> (arithmetic operations with immediate operand); + * 3. rX = <imm64> (load with 64-bit immediate value); + * 4. rX = *(T *)(rY + <off>), where T is one of {u8, u16, u32, u64}; + * 5. *(T *)(rX + <off>) = rY, where T is one of {u8, u16, u32, u64}; + * 6. *(T *)(rX + <off>) = <imm>, where T is one of {u8, u16, u32, u64}. + */ +static int bpf_core_patch_insn(const char *prog_name, struct bpf_insn *insn, + int insn_idx, const struct bpf_core_relo *relo, + int relo_idx, const struct bpf_core_relo_res *res) +{ + __u32 orig_val, new_val; + __u8 class; + + class = BPF_CLASS(insn->code); + + if (res->poison) { +poison: + /* poison second part of ldimm64 to avoid confusing error from + * verifier about "unknown opcode 00" + */ + if (is_ldimm64_insn(insn)) + bpf_core_poison_insn(prog_name, relo_idx, insn_idx + 1, insn + 1); + bpf_core_poison_insn(prog_name, relo_idx, insn_idx, insn); + return 0; + } + + orig_val = res->orig_val; + new_val = res->new_val; + + switch (class) { + case BPF_ALU: + case BPF_ALU64: + if (BPF_SRC(insn->code) != BPF_K) + return -EINVAL; + if (res->validate && insn->imm != orig_val) { + pr_warn("prog '%s': relo #%d: unexpected insn #%d (ALU/ALU64) value: got %u, exp %u -> %u\n", + prog_name, relo_idx, + insn_idx, insn->imm, orig_val, new_val); + return -EINVAL; + } + orig_val = insn->imm; + insn->imm = new_val; + pr_debug("prog '%s': relo #%d: patched insn #%d (ALU/ALU64) imm %u -> %u\n", + prog_name, relo_idx, insn_idx, + orig_val, new_val); + break; + case BPF_LDX: + case BPF_ST: + case BPF_STX: + if (res->validate && insn->off != orig_val) { + pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDX/ST/STX) value: got %u, exp %u -> %u\n", + prog_name, relo_idx, insn_idx, insn->off, orig_val, new_val); + return -EINVAL; + } + if (new_val > SHRT_MAX) { + pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) value too big: %u\n", + prog_name, relo_idx, insn_idx, new_val); + return -ERANGE; + } + if (res->fail_memsz_adjust) { + pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) accesses field incorrectly. " + "Make sure you are accessing pointers, unsigned integers, or fields of matching type and size.\n", + prog_name, relo_idx, insn_idx); + goto poison; + } + + orig_val = insn->off; + insn->off = new_val; + pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) off %u -> %u\n", + prog_name, relo_idx, insn_idx, orig_val, new_val); + + if (res->new_sz != res->orig_sz) { + int insn_bytes_sz, insn_bpf_sz; + + insn_bytes_sz = insn_bpf_size_to_bytes(insn); + if (insn_bytes_sz != res->orig_sz) { + pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) unexpected mem size: got %d, exp %u\n", + prog_name, relo_idx, insn_idx, insn_bytes_sz, res->orig_sz); + return -EINVAL; + } + + insn_bpf_sz = insn_bytes_to_bpf_size(res->new_sz); + if (insn_bpf_sz < 0) { + pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) invalid new mem size: %u\n", + prog_name, relo_idx, insn_idx, res->new_sz); + return -EINVAL; + } + + insn->code = BPF_MODE(insn->code) | insn_bpf_sz | BPF_CLASS(insn->code); + pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) mem_sz %u -> %u\n", + prog_name, relo_idx, insn_idx, res->orig_sz, res->new_sz); + } + break; + case BPF_LD: { + __u64 imm; + + if (!is_ldimm64_insn(insn) || + insn[0].src_reg != 0 || insn[0].off != 0 || + insn[1].code != 0 || insn[1].dst_reg != 0 || + insn[1].src_reg != 0 || insn[1].off != 0) { + pr_warn("prog '%s': relo #%d: insn #%d (LDIMM64) has unexpected form\n", + prog_name, relo_idx, insn_idx); + return -EINVAL; + } + + imm = insn[0].imm + ((__u64)insn[1].imm << 32); + if (res->validate && imm != orig_val) { + pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDIMM64) value: got %llu, exp %u -> %u\n", + prog_name, relo_idx, + insn_idx, (unsigned long long)imm, + orig_val, new_val); + return -EINVAL; + } + + insn[0].imm = new_val; + insn[1].imm = 0; /* currently only 32-bit values are supported */ + pr_debug("prog '%s': relo #%d: patched insn #%d (LDIMM64) imm64 %llu -> %u\n", + prog_name, relo_idx, insn_idx, + (unsigned long long)imm, new_val); + break; + } + default: + pr_warn("prog '%s': relo #%d: trying to relocate unrecognized insn #%d, code:0x%x, src:0x%x, dst:0x%x, off:0x%x, imm:0x%x\n", + prog_name, relo_idx, insn_idx, insn->code, + insn->src_reg, insn->dst_reg, insn->off, insn->imm); + return -EINVAL; + } + + return 0; +} + +/* Output spec definition in the format: + * [<type-id>] (<type-name>) + <raw-spec> => <offset>@<spec>, + * where <spec> is a C-syntax view of recorded field access, e.g.: x.a[3].b + */ +static void bpf_core_dump_spec(int level, const struct bpf_core_spec *spec) +{ + const struct btf_type *t; + const struct btf_enum *e; + const char *s; + __u32 type_id; + int i; + + type_id = spec->root_type_id; + t = btf__type_by_id(spec->btf, type_id); + s = btf__name_by_offset(spec->btf, t->name_off); + + libbpf_print(level, "[%u] %s %s", type_id, btf_kind_str(t), str_is_empty(s) ? "<anon>" : s); + + if (core_relo_is_type_based(spec->relo_kind)) + return; + + if (core_relo_is_enumval_based(spec->relo_kind)) { + t = skip_mods_and_typedefs(spec->btf, type_id, NULL); + e = btf_enum(t) + spec->raw_spec[0]; + s = btf__name_by_offset(spec->btf, e->name_off); + + libbpf_print(level, "::%s = %u", s, e->val); + return; + } + + if (core_relo_is_field_based(spec->relo_kind)) { + for (i = 0; i < spec->len; i++) { + if (spec->spec[i].name) + libbpf_print(level, ".%s", spec->spec[i].name); + else if (i > 0 || spec->spec[i].idx > 0) + libbpf_print(level, "[%u]", spec->spec[i].idx); + } + + libbpf_print(level, " ("); + for (i = 0; i < spec->raw_len; i++) + libbpf_print(level, "%s%d", i == 0 ? "" : ":", spec->raw_spec[i]); + + if (spec->bit_offset % 8) + libbpf_print(level, " @ offset %u.%u)", + spec->bit_offset / 8, spec->bit_offset % 8); + else + libbpf_print(level, " @ offset %u)", spec->bit_offset / 8); + return; + } +} + +/* + * CO-RE relocate single instruction. + * + * The outline and important points of the algorithm: + * 1. For given local type, find corresponding candidate target types. + * Candidate type is a type with the same "essential" name, ignoring + * everything after last triple underscore (___). E.g., `sample`, + * `sample___flavor_one`, `sample___flavor_another_one`, are all candidates + * for each other. Names with triple underscore are referred to as + * "flavors" and are useful, among other things, to allow to + * specify/support incompatible variations of the same kernel struct, which + * might differ between different kernel versions and/or build + * configurations. + * + * N.B. Struct "flavors" could be generated by bpftool's BTF-to-C + * converter, when deduplicated BTF of a kernel still contains more than + * one different types with the same name. In that case, ___2, ___3, etc + * are appended starting from second name conflict. But start flavors are + * also useful to be defined "locally", in BPF program, to extract same + * data from incompatible changes between different kernel + * versions/configurations. For instance, to handle field renames between + * kernel versions, one can use two flavors of the struct name with the + * same common name and use conditional relocations to extract that field, + * depending on target kernel version. + * 2. For each candidate type, try to match local specification to this + * candidate target type. Matching involves finding corresponding + * high-level spec accessors, meaning that all named fields should match, + * as well as all array accesses should be within the actual bounds. Also, + * types should be compatible (see bpf_core_fields_are_compat for details). + * 3. It is supported and expected that there might be multiple flavors + * matching the spec. As long as all the specs resolve to the same set of + * offsets across all candidates, there is no error. If there is any + * ambiguity, CO-RE relocation will fail. This is necessary to accomodate + * imprefection of BTF deduplication, which can cause slight duplication of + * the same BTF type, if some directly or indirectly referenced (by + * pointer) type gets resolved to different actual types in different + * object files. If such situation occurs, deduplicated BTF will end up + * with two (or more) structurally identical types, which differ only in + * types they refer to through pointer. This should be OK in most cases and + * is not an error. + * 4. Candidate types search is performed by linearly scanning through all + * types in target BTF. It is anticipated that this is overall more + * efficient memory-wise and not significantly worse (if not better) + * CPU-wise compared to prebuilding a map from all local type names to + * a list of candidate type names. It's also sped up by caching resolved + * list of matching candidates per each local "root" type ID, that has at + * least one bpf_core_relo associated with it. This list is shared + * between multiple relocations for the same type ID and is updated as some + * of the candidates are pruned due to structural incompatibility. + */ +int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, + int insn_idx, + const struct bpf_core_relo *relo, + int relo_idx, + const struct btf *local_btf, + struct bpf_core_cand_list *cands) +{ + struct bpf_core_spec local_spec, cand_spec, targ_spec = {}; + struct bpf_core_relo_res cand_res, targ_res; + const struct btf_type *local_type; + const char *local_name; + __u32 local_id; + const char *spec_str; + int i, j, err; + + local_id = relo->type_id; + local_type = btf__type_by_id(local_btf, local_id); + if (!local_type) + return -EINVAL; + + local_name = btf__name_by_offset(local_btf, local_type->name_off); + if (!local_name) + return -EINVAL; + + spec_str = btf__name_by_offset(local_btf, relo->access_str_off); + if (str_is_empty(spec_str)) + return -EINVAL; + + err = bpf_core_parse_spec(local_btf, local_id, spec_str, relo->kind, &local_spec); + if (err) { + pr_warn("prog '%s': relo #%d: parsing [%d] %s %s + %s failed: %d\n", + prog_name, relo_idx, local_id, btf_kind_str(local_type), + str_is_empty(local_name) ? "<anon>" : local_name, + spec_str, err); + return -EINVAL; + } + + pr_debug("prog '%s': relo #%d: kind <%s> (%d), spec is ", prog_name, + relo_idx, core_relo_kind_str(relo->kind), relo->kind); + bpf_core_dump_spec(LIBBPF_DEBUG, &local_spec); + libbpf_print(LIBBPF_DEBUG, "\n"); + + /* TYPE_ID_LOCAL relo is special and doesn't need candidate search */ + if (relo->kind == BPF_TYPE_ID_LOCAL) { + targ_res.validate = true; + targ_res.poison = false; + targ_res.orig_val = local_spec.root_type_id; + targ_res.new_val = local_spec.root_type_id; + goto patch_insn; + } + + /* libbpf doesn't support candidate search for anonymous types */ + if (str_is_empty(spec_str)) { + pr_warn("prog '%s': relo #%d: <%s> (%d) relocation doesn't support anonymous types\n", + prog_name, relo_idx, core_relo_kind_str(relo->kind), relo->kind); + return -EOPNOTSUPP; + } + + + for (i = 0, j = 0; i < cands->len; i++) { + err = bpf_core_spec_match(&local_spec, cands->cands[i].btf, + cands->cands[i].id, &cand_spec); + if (err < 0) { + pr_warn("prog '%s': relo #%d: error matching candidate #%d ", + prog_name, relo_idx, i); + bpf_core_dump_spec(LIBBPF_WARN, &cand_spec); + libbpf_print(LIBBPF_WARN, ": %d\n", err); + return err; + } + + pr_debug("prog '%s': relo #%d: %s candidate #%d ", prog_name, + relo_idx, err == 0 ? "non-matching" : "matching", i); + bpf_core_dump_spec(LIBBPF_DEBUG, &cand_spec); + libbpf_print(LIBBPF_DEBUG, "\n"); + + if (err == 0) + continue; + + err = bpf_core_calc_relo(prog_name, relo, relo_idx, &local_spec, &cand_spec, &cand_res); + if (err) + return err; + + if (j == 0) { + targ_res = cand_res; + targ_spec = cand_spec; + } else if (cand_spec.bit_offset != targ_spec.bit_offset) { + /* if there are many field relo candidates, they + * should all resolve to the same bit offset + */ + pr_warn("prog '%s': relo #%d: field offset ambiguity: %u != %u\n", + prog_name, relo_idx, cand_spec.bit_offset, + targ_spec.bit_offset); + return -EINVAL; + } else if (cand_res.poison != targ_res.poison || cand_res.new_val != targ_res.new_val) { + /* all candidates should result in the same relocation + * decision and value, otherwise it's dangerous to + * proceed due to ambiguity + */ + pr_warn("prog '%s': relo #%d: relocation decision ambiguity: %s %u != %s %u\n", + prog_name, relo_idx, + cand_res.poison ? "failure" : "success", cand_res.new_val, + targ_res.poison ? "failure" : "success", targ_res.new_val); + return -EINVAL; + } + + cands->cands[j++] = cands->cands[i]; + } + + /* + * For BPF_FIELD_EXISTS relo or when used BPF program has field + * existence checks or kernel version/config checks, it's expected + * that we might not find any candidates. In this case, if field + * wasn't found in any candidate, the list of candidates shouldn't + * change at all, we'll just handle relocating appropriately, + * depending on relo's kind. + */ + if (j > 0) + cands->len = j; + + /* + * If no candidates were found, it might be both a programmer error, + * as well as expected case, depending whether instruction w/ + * relocation is guarded in some way that makes it unreachable (dead + * code) if relocation can't be resolved. This is handled in + * bpf_core_patch_insn() uniformly by replacing that instruction with + * BPF helper call insn (using invalid helper ID). If that instruction + * is indeed unreachable, then it will be ignored and eliminated by + * verifier. If it was an error, then verifier will complain and point + * to a specific instruction number in its log. + */ + if (j == 0) { + pr_debug("prog '%s': relo #%d: no matching targets found\n", + prog_name, relo_idx); + + /* calculate single target relo result explicitly */ + err = bpf_core_calc_relo(prog_name, relo, relo_idx, &local_spec, NULL, &targ_res); + if (err) + return err; + } + +patch_insn: + /* bpf_core_patch_insn() should know how to handle missing targ_spec */ + err = bpf_core_patch_insn(prog_name, insn, insn_idx, relo, relo_idx, &targ_res); + if (err) { + pr_warn("prog '%s': relo #%d: failed to patch insn #%u: %d\n", + prog_name, relo_idx, relo->insn_off / 8, err); + return -EINVAL; + } + + return 0; +} diff --git a/tools/lib/bpf/relo_core.h b/tools/lib/bpf/relo_core.h new file mode 100644 index 000000000000..3b9f8f18346c --- /dev/null +++ b/tools/lib/bpf/relo_core.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2019 Facebook */ + +#ifndef __RELO_CORE_H +#define __RELO_CORE_H + +/* bpf_core_relo_kind encodes which aspect of captured field/type/enum value + * has to be adjusted by relocations. + */ +enum bpf_core_relo_kind { + BPF_FIELD_BYTE_OFFSET = 0, /* field byte offset */ + BPF_FIELD_BYTE_SIZE = 1, /* field size in bytes */ + BPF_FIELD_EXISTS = 2, /* field existence in target kernel */ + BPF_FIELD_SIGNED = 3, /* field signedness (0 - unsigned, 1 - signed) */ + BPF_FIELD_LSHIFT_U64 = 4, /* bitfield-specific left bitshift */ + BPF_FIELD_RSHIFT_U64 = 5, /* bitfield-specific right bitshift */ + BPF_TYPE_ID_LOCAL = 6, /* type ID in local BPF object */ + BPF_TYPE_ID_TARGET = 7, /* type ID in target kernel */ + BPF_TYPE_EXISTS = 8, /* type existence in target kernel */ + BPF_TYPE_SIZE = 9, /* type size in bytes */ + BPF_ENUMVAL_EXISTS = 10, /* enum value existence in target kernel */ + BPF_ENUMVAL_VALUE = 11, /* enum value integer value */ +}; + +/* The minimum bpf_core_relo checked by the loader + * + * CO-RE relocation captures the following data: + * - insn_off - instruction offset (in bytes) within a BPF program that needs + * its insn->imm field to be relocated with actual field info; + * - type_id - BTF type ID of the "root" (containing) entity of a relocatable + * type or field; + * - access_str_off - offset into corresponding .BTF string section. String + * interpretation depends on specific relocation kind: + * - for field-based relocations, string encodes an accessed field using + * a sequence of field and array indices, separated by colon (:). It's + * conceptually very close to LLVM's getelementptr ([0]) instruction's + * arguments for identifying offset to a field. + * - for type-based relocations, strings is expected to be just "0"; + * - for enum value-based relocations, string contains an index of enum + * value within its enum type; + * + * Example to provide a better feel. + * + * struct sample { + * int a; + * struct { + * int b[10]; + * }; + * }; + * + * struct sample *s = ...; + * int x = &s->a; // encoded as "0:0" (a is field #0) + * int y = &s->b[5]; // encoded as "0:1:0:5" (anon struct is field #1, + * // b is field #0 inside anon struct, accessing elem #5) + * int z = &s[10]->b; // encoded as "10:1" (ptr is used as an array) + * + * type_id for all relocs in this example will capture BTF type id of + * `struct sample`. + * + * Such relocation is emitted when using __builtin_preserve_access_index() + * Clang built-in, passing expression that captures field address, e.g.: + * + * bpf_probe_read(&dst, sizeof(dst), + * __builtin_preserve_access_index(&src->a.b.c)); + * + * In this case Clang will emit field relocation recording necessary data to + * be able to find offset of embedded `a.b.c` field within `src` struct. + * + * [0] https://llvm.org/docs/LangRef.html#getelementptr-instruction + */ +struct bpf_core_relo { + __u32 insn_off; + __u32 type_id; + __u32 access_str_off; + enum bpf_core_relo_kind kind; +}; + +struct bpf_core_cand { + const struct btf *btf; + const struct btf_type *t; + const char *name; + __u32 id; +}; + +/* dynamically sized list of type IDs and its associated struct btf */ +struct bpf_core_cand_list { + struct bpf_core_cand *cands; + int len; +}; + +int bpf_core_apply_relo_insn(const char *prog_name, + struct bpf_insn *insn, int insn_idx, + const struct bpf_core_relo *relo, int relo_idx, + const struct btf *local_btf, + struct bpf_core_cand_list *cands); +int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id); + +size_t bpf_core_essential_name_len(const char *name); +#endif diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index cdecda1ddd36..996d025b8ed8 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -223,10 +223,10 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_session *session, free(info_linear); return -1; } - if (btf__get_from_id(info->btf_id, &btf)) { + btf = btf__load_from_kernel_by_id(info->btf_id); + if (libbpf_get_error(btf)) { pr_debug("%s: failed to get BTF of id %u, aborting\n", __func__, info->btf_id); err = -1; - btf = NULL; goto out; } perf_env__fetch_btf(env, info->btf_id, btf); @@ -296,7 +296,7 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_session *session, out: free(info_linear); - free(btf); + btf__free(btf); return err ? -1 : 0; } @@ -478,7 +478,8 @@ static void perf_env__add_bpf_info(struct perf_env *env, u32 id) if (btf_id == 0) goto out; - if (btf__get_from_id(btf_id, &btf)) { + btf = btf__load_from_kernel_by_id(btf_id); + if (libbpf_get_error(btf)) { pr_debug("%s: failed to get BTF of id %u, aborting\n", __func__, btf_id); goto out; @@ -486,7 +487,7 @@ static void perf_env__add_bpf_info(struct perf_env *env, u32 id) perf_env__fetch_btf(env, btf_id, btf); out: - free(btf); + btf__free(btf); close(fd); } diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c index 8150e03367bb..ba0f20853651 100644 --- a/tools/perf/util/bpf_counter.c +++ b/tools/perf/util/bpf_counter.c @@ -64,8 +64,8 @@ static char *bpf_target_prog_name(int tgt_fd) struct bpf_prog_info_linear *info_linear; struct bpf_func_info *func_info; const struct btf_type *t; + struct btf *btf = NULL; char *name = NULL; - struct btf *btf; info_linear = bpf_program__get_prog_info_linear( tgt_fd, 1UL << BPF_PROG_INFO_FUNC_INFO); @@ -74,12 +74,17 @@ static char *bpf_target_prog_name(int tgt_fd) return NULL; } - if (info_linear->info.btf_id == 0 || - btf__get_from_id(info_linear->info.btf_id, &btf)) { + if (info_linear->info.btf_id == 0) { pr_debug("prog FD %d doesn't have valid btf\n", tgt_fd); goto out; } + btf = btf__load_from_kernel_by_id(info_linear->info.btf_id); + if (libbpf_get_error(btf)) { + pr_debug("failed to load btf for prog FD %d\n", tgt_fd); + goto out; + } + func_info = u64_to_ptr(info_linear->info.func_info); t = btf__type_by_id(btf, func_info[0].type_id); if (!t) { @@ -89,6 +94,7 @@ static char *bpf_target_prog_name(int tgt_fd) } name = strdup(btf__name_by_offset(btf, t->name_off)); out: + btf__free(btf); free(info_linear); return name; } diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index fb010a35d61a..da9e8b699e42 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -38,6 +38,7 @@ TARGETS += mount_setattr TARGETS += mqueue TARGETS += nci TARGETS += net +TARGETS += net/af_unix TARGETS += net/forwarding TARGETS += net/mptcp TARGETS += netfilter diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index addcfd8b615e..433f8bef261e 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -23,7 +23,6 @@ test_skb_cgroup_id_user test_cgroup_storage test_flow_dissector flow_dissector_load -test_netcnt test_tcpnotify_user test_libbpf test_tcp_check_syncookie_user diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index f405b20c1e6c..2a58b7b5aea4 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -38,7 +38,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test test_verifier_log test_dev_cgroup \ test_sock test_sockmap get_cgroup_id_user \ test_cgroup_storage \ - test_netcnt test_tcpnotify_user test_sysctl \ + test_tcpnotify_user test_sysctl \ test_progs-no_alu32 # Also test bpf-gcc, if present @@ -197,7 +197,6 @@ $(OUTPUT)/test_sockmap: cgroup_helpers.c $(OUTPUT)/test_tcpnotify_user: cgroup_helpers.c trace_helpers.c $(OUTPUT)/get_cgroup_id_user: cgroup_helpers.c $(OUTPUT)/test_cgroup_storage: cgroup_helpers.c -$(OUTPUT)/test_netcnt: cgroup_helpers.c $(OUTPUT)/test_sock_fields: cgroup_helpers.c $(OUTPUT)/test_sysctl: cgroup_helpers.c diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index 8deec1ca9150..9b17f2867488 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -19,6 +19,13 @@ the CI. It builds the kernel (without overwriting your existing Kconfig), recomp bpf selftests, runs them (by default ``tools/testing/selftests/bpf/test_progs``) and saves the resulting output (by default in ``~/.bpf_selftests``). +Script dependencies: +- clang (preferably built from sources, https://github.com/llvm/llvm-project); +- pahole (preferably built from sources, https://git.kernel.org/pub/scm/devel/pahole/pahole.git/); +- qemu; +- docutils (for ``rst2man``); +- libcap-devel. + For more information on about using the script, run: .. code-block:: console diff --git a/tools/testing/selftests/bpf/netcnt_common.h b/tools/testing/selftests/bpf/netcnt_common.h index 81084c1c2c23..0ab1c88041cd 100644 --- a/tools/testing/selftests/bpf/netcnt_common.h +++ b/tools/testing/selftests/bpf/netcnt_common.h @@ -6,19 +6,39 @@ #define MAX_PERCPU_PACKETS 32 -struct percpu_net_cnt { - __u64 packets; - __u64 bytes; +/* sizeof(struct bpf_local_storage_elem): + * + * It really is about 128 bytes on x86_64, but allocate more to account for + * possible layout changes, different architectures, etc. + * The kernel will wrap up to PAGE_SIZE internally anyway. + */ +#define SIZEOF_BPF_LOCAL_STORAGE_ELEM 256 - __u64 prev_ts; +/* Try to estimate kernel's BPF_LOCAL_STORAGE_MAX_VALUE_SIZE: */ +#define BPF_LOCAL_STORAGE_MAX_VALUE_SIZE (0xFFFF - \ + SIZEOF_BPF_LOCAL_STORAGE_ELEM) - __u64 prev_packets; - __u64 prev_bytes; +#define PCPU_MIN_UNIT_SIZE 32768 + +union percpu_net_cnt { + struct { + __u64 packets; + __u64 bytes; + + __u64 prev_ts; + + __u64 prev_packets; + __u64 prev_bytes; + }; + __u8 data[PCPU_MIN_UNIT_SIZE]; }; -struct net_cnt { - __u64 packets; - __u64 bytes; +union net_cnt { + struct { + __u64 packets; + __u64 bytes; + }; + __u8 data[BPF_LOCAL_STORAGE_MAX_VALUE_SIZE]; }; #endif diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 2060bc122c53..d6857683397f 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -66,17 +66,13 @@ int settimeo(int fd, int timeout_ms) #define save_errno_close(fd) ({ int __save = errno; close(fd); errno = __save; }) -int start_server(int family, int type, const char *addr_str, __u16 port, - int timeout_ms) +static int __start_server(int type, const struct sockaddr *addr, + socklen_t addrlen, int timeout_ms, bool reuseport) { - struct sockaddr_storage addr = {}; - socklen_t len; + int on = 1; int fd; - if (make_sockaddr(family, addr_str, port, &addr, &len)) - return -1; - - fd = socket(family, type, 0); + fd = socket(addr->sa_family, type, 0); if (fd < 0) { log_err("Failed to create server socket"); return -1; @@ -85,7 +81,13 @@ int start_server(int family, int type, const char *addr_str, __u16 port, if (settimeo(fd, timeout_ms)) goto error_close; - if (bind(fd, (const struct sockaddr *)&addr, len) < 0) { + if (reuseport && + setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &on, sizeof(on))) { + log_err("Failed to set SO_REUSEPORT"); + return -1; + } + + if (bind(fd, addr, addrlen) < 0) { log_err("Failed to bind socket"); goto error_close; } @@ -104,6 +106,69 @@ error_close: return -1; } +int start_server(int family, int type, const char *addr_str, __u16 port, + int timeout_ms) +{ + struct sockaddr_storage addr; + socklen_t addrlen; + + if (make_sockaddr(family, addr_str, port, &addr, &addrlen)) + return -1; + + return __start_server(type, (struct sockaddr *)&addr, + addrlen, timeout_ms, false); +} + +int *start_reuseport_server(int family, int type, const char *addr_str, + __u16 port, int timeout_ms, unsigned int nr_listens) +{ + struct sockaddr_storage addr; + unsigned int nr_fds = 0; + socklen_t addrlen; + int *fds; + + if (!nr_listens) + return NULL; + + if (make_sockaddr(family, addr_str, port, &addr, &addrlen)) + return NULL; + + fds = malloc(sizeof(*fds) * nr_listens); + if (!fds) + return NULL; + + fds[0] = __start_server(type, (struct sockaddr *)&addr, addrlen, + timeout_ms, true); + if (fds[0] == -1) + goto close_fds; + nr_fds = 1; + + if (getsockname(fds[0], (struct sockaddr *)&addr, &addrlen)) + goto close_fds; + + for (; nr_fds < nr_listens; nr_fds++) { + fds[nr_fds] = __start_server(type, (struct sockaddr *)&addr, + addrlen, timeout_ms, true); + if (fds[nr_fds] == -1) + goto close_fds; + } + + return fds; + +close_fds: + free_fds(fds, nr_fds); + return NULL; +} + +void free_fds(int *fds, unsigned int nr_close_fds) +{ + if (fds) { + while (nr_close_fds) + close(fds[--nr_close_fds]); + free(fds); + } +} + int fastopen_connect(int server_fd, const char *data, unsigned int data_len, int timeout_ms) { @@ -217,6 +282,7 @@ int make_sockaddr(int family, const char *addr_str, __u16 port, if (family == AF_INET) { struct sockaddr_in *sin = (void *)addr; + memset(addr, 0, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_port = htons(port); if (addr_str && @@ -230,6 +296,7 @@ int make_sockaddr(int family, const char *addr_str, __u16 port, } else if (family == AF_INET6) { struct sockaddr_in6 *sin6 = (void *)addr; + memset(addr, 0, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_port = htons(port); if (addr_str && @@ -243,3 +310,15 @@ int make_sockaddr(int family, const char *addr_str, __u16 port, } return -1; } + +char *ping_command(int family) +{ + if (family == AF_INET6) { + /* On some systems 'ping' doesn't support IPv6, so use ping6 if it is present. */ + if (!system("which ping6 >/dev/null 2>&1")) + return "ping6"; + else + return "ping -6"; + } + return "ping"; +} diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 5e0d51c07b63..c59a8f6d770b 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -36,11 +36,16 @@ extern struct ipv6_packet pkt_v6; int settimeo(int fd, int timeout_ms); int start_server(int family, int type, const char *addr, __u16 port, int timeout_ms); +int *start_reuseport_server(int family, int type, const char *addr_str, + __u16 port, int timeout_ms, + unsigned int nr_listens); +void free_fds(int *fds, unsigned int nr_close_fds); int connect_to_fd(int server_fd, int timeout_ms); int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms); int fastopen_connect(int server_fd, const char *data, unsigned int data_len, int timeout_ms); int make_sockaddr(int family, const char *addr_str, __u16 port, struct sockaddr_storage *addr, socklen_t *len); +char *ping_command(int family); #endif diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt.c new file mode 100644 index 000000000000..85babb0487b3 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt.c @@ -0,0 +1,226 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#define _GNU_SOURCE +#include <sched.h> +#include <test_progs.h> +#include "network_helpers.h" +#include "bpf_dctcp.skel.h" +#include "bpf_cubic.skel.h" +#include "bpf_iter_setsockopt.skel.h" + +static int create_netns(void) +{ + if (!ASSERT_OK(unshare(CLONE_NEWNET), "create netns")) + return -1; + + if (!ASSERT_OK(system("ip link set dev lo up"), "bring up lo")) + return -1; + + return 0; +} + +static unsigned int set_bpf_cubic(int *fds, unsigned int nr_fds) +{ + unsigned int i; + + for (i = 0; i < nr_fds; i++) { + if (setsockopt(fds[i], SOL_TCP, TCP_CONGESTION, "bpf_cubic", + sizeof("bpf_cubic"))) + return i; + } + + return nr_fds; +} + +static unsigned int check_bpf_dctcp(int *fds, unsigned int nr_fds) +{ + char tcp_cc[16]; + socklen_t optlen = sizeof(tcp_cc); + unsigned int i; + + for (i = 0; i < nr_fds; i++) { + if (getsockopt(fds[i], SOL_TCP, TCP_CONGESTION, + tcp_cc, &optlen) || + strcmp(tcp_cc, "bpf_dctcp")) + return i; + } + + return nr_fds; +} + +static int *make_established(int listen_fd, unsigned int nr_est, + int **paccepted_fds) +{ + int *est_fds, *accepted_fds; + unsigned int i; + + est_fds = malloc(sizeof(*est_fds) * nr_est); + if (!est_fds) + return NULL; + + accepted_fds = malloc(sizeof(*accepted_fds) * nr_est); + if (!accepted_fds) { + free(est_fds); + return NULL; + } + + for (i = 0; i < nr_est; i++) { + est_fds[i] = connect_to_fd(listen_fd, 0); + if (est_fds[i] == -1) + break; + if (set_bpf_cubic(&est_fds[i], 1) != 1) { + close(est_fds[i]); + break; + } + + accepted_fds[i] = accept(listen_fd, NULL, 0); + if (accepted_fds[i] == -1) { + close(est_fds[i]); + break; + } + } + + if (!ASSERT_EQ(i, nr_est, "create established fds")) { + free_fds(accepted_fds, i); + free_fds(est_fds, i); + return NULL; + } + + *paccepted_fds = accepted_fds; + return est_fds; +} + +static unsigned short get_local_port(int fd) +{ + struct sockaddr_in6 addr; + socklen_t addrlen = sizeof(addr); + + if (!getsockname(fd, &addr, &addrlen)) + return ntohs(addr.sin6_port); + + return 0; +} + +static void do_bpf_iter_setsockopt(struct bpf_iter_setsockopt *iter_skel, + bool random_retry) +{ + int *reuse_listen_fds = NULL, *accepted_fds = NULL, *est_fds = NULL; + unsigned int nr_reuse_listens = 256, nr_est = 256; + int err, iter_fd = -1, listen_fd = -1; + char buf; + + /* Prepare non-reuseport listen_fd */ + listen_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0); + if (!ASSERT_GE(listen_fd, 0, "start_server")) + return; + if (!ASSERT_EQ(set_bpf_cubic(&listen_fd, 1), 1, + "set listen_fd to cubic")) + goto done; + iter_skel->bss->listen_hport = get_local_port(listen_fd); + if (!ASSERT_NEQ(iter_skel->bss->listen_hport, 0, + "get_local_port(listen_fd)")) + goto done; + + /* Connect to non-reuseport listen_fd */ + est_fds = make_established(listen_fd, nr_est, &accepted_fds); + if (!ASSERT_OK_PTR(est_fds, "create established")) + goto done; + + /* Prepare reuseport listen fds */ + reuse_listen_fds = start_reuseport_server(AF_INET6, SOCK_STREAM, + "::1", 0, 0, + nr_reuse_listens); + if (!ASSERT_OK_PTR(reuse_listen_fds, "start_reuseport_server")) + goto done; + if (!ASSERT_EQ(set_bpf_cubic(reuse_listen_fds, nr_reuse_listens), + nr_reuse_listens, "set reuse_listen_fds to cubic")) + goto done; + iter_skel->bss->reuse_listen_hport = get_local_port(reuse_listen_fds[0]); + if (!ASSERT_NEQ(iter_skel->bss->reuse_listen_hport, 0, + "get_local_port(reuse_listen_fds[0])")) + goto done; + + /* Run bpf tcp iter to switch from bpf_cubic to bpf_dctcp */ + iter_skel->bss->random_retry = random_retry; + iter_fd = bpf_iter_create(bpf_link__fd(iter_skel->links.change_tcp_cc)); + if (!ASSERT_GE(iter_fd, 0, "create iter_fd")) + goto done; + + while ((err = read(iter_fd, &buf, sizeof(buf))) == -1 && + errno == EAGAIN) + ; + if (!ASSERT_OK(err, "read iter error")) + goto done; + + /* Check reuseport listen fds for dctcp */ + ASSERT_EQ(check_bpf_dctcp(reuse_listen_fds, nr_reuse_listens), + nr_reuse_listens, + "check reuse_listen_fds dctcp"); + + /* Check non reuseport listen fd for dctcp */ + ASSERT_EQ(check_bpf_dctcp(&listen_fd, 1), 1, + "check listen_fd dctcp"); + + /* Check established fds for dctcp */ + ASSERT_EQ(check_bpf_dctcp(est_fds, nr_est), nr_est, + "check est_fds dctcp"); + + /* Check accepted fds for dctcp */ + ASSERT_EQ(check_bpf_dctcp(accepted_fds, nr_est), nr_est, + "check accepted_fds dctcp"); + +done: + if (iter_fd != -1) + close(iter_fd); + if (listen_fd != -1) + close(listen_fd); + free_fds(reuse_listen_fds, nr_reuse_listens); + free_fds(accepted_fds, nr_est); + free_fds(est_fds, nr_est); +} + +void test_bpf_iter_setsockopt(void) +{ + struct bpf_iter_setsockopt *iter_skel = NULL; + struct bpf_cubic *cubic_skel = NULL; + struct bpf_dctcp *dctcp_skel = NULL; + struct bpf_link *cubic_link = NULL; + struct bpf_link *dctcp_link = NULL; + + if (create_netns()) + return; + + /* Load iter_skel */ + iter_skel = bpf_iter_setsockopt__open_and_load(); + if (!ASSERT_OK_PTR(iter_skel, "iter_skel")) + return; + iter_skel->links.change_tcp_cc = bpf_program__attach_iter(iter_skel->progs.change_tcp_cc, NULL); + if (!ASSERT_OK_PTR(iter_skel->links.change_tcp_cc, "attach iter")) + goto done; + + /* Load bpf_cubic */ + cubic_skel = bpf_cubic__open_and_load(); + if (!ASSERT_OK_PTR(cubic_skel, "cubic_skel")) + goto done; + cubic_link = bpf_map__attach_struct_ops(cubic_skel->maps.cubic); + if (!ASSERT_OK_PTR(cubic_link, "cubic_link")) + goto done; + + /* Load bpf_dctcp */ + dctcp_skel = bpf_dctcp__open_and_load(); + if (!ASSERT_OK_PTR(dctcp_skel, "dctcp_skel")) + goto done; + dctcp_link = bpf_map__attach_struct_ops(dctcp_skel->maps.dctcp); + if (!ASSERT_OK_PTR(dctcp_link, "dctcp_link")) + goto done; + + do_bpf_iter_setsockopt(iter_skel, true); + do_bpf_iter_setsockopt(iter_skel, false); + +done: + bpf_link__destroy(cubic_link); + bpf_link__destroy(dctcp_link); + bpf_cubic__destroy(cubic_skel); + bpf_dctcp__destroy(dctcp_skel); + bpf_iter_setsockopt__destroy(iter_skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index 857e3f26086f..649f87382c8d 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -4350,7 +4350,8 @@ static void do_test_file(unsigned int test_num) goto done; } - err = btf__get_from_id(info.btf_id, &btf); + btf = btf__load_from_kernel_by_id(info.btf_id); + err = libbpf_get_error(btf); if (CHECK(err, "cannot get btf from kernel, err: %d", err)) goto done; @@ -4386,6 +4387,7 @@ skip: fprintf(stderr, "OK"); done: + btf__free(btf); free(func_info); bpf_object__close(obj); } diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index 1b90e684ff13..52ccf0cf35e1 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -232,7 +232,593 @@ err_out: btf__free(btf); } +#define STRSIZE 4096 + +static void btf_dump_snprintf(void *ctx, const char *fmt, va_list args) +{ + char *s = ctx, new[STRSIZE]; + + vsnprintf(new, STRSIZE, fmt, args); + if (strlen(s) < STRSIZE) + strncat(s, new, STRSIZE - strlen(s) - 1); +} + +static int btf_dump_data(struct btf *btf, struct btf_dump *d, + char *name, char *prefix, __u64 flags, void *ptr, + size_t ptr_sz, char *str, const char *expected_val) +{ + DECLARE_LIBBPF_OPTS(btf_dump_type_data_opts, opts); + size_t type_sz; + __s32 type_id; + int ret = 0; + + if (flags & BTF_F_COMPACT) + opts.compact = true; + if (flags & BTF_F_NONAME) + opts.skip_names = true; + if (flags & BTF_F_ZERO) + opts.emit_zeroes = true; + if (prefix) { + ASSERT_STRNEQ(name, prefix, strlen(prefix), + "verify prefix match"); + name += strlen(prefix) + 1; + } + type_id = btf__find_by_name(btf, name); + if (!ASSERT_GE(type_id, 0, "find type id")) + return -ENOENT; + type_sz = btf__resolve_size(btf, type_id); + str[0] = '\0'; + ret = btf_dump__dump_type_data(d, type_id, ptr, ptr_sz, &opts); + if (type_sz <= ptr_sz) { + if (!ASSERT_EQ(ret, type_sz, "failed/unexpected type_sz")) + return -EINVAL; + } else { + if (!ASSERT_EQ(ret, -E2BIG, "failed to return -E2BIG")) + return -EINVAL; + } + if (!ASSERT_STREQ(str, expected_val, "ensure expected/actual match")) + return -EFAULT; + return 0; +} + +#define TEST_BTF_DUMP_DATA(_b, _d, _prefix, _str, _type, _flags, \ + _expected, ...) \ + do { \ + char __ptrtype[64] = #_type; \ + char *_ptrtype = (char *)__ptrtype; \ + _type _ptrdata = __VA_ARGS__; \ + void *_ptr = &_ptrdata; \ + \ + (void) btf_dump_data(_b, _d, _ptrtype, _prefix, _flags, \ + _ptr, sizeof(_type), _str, \ + _expected); \ + } while (0) + +/* Use where expected data string matches its stringified declaration */ +#define TEST_BTF_DUMP_DATA_C(_b, _d, _prefix, _str, _type, _flags, \ + ...) \ + TEST_BTF_DUMP_DATA(_b, _d, _prefix, _str, _type, _flags, \ + "(" #_type ")" #__VA_ARGS__, __VA_ARGS__) + +/* overflow test; pass typesize < expected type size, ensure E2BIG returned */ +#define TEST_BTF_DUMP_DATA_OVER(_b, _d, _prefix, _str, _type, _type_sz, \ + _expected, ...) \ + do { \ + char __ptrtype[64] = #_type; \ + char *_ptrtype = (char *)__ptrtype; \ + _type _ptrdata = __VA_ARGS__; \ + void *_ptr = &_ptrdata; \ + \ + (void) btf_dump_data(_b, _d, _ptrtype, _prefix, 0, \ + _ptr, _type_sz, _str, _expected); \ + } while (0) + +#define TEST_BTF_DUMP_VAR(_b, _d, _prefix, _str, _var, _type, _flags, \ + _expected, ...) \ + do { \ + _type _ptrdata = __VA_ARGS__; \ + void *_ptr = &_ptrdata; \ + \ + (void) btf_dump_data(_b, _d, _var, _prefix, _flags, \ + _ptr, sizeof(_type), _str, \ + _expected); \ + } while (0) + +static void test_btf_dump_int_data(struct btf *btf, struct btf_dump *d, + char *str) +{ +#ifdef __SIZEOF_INT128__ + __int128 i = 0xffffffffffffffff; + + /* this dance is required because we cannot directly initialize + * a 128-bit value to anything larger than a 64-bit value. + */ + i = (i << 64) | (i - 1); +#endif + /* simple int */ + TEST_BTF_DUMP_DATA_C(btf, d, NULL, str, int, BTF_F_COMPACT, 1234); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, int, BTF_F_COMPACT | BTF_F_NONAME, + "1234", 1234); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, int, 0, "(int)1234", 1234); + + /* zero value should be printed at toplevel */ + TEST_BTF_DUMP_DATA(btf, d, NULL, str, int, BTF_F_COMPACT, "(int)0", 0); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, int, BTF_F_COMPACT | BTF_F_NONAME, + "0", 0); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, int, BTF_F_COMPACT | BTF_F_ZERO, + "(int)0", 0); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, int, + BTF_F_COMPACT | BTF_F_NONAME | BTF_F_ZERO, + "0", 0); + TEST_BTF_DUMP_DATA_C(btf, d, NULL, str, int, BTF_F_COMPACT, -4567); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, int, BTF_F_COMPACT | BTF_F_NONAME, + "-4567", -4567); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, int, 0, "(int)-4567", -4567); + + TEST_BTF_DUMP_DATA_OVER(btf, d, NULL, str, int, sizeof(int)-1, "", 1); + +#ifdef __SIZEOF_INT128__ + TEST_BTF_DUMP_DATA(btf, d, NULL, str, __int128, BTF_F_COMPACT, + "(__int128)0xffffffffffffffff", + 0xffffffffffffffff); + ASSERT_OK(btf_dump_data(btf, d, "__int128", NULL, 0, &i, 16, str, + "(__int128)0xfffffffffffffffffffffffffffffffe"), + "dump __int128"); +#endif +} + +static void test_btf_dump_float_data(struct btf *btf, struct btf_dump *d, + char *str) +{ + float t1 = 1.234567; + float t2 = -1.234567; + float t3 = 0.0; + double t4 = 5.678912; + double t5 = -5.678912; + double t6 = 0.0; + long double t7 = 9.876543; + long double t8 = -9.876543; + long double t9 = 0.0; + + /* since the kernel does not likely have any float types in its BTF, we + * will need to add some of various sizes. + */ + + ASSERT_GT(btf__add_float(btf, "test_float", 4), 0, "add float"); + ASSERT_OK(btf_dump_data(btf, d, "test_float", NULL, 0, &t1, 4, str, + "(test_float)1.234567"), "dump float"); + ASSERT_OK(btf_dump_data(btf, d, "test_float", NULL, 0, &t2, 4, str, + "(test_float)-1.234567"), "dump float"); + ASSERT_OK(btf_dump_data(btf, d, "test_float", NULL, 0, &t3, 4, str, + "(test_float)0.000000"), "dump float"); + + ASSERT_GT(btf__add_float(btf, "test_double", 8), 0, "add_double"); + ASSERT_OK(btf_dump_data(btf, d, "test_double", NULL, 0, &t4, 8, str, + "(test_double)5.678912"), "dump double"); + ASSERT_OK(btf_dump_data(btf, d, "test_double", NULL, 0, &t5, 8, str, + "(test_double)-5.678912"), "dump double"); + ASSERT_OK(btf_dump_data(btf, d, "test_double", NULL, 0, &t6, 8, str, + "(test_double)0.000000"), "dump double"); + + ASSERT_GT(btf__add_float(btf, "test_long_double", 16), 0, "add long double"); + ASSERT_OK(btf_dump_data(btf, d, "test_long_double", NULL, 0, &t7, 16, + str, "(test_long_double)9.876543"), + "dump long_double"); + ASSERT_OK(btf_dump_data(btf, d, "test_long_double", NULL, 0, &t8, 16, + str, "(test_long_double)-9.876543"), + "dump long_double"); + ASSERT_OK(btf_dump_data(btf, d, "test_long_double", NULL, 0, &t9, 16, + str, "(test_long_double)0.000000"), + "dump long_double"); +} + +static void test_btf_dump_char_data(struct btf *btf, struct btf_dump *d, + char *str) +{ + /* simple char */ + TEST_BTF_DUMP_DATA_C(btf, d, NULL, str, char, BTF_F_COMPACT, 100); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, char, BTF_F_COMPACT | BTF_F_NONAME, + "100", 100); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, char, 0, "(char)100", 100); + /* zero value should be printed at toplevel */ + TEST_BTF_DUMP_DATA(btf, d, NULL, str, char, BTF_F_COMPACT, + "(char)0", 0); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, char, BTF_F_COMPACT | BTF_F_NONAME, + "0", 0); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, char, BTF_F_COMPACT | BTF_F_ZERO, + "(char)0", 0); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, char, BTF_F_COMPACT | BTF_F_NONAME | BTF_F_ZERO, + "0", 0); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, char, 0, "(char)0", 0); + + TEST_BTF_DUMP_DATA_OVER(btf, d, NULL, str, char, sizeof(char)-1, "", 100); +} + +static void test_btf_dump_typedef_data(struct btf *btf, struct btf_dump *d, + char *str) +{ + /* simple typedef */ + TEST_BTF_DUMP_DATA_C(btf, d, NULL, str, uint64_t, BTF_F_COMPACT, 100); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, u64, BTF_F_COMPACT | BTF_F_NONAME, + "1", 1); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, u64, 0, "(u64)1", 1); + /* zero value should be printed at toplevel */ + TEST_BTF_DUMP_DATA(btf, d, NULL, str, u64, BTF_F_COMPACT, "(u64)0", 0); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, u64, BTF_F_COMPACT | BTF_F_NONAME, + "0", 0); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, u64, BTF_F_COMPACT | BTF_F_ZERO, + "(u64)0", 0); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, u64, + BTF_F_COMPACT | BTF_F_NONAME | BTF_F_ZERO, + "0", 0); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, u64, 0, "(u64)0", 0); + + /* typedef struct */ + TEST_BTF_DUMP_DATA_C(btf, d, NULL, str, atomic_t, BTF_F_COMPACT, + {.counter = (int)1,}); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, atomic_t, BTF_F_COMPACT | BTF_F_NONAME, + "{1,}", { .counter = 1 }); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, atomic_t, 0, +"(atomic_t){\n" +" .counter = (int)1,\n" +"}", + {.counter = 1,}); + /* typedef with 0 value should be printed at toplevel */ + TEST_BTF_DUMP_DATA(btf, d, NULL, str, atomic_t, BTF_F_COMPACT, "(atomic_t){}", + {.counter = 0,}); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, atomic_t, BTF_F_COMPACT | BTF_F_NONAME, + "{}", {.counter = 0,}); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, atomic_t, 0, +"(atomic_t){\n" +"}", + {.counter = 0,}); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, atomic_t, BTF_F_COMPACT | BTF_F_ZERO, + "(atomic_t){.counter = (int)0,}", + {.counter = 0,}); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, atomic_t, + BTF_F_COMPACT | BTF_F_NONAME | BTF_F_ZERO, + "{0,}", {.counter = 0,}); + TEST_BTF_DUMP_DATA(btf, d, NULL, str, atomic_t, BTF_F_ZERO, +"(atomic_t){\n" +" .counter = (int)0,\n" +"}", + { .counter = 0,}); + + /* overflow should show type but not value since it overflows */ + TEST_BTF_DUMP_DATA_OVER(btf, d, NULL, str, atomic_t, sizeof(atomic_t)-1, + "(atomic_t){\n", { .counter = 1}); +} + +static void test_btf_dump_enum_data(struct btf *btf, struct btf_dump *d, + char *str) +{ + /* enum where enum value does (and does not) exist */ + TEST_BTF_DUMP_DATA_C(btf, d, "enum", str, enum bpf_cmd, BTF_F_COMPACT, + BPF_MAP_CREATE); + TEST_BTF_DUMP_DATA(btf, d, "enum", str, enum bpf_cmd, BTF_F_COMPACT, + "(enum bpf_cmd)BPF_MAP_CREATE", 0); + TEST_BTF_DUMP_DATA(btf, d, "enum", str, enum bpf_cmd, + BTF_F_COMPACT | BTF_F_NONAME, + "BPF_MAP_CREATE", + BPF_MAP_CREATE); + TEST_BTF_DUMP_DATA(btf, d, "enum", str, enum bpf_cmd, 0, + "(enum bpf_cmd)BPF_MAP_CREATE", + BPF_MAP_CREATE); + TEST_BTF_DUMP_DATA(btf, d, "enum", str, enum bpf_cmd, + BTF_F_COMPACT | BTF_F_NONAME | BTF_F_ZERO, + "BPF_MAP_CREATE", 0); + TEST_BTF_DUMP_DATA(btf, d, "enum", str, enum bpf_cmd, + BTF_F_COMPACT | BTF_F_ZERO, + "(enum bpf_cmd)BPF_MAP_CREATE", + BPF_MAP_CREATE); + TEST_BTF_DUMP_DATA(btf, d, "enum", str, enum bpf_cmd, + BTF_F_COMPACT | BTF_F_NONAME | BTF_F_ZERO, + "BPF_MAP_CREATE", BPF_MAP_CREATE); + TEST_BTF_DUMP_DATA_C(btf, d, "enum", str, enum bpf_cmd, BTF_F_COMPACT, 2000); + TEST_BTF_DUMP_DATA(btf, d, "enum", str, enum bpf_cmd, + BTF_F_COMPACT | BTF_F_NONAME, + "2000", 2000); + TEST_BTF_DUMP_DATA(btf, d, "enum", str, enum bpf_cmd, 0, + "(enum bpf_cmd)2000", 2000); + + TEST_BTF_DUMP_DATA_OVER(btf, d, "enum", str, enum bpf_cmd, + sizeof(enum bpf_cmd) - 1, "", BPF_MAP_CREATE); +} + +static void test_btf_dump_struct_data(struct btf *btf, struct btf_dump *d, + char *str) +{ + DECLARE_LIBBPF_OPTS(btf_dump_type_data_opts, opts); + char zero_data[512] = { }; + char type_data[512]; + void *fops = type_data; + void *skb = type_data; + size_t type_sz; + __s32 type_id; + char *cmpstr; + int ret; + + memset(type_data, 255, sizeof(type_data)); + + /* simple struct */ + TEST_BTF_DUMP_DATA_C(btf, d, "struct", str, struct btf_enum, BTF_F_COMPACT, + {.name_off = (__u32)3,.val = (__s32)-1,}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum, + BTF_F_COMPACT | BTF_F_NONAME, + "{3,-1,}", + { .name_off = 3, .val = -1,}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum, 0, +"(struct btf_enum){\n" +" .name_off = (__u32)3,\n" +" .val = (__s32)-1,\n" +"}", + { .name_off = 3, .val = -1,}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum, + BTF_F_COMPACT | BTF_F_NONAME, + "{-1,}", + { .name_off = 0, .val = -1,}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum, + BTF_F_COMPACT | BTF_F_NONAME | BTF_F_ZERO, + "{0,-1,}", + { .name_off = 0, .val = -1,}); + /* empty struct should be printed */ + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum, BTF_F_COMPACT, + "(struct btf_enum){}", + { .name_off = 0, .val = 0,}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum, + BTF_F_COMPACT | BTF_F_NONAME, + "{}", + { .name_off = 0, .val = 0,}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum, 0, +"(struct btf_enum){\n" +"}", + { .name_off = 0, .val = 0,}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum, + BTF_F_COMPACT | BTF_F_ZERO, + "(struct btf_enum){.name_off = (__u32)0,.val = (__s32)0,}", + { .name_off = 0, .val = 0,}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum, + BTF_F_ZERO, +"(struct btf_enum){\n" +" .name_off = (__u32)0,\n" +" .val = (__s32)0,\n" +"}", + { .name_off = 0, .val = 0,}); + + /* struct with pointers */ + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct list_head, BTF_F_COMPACT, + "(struct list_head){.next = (struct list_head *)0x1,}", + { .next = (struct list_head *)1 }); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct list_head, 0, +"(struct list_head){\n" +" .next = (struct list_head *)0x1,\n" +"}", + { .next = (struct list_head *)1 }); + /* NULL pointer should not be displayed */ + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct list_head, BTF_F_COMPACT, + "(struct list_head){}", + { .next = (struct list_head *)0 }); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct list_head, 0, +"(struct list_head){\n" +"}", + { .next = (struct list_head *)0 }); + + /* struct with function pointers */ + type_id = btf__find_by_name(btf, "file_operations"); + if (ASSERT_GT(type_id, 0, "find type id")) { + type_sz = btf__resolve_size(btf, type_id); + str[0] = '\0'; + + ret = btf_dump__dump_type_data(d, type_id, fops, type_sz, &opts); + ASSERT_EQ(ret, type_sz, + "unexpected return value dumping file_operations"); + cmpstr = +"(struct file_operations){\n" +" .owner = (struct module *)0xffffffffffffffff,\n" +" .llseek = (loff_t (*)(struct file *, loff_t, int))0xffffffffffffffff,"; + + ASSERT_STRNEQ(str, cmpstr, strlen(cmpstr), "file_operations"); + } + + /* struct with char array */ + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_prog_info, BTF_F_COMPACT, + "(struct bpf_prog_info){.name = (char[16])['f','o','o',],}", + { .name = "foo",}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_prog_info, + BTF_F_COMPACT | BTF_F_NONAME, + "{['f','o','o',],}", + {.name = "foo",}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_prog_info, 0, +"(struct bpf_prog_info){\n" +" .name = (char[16])[\n" +" 'f',\n" +" 'o',\n" +" 'o',\n" +" ],\n" +"}", + {.name = "foo",}); + /* leading null char means do not display string */ + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_prog_info, BTF_F_COMPACT, + "(struct bpf_prog_info){}", + {.name = {'\0', 'f', 'o', 'o'}}); + /* handle non-printable characters */ + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_prog_info, BTF_F_COMPACT, + "(struct bpf_prog_info){.name = (char[16])[1,2,3,],}", + { .name = {1, 2, 3, 0}}); + + /* struct with non-char array */ + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct __sk_buff, BTF_F_COMPACT, + "(struct __sk_buff){.cb = (__u32[5])[1,2,3,4,5,],}", + { .cb = {1, 2, 3, 4, 5,},}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct __sk_buff, + BTF_F_COMPACT | BTF_F_NONAME, + "{[1,2,3,4,5,],}", + { .cb = { 1, 2, 3, 4, 5},}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct __sk_buff, 0, +"(struct __sk_buff){\n" +" .cb = (__u32[5])[\n" +" 1,\n" +" 2,\n" +" 3,\n" +" 4,\n" +" 5,\n" +" ],\n" +"}", + { .cb = { 1, 2, 3, 4, 5},}); + /* For non-char, arrays, show non-zero values only */ + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct __sk_buff, BTF_F_COMPACT, + "(struct __sk_buff){.cb = (__u32[5])[0,0,1,0,0,],}", + { .cb = { 0, 0, 1, 0, 0},}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct __sk_buff, 0, +"(struct __sk_buff){\n" +" .cb = (__u32[5])[\n" +" 0,\n" +" 0,\n" +" 1,\n" +" 0,\n" +" 0,\n" +" ],\n" +"}", + { .cb = { 0, 0, 1, 0, 0},}); + + /* struct with bitfields */ + TEST_BTF_DUMP_DATA_C(btf, d, "struct", str, struct bpf_insn, BTF_F_COMPACT, + {.code = (__u8)1,.dst_reg = (__u8)0x2,.src_reg = (__u8)0x3,.off = (__s16)4,.imm = (__s32)5,}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_insn, + BTF_F_COMPACT | BTF_F_NONAME, + "{1,0x2,0x3,4,5,}", + { .code = 1, .dst_reg = 0x2, .src_reg = 0x3, .off = 4, + .imm = 5,}); + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_insn, 0, +"(struct bpf_insn){\n" +" .code = (__u8)1,\n" +" .dst_reg = (__u8)0x2,\n" +" .src_reg = (__u8)0x3,\n" +" .off = (__s16)4,\n" +" .imm = (__s32)5,\n" +"}", + {.code = 1, .dst_reg = 2, .src_reg = 3, .off = 4, .imm = 5}); + + /* zeroed bitfields should not be displayed */ + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_insn, BTF_F_COMPACT, + "(struct bpf_insn){.dst_reg = (__u8)0x1,}", + { .code = 0, .dst_reg = 1}); + + /* struct with enum bitfield */ + type_id = btf__find_by_name(btf, "fs_context"); + if (ASSERT_GT(type_id, 0, "find fs_context")) { + type_sz = btf__resolve_size(btf, type_id); + str[0] = '\0'; + + opts.emit_zeroes = true; + ret = btf_dump__dump_type_data(d, type_id, zero_data, type_sz, &opts); + ASSERT_EQ(ret, type_sz, + "unexpected return value dumping fs_context"); + + ASSERT_NEQ(strstr(str, "FS_CONTEXT_FOR_MOUNT"), NULL, + "bitfield value not present"); + } + + /* struct with nested anon union */ + TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_sock_ops, BTF_F_COMPACT, + "(struct bpf_sock_ops){.op = (__u32)1,(union){.args = (__u32[4])[1,2,3,4,],.reply = (__u32)1,.replylong = (__u32[4])[1,2,3,4,],},}", + { .op = 1, .args = { 1, 2, 3, 4}}); + + /* union with nested struct */ + TEST_BTF_DUMP_DATA(btf, d, "union", str, union bpf_iter_link_info, BTF_F_COMPACT, + "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},}", + { .map = { .map_fd = 1 }}); + + /* struct skb with nested structs/unions; because type output is so + * complex, we don't do a string comparison, just verify we return + * the type size as the amount of data displayed. + */ + type_id = btf__find_by_name(btf, "sk_buff"); + if (ASSERT_GT(type_id, 0, "find struct sk_buff")) { + type_sz = btf__resolve_size(btf, type_id); + str[0] = '\0'; + + ret = btf_dump__dump_type_data(d, type_id, skb, type_sz, &opts); + ASSERT_EQ(ret, type_sz, + "unexpected return value dumping sk_buff"); + } + + /* overflow bpf_sock_ops struct with final element nonzero/zero. + * Regardless of the value of the final field, we don't have all the + * data we need to display it, so we should trigger an overflow. + * In other words oveflow checking should trump "is field zero?" + * checks because if we've overflowed, it shouldn't matter what the + * field is - we can't trust its value so shouldn't display it. + */ + TEST_BTF_DUMP_DATA_OVER(btf, d, "struct", str, struct bpf_sock_ops, + sizeof(struct bpf_sock_ops) - 1, + "(struct bpf_sock_ops){\n\t.op = (__u32)1,\n", + { .op = 1, .skb_tcp_flags = 2}); + TEST_BTF_DUMP_DATA_OVER(btf, d, "struct", str, struct bpf_sock_ops, + sizeof(struct bpf_sock_ops) - 1, + "(struct bpf_sock_ops){\n\t.op = (__u32)1,\n", + { .op = 1, .skb_tcp_flags = 0}); +} + +static void test_btf_dump_var_data(struct btf *btf, struct btf_dump *d, + char *str) +{ + TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_number", int, BTF_F_COMPACT, + "int cpu_number = (int)100", 100); + TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_profile_flip", int, BTF_F_COMPACT, + "static int cpu_profile_flip = (int)2", 2); +} + +static void test_btf_datasec(struct btf *btf, struct btf_dump *d, char *str, + const char *name, const char *expected_val, + void *data, size_t data_sz) +{ + DECLARE_LIBBPF_OPTS(btf_dump_type_data_opts, opts); + int ret = 0, cmp; + size_t secsize; + __s32 type_id; + + opts.compact = true; + + type_id = btf__find_by_name(btf, name); + if (!ASSERT_GT(type_id, 0, "find type id")) + return; + + secsize = btf__resolve_size(btf, type_id); + ASSERT_EQ(secsize, 0, "verify section size"); + + str[0] = '\0'; + ret = btf_dump__dump_type_data(d, type_id, data, data_sz, &opts); + ASSERT_EQ(ret, 0, "unexpected return value"); + + cmp = strcmp(str, expected_val); + ASSERT_EQ(cmp, 0, "ensure expected/actual match"); +} + +static void test_btf_dump_datasec_data(char *str) +{ + struct btf *btf = btf__parse("xdping_kern.o", NULL); + struct btf_dump_opts opts = { .ctx = str }; + char license[4] = "GPL"; + struct btf_dump *d; + + if (!ASSERT_OK_PTR(btf, "xdping_kern.o BTF not found")) + return; + + d = btf_dump__new(btf, NULL, &opts, btf_dump_snprintf); + if (!ASSERT_OK_PTR(d, "could not create BTF dump")) + return; + + test_btf_datasec(btf, d, str, "license", + "SEC(\"license\") char[4] _license = (char[4])['G','P','L',];", + license, sizeof(license)); +} + void test_btf_dump() { + char str[STRSIZE]; + struct btf_dump_opts opts = { .ctx = str }; + struct btf_dump *d; + struct btf *btf; int i; for (i = 0; i < ARRAY_SIZE(btf_dump_test_cases); i++) { @@ -245,4 +831,33 @@ void test_btf_dump() { } if (test__start_subtest("btf_dump: incremental")) test_btf_dump_incremental(); + + btf = libbpf_find_kernel_btf(); + if (!ASSERT_OK_PTR(btf, "no kernel BTF found")) + return; + + d = btf_dump__new(btf, NULL, &opts, btf_dump_snprintf); + if (!ASSERT_OK_PTR(d, "could not create BTF dump")) + return; + + /* Verify type display for various types. */ + if (test__start_subtest("btf_dump: int_data")) + test_btf_dump_int_data(btf, d, str); + if (test__start_subtest("btf_dump: float_data")) + test_btf_dump_float_data(btf, d, str); + if (test__start_subtest("btf_dump: char_data")) + test_btf_dump_char_data(btf, d, str); + if (test__start_subtest("btf_dump: typedef_data")) + test_btf_dump_typedef_data(btf, d, str); + if (test__start_subtest("btf_dump: enum_data")) + test_btf_dump_enum_data(btf, d, str); + if (test__start_subtest("btf_dump: struct_data")) + test_btf_dump_struct_data(btf, d, str); + if (test__start_subtest("btf_dump: var_data")) + test_btf_dump_var_data(btf, d, str); + btf_dump__free(d); + btf__free(btf); + + if (test__start_subtest("btf_dump: datasec_data")) + test_btf_dump_datasec_data(str); } diff --git a/tools/testing/selftests/bpf/prog_tests/core_autosize.c b/tools/testing/selftests/bpf/prog_tests/core_autosize.c index 981c251453d9..3d4b2a358d47 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_autosize.c +++ b/tools/testing/selftests/bpf/prog_tests/core_autosize.c @@ -53,8 +53,8 @@ void test_core_autosize(void) char btf_file[] = "/tmp/core_autosize.btf.XXXXXX"; int err, fd = -1, zero = 0; int char_id, short_id, int_id, long_long_id, void_ptr_id, id; + DECLARE_LIBBPF_OPTS(bpf_object_open_opts, open_opts); struct test_core_autosize* skel = NULL; - struct bpf_object_load_attr load_attr = {}; struct bpf_program *prog; struct bpf_map *bss_map; struct btf *btf = NULL; @@ -125,9 +125,10 @@ void test_core_autosize(void) fd = -1; /* open and load BPF program with custom BTF as the kernel BTF */ - skel = test_core_autosize__open(); + open_opts.btf_custom_path = btf_file; + skel = test_core_autosize__open_opts(&open_opts); if (!ASSERT_OK_PTR(skel, "skel_open")) - return; + goto cleanup; /* disable handle_signed() for now */ prog = bpf_object__find_program_by_name(skel->obj, "handle_signed"); @@ -135,9 +136,7 @@ void test_core_autosize(void) goto cleanup; bpf_program__set_autoload(prog, false); - load_attr.obj = skel->obj; - load_attr.target_btf_path = btf_file; - err = bpf_object__load_xattr(&load_attr); + err = bpf_object__load(skel->obj); if (!ASSERT_OK(err, "prog_load")) goto cleanup; @@ -204,14 +203,13 @@ void test_core_autosize(void) skel = NULL; /* now re-load with handle_signed() enabled, it should fail loading */ - skel = test_core_autosize__open(); + open_opts.btf_custom_path = btf_file; + skel = test_core_autosize__open_opts(&open_opts); if (!ASSERT_OK_PTR(skel, "skel_open")) - return; + goto cleanup; - load_attr.obj = skel->obj; - load_attr.target_btf_path = btf_file; - err = bpf_object__load_xattr(&load_attr); - if (!ASSERT_ERR(err, "bad_prog_load")) + err = test_core_autosize__load(skel); + if (!ASSERT_ERR(err, "skel_load")) goto cleanup; cleanup: diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index d02e064c535f..4739b15b2a97 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -816,7 +816,7 @@ static size_t roundup_page(size_t sz) void test_core_reloc(void) { const size_t mmap_sz = roundup_page(sizeof(struct data)); - struct bpf_object_load_attr load_attr = {}; + DECLARE_LIBBPF_OPTS(bpf_object_open_opts, open_opts); struct core_reloc_test_case *test_case; const char *tp_name, *probe_name; int err, i, equal; @@ -846,9 +846,16 @@ void test_core_reloc(void) continue; } - obj = bpf_object__open_file(test_case->bpf_obj_file, NULL); + if (test_case->btf_src_file) { + err = access(test_case->btf_src_file, R_OK); + if (!ASSERT_OK(err, "btf_src_file")) + goto cleanup; + } + + open_opts.btf_custom_path = test_case->btf_src_file; + obj = bpf_object__open_file(test_case->bpf_obj_file, &open_opts); if (!ASSERT_OK_PTR(obj, "obj_open")) - continue; + goto cleanup; probe_name = "raw_tracepoint/sys_enter"; tp_name = "sys_enter"; @@ -862,17 +869,7 @@ void test_core_reloc(void) "prog '%s' not found\n", probe_name)) goto cleanup; - - if (test_case->btf_src_file) { - err = access(test_case->btf_src_file, R_OK); - if (!ASSERT_OK(err, "btf_src_file")) - goto cleanup; - } - - load_attr.obj = obj; - load_attr.log_level = 0; - load_attr.target_btf_path = test_case->btf_src_file; - err = bpf_object__load_xattr(&load_attr); + err = bpf_object__load(obj); if (err) { if (!test_case->fails) ASSERT_OK(err, "obj_load"); diff --git a/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c b/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c new file mode 100644 index 000000000000..02a465f36d59 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> +#include "get_func_ip_test.skel.h" + +void test_get_func_ip_test(void) +{ + struct get_func_ip_test *skel = NULL; + __u32 duration = 0, retval; + int err, prog_fd; + + skel = get_func_ip_test__open(); + if (!ASSERT_OK_PTR(skel, "get_func_ip_test__open")) + return; + + /* test6 is x86_64 specifc because of the instruction + * offset, disabling it for all other archs + */ +#ifndef __x86_64__ + bpf_program__set_autoload(skel->progs.test6, false); + bpf_program__set_autoload(skel->progs.test7, false); +#endif + + err = get_func_ip_test__load(skel); + if (!ASSERT_OK(err, "get_func_ip_test__load")) + goto cleanup; + + err = get_func_ip_test__attach(skel); + if (!ASSERT_OK(err, "get_func_ip_test__attach")) + goto cleanup; + + prog_fd = bpf_program__fd(skel->progs.test1); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, + NULL, NULL, &retval, &duration); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(retval, 0, "test_run"); + + prog_fd = bpf_program__fd(skel->progs.test5); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, + NULL, NULL, &retval, &duration); + + ASSERT_OK(err, "test_run"); + + ASSERT_EQ(skel->bss->test1_result, 1, "test1_result"); + ASSERT_EQ(skel->bss->test2_result, 1, "test2_result"); + ASSERT_EQ(skel->bss->test3_result, 1, "test3_result"); + ASSERT_EQ(skel->bss->test4_result, 1, "test4_result"); + ASSERT_EQ(skel->bss->test5_result, 1, "test5_result"); +#ifdef __x86_64__ + ASSERT_EQ(skel->bss->test6_result, 1, "test6_result"); + ASSERT_EQ(skel->bss->test7_result, 1, "test7_result"); +#endif + +cleanup: + get_func_ip_test__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/netcnt.c b/tools/testing/selftests/bpf/prog_tests/netcnt.c new file mode 100644 index 000000000000..6ede48bde91b --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/netcnt.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <sys/sysinfo.h> +#include <test_progs.h> +#include "network_helpers.h" +#include "netcnt_prog.skel.h" +#include "netcnt_common.h" + +#define CG_NAME "/netcnt" + +void test_netcnt(void) +{ + union percpu_net_cnt *percpu_netcnt = NULL; + struct bpf_cgroup_storage_key key; + int map_fd, percpu_map_fd; + struct netcnt_prog *skel; + unsigned long packets; + union net_cnt netcnt; + unsigned long bytes; + int cpu, nproc; + int cg_fd = -1; + char cmd[128]; + + skel = netcnt_prog__open_and_load(); + if (!ASSERT_OK_PTR(skel, "netcnt_prog__open_and_load")) + return; + + nproc = get_nprocs_conf(); + percpu_netcnt = malloc(sizeof(*percpu_netcnt) * nproc); + if (!ASSERT_OK_PTR(percpu_netcnt, "malloc(percpu_netcnt)")) + goto err; + + cg_fd = test__join_cgroup(CG_NAME); + if (!ASSERT_GE(cg_fd, 0, "test__join_cgroup")) + goto err; + + skel->links.bpf_nextcnt = bpf_program__attach_cgroup(skel->progs.bpf_nextcnt, cg_fd); + if (!ASSERT_OK_PTR(skel->links.bpf_nextcnt, + "attach_cgroup(bpf_nextcnt)")) + goto err; + + snprintf(cmd, sizeof(cmd), "%s ::1 -A -c 10000 -q > /dev/null", ping_command(AF_INET6)); + ASSERT_OK(system(cmd), cmd); + + map_fd = bpf_map__fd(skel->maps.netcnt); + if (!ASSERT_OK(bpf_map_get_next_key(map_fd, NULL, &key), "bpf_map_get_next_key")) + goto err; + + if (!ASSERT_OK(bpf_map_lookup_elem(map_fd, &key, &netcnt), "bpf_map_lookup_elem(netcnt)")) + goto err; + + percpu_map_fd = bpf_map__fd(skel->maps.percpu_netcnt); + if (!ASSERT_OK(bpf_map_lookup_elem(percpu_map_fd, &key, &percpu_netcnt[0]), + "bpf_map_lookup_elem(percpu_netcnt)")) + goto err; + + /* Some packets can be still in per-cpu cache, but not more than + * MAX_PERCPU_PACKETS. + */ + packets = netcnt.packets; + bytes = netcnt.bytes; + for (cpu = 0; cpu < nproc; cpu++) { + ASSERT_LE(percpu_netcnt[cpu].packets, MAX_PERCPU_PACKETS, "MAX_PERCPU_PACKETS"); + + packets += percpu_netcnt[cpu].packets; + bytes += percpu_netcnt[cpu].bytes; + } + + /* No packets should be lost */ + ASSERT_EQ(packets, 10000, "packets"); + + /* Let's check that bytes counter matches the number of packets + * multiplied by the size of ipv6 ICMP packet. + */ + ASSERT_EQ(bytes, packets * 104, "bytes"); + +err: + if (cg_fd != -1) + close(cg_fd); + free(percpu_netcnt); + netcnt_prog__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/pinning.c b/tools/testing/selftests/bpf/prog_tests/pinning.c index fcf54b3a1dd0..d4b953ae3407 100644 --- a/tools/testing/selftests/bpf/prog_tests/pinning.c +++ b/tools/testing/selftests/bpf/prog_tests/pinning.c @@ -125,6 +125,10 @@ void test_pinning(void) if (CHECK(err, "pin maps", "err %d errno %d\n", err, errno)) goto out; + /* get pinning path */ + if (!ASSERT_STREQ(bpf_map__pin_path(map), pinpath, "get pin path")) + goto out; + /* set pinning path of other map and re-pin all */ map = bpf_object__find_map_by_name(obj, "nopinmap"); if (CHECK(!map, "find map", "NULL map")) @@ -134,6 +138,11 @@ void test_pinning(void) if (CHECK(err, "set pin path", "err %d errno %d\n", err, errno)) goto out; + /* get pinning path after set */ + if (!ASSERT_STREQ(bpf_map__pin_path(map), custpinpath, + "get pin path after set")) + goto out; + /* should only pin the one unpinned map */ err = bpf_object__pin_maps(obj, NULL); if (CHECK(err, "pin maps", "err %d errno %d\n", err, errno)) diff --git a/tools/testing/selftests/bpf/prog_tests/reference_tracking.c b/tools/testing/selftests/bpf/prog_tests/reference_tracking.c index de2688166696..4e91f4d6466c 100644 --- a/tools/testing/selftests/bpf/prog_tests/reference_tracking.c +++ b/tools/testing/selftests/bpf/prog_tests/reference_tracking.c @@ -34,8 +34,8 @@ void test_reference_tracking(void) if (!test__start_subtest(title)) continue; - /* Expect verifier failure if test name has 'fail' */ - if (strstr(title, "fail") != NULL) { + /* Expect verifier failure if test name has 'err' */ + if (strstr(title, "err_") != NULL) { libbpf_print_fn_t old_print_fn; old_print_fn = libbpf_set_print(NULL); diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index 515229f24a93..a9f1bf9d5dff 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -351,9 +351,11 @@ static void test_insert_opened(int family, int sotype, int mapfd) errno = 0; value = s; err = bpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); - if (!err || errno != EOPNOTSUPP) - FAIL_ERRNO("map_update: expected EOPNOTSUPP"); - + if (sotype == SOCK_STREAM) { + if (!err || errno != EOPNOTSUPP) + FAIL_ERRNO("map_update: expected EOPNOTSUPP"); + } else if (err) + FAIL_ERRNO("map_update: expected success"); xclose(s); } @@ -919,6 +921,23 @@ static const char *redir_mode_str(enum redir_mode mode) } } +static int add_to_sockmap(int sock_mapfd, int fd1, int fd2) +{ + u64 value; + u32 key; + int err; + + key = 0; + value = fd1; + err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); + if (err) + return err; + + key = 1; + value = fd2; + return xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); +} + static void redir_to_connected(int family, int sotype, int sock_mapfd, int verd_mapfd, enum redir_mode mode) { @@ -928,7 +947,6 @@ static void redir_to_connected(int family, int sotype, int sock_mapfd, unsigned int pass; socklen_t len; int err, n; - u64 value; u32 key; char b; @@ -965,15 +983,7 @@ static void redir_to_connected(int family, int sotype, int sock_mapfd, if (p1 < 0) goto close_cli1; - key = 0; - value = p0; - err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); - if (err) - goto close_peer1; - - key = 1; - value = p1; - err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); + err = add_to_sockmap(sock_mapfd, p0, p1); if (err) goto close_peer1; @@ -1061,7 +1071,6 @@ static void redir_to_listening(int family, int sotype, int sock_mapfd, int s, c, p, err, n; unsigned int drop; socklen_t len; - u64 value; u32 key; zero_verdict_count(verd_mapfd); @@ -1086,15 +1095,7 @@ static void redir_to_listening(int family, int sotype, int sock_mapfd, if (p < 0) goto close_cli; - key = 0; - value = s; - err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); - if (err) - goto close_peer; - - key = 1; - value = p; - err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); + err = add_to_sockmap(sock_mapfd, s, p); if (err) goto close_peer; @@ -1346,7 +1347,6 @@ static void test_reuseport_mixed_groups(int family, int sotype, int sock_map, int s1, s2, c, err; unsigned int drop; socklen_t len; - u64 value; u32 key; zero_verdict_count(verd_map); @@ -1360,16 +1360,10 @@ static void test_reuseport_mixed_groups(int family, int sotype, int sock_map, if (s2 < 0) goto close_srv1; - key = 0; - value = s1; - err = xbpf_map_update_elem(sock_map, &key, &value, BPF_NOEXIST); + err = add_to_sockmap(sock_map, s1, s2); if (err) goto close_srv2; - key = 1; - value = s2; - err = xbpf_map_update_elem(sock_map, &key, &value, BPF_NOEXIST); - /* Connect to s2, reuseport BPF selects s1 via sock_map[0] */ len = sizeof(addr); err = xgetsockname(s2, sockaddr(&addr), &len); @@ -1441,6 +1435,8 @@ static const char *family_str(sa_family_t family) return "IPv4"; case AF_INET6: return "IPv6"; + case AF_UNIX: + return "Unix"; default: return "unknown"; } @@ -1563,6 +1559,99 @@ static void test_redir(struct test_sockmap_listen *skel, struct bpf_map *map, } } +static void unix_redir_to_connected(int sotype, int sock_mapfd, + int verd_mapfd, enum redir_mode mode) +{ + const char *log_prefix = redir_mode_str(mode); + int c0, c1, p0, p1; + unsigned int pass; + int retries = 100; + int err, n; + int sfd[2]; + u32 key; + char b; + + zero_verdict_count(verd_mapfd); + + if (socketpair(AF_UNIX, sotype | SOCK_NONBLOCK, 0, sfd)) + return; + c0 = sfd[0], p0 = sfd[1]; + + if (socketpair(AF_UNIX, sotype | SOCK_NONBLOCK, 0, sfd)) + goto close0; + c1 = sfd[0], p1 = sfd[1]; + + err = add_to_sockmap(sock_mapfd, p0, p1); + if (err) + goto close; + + n = write(c1, "a", 1); + if (n < 0) + FAIL_ERRNO("%s: write", log_prefix); + if (n == 0) + FAIL("%s: incomplete write", log_prefix); + if (n < 1) + goto close; + + key = SK_PASS; + err = xbpf_map_lookup_elem(verd_mapfd, &key, &pass); + if (err) + goto close; + if (pass != 1) + FAIL("%s: want pass count 1, have %d", log_prefix, pass); + +again: + n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1); + if (n < 0) { + if (errno == EAGAIN && retries--) + goto again; + FAIL_ERRNO("%s: read", log_prefix); + } + if (n == 0) + FAIL("%s: incomplete read", log_prefix); + +close: + xclose(c1); + xclose(p1); +close0: + xclose(c0); + xclose(p0); +} + +static void unix_skb_redir_to_connected(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int sotype) +{ + int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + int err; + + err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_VERDICT, 0); + if (err) + return; + + skel->bss->test_ingress = false; + unix_redir_to_connected(sotype, sock_map, verdict_map, REDIR_EGRESS); + skel->bss->test_ingress = true; + unix_redir_to_connected(sotype, sock_map, verdict_map, REDIR_INGRESS); + + xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT); +} + +static void test_unix_redir(struct test_sockmap_listen *skel, struct bpf_map *map, + int sotype) +{ + const char *family_name, *map_name; + char s[MAX_TEST_NAME]; + + family_name = family_str(AF_UNIX); + map_name = map_type_str(map); + snprintf(s, sizeof(s), "%s %s %s", map_name, family_name, __func__); + if (!test__start_subtest(s)) + return; + unix_skb_redir_to_connected(skel, map, sotype); +} + static void test_reuseport(struct test_sockmap_listen *skel, struct bpf_map *map, int family, int sotype) { @@ -1603,33 +1692,27 @@ static void test_reuseport(struct test_sockmap_listen *skel, } } -static void udp_redir_to_connected(int family, int sotype, int sock_mapfd, - int verd_mapfd, enum redir_mode mode) +static int udp_socketpair(int family, int *s, int *c) { - const char *log_prefix = redir_mode_str(mode); struct sockaddr_storage addr; - int c0, c1, p0, p1; - unsigned int pass; - int retries = 100; socklen_t len; - int err, n; - u64 value; - u32 key; - char b; - - zero_verdict_count(verd_mapfd); + int p0, c0; + int err; - p0 = socket_loopback(family, sotype | SOCK_NONBLOCK); + p0 = socket_loopback(family, SOCK_DGRAM | SOCK_NONBLOCK); if (p0 < 0) - return; + return p0; + len = sizeof(addr); err = xgetsockname(p0, sockaddr(&addr), &len); if (err) goto close_peer0; - c0 = xsocket(family, sotype | SOCK_NONBLOCK, 0); - if (c0 < 0) + c0 = xsocket(family, SOCK_DGRAM | SOCK_NONBLOCK, 0); + if (c0 < 0) { + err = c0; goto close_peer0; + } err = xconnect(c0, sockaddr(&addr), len); if (err) goto close_cli0; @@ -1640,35 +1723,131 @@ static void udp_redir_to_connected(int family, int sotype, int sock_mapfd, if (err) goto close_cli0; - p1 = socket_loopback(family, sotype | SOCK_NONBLOCK); - if (p1 < 0) - goto close_cli0; - err = xgetsockname(p1, sockaddr(&addr), &len); + *s = p0; + *c = c0; + return 0; + +close_cli0: + xclose(c0); +close_peer0: + xclose(p0); + return err; +} + +static void udp_redir_to_connected(int family, int sock_mapfd, int verd_mapfd, + enum redir_mode mode) +{ + const char *log_prefix = redir_mode_str(mode); + int c0, c1, p0, p1; + unsigned int pass; + int retries = 100; + int err, n; + u32 key; + char b; + + zero_verdict_count(verd_mapfd); + + err = udp_socketpair(family, &p0, &c0); + if (err) + return; + err = udp_socketpair(family, &p1, &c1); if (err) goto close_cli0; - c1 = xsocket(family, sotype | SOCK_NONBLOCK, 0); - if (c1 < 0) - goto close_peer1; - err = xconnect(c1, sockaddr(&addr), len); + err = add_to_sockmap(sock_mapfd, p0, p1); if (err) goto close_cli1; - err = xgetsockname(c1, sockaddr(&addr), &len); - if (err) + + n = write(c1, "a", 1); + if (n < 0) + FAIL_ERRNO("%s: write", log_prefix); + if (n == 0) + FAIL("%s: incomplete write", log_prefix); + if (n < 1) goto close_cli1; - err = xconnect(p1, sockaddr(&addr), len); + + key = SK_PASS; + err = xbpf_map_lookup_elem(verd_mapfd, &key, &pass); if (err) goto close_cli1; + if (pass != 1) + FAIL("%s: want pass count 1, have %d", log_prefix, pass); - key = 0; - value = p0; - err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); +again: + n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1); + if (n < 0) { + if (errno == EAGAIN && retries--) + goto again; + FAIL_ERRNO("%s: read", log_prefix); + } + if (n == 0) + FAIL("%s: incomplete read", log_prefix); + +close_cli1: + xclose(c1); + xclose(p1); +close_cli0: + xclose(c0); + xclose(p0); +} + +static void udp_skb_redir_to_connected(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family) +{ + int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + int err; + + err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_VERDICT, 0); if (err) - goto close_cli1; + return; - key = 1; - value = p1; - err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); + skel->bss->test_ingress = false; + udp_redir_to_connected(family, sock_map, verdict_map, REDIR_EGRESS); + skel->bss->test_ingress = true; + udp_redir_to_connected(family, sock_map, verdict_map, REDIR_INGRESS); + + xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT); +} + +static void test_udp_redir(struct test_sockmap_listen *skel, struct bpf_map *map, + int family) +{ + const char *family_name, *map_name; + char s[MAX_TEST_NAME]; + + family_name = family_str(family); + map_name = map_type_str(map); + snprintf(s, sizeof(s), "%s %s %s", map_name, family_name, __func__); + if (!test__start_subtest(s)) + return; + udp_skb_redir_to_connected(skel, map, family); +} + +static void udp_unix_redir_to_connected(int family, int sock_mapfd, + int verd_mapfd, enum redir_mode mode) +{ + const char *log_prefix = redir_mode_str(mode); + int c0, c1, p0, p1; + unsigned int pass; + int retries = 100; + int err, n; + int sfd[2]; + u32 key; + char b; + + zero_verdict_count(verd_mapfd); + + if (socketpair(AF_UNIX, SOCK_DGRAM | SOCK_NONBLOCK, 0, sfd)) + return; + c0 = sfd[0], p0 = sfd[1]; + + err = udp_socketpair(family, &p1, &c1); + if (err) + goto close; + + err = add_to_sockmap(sock_mapfd, p0, p1); if (err) goto close_cli1; @@ -1699,16 +1878,89 @@ again: close_cli1: xclose(c1); -close_peer1: + xclose(p1); +close: + xclose(c0); + xclose(p0); +} + +static void udp_unix_skb_redir_to_connected(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family) +{ + int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + int err; + + err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_VERDICT, 0); + if (err) + return; + + skel->bss->test_ingress = false; + udp_unix_redir_to_connected(family, sock_map, verdict_map, REDIR_EGRESS); + skel->bss->test_ingress = true; + udp_unix_redir_to_connected(family, sock_map, verdict_map, REDIR_INGRESS); + + xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT); +} + +static void unix_udp_redir_to_connected(int family, int sock_mapfd, + int verd_mapfd, enum redir_mode mode) +{ + const char *log_prefix = redir_mode_str(mode); + int c0, c1, p0, p1; + unsigned int pass; + int err, n; + int sfd[2]; + u32 key; + char b; + + zero_verdict_count(verd_mapfd); + + err = udp_socketpair(family, &p0, &c0); + if (err) + return; + + if (socketpair(AF_UNIX, SOCK_DGRAM | SOCK_NONBLOCK, 0, sfd)) + goto close_cli0; + c1 = sfd[0], p1 = sfd[1]; + + err = add_to_sockmap(sock_mapfd, p0, p1); + if (err) + goto close; + + n = write(c1, "a", 1); + if (n < 0) + FAIL_ERRNO("%s: write", log_prefix); + if (n == 0) + FAIL("%s: incomplete write", log_prefix); + if (n < 1) + goto close; + + key = SK_PASS; + err = xbpf_map_lookup_elem(verd_mapfd, &key, &pass); + if (err) + goto close; + if (pass != 1) + FAIL("%s: want pass count 1, have %d", log_prefix, pass); + + n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1); + if (n < 0) + FAIL_ERRNO("%s: read", log_prefix); + if (n == 0) + FAIL("%s: incomplete read", log_prefix); + +close: + xclose(c1); xclose(p1); close_cli0: xclose(c0); -close_peer0: xclose(p0); + } -static void udp_skb_redir_to_connected(struct test_sockmap_listen *skel, - struct bpf_map *inner_map, int family) +static void unix_udp_skb_redir_to_connected(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family) { int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); int verdict_map = bpf_map__fd(skel->maps.verdict_map); @@ -1720,17 +1972,15 @@ static void udp_skb_redir_to_connected(struct test_sockmap_listen *skel, return; skel->bss->test_ingress = false; - udp_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, - REDIR_EGRESS); + unix_udp_redir_to_connected(family, sock_map, verdict_map, REDIR_EGRESS); skel->bss->test_ingress = true; - udp_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, - REDIR_INGRESS); + unix_udp_redir_to_connected(family, sock_map, verdict_map, REDIR_INGRESS); xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT); } -static void test_udp_redir(struct test_sockmap_listen *skel, struct bpf_map *map, - int family) +static void test_udp_unix_redir(struct test_sockmap_listen *skel, struct bpf_map *map, + int family) { const char *family_name, *map_name; char s[MAX_TEST_NAME]; @@ -1740,7 +1990,8 @@ static void test_udp_redir(struct test_sockmap_listen *skel, struct bpf_map *map snprintf(s, sizeof(s), "%s %s %s", map_name, family_name, __func__); if (!test__start_subtest(s)) return; - udp_skb_redir_to_connected(skel, map, family); + udp_unix_skb_redir_to_connected(skel, map, family); + unix_udp_skb_redir_to_connected(skel, map, family); } static void run_tests(struct test_sockmap_listen *skel, struct bpf_map *map, @@ -1752,6 +2003,7 @@ static void run_tests(struct test_sockmap_listen *skel, struct bpf_map *map, test_reuseport(skel, map, family, SOCK_STREAM); test_reuseport(skel, map, family, SOCK_DGRAM); test_udp_redir(skel, map, family); + test_udp_unix_redir(skel, map, family); } void test_sockmap_listen(void) @@ -1767,10 +2019,12 @@ void test_sockmap_listen(void) skel->bss->test_sockmap = true; run_tests(skel, skel->maps.sock_map, AF_INET); run_tests(skel, skel->maps.sock_map, AF_INET6); + test_unix_redir(skel, skel->maps.sock_map, SOCK_DGRAM); skel->bss->test_sockmap = false; run_tests(skel, skel->maps.sock_hash, AF_INET); run_tests(skel, skel->maps.sock_hash, AF_INET6); + test_unix_redir(skel, skel->maps.sock_hash, SOCK_DGRAM); test_sockmap_listen__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index 5703c918812b..e7201ba29ccd 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -13,15 +13,16 @@ #define _GNU_SOURCE #include <arpa/inet.h> +#include <linux/if.h> +#include <linux/if_tun.h> #include <linux/limits.h> #include <linux/sysctl.h> -#include <linux/if_tun.h> -#include <linux/if.h> #include <sched.h> #include <stdbool.h> #include <stdio.h> -#include <sys/stat.h> #include <sys/mount.h> +#include <sys/stat.h> +#include <unistd.h> #include "test_progs.h" #include "network_helpers.h" @@ -391,9 +392,7 @@ done: static int test_ping(int family, const char *addr) { - const char *ping = family == AF_INET6 ? "ping6" : "ping"; - - SYS("ip netns exec " NS_SRC " %s " PING_ARGS " %s > /dev/null", ping, addr); + SYS("ip netns exec " NS_SRC " %s " PING_ARGS " %s > /dev/null", ping_command(family), addr); return 0; fail: return -1; diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c new file mode 100644 index 000000000000..25f40e1b9967 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/timer.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include <test_progs.h> +#include "timer.skel.h" + +static int timer(struct timer *timer_skel) +{ + int err, prog_fd; + __u32 duration = 0, retval; + + err = timer__attach(timer_skel); + if (!ASSERT_OK(err, "timer_attach")) + return err; + + ASSERT_EQ(timer_skel->data->callback_check, 52, "callback_check1"); + ASSERT_EQ(timer_skel->data->callback2_check, 52, "callback2_check1"); + + prog_fd = bpf_program__fd(timer_skel->progs.test1); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, + NULL, NULL, &retval, &duration); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(retval, 0, "test_run"); + timer__detach(timer_skel); + + usleep(50); /* 10 usecs should be enough, but give it extra */ + /* check that timer_cb1() was executed 10+10 times */ + ASSERT_EQ(timer_skel->data->callback_check, 42, "callback_check2"); + ASSERT_EQ(timer_skel->data->callback2_check, 42, "callback2_check2"); + + /* check that timer_cb2() was executed twice */ + ASSERT_EQ(timer_skel->bss->bss_data, 10, "bss_data"); + + /* check that there were no errors in timer execution */ + ASSERT_EQ(timer_skel->bss->err, 0, "err"); + + /* check that code paths completed */ + ASSERT_EQ(timer_skel->bss->ok, 1 | 2 | 4, "ok"); + + return 0; +} + +void test_timer(void) +{ + struct timer *timer_skel = NULL; + int err; + + timer_skel = timer__open_and_load(); + if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load")) + goto cleanup; + + err = timer(timer_skel); + ASSERT_OK(err, "timer"); +cleanup: + timer__destroy(timer_skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/timer_mim.c b/tools/testing/selftests/bpf/prog_tests/timer_mim.c new file mode 100644 index 000000000000..f5acbcbe33a4 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/timer_mim.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include <test_progs.h> +#include "timer_mim.skel.h" +#include "timer_mim_reject.skel.h" + +static int timer_mim(struct timer_mim *timer_skel) +{ + __u32 duration = 0, retval; + __u64 cnt1, cnt2; + int err, prog_fd, key1 = 1; + + err = timer_mim__attach(timer_skel); + if (!ASSERT_OK(err, "timer_attach")) + return err; + + prog_fd = bpf_program__fd(timer_skel->progs.test1); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, + NULL, NULL, &retval, &duration); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(retval, 0, "test_run"); + timer_mim__detach(timer_skel); + + /* check that timer_cb[12] are incrementing 'cnt' */ + cnt1 = READ_ONCE(timer_skel->bss->cnt); + usleep(200); /* 100 times more than interval */ + cnt2 = READ_ONCE(timer_skel->bss->cnt); + ASSERT_GT(cnt2, cnt1, "cnt"); + + ASSERT_EQ(timer_skel->bss->err, 0, "err"); + /* check that code paths completed */ + ASSERT_EQ(timer_skel->bss->ok, 1 | 2, "ok"); + + close(bpf_map__fd(timer_skel->maps.inner_htab)); + err = bpf_map_delete_elem(bpf_map__fd(timer_skel->maps.outer_arr), &key1); + ASSERT_EQ(err, 0, "delete inner map"); + + /* check that timer_cb[12] are no longer running */ + cnt1 = READ_ONCE(timer_skel->bss->cnt); + usleep(200); + cnt2 = READ_ONCE(timer_skel->bss->cnt); + ASSERT_EQ(cnt2, cnt1, "cnt"); + + return 0; +} + +void test_timer_mim(void) +{ + struct timer_mim_reject *timer_reject_skel = NULL; + libbpf_print_fn_t old_print_fn = NULL; + struct timer_mim *timer_skel = NULL; + int err; + + old_print_fn = libbpf_set_print(NULL); + timer_reject_skel = timer_mim_reject__open_and_load(); + libbpf_set_print(old_print_fn); + if (!ASSERT_ERR_PTR(timer_reject_skel, "timer_reject_skel_load")) + goto cleanup; + + timer_skel = timer_mim__open_and_load(); + if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load")) + goto cleanup; + + err = timer_mim(timer_skel); + ASSERT_OK(err, "timer_mim"); +cleanup: + timer_mim__destroy(timer_skel); + timer_mim_reject__destroy(timer_reject_skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c b/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c new file mode 100644 index 000000000000..6b186b4238d0 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c @@ -0,0 +1,520 @@ +// SPDX-License-Identifier: GPL-2.0 + +/** + * Test XDP bonding support + * + * Sets up two bonded veth pairs between two fresh namespaces + * and verifies that XDP_TX program loaded on a bond device + * are correctly loaded onto the slave devices and XDP_TX'd + * packets are balanced using bonding. + */ + +#define _GNU_SOURCE +#include <sched.h> +#include <net/if.h> +#include <linux/if_link.h> +#include "test_progs.h" +#include "network_helpers.h" +#include <linux/if_bonding.h> +#include <linux/limits.h> +#include <linux/udp.h> + +#include "xdp_dummy.skel.h" +#include "xdp_redirect_multi_kern.skel.h" +#include "xdp_tx.skel.h" + +#define BOND1_MAC {0x00, 0x11, 0x22, 0x33, 0x44, 0x55} +#define BOND1_MAC_STR "00:11:22:33:44:55" +#define BOND2_MAC {0x00, 0x22, 0x33, 0x44, 0x55, 0x66} +#define BOND2_MAC_STR "00:22:33:44:55:66" +#define NPACKETS 100 + +static int root_netns_fd = -1; + +static void restore_root_netns(void) +{ + ASSERT_OK(setns(root_netns_fd, CLONE_NEWNET), "restore_root_netns"); +} + +static int setns_by_name(char *name) +{ + int nsfd, err; + char nspath[PATH_MAX]; + + snprintf(nspath, sizeof(nspath), "%s/%s", "/var/run/netns", name); + nsfd = open(nspath, O_RDONLY | O_CLOEXEC); + if (nsfd < 0) + return -1; + + err = setns(nsfd, CLONE_NEWNET); + close(nsfd); + return err; +} + +static int get_rx_packets(const char *iface) +{ + FILE *f; + char line[512]; + int iface_len = strlen(iface); + + f = fopen("/proc/net/dev", "r"); + if (!f) + return -1; + + while (fgets(line, sizeof(line), f)) { + char *p = line; + + while (*p == ' ') + p++; /* skip whitespace */ + if (!strncmp(p, iface, iface_len)) { + p += iface_len; + if (*p++ != ':') + continue; + while (*p == ' ') + p++; /* skip whitespace */ + while (*p && *p != ' ') + p++; /* skip rx bytes */ + while (*p == ' ') + p++; /* skip whitespace */ + fclose(f); + return atoi(p); + } + } + fclose(f); + return -1; +} + +#define MAX_BPF_LINKS 8 + +struct skeletons { + struct xdp_dummy *xdp_dummy; + struct xdp_tx *xdp_tx; + struct xdp_redirect_multi_kern *xdp_redirect_multi_kern; + + int nlinks; + struct bpf_link *links[MAX_BPF_LINKS]; +}; + +static int xdp_attach(struct skeletons *skeletons, struct bpf_program *prog, char *iface) +{ + struct bpf_link *link; + int ifindex; + + ifindex = if_nametoindex(iface); + if (!ASSERT_GT(ifindex, 0, "get ifindex")) + return -1; + + if (!ASSERT_LE(skeletons->nlinks+1, MAX_BPF_LINKS, "too many XDP programs attached")) + return -1; + + link = bpf_program__attach_xdp(prog, ifindex); + if (!ASSERT_OK_PTR(link, "attach xdp program")) + return -1; + + skeletons->links[skeletons->nlinks++] = link; + return 0; +} + +enum { + BOND_ONE_NO_ATTACH = 0, + BOND_BOTH_AND_ATTACH, +}; + +static const char * const mode_names[] = { + [BOND_MODE_ROUNDROBIN] = "balance-rr", + [BOND_MODE_ACTIVEBACKUP] = "active-backup", + [BOND_MODE_XOR] = "balance-xor", + [BOND_MODE_BROADCAST] = "broadcast", + [BOND_MODE_8023AD] = "802.3ad", + [BOND_MODE_TLB] = "balance-tlb", + [BOND_MODE_ALB] = "balance-alb", +}; + +static const char * const xmit_policy_names[] = { + [BOND_XMIT_POLICY_LAYER2] = "layer2", + [BOND_XMIT_POLICY_LAYER34] = "layer3+4", + [BOND_XMIT_POLICY_LAYER23] = "layer2+3", + [BOND_XMIT_POLICY_ENCAP23] = "encap2+3", + [BOND_XMIT_POLICY_ENCAP34] = "encap3+4", +}; + +static int bonding_setup(struct skeletons *skeletons, int mode, int xmit_policy, + int bond_both_attach) +{ +#define SYS(fmt, ...) \ + ({ \ + char cmd[1024]; \ + snprintf(cmd, sizeof(cmd), fmt, ##__VA_ARGS__); \ + if (!ASSERT_OK(system(cmd), cmd)) \ + return -1; \ + }) + + SYS("ip netns add ns_dst"); + SYS("ip link add veth1_1 type veth peer name veth2_1 netns ns_dst"); + SYS("ip link add veth1_2 type veth peer name veth2_2 netns ns_dst"); + + SYS("ip link add bond1 type bond mode %s xmit_hash_policy %s", + mode_names[mode], xmit_policy_names[xmit_policy]); + SYS("ip link set bond1 up address " BOND1_MAC_STR " addrgenmode none"); + SYS("ip -netns ns_dst link add bond2 type bond mode %s xmit_hash_policy %s", + mode_names[mode], xmit_policy_names[xmit_policy]); + SYS("ip -netns ns_dst link set bond2 up address " BOND2_MAC_STR " addrgenmode none"); + + SYS("ip link set veth1_1 master bond1"); + if (bond_both_attach == BOND_BOTH_AND_ATTACH) { + SYS("ip link set veth1_2 master bond1"); + } else { + SYS("ip link set veth1_2 up addrgenmode none"); + + if (xdp_attach(skeletons, skeletons->xdp_dummy->progs.xdp_dummy_prog, "veth1_2")) + return -1; + } + + SYS("ip -netns ns_dst link set veth2_1 master bond2"); + + if (bond_both_attach == BOND_BOTH_AND_ATTACH) + SYS("ip -netns ns_dst link set veth2_2 master bond2"); + else + SYS("ip -netns ns_dst link set veth2_2 up addrgenmode none"); + + /* Load a dummy program on sending side as with veth peer needs to have a + * XDP program loaded as well. + */ + if (xdp_attach(skeletons, skeletons->xdp_dummy->progs.xdp_dummy_prog, "bond1")) + return -1; + + if (bond_both_attach == BOND_BOTH_AND_ATTACH) { + if (!ASSERT_OK(setns_by_name("ns_dst"), "set netns to ns_dst")) + return -1; + + if (xdp_attach(skeletons, skeletons->xdp_tx->progs.xdp_tx, "bond2")) + return -1; + + restore_root_netns(); + } + + return 0; + +#undef SYS +} + +static void bonding_cleanup(struct skeletons *skeletons) +{ + restore_root_netns(); + while (skeletons->nlinks) { + skeletons->nlinks--; + bpf_link__destroy(skeletons->links[skeletons->nlinks]); + } + ASSERT_OK(system("ip link delete bond1"), "delete bond1"); + ASSERT_OK(system("ip link delete veth1_1"), "delete veth1_1"); + ASSERT_OK(system("ip link delete veth1_2"), "delete veth1_2"); + ASSERT_OK(system("ip netns delete ns_dst"), "delete ns_dst"); +} + +static int send_udp_packets(int vary_dst_ip) +{ + struct ethhdr eh = { + .h_source = BOND1_MAC, + .h_dest = BOND2_MAC, + .h_proto = htons(ETH_P_IP), + }; + uint8_t buf[128] = {}; + struct iphdr *iph = (struct iphdr *)(buf + sizeof(eh)); + struct udphdr *uh = (struct udphdr *)(buf + sizeof(eh) + sizeof(*iph)); + int i, s = -1; + int ifindex; + + s = socket(AF_PACKET, SOCK_RAW, IPPROTO_RAW); + if (!ASSERT_GE(s, 0, "socket")) + goto err; + + ifindex = if_nametoindex("bond1"); + if (!ASSERT_GT(ifindex, 0, "get bond1 ifindex")) + goto err; + + memcpy(buf, &eh, sizeof(eh)); + iph->ihl = 5; + iph->version = 4; + iph->tos = 16; + iph->id = 1; + iph->ttl = 64; + iph->protocol = IPPROTO_UDP; + iph->saddr = 1; + iph->daddr = 2; + iph->tot_len = htons(sizeof(buf) - ETH_HLEN); + iph->check = 0; + + for (i = 1; i <= NPACKETS; i++) { + int n; + struct sockaddr_ll saddr_ll = { + .sll_ifindex = ifindex, + .sll_halen = ETH_ALEN, + .sll_addr = BOND2_MAC, + }; + + /* vary the UDP destination port for even distribution with roundrobin/xor modes */ + uh->dest++; + + if (vary_dst_ip) + iph->daddr++; + + n = sendto(s, buf, sizeof(buf), 0, (struct sockaddr *)&saddr_ll, sizeof(saddr_ll)); + if (!ASSERT_EQ(n, sizeof(buf), "sendto")) + goto err; + } + + return 0; + +err: + if (s >= 0) + close(s); + return -1; +} + +static void test_xdp_bonding_with_mode(struct skeletons *skeletons, int mode, int xmit_policy) +{ + int bond1_rx; + + if (bonding_setup(skeletons, mode, xmit_policy, BOND_BOTH_AND_ATTACH)) + goto out; + + if (send_udp_packets(xmit_policy != BOND_XMIT_POLICY_LAYER34)) + goto out; + + bond1_rx = get_rx_packets("bond1"); + ASSERT_EQ(bond1_rx, NPACKETS, "expected more received packets"); + + switch (mode) { + case BOND_MODE_ROUNDROBIN: + case BOND_MODE_XOR: { + int veth1_rx = get_rx_packets("veth1_1"); + int veth2_rx = get_rx_packets("veth1_2"); + int diff = abs(veth1_rx - veth2_rx); + + ASSERT_GE(veth1_rx + veth2_rx, NPACKETS, "expected more packets"); + + switch (xmit_policy) { + case BOND_XMIT_POLICY_LAYER2: + ASSERT_GE(diff, NPACKETS, + "expected packets on only one of the interfaces"); + break; + case BOND_XMIT_POLICY_LAYER23: + case BOND_XMIT_POLICY_LAYER34: + ASSERT_LT(diff, NPACKETS/2, + "expected even distribution of packets"); + break; + default: + PRINT_FAIL("Unimplemented xmit_policy=%d\n", xmit_policy); + break; + } + break; + } + case BOND_MODE_ACTIVEBACKUP: { + int veth1_rx = get_rx_packets("veth1_1"); + int veth2_rx = get_rx_packets("veth1_2"); + int diff = abs(veth1_rx - veth2_rx); + + ASSERT_GE(diff, NPACKETS, + "expected packets on only one of the interfaces"); + break; + } + default: + PRINT_FAIL("Unimplemented xmit_policy=%d\n", xmit_policy); + break; + } + +out: + bonding_cleanup(skeletons); +} + +/* Test the broadcast redirection using xdp_redirect_map_multi_prog and adding + * all the interfaces to it and checking that broadcasting won't send the packet + * to neither the ingress bond device (bond2) or its slave (veth2_1). + */ +static void test_xdp_bonding_redirect_multi(struct skeletons *skeletons) +{ + static const char * const ifaces[] = {"bond2", "veth2_1", "veth2_2"}; + int veth1_1_rx, veth1_2_rx; + int err; + + if (bonding_setup(skeletons, BOND_MODE_ROUNDROBIN, BOND_XMIT_POLICY_LAYER23, + BOND_ONE_NO_ATTACH)) + goto out; + + + if (!ASSERT_OK(setns_by_name("ns_dst"), "could not set netns to ns_dst")) + goto out; + + /* populate the devmap with the relevant interfaces */ + for (int i = 0; i < ARRAY_SIZE(ifaces); i++) { + int ifindex = if_nametoindex(ifaces[i]); + int map_fd = bpf_map__fd(skeletons->xdp_redirect_multi_kern->maps.map_all); + + if (!ASSERT_GT(ifindex, 0, "could not get interface index")) + goto out; + + err = bpf_map_update_elem(map_fd, &ifindex, &ifindex, 0); + if (!ASSERT_OK(err, "add interface to map_all")) + goto out; + } + + if (xdp_attach(skeletons, + skeletons->xdp_redirect_multi_kern->progs.xdp_redirect_map_multi_prog, + "bond2")) + goto out; + + restore_root_netns(); + + if (send_udp_packets(BOND_MODE_ROUNDROBIN)) + goto out; + + veth1_1_rx = get_rx_packets("veth1_1"); + veth1_2_rx = get_rx_packets("veth1_2"); + + ASSERT_EQ(veth1_1_rx, 0, "expected no packets on veth1_1"); + ASSERT_GE(veth1_2_rx, NPACKETS, "expected packets on veth1_2"); + +out: + restore_root_netns(); + bonding_cleanup(skeletons); +} + +/* Test that XDP programs cannot be attached to both the bond master and slaves simultaneously */ +static void test_xdp_bonding_attach(struct skeletons *skeletons) +{ + struct bpf_link *link = NULL; + struct bpf_link *link2 = NULL; + int veth, bond; + int err; + + if (!ASSERT_OK(system("ip link add veth type veth"), "add veth")) + goto out; + if (!ASSERT_OK(system("ip link add bond type bond"), "add bond")) + goto out; + + veth = if_nametoindex("veth"); + if (!ASSERT_GE(veth, 0, "if_nametoindex veth")) + goto out; + bond = if_nametoindex("bond"); + if (!ASSERT_GE(bond, 0, "if_nametoindex bond")) + goto out; + + /* enslaving with a XDP program loaded fails */ + link = bpf_program__attach_xdp(skeletons->xdp_dummy->progs.xdp_dummy_prog, veth); + if (!ASSERT_OK_PTR(link, "attach program to veth")) + goto out; + + err = system("ip link set veth master bond"); + if (!ASSERT_NEQ(err, 0, "attaching slave with xdp program expected to fail")) + goto out; + + bpf_link__destroy(link); + link = NULL; + + err = system("ip link set veth master bond"); + if (!ASSERT_OK(err, "set veth master")) + goto out; + + /* attaching to slave when master has no program is allowed */ + link = bpf_program__attach_xdp(skeletons->xdp_dummy->progs.xdp_dummy_prog, veth); + if (!ASSERT_OK_PTR(link, "attach program to slave when enslaved")) + goto out; + + /* attaching to master not allowed when slave has program loaded */ + link2 = bpf_program__attach_xdp(skeletons->xdp_dummy->progs.xdp_dummy_prog, bond); + if (!ASSERT_ERR_PTR(link2, "attach program to master when slave has program")) + goto out; + + bpf_link__destroy(link); + link = NULL; + + /* attaching XDP program to master allowed when slave has no program */ + link = bpf_program__attach_xdp(skeletons->xdp_dummy->progs.xdp_dummy_prog, bond); + if (!ASSERT_OK_PTR(link, "attach program to master")) + goto out; + + /* attaching to slave not allowed when master has program loaded */ + link2 = bpf_program__attach_xdp(skeletons->xdp_dummy->progs.xdp_dummy_prog, bond); + ASSERT_ERR_PTR(link2, "attach program to slave when master has program"); + +out: + bpf_link__destroy(link); + bpf_link__destroy(link2); + + system("ip link del veth"); + system("ip link del bond"); +} + +static int libbpf_debug_print(enum libbpf_print_level level, + const char *format, va_list args) +{ + if (level != LIBBPF_WARN) + vprintf(format, args); + return 0; +} + +struct bond_test_case { + char *name; + int mode; + int xmit_policy; +}; + +static struct bond_test_case bond_test_cases[] = { + { "xdp_bonding_roundrobin", BOND_MODE_ROUNDROBIN, BOND_XMIT_POLICY_LAYER23, }, + { "xdp_bonding_activebackup", BOND_MODE_ACTIVEBACKUP, BOND_XMIT_POLICY_LAYER23 }, + + { "xdp_bonding_xor_layer2", BOND_MODE_XOR, BOND_XMIT_POLICY_LAYER2, }, + { "xdp_bonding_xor_layer23", BOND_MODE_XOR, BOND_XMIT_POLICY_LAYER23, }, + { "xdp_bonding_xor_layer34", BOND_MODE_XOR, BOND_XMIT_POLICY_LAYER34, }, +}; + +void test_xdp_bonding(void) +{ + libbpf_print_fn_t old_print_fn; + struct skeletons skeletons = {}; + int i; + + old_print_fn = libbpf_set_print(libbpf_debug_print); + + root_netns_fd = open("/proc/self/ns/net", O_RDONLY); + if (!ASSERT_GE(root_netns_fd, 0, "open /proc/self/ns/net")) + goto out; + + skeletons.xdp_dummy = xdp_dummy__open_and_load(); + if (!ASSERT_OK_PTR(skeletons.xdp_dummy, "xdp_dummy__open_and_load")) + goto out; + + skeletons.xdp_tx = xdp_tx__open_and_load(); + if (!ASSERT_OK_PTR(skeletons.xdp_tx, "xdp_tx__open_and_load")) + goto out; + + skeletons.xdp_redirect_multi_kern = xdp_redirect_multi_kern__open_and_load(); + if (!ASSERT_OK_PTR(skeletons.xdp_redirect_multi_kern, + "xdp_redirect_multi_kern__open_and_load")) + goto out; + + if (!test__start_subtest("xdp_bonding_attach")) + test_xdp_bonding_attach(&skeletons); + + for (i = 0; i < ARRAY_SIZE(bond_test_cases); i++) { + struct bond_test_case *test_case = &bond_test_cases[i]; + + if (!test__start_subtest(test_case->name)) + test_xdp_bonding_with_mode( + &skeletons, + test_case->mode, + test_case->xmit_policy); + } + + if (!test__start_subtest("xdp_bonding_redirect_multi")) + test_xdp_bonding_redirect_multi(&skeletons); + +out: + xdp_dummy__destroy(skeletons.xdp_dummy); + xdp_tx__destroy(skeletons.xdp_tx); + xdp_redirect_multi_kern__destroy(skeletons.xdp_redirect_multi_kern); + + libbpf_set_print(old_print_fn); + if (root_netns_fd >= 0) + close(root_netns_fd); +} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c new file mode 100644 index 000000000000..ab4952b9fb1d --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> +#include <network_helpers.h> +#include "test_xdp_context_test_run.skel.h" + +void test_xdp_context_error(int prog_fd, struct bpf_test_run_opts opts, + __u32 data_meta, __u32 data, __u32 data_end, + __u32 ingress_ifindex, __u32 rx_queue_index, + __u32 egress_ifindex) +{ + struct xdp_md ctx = { + .data = data, + .data_end = data_end, + .data_meta = data_meta, + .ingress_ifindex = ingress_ifindex, + .rx_queue_index = rx_queue_index, + .egress_ifindex = egress_ifindex, + }; + int err; + + opts.ctx_in = &ctx; + opts.ctx_size_in = sizeof(ctx); + err = bpf_prog_test_run_opts(prog_fd, &opts); + ASSERT_EQ(errno, EINVAL, "errno-EINVAL"); + ASSERT_ERR(err, "bpf_prog_test_run"); +} + +void test_xdp_context_test_run(void) +{ + struct test_xdp_context_test_run *skel = NULL; + char data[sizeof(pkt_v4) + sizeof(__u32)]; + char bad_ctx[sizeof(struct xdp_md) + 1]; + struct xdp_md ctx_in, ctx_out; + DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts, + .data_in = &data, + .data_size_in = sizeof(data), + .ctx_out = &ctx_out, + .ctx_size_out = sizeof(ctx_out), + .repeat = 1, + ); + int err, prog_fd; + + skel = test_xdp_context_test_run__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel")) + return; + prog_fd = bpf_program__fd(skel->progs.xdp_context); + + /* Data past the end of the kernel's struct xdp_md must be 0 */ + bad_ctx[sizeof(bad_ctx) - 1] = 1; + opts.ctx_in = bad_ctx; + opts.ctx_size_in = sizeof(bad_ctx); + err = bpf_prog_test_run_opts(prog_fd, &opts); + ASSERT_EQ(errno, E2BIG, "extradata-errno"); + ASSERT_ERR(err, "bpf_prog_test_run(extradata)"); + + *(__u32 *)data = XDP_PASS; + *(struct ipv4_packet *)(data + sizeof(__u32)) = pkt_v4; + opts.ctx_in = &ctx_in; + opts.ctx_size_in = sizeof(ctx_in); + memset(&ctx_in, 0, sizeof(ctx_in)); + ctx_in.data_meta = 0; + ctx_in.data = sizeof(__u32); + ctx_in.data_end = ctx_in.data + sizeof(pkt_v4); + err = bpf_prog_test_run_opts(prog_fd, &opts); + ASSERT_OK(err, "bpf_prog_test_run(valid)"); + ASSERT_EQ(opts.retval, XDP_PASS, "valid-retval"); + ASSERT_EQ(opts.data_size_out, sizeof(pkt_v4), "valid-datasize"); + ASSERT_EQ(opts.ctx_size_out, opts.ctx_size_in, "valid-ctxsize"); + ASSERT_EQ(ctx_out.data_meta, 0, "valid-datameta"); + ASSERT_EQ(ctx_out.data, 0, "valid-data"); + ASSERT_EQ(ctx_out.data_end, sizeof(pkt_v4), "valid-dataend"); + + /* Meta data's size must be a multiple of 4 */ + test_xdp_context_error(prog_fd, opts, 0, 1, sizeof(data), 0, 0, 0); + + /* data_meta must reference the start of data */ + test_xdp_context_error(prog_fd, opts, 4, sizeof(__u32), sizeof(data), + 0, 0, 0); + + /* Meta data must be 32 bytes or smaller */ + test_xdp_context_error(prog_fd, opts, 0, 36, sizeof(data), 0, 0, 0); + + /* Total size of data must match data_end - data_meta */ + test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), + sizeof(data) - 1, 0, 0, 0); + test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), + sizeof(data) + 1, 0, 0, 0); + + /* RX queue cannot be specified without specifying an ingress */ + test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), sizeof(data), + 0, 1, 0); + + /* Interface 1 is always the loopback interface which always has only + * one RX queue (index 0). This makes index 1 an invalid rx queue index + * for interface 1. + */ + test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), sizeof(data), + 1, 1, 0); + + /* The egress cannot be specified */ + test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), sizeof(data), + 0, 0, 1); + + test_xdp_context_test_run__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c index 0176573fe4e7..8755effd80b0 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c @@ -7,64 +7,53 @@ #define IFINDEX_LO 1 -void test_xdp_with_cpumap_helpers(void) +void test_xdp_cpumap_attach(void) { struct test_xdp_with_cpumap_helpers *skel; struct bpf_prog_info info = {}; + __u32 len = sizeof(info); struct bpf_cpumap_val val = { .qsize = 192, }; - __u32 duration = 0, idx = 0; - __u32 len = sizeof(info); int err, prog_fd, map_fd; + __u32 idx = 0; skel = test_xdp_with_cpumap_helpers__open_and_load(); - if (CHECK_FAIL(!skel)) { - perror("test_xdp_with_cpumap_helpers__open_and_load"); + if (!ASSERT_OK_PTR(skel, "test_xdp_with_cpumap_helpers__open_and_load")) return; - } - /* can not attach program with cpumaps that allow programs - * as xdp generic - */ prog_fd = bpf_program__fd(skel->progs.xdp_redir_prog); err = bpf_set_link_xdp_fd(IFINDEX_LO, prog_fd, XDP_FLAGS_SKB_MODE); - CHECK(err == 0, "Generic attach of program with 8-byte CPUMAP", - "should have failed\n"); + if (!ASSERT_OK(err, "Generic attach of program with 8-byte CPUMAP")) + goto out_close; + + err = bpf_set_link_xdp_fd(IFINDEX_LO, -1, XDP_FLAGS_SKB_MODE); + ASSERT_OK(err, "XDP program detach"); prog_fd = bpf_program__fd(skel->progs.xdp_dummy_cm); map_fd = bpf_map__fd(skel->maps.cpu_map); err = bpf_obj_get_info_by_fd(prog_fd, &info, &len); - if (CHECK_FAIL(err)) + if (!ASSERT_OK(err, "bpf_obj_get_info_by_fd")) goto out_close; val.bpf_prog.fd = prog_fd; err = bpf_map_update_elem(map_fd, &idx, &val, 0); - CHECK(err, "Add program to cpumap entry", "err %d errno %d\n", - err, errno); + ASSERT_OK(err, "Add program to cpumap entry"); err = bpf_map_lookup_elem(map_fd, &idx, &val); - CHECK(err, "Read cpumap entry", "err %d errno %d\n", err, errno); - CHECK(info.id != val.bpf_prog.id, "Expected program id in cpumap entry", - "expected %u read %u\n", info.id, val.bpf_prog.id); + ASSERT_OK(err, "Read cpumap entry"); + ASSERT_EQ(info.id, val.bpf_prog.id, "Match program id to cpumap entry prog_id"); /* can not attach BPF_XDP_CPUMAP program to a device */ err = bpf_set_link_xdp_fd(IFINDEX_LO, prog_fd, XDP_FLAGS_SKB_MODE); - CHECK(err == 0, "Attach of BPF_XDP_CPUMAP program", - "should have failed\n"); + if (!ASSERT_NEQ(err, 0, "Attach of BPF_XDP_CPUMAP program")) + bpf_set_link_xdp_fd(IFINDEX_LO, -1, XDP_FLAGS_SKB_MODE); val.qsize = 192; val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_dummy_prog); err = bpf_map_update_elem(map_fd, &idx, &val, 0); - CHECK(err == 0, "Add non-BPF_XDP_CPUMAP program to cpumap entry", - "should have failed\n"); + ASSERT_NEQ(err, 0, "Add non-BPF_XDP_CPUMAP program to cpumap entry"); out_close: test_xdp_with_cpumap_helpers__destroy(skel); } - -void test_xdp_cpumap_attach(void) -{ - if (test__start_subtest("cpumap_with_progs")) - test_xdp_with_cpumap_helpers(); -} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c index 88ef3ec8ac4c..c72af030ff10 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c @@ -16,50 +16,45 @@ void test_xdp_with_devmap_helpers(void) .ifindex = IFINDEX_LO, }; __u32 len = sizeof(info); - __u32 duration = 0, idx = 0; int err, dm_fd, map_fd; + __u32 idx = 0; skel = test_xdp_with_devmap_helpers__open_and_load(); - if (CHECK_FAIL(!skel)) { - perror("test_xdp_with_devmap_helpers__open_and_load"); + if (!ASSERT_OK_PTR(skel, "test_xdp_with_devmap_helpers__open_and_load")) return; - } - /* can not attach program with DEVMAPs that allow programs - * as xdp generic - */ dm_fd = bpf_program__fd(skel->progs.xdp_redir_prog); err = bpf_set_link_xdp_fd(IFINDEX_LO, dm_fd, XDP_FLAGS_SKB_MODE); - CHECK(err == 0, "Generic attach of program with 8-byte devmap", - "should have failed\n"); + if (!ASSERT_OK(err, "Generic attach of program with 8-byte devmap")) + goto out_close; + + err = bpf_set_link_xdp_fd(IFINDEX_LO, -1, XDP_FLAGS_SKB_MODE); + ASSERT_OK(err, "XDP program detach"); dm_fd = bpf_program__fd(skel->progs.xdp_dummy_dm); map_fd = bpf_map__fd(skel->maps.dm_ports); err = bpf_obj_get_info_by_fd(dm_fd, &info, &len); - if (CHECK_FAIL(err)) + if (!ASSERT_OK(err, "bpf_obj_get_info_by_fd")) goto out_close; val.bpf_prog.fd = dm_fd; err = bpf_map_update_elem(map_fd, &idx, &val, 0); - CHECK(err, "Add program to devmap entry", - "err %d errno %d\n", err, errno); + ASSERT_OK(err, "Add program to devmap entry"); err = bpf_map_lookup_elem(map_fd, &idx, &val); - CHECK(err, "Read devmap entry", "err %d errno %d\n", err, errno); - CHECK(info.id != val.bpf_prog.id, "Expected program id in devmap entry", - "expected %u read %u\n", info.id, val.bpf_prog.id); + ASSERT_OK(err, "Read devmap entry"); + ASSERT_EQ(info.id, val.bpf_prog.id, "Match program id to devmap entry prog_id"); /* can not attach BPF_XDP_DEVMAP program to a device */ err = bpf_set_link_xdp_fd(IFINDEX_LO, dm_fd, XDP_FLAGS_SKB_MODE); - CHECK(err == 0, "Attach of BPF_XDP_DEVMAP program", - "should have failed\n"); + if (!ASSERT_NEQ(err, 0, "Attach of BPF_XDP_DEVMAP program")) + bpf_set_link_xdp_fd(IFINDEX_LO, -1, XDP_FLAGS_SKB_MODE); val.ifindex = 1; val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_dummy_prog); err = bpf_map_update_elem(map_fd, &idx, &val, 0); - CHECK(err == 0, "Add non-BPF_XDP_DEVMAP program to devmap entry", - "should have failed\n"); + ASSERT_NEQ(err, 0, "Add non-BPF_XDP_DEVMAP program to devmap entry"); out_close: test_xdp_with_devmap_helpers__destroy(skel); @@ -68,12 +63,10 @@ out_close: void test_neg_xdp_devmap_helpers(void) { struct test_xdp_devmap_helpers *skel; - __u32 duration = 0; skel = test_xdp_devmap_helpers__open_and_load(); - if (CHECK(skel, - "Load of XDP program accessing egress ifindex without attach type", - "should have failed\n")) { + if (!ASSERT_EQ(skel, NULL, + "Load of XDP program accessing egress ifindex without attach type")) { test_xdp_devmap_helpers__destroy(skel); } } diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c b/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c new file mode 100644 index 000000000000..b77adfd55d73 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include "bpf_iter.h" +#include "bpf_tracing_net.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#define bpf_tcp_sk(skc) ({ \ + struct sock_common *_skc = skc; \ + sk = NULL; \ + tp = NULL; \ + if (_skc) { \ + tp = bpf_skc_to_tcp_sock(_skc); \ + sk = (struct sock *)tp; \ + } \ + tp; \ +}) + +unsigned short reuse_listen_hport = 0; +unsigned short listen_hport = 0; +char cubic_cc[TCP_CA_NAME_MAX] = "bpf_cubic"; +char dctcp_cc[TCP_CA_NAME_MAX] = "bpf_dctcp"; +bool random_retry = false; + +static bool tcp_cc_eq(const char *a, const char *b) +{ + int i; + + for (i = 0; i < TCP_CA_NAME_MAX; i++) { + if (a[i] != b[i]) + return false; + if (!a[i]) + break; + } + + return true; +} + +SEC("iter/tcp") +int change_tcp_cc(struct bpf_iter__tcp *ctx) +{ + char cur_cc[TCP_CA_NAME_MAX]; + struct tcp_sock *tp; + struct sock *sk; + int ret; + + if (!bpf_tcp_sk(ctx->sk_common)) + return 0; + + if (sk->sk_family != AF_INET6 || + (sk->sk_state != TCP_LISTEN && + sk->sk_state != TCP_ESTABLISHED) || + (sk->sk_num != reuse_listen_hport && + sk->sk_num != listen_hport && + bpf_ntohs(sk->sk_dport) != listen_hport)) + return 0; + + if (bpf_getsockopt(tp, SOL_TCP, TCP_CONGESTION, + cur_cc, sizeof(cur_cc))) + return 0; + + if (!tcp_cc_eq(cur_cc, cubic_cc)) + return 0; + + if (random_retry && bpf_get_prandom_u32() % 4 == 1) + return 1; + + bpf_setsockopt(tp, SOL_TCP, TCP_CONGESTION, dctcp_cc, sizeof(dctcp_cc)); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c b/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c index 2e4775c35414..92267abb462f 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c @@ -121,7 +121,7 @@ static int dump_tcp_sock(struct seq_file *seq, struct tcp_sock *tp, } BPF_SEQ_PRINTF(seq, "%4d: %08X:%04X %08X:%04X ", - seq_num, src, srcp, destp, destp); + seq_num, src, srcp, dest, destp); BPF_SEQ_PRINTF(seq, "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d ", state, tp->write_seq - tp->snd_una, rx_queue, diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index 01378911252b..3af0998a0623 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -5,6 +5,10 @@ #define AF_INET 2 #define AF_INET6 10 +#define SOL_TCP 6 +#define TCP_CONGESTION 13 +#define TCP_CA_NAME_MAX 16 + #define ICSK_TIME_RETRANS 1 #define ICSK_TIME_PROBE0 3 #define ICSK_TIME_LOSS_PROBE 5 @@ -32,6 +36,8 @@ #define ir_v6_rmt_addr req.__req_common.skc_v6_daddr #define ir_v6_loc_addr req.__req_common.skc_v6_rcv_saddr +#define sk_num __sk_common.skc_num +#define sk_dport __sk_common.skc_dport #define sk_family __sk_common.skc_family #define sk_rmem_alloc sk_backlog.rmem_alloc #define sk_refcnt __sk_common.skc_refcnt diff --git a/tools/testing/selftests/bpf/progs/get_func_ip_test.c b/tools/testing/selftests/bpf/progs/get_func_ip_test.c new file mode 100644 index 000000000000..a587aeca5ae0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/get_func_ip_test.c @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +extern const void bpf_fentry_test1 __ksym; +extern const void bpf_fentry_test2 __ksym; +extern const void bpf_fentry_test3 __ksym; +extern const void bpf_fentry_test4 __ksym; +extern const void bpf_modify_return_test __ksym; +extern const void bpf_fentry_test6 __ksym; +extern const void bpf_fentry_test7 __ksym; + +__u64 test1_result = 0; +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(test1, int a) +{ + __u64 addr = bpf_get_func_ip(ctx); + + test1_result = (const void *) addr == &bpf_fentry_test1; + return 0; +} + +__u64 test2_result = 0; +SEC("fexit/bpf_fentry_test2") +int BPF_PROG(test2, int a) +{ + __u64 addr = bpf_get_func_ip(ctx); + + test2_result = (const void *) addr == &bpf_fentry_test2; + return 0; +} + +__u64 test3_result = 0; +SEC("kprobe/bpf_fentry_test3") +int test3(struct pt_regs *ctx) +{ + __u64 addr = bpf_get_func_ip(ctx); + + test3_result = (const void *) addr == &bpf_fentry_test3; + return 0; +} + +__u64 test4_result = 0; +SEC("kretprobe/bpf_fentry_test4") +int BPF_KRETPROBE(test4) +{ + __u64 addr = bpf_get_func_ip(ctx); + + test4_result = (const void *) addr == &bpf_fentry_test4; + return 0; +} + +__u64 test5_result = 0; +SEC("fmod_ret/bpf_modify_return_test") +int BPF_PROG(test5, int a, int *b, int ret) +{ + __u64 addr = bpf_get_func_ip(ctx); + + test5_result = (const void *) addr == &bpf_modify_return_test; + return ret; +} + +__u64 test6_result = 0; +SEC("kprobe/bpf_fentry_test6+0x5") +int test6(struct pt_regs *ctx) +{ + __u64 addr = bpf_get_func_ip(ctx); + + test6_result = (const void *) addr == &bpf_fentry_test6 + 5; + return 0; +} + +__u64 test7_result = 0; +SEC("kprobe/bpf_fentry_test7+5") +int test7(struct pt_regs *ctx) +{ + __u64 addr = bpf_get_func_ip(ctx); + + test7_result = (const void *) addr == &bpf_fentry_test7 + 5; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/netcnt_prog.c b/tools/testing/selftests/bpf/progs/netcnt_prog.c index d071adf178bd..43649bce4c54 100644 --- a/tools/testing/selftests/bpf/progs/netcnt_prog.c +++ b/tools/testing/selftests/bpf/progs/netcnt_prog.c @@ -13,21 +13,21 @@ struct { __uint(type, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); __type(key, struct bpf_cgroup_storage_key); - __type(value, struct percpu_net_cnt); + __type(value, union percpu_net_cnt); } percpu_netcnt SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_CGROUP_STORAGE); __type(key, struct bpf_cgroup_storage_key); - __type(value, struct net_cnt); + __type(value, union net_cnt); } netcnt SEC(".maps"); SEC("cgroup/skb") int bpf_nextcnt(struct __sk_buff *skb) { - struct percpu_net_cnt *percpu_cnt; + union percpu_net_cnt *percpu_cnt; char fmt[] = "%d %llu %llu\n"; - struct net_cnt *cnt; + union net_cnt *cnt; __u64 ts, dt; int ret; diff --git a/tools/testing/selftests/bpf/progs/test_map_in_map_invalid.c b/tools/testing/selftests/bpf/progs/test_map_in_map_invalid.c new file mode 100644 index 000000000000..703c08e06442 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_map_in_map_invalid.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Isovalent, Inc. */ +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +struct inner { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, int); + __uint(max_entries, 4); +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 0); /* This will make map creation to fail */ + __uint(key_size, sizeof(__u32)); + __array(values, struct inner); +} mim SEC(".maps"); + +SEC("xdp") +int xdp_noop0(struct xdp_md *ctx) +{ + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c b/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c index e83d0b48d80c..8249075f088f 100644 --- a/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c +++ b/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c @@ -91,7 +91,7 @@ int bpf_sk_lookup_test1(struct __sk_buff *skb) return 0; } -SEC("classifier/fail_use_after_free") +SEC("classifier/err_use_after_free") int bpf_sk_lookup_uaf(struct __sk_buff *skb) { struct bpf_sock_tuple tuple = {}; @@ -106,7 +106,7 @@ int bpf_sk_lookup_uaf(struct __sk_buff *skb) return family; } -SEC("classifier/fail_modify_sk_pointer") +SEC("classifier/err_modify_sk_pointer") int bpf_sk_lookup_modptr(struct __sk_buff *skb) { struct bpf_sock_tuple tuple = {}; @@ -121,7 +121,7 @@ int bpf_sk_lookup_modptr(struct __sk_buff *skb) return 0; } -SEC("classifier/fail_modify_sk_or_null_pointer") +SEC("classifier/err_modify_sk_or_null_pointer") int bpf_sk_lookup_modptr_or_null(struct __sk_buff *skb) { struct bpf_sock_tuple tuple = {}; @@ -135,7 +135,7 @@ int bpf_sk_lookup_modptr_or_null(struct __sk_buff *skb) return 0; } -SEC("classifier/fail_no_release") +SEC("classifier/err_no_release") int bpf_sk_lookup_test2(struct __sk_buff *skb) { struct bpf_sock_tuple tuple = {}; @@ -144,7 +144,7 @@ int bpf_sk_lookup_test2(struct __sk_buff *skb) return 0; } -SEC("classifier/fail_release_twice") +SEC("classifier/err_release_twice") int bpf_sk_lookup_test3(struct __sk_buff *skb) { struct bpf_sock_tuple tuple = {}; @@ -156,7 +156,7 @@ int bpf_sk_lookup_test3(struct __sk_buff *skb) return 0; } -SEC("classifier/fail_release_unchecked") +SEC("classifier/err_release_unchecked") int bpf_sk_lookup_test4(struct __sk_buff *skb) { struct bpf_sock_tuple tuple = {}; @@ -173,7 +173,7 @@ void lookup_no_release(struct __sk_buff *skb) bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0); } -SEC("classifier/fail_no_release_subcall") +SEC("classifier/err_no_release_subcall") int bpf_sk_lookup_test5(struct __sk_buff *skb) { lookup_no_release(skb); diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index 84cd63259554..a0e7762b1e5a 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -528,7 +528,6 @@ int __encap_ip6vxlan_eth(struct __sk_buff *skb) static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) { - char buf[sizeof(struct v6hdr)]; struct gre_hdr greh; struct udphdr udph; int olen = len; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_context_test_run.c b/tools/testing/selftests/bpf/progs/test_xdp_context_test_run.c new file mode 100644 index 000000000000..d7b88cd05afd --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_xdp_context_test_run.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +SEC("xdp") +int xdp_context(struct xdp_md *xdp) +{ + void *data = (void *)(long)xdp->data; + __u32 *metadata = (void *)(long)xdp->data_meta; + __u32 ret; + + if (metadata + 1 > data) + return XDP_ABORTED; + ret = *metadata; + if (bpf_xdp_adjust_meta(xdp, 4)) + return XDP_ABORTED; + return ret; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c new file mode 100644 index 000000000000..5f5309791649 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/timer.c @@ -0,0 +1,297 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include <linux/bpf.h> +#include <time.h> +#include <errno.h> +#include <bpf/bpf_helpers.h> +#include "bpf_tcp_helpers.h" + +char _license[] SEC("license") = "GPL"; +struct hmap_elem { + int counter; + struct bpf_timer timer; + struct bpf_spin_lock lock; /* unused */ +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1000); + __type(key, int); + __type(value, struct hmap_elem); +} hmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, 1000); + __type(key, int); + __type(value, struct hmap_elem); +} hmap_malloc SEC(".maps"); + +struct elem { + struct bpf_timer t; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 2); + __type(key, int); + __type(value, struct elem); +} array SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, 4); + __type(key, int); + __type(value, struct elem); +} lru SEC(".maps"); + +__u64 bss_data; +__u64 err; +__u64 ok; +__u64 callback_check = 52; +__u64 callback2_check = 52; + +#define ARRAY 1 +#define HTAB 2 +#define HTAB_MALLOC 3 +#define LRU 4 + +/* callback for array and lru timers */ +static int timer_cb1(void *map, int *key, struct bpf_timer *timer) +{ + /* increment bss variable twice. + * Once via array timer callback and once via lru timer callback + */ + bss_data += 5; + + /* *key == 0 - the callback was called for array timer. + * *key == 4 - the callback was called from lru timer. + */ + if (*key == ARRAY) { + struct bpf_timer *lru_timer; + int lru_key = LRU; + + /* rearm array timer to be called again in ~35 seconds */ + if (bpf_timer_start(timer, 1ull << 35, 0) != 0) + err |= 1; + + lru_timer = bpf_map_lookup_elem(&lru, &lru_key); + if (!lru_timer) + return 0; + bpf_timer_set_callback(lru_timer, timer_cb1); + if (bpf_timer_start(lru_timer, 0, 0) != 0) + err |= 2; + } else if (*key == LRU) { + int lru_key, i; + + for (i = LRU + 1; + i <= 100 /* for current LRU eviction algorithm this number + * should be larger than ~ lru->max_entries * 2 + */; + i++) { + struct elem init = {}; + + /* lru_key cannot be used as loop induction variable + * otherwise the loop will be unbounded. + */ + lru_key = i; + + /* add more elements into lru map to push out current + * element and force deletion of this timer + */ + bpf_map_update_elem(map, &lru_key, &init, 0); + /* look it up to bump it into active list */ + bpf_map_lookup_elem(map, &lru_key); + + /* keep adding until *key changes underneath, + * which means that key/timer memory was reused + */ + if (*key != LRU) + break; + } + + /* check that the timer was removed */ + if (bpf_timer_cancel(timer) != -EINVAL) + err |= 4; + ok |= 1; + } + return 0; +} + +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(test1, int a) +{ + struct bpf_timer *arr_timer, *lru_timer; + struct elem init = {}; + int lru_key = LRU; + int array_key = ARRAY; + + arr_timer = bpf_map_lookup_elem(&array, &array_key); + if (!arr_timer) + return 0; + bpf_timer_init(arr_timer, &array, CLOCK_MONOTONIC); + + bpf_map_update_elem(&lru, &lru_key, &init, 0); + lru_timer = bpf_map_lookup_elem(&lru, &lru_key); + if (!lru_timer) + return 0; + bpf_timer_init(lru_timer, &lru, CLOCK_MONOTONIC); + + bpf_timer_set_callback(arr_timer, timer_cb1); + bpf_timer_start(arr_timer, 0 /* call timer_cb1 asap */, 0); + + /* init more timers to check that array destruction + * doesn't leak timer memory. + */ + array_key = 0; + arr_timer = bpf_map_lookup_elem(&array, &array_key); + if (!arr_timer) + return 0; + bpf_timer_init(arr_timer, &array, CLOCK_MONOTONIC); + return 0; +} + +/* callback for prealloc and non-prealloca hashtab timers */ +static int timer_cb2(void *map, int *key, struct hmap_elem *val) +{ + if (*key == HTAB) + callback_check--; + else + callback2_check--; + if (val->counter > 0 && --val->counter) { + /* re-arm the timer again to execute after 1 usec */ + bpf_timer_start(&val->timer, 1000, 0); + } else if (*key == HTAB) { + struct bpf_timer *arr_timer; + int array_key = ARRAY; + + /* cancel arr_timer otherwise bpf_fentry_test1 prog + * will stay alive forever. + */ + arr_timer = bpf_map_lookup_elem(&array, &array_key); + if (!arr_timer) + return 0; + if (bpf_timer_cancel(arr_timer) != 1) + /* bpf_timer_cancel should return 1 to indicate + * that arr_timer was active at this time + */ + err |= 8; + + /* try to cancel ourself. It shouldn't deadlock. */ + if (bpf_timer_cancel(&val->timer) != -EDEADLK) + err |= 16; + + /* delete this key and this timer anyway. + * It shouldn't deadlock either. + */ + bpf_map_delete_elem(map, key); + + /* in preallocated hashmap both 'key' and 'val' could have been + * reused to store another map element (like in LRU above), + * but in controlled test environment the below test works. + * It's not a use-after-free. The memory is owned by the map. + */ + if (bpf_timer_start(&val->timer, 1000, 0) != -EINVAL) + err |= 32; + ok |= 2; + } else { + if (*key != HTAB_MALLOC) + err |= 64; + + /* try to cancel ourself. It shouldn't deadlock. */ + if (bpf_timer_cancel(&val->timer) != -EDEADLK) + err |= 128; + + /* delete this key and this timer anyway. + * It shouldn't deadlock either. + */ + bpf_map_delete_elem(map, key); + + /* in non-preallocated hashmap both 'key' and 'val' are RCU + * protected and still valid though this element was deleted + * from the map. Arm this timer for ~35 seconds. When callback + * finishes the call_rcu will invoke: + * htab_elem_free_rcu + * check_and_free_timer + * bpf_timer_cancel_and_free + * to cancel this 35 second sleep and delete the timer for real. + */ + if (bpf_timer_start(&val->timer, 1ull << 35, 0) != 0) + err |= 256; + ok |= 4; + } + return 0; +} + +int bpf_timer_test(void) +{ + struct hmap_elem *val; + int key = HTAB, key_malloc = HTAB_MALLOC; + + val = bpf_map_lookup_elem(&hmap, &key); + if (val) { + if (bpf_timer_init(&val->timer, &hmap, CLOCK_BOOTTIME) != 0) + err |= 512; + bpf_timer_set_callback(&val->timer, timer_cb2); + bpf_timer_start(&val->timer, 1000, 0); + } + val = bpf_map_lookup_elem(&hmap_malloc, &key_malloc); + if (val) { + if (bpf_timer_init(&val->timer, &hmap_malloc, CLOCK_BOOTTIME) != 0) + err |= 1024; + bpf_timer_set_callback(&val->timer, timer_cb2); + bpf_timer_start(&val->timer, 1000, 0); + } + return 0; +} + +SEC("fentry/bpf_fentry_test2") +int BPF_PROG(test2, int a, int b) +{ + struct hmap_elem init = {}, *val; + int key = HTAB, key_malloc = HTAB_MALLOC; + + init.counter = 10; /* number of times to trigger timer_cb2 */ + bpf_map_update_elem(&hmap, &key, &init, 0); + val = bpf_map_lookup_elem(&hmap, &key); + if (val) + bpf_timer_init(&val->timer, &hmap, CLOCK_BOOTTIME); + /* update the same key to free the timer */ + bpf_map_update_elem(&hmap, &key, &init, 0); + + bpf_map_update_elem(&hmap_malloc, &key_malloc, &init, 0); + val = bpf_map_lookup_elem(&hmap_malloc, &key_malloc); + if (val) + bpf_timer_init(&val->timer, &hmap_malloc, CLOCK_BOOTTIME); + /* update the same key to free the timer */ + bpf_map_update_elem(&hmap_malloc, &key_malloc, &init, 0); + + /* init more timers to check that htab operations + * don't leak timer memory. + */ + key = 0; + bpf_map_update_elem(&hmap, &key, &init, 0); + val = bpf_map_lookup_elem(&hmap, &key); + if (val) + bpf_timer_init(&val->timer, &hmap, CLOCK_BOOTTIME); + bpf_map_delete_elem(&hmap, &key); + bpf_map_update_elem(&hmap, &key, &init, 0); + val = bpf_map_lookup_elem(&hmap, &key); + if (val) + bpf_timer_init(&val->timer, &hmap, CLOCK_BOOTTIME); + + /* and with non-prealloc htab */ + key_malloc = 0; + bpf_map_update_elem(&hmap_malloc, &key_malloc, &init, 0); + val = bpf_map_lookup_elem(&hmap_malloc, &key_malloc); + if (val) + bpf_timer_init(&val->timer, &hmap_malloc, CLOCK_BOOTTIME); + bpf_map_delete_elem(&hmap_malloc, &key_malloc); + bpf_map_update_elem(&hmap_malloc, &key_malloc, &init, 0); + val = bpf_map_lookup_elem(&hmap_malloc, &key_malloc); + if (val) + bpf_timer_init(&val->timer, &hmap_malloc, CLOCK_BOOTTIME); + + return bpf_timer_test(); +} diff --git a/tools/testing/selftests/bpf/progs/timer_mim.c b/tools/testing/selftests/bpf/progs/timer_mim.c new file mode 100644 index 000000000000..2fee7ab105ef --- /dev/null +++ b/tools/testing/selftests/bpf/progs/timer_mim.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include <linux/bpf.h> +#include <time.h> +#include <errno.h> +#include <bpf/bpf_helpers.h> +#include "bpf_tcp_helpers.h" + +char _license[] SEC("license") = "GPL"; +struct hmap_elem { + int pad; /* unused */ + struct bpf_timer timer; +}; + +struct inner_map { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1024); + __type(key, int); + __type(value, struct hmap_elem); +} inner_htab SEC(".maps"); + +#define ARRAY_KEY 1 +#define HASH_KEY 1234 + +struct outer_arr { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 2); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __array(values, struct inner_map); +} outer_arr SEC(".maps") = { + .values = { [ARRAY_KEY] = &inner_htab }, +}; + +__u64 err; +__u64 ok; +__u64 cnt; + +static int timer_cb1(void *map, int *key, struct hmap_elem *val); + +static int timer_cb2(void *map, int *key, struct hmap_elem *val) +{ + cnt++; + bpf_timer_set_callback(&val->timer, timer_cb1); + if (bpf_timer_start(&val->timer, 1000, 0)) + err |= 1; + ok |= 1; + return 0; +} + +/* callback for inner hash map */ +static int timer_cb1(void *map, int *key, struct hmap_elem *val) +{ + cnt++; + bpf_timer_set_callback(&val->timer, timer_cb2); + if (bpf_timer_start(&val->timer, 1000, 0)) + err |= 2; + /* Do a lookup to make sure 'map' and 'key' pointers are correct */ + bpf_map_lookup_elem(map, key); + ok |= 2; + return 0; +} + +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(test1, int a) +{ + struct hmap_elem init = {}; + struct bpf_map *inner_map; + struct hmap_elem *val; + int array_key = ARRAY_KEY; + int hash_key = HASH_KEY; + + inner_map = bpf_map_lookup_elem(&outer_arr, &array_key); + if (!inner_map) + return 0; + + bpf_map_update_elem(inner_map, &hash_key, &init, 0); + val = bpf_map_lookup_elem(inner_map, &hash_key); + if (!val) + return 0; + + bpf_timer_init(&val->timer, inner_map, CLOCK_MONOTONIC); + if (bpf_timer_set_callback(&val->timer, timer_cb1)) + err |= 4; + if (bpf_timer_start(&val->timer, 0, 0)) + err |= 8; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/timer_mim_reject.c b/tools/testing/selftests/bpf/progs/timer_mim_reject.c new file mode 100644 index 000000000000..5d648e3d8a41 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/timer_mim_reject.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include <linux/bpf.h> +#include <time.h> +#include <errno.h> +#include <bpf/bpf_helpers.h> +#include "bpf_tcp_helpers.h" + +char _license[] SEC("license") = "GPL"; +struct hmap_elem { + int pad; /* unused */ + struct bpf_timer timer; +}; + +struct inner_map { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1024); + __type(key, int); + __type(value, struct hmap_elem); +} inner_htab SEC(".maps"); + +#define ARRAY_KEY 1 +#define ARRAY_KEY2 2 +#define HASH_KEY 1234 + +struct outer_arr { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 2); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __array(values, struct inner_map); +} outer_arr SEC(".maps") = { + .values = { [ARRAY_KEY] = &inner_htab }, +}; + +__u64 err; +__u64 ok; +__u64 cnt; + +/* callback for inner hash map */ +static int timer_cb(void *map, int *key, struct hmap_elem *val) +{ + return 0; +} + +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(test1, int a) +{ + struct hmap_elem init = {}; + struct bpf_map *inner_map, *inner_map2; + struct hmap_elem *val; + int array_key = ARRAY_KEY; + int array_key2 = ARRAY_KEY2; + int hash_key = HASH_KEY; + + inner_map = bpf_map_lookup_elem(&outer_arr, &array_key); + if (!inner_map) + return 0; + + inner_map2 = bpf_map_lookup_elem(&outer_arr, &array_key2); + if (!inner_map2) + return 0; + bpf_map_update_elem(inner_map, &hash_key, &init, 0); + val = bpf_map_lookup_elem(inner_map, &hash_key); + if (!val) + return 0; + + bpf_timer_init(&val->timer, inner_map2, CLOCK_MONOTONIC); + if (bpf_timer_set_callback(&val->timer, timer_cb)) + err |= 4; + if (bpf_timer_start(&val->timer, 0, 0)) + err |= 8; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/xdp_tx.c b/tools/testing/selftests/bpf/progs/xdp_tx.c index 94e6c2b281cb..5f725c720e00 100644 --- a/tools/testing/selftests/bpf/progs/xdp_tx.c +++ b/tools/testing/selftests/bpf/progs/xdp_tx.c @@ -3,7 +3,7 @@ #include <linux/bpf.h> #include <bpf/bpf_helpers.h> -SEC("tx") +SEC("xdp") int xdp_tx(struct xdp_md *xdp) { return XDP_TX; diff --git a/tools/testing/selftests/bpf/test_bpftool_synctypes.py b/tools/testing/selftests/bpf/test_bpftool_synctypes.py new file mode 100755 index 000000000000..be54b7335a76 --- /dev/null +++ b/tools/testing/selftests/bpf/test_bpftool_synctypes.py @@ -0,0 +1,586 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +# +# Copyright (C) 2021 Isovalent, Inc. + +import argparse +import re +import os, sys + +LINUX_ROOT = os.path.abspath(os.path.join(__file__, + os.pardir, os.pardir, os.pardir, os.pardir, os.pardir)) +BPFTOOL_DIR = os.path.join(LINUX_ROOT, 'tools/bpf/bpftool') +retval = 0 + +class BlockParser(object): + """ + A parser for extracting set of values from blocks such as enums. + @reader: a pointer to the open file to parse + """ + def __init__(self, reader): + self.reader = reader + + def search_block(self, start_marker): + """ + Search for a given structure in a file. + @start_marker: regex marking the beginning of a structure to parse + """ + offset = self.reader.tell() + array_start = re.search(start_marker, self.reader.read()) + if array_start is None: + raise Exception('Failed to find start of block') + self.reader.seek(offset + array_start.start()) + + def parse(self, pattern, end_marker): + """ + Parse a block and return a set of values. Values to extract must be + on separate lines in the file. + @pattern: pattern used to identify the values to extract + @end_marker: regex marking the end of the block to parse + """ + entries = set() + while True: + line = self.reader.readline() + if not line or re.match(end_marker, line): + break + capture = pattern.search(line) + if capture and pattern.groups >= 1: + entries.add(capture.group(1)) + return entries + +class ArrayParser(BlockParser): + """ + A parser for extracting dicionaries of values from some BPF-related arrays. + @reader: a pointer to the open file to parse + @array_name: name of the array to parse + """ + end_marker = re.compile('^};') + + def __init__(self, reader, array_name): + self.array_name = array_name + self.start_marker = re.compile(f'(static )?const char \* const {self.array_name}\[.*\] = {{\n') + super().__init__(reader) + + def search_block(self): + """ + Search for the given array in a file. + """ + super().search_block(self.start_marker); + + def parse(self): + """ + Parse a block and return data as a dictionary. Items to extract must be + on separate lines in the file. + """ + pattern = re.compile('\[(BPF_\w*)\]\s*= "(.*)",?$') + entries = {} + while True: + line = self.reader.readline() + if line == '' or re.match(self.end_marker, line): + break + capture = pattern.search(line) + if capture: + entries[capture.group(1)] = capture.group(2) + return entries + +class InlineListParser(BlockParser): + """ + A parser for extracting set of values from inline lists. + """ + def parse(self, pattern, end_marker): + """ + Parse a block and return a set of values. Multiple values to extract + can be on a same line in the file. + @pattern: pattern used to identify the values to extract + @end_marker: regex marking the end of the block to parse + """ + entries = set() + while True: + line = self.reader.readline() + if not line: + break + entries.update(pattern.findall(line)) + if re.search(end_marker, line): + break + return entries + +class FileExtractor(object): + """ + A generic reader for extracting data from a given file. This class contains + several helper methods that wrap arround parser objects to extract values + from different structures. + This class does not offer a way to set a filename, which is expected to be + defined in children classes. + """ + def __init__(self): + self.reader = open(self.filename, 'r') + + def close(self): + """ + Close the file used by the parser. + """ + self.reader.close() + + def reset_read(self): + """ + Reset the file position indicator for this parser. This is useful when + parsing several structures in the file without respecting the order in + which those structures appear in the file. + """ + self.reader.seek(0) + + def get_types_from_array(self, array_name): + """ + Search for and parse an array associating names to BPF_* enum members, + for example: + + const char * const prog_type_name[] = { + [BPF_PROG_TYPE_UNSPEC] = "unspec", + [BPF_PROG_TYPE_SOCKET_FILTER] = "socket_filter", + [BPF_PROG_TYPE_KPROBE] = "kprobe", + }; + + Return a dictionary with the enum member names as keys and the + associated names as values, for example: + + {'BPF_PROG_TYPE_UNSPEC': 'unspec', + 'BPF_PROG_TYPE_SOCKET_FILTER': 'socket_filter', + 'BPF_PROG_TYPE_KPROBE': 'kprobe'} + + @array_name: name of the array to parse + """ + array_parser = ArrayParser(self.reader, array_name) + array_parser.search_block() + return array_parser.parse() + + def get_enum(self, enum_name): + """ + Search for and parse an enum containing BPF_* members, for example: + + enum bpf_prog_type { + BPF_PROG_TYPE_UNSPEC, + BPF_PROG_TYPE_SOCKET_FILTER, + BPF_PROG_TYPE_KPROBE, + }; + + Return a set containing all member names, for example: + + {'BPF_PROG_TYPE_UNSPEC', + 'BPF_PROG_TYPE_SOCKET_FILTER', + 'BPF_PROG_TYPE_KPROBE'} + + @enum_name: name of the enum to parse + """ + start_marker = re.compile(f'enum {enum_name} {{\n') + pattern = re.compile('^\s*(BPF_\w+),?$') + end_marker = re.compile('^};') + parser = BlockParser(self.reader) + parser.search_block(start_marker) + return parser.parse(pattern, end_marker) + + def __get_description_list(self, start_marker, pattern, end_marker): + parser = InlineListParser(self.reader) + parser.search_block(start_marker) + return parser.parse(pattern, end_marker) + + def get_rst_list(self, block_name): + """ + Search for and parse a list of type names from RST documentation, for + example: + + | *TYPE* := { + | **socket** | **kprobe** | + | **kretprobe** + | } + + Return a set containing all type names, for example: + + {'socket', 'kprobe', 'kretprobe'} + + @block_name: name of the blog to parse, 'TYPE' in the example + """ + start_marker = re.compile(f'\*{block_name}\* := {{') + pattern = re.compile('\*\*([\w/-]+)\*\*') + end_marker = re.compile('}\n') + return self.__get_description_list(start_marker, pattern, end_marker) + + def get_help_list(self, block_name): + """ + Search for and parse a list of type names from a help message in + bpftool, for example: + + " TYPE := { socket | kprobe |\\n" + " kretprobe }\\n" + + Return a set containing all type names, for example: + + {'socket', 'kprobe', 'kretprobe'} + + @block_name: name of the blog to parse, 'TYPE' in the example + """ + start_marker = re.compile(f'"\s*{block_name} := {{') + pattern = re.compile('([\w/]+) [|}]') + end_marker = re.compile('}') + return self.__get_description_list(start_marker, pattern, end_marker) + + def get_help_list_macro(self, macro): + """ + Search for and parse a list of values from a help message starting with + a macro in bpftool, for example: + + " " HELP_SPEC_OPTIONS " |\\n" + " {-f|--bpffs} | {-m|--mapcompat} | {-n|--nomount} }\\n" + + Return a set containing all item names, for example: + + {'-f', '--bpffs', '-m', '--mapcompat', '-n', '--nomount'} + + @macro: macro starting the block, 'HELP_SPEC_OPTIONS' in the example + """ + start_marker = re.compile(f'"\s*{macro}\s*" [|}}]') + pattern = re.compile('([\w-]+) ?(?:\||}[ }\]])') + end_marker = re.compile('}\\\\n') + return self.__get_description_list(start_marker, pattern, end_marker) + + def default_options(self): + """ + Return the default options contained in HELP_SPEC_OPTIONS + """ + return { '-j', '--json', '-p', '--pretty', '-d', '--debug' } + + def get_bashcomp_list(self, block_name): + """ + Search for and parse a list of type names from a variable in bash + completion file, for example: + + local BPFTOOL_PROG_LOAD_TYPES='socket kprobe \\ + kretprobe' + + Return a set containing all type names, for example: + + {'socket', 'kprobe', 'kretprobe'} + + @block_name: name of the blog to parse, 'TYPE' in the example + """ + start_marker = re.compile(f'local {block_name}=\'') + pattern = re.compile('(?:.*=\')?([\w/]+)') + end_marker = re.compile('\'$') + return self.__get_description_list(start_marker, pattern, end_marker) + +class SourceFileExtractor(FileExtractor): + """ + An abstract extractor for a source file with usage message. + This class does not offer a way to set a filename, which is expected to be + defined in children classes. + """ + def get_options(self): + return self.default_options().union(self.get_help_list_macro('HELP_SPEC_OPTIONS')) + +class ProgFileExtractor(SourceFileExtractor): + """ + An extractor for bpftool's prog.c. + """ + filename = os.path.join(BPFTOOL_DIR, 'prog.c') + + def get_prog_types(self): + return self.get_types_from_array('prog_type_name') + + def get_attach_types(self): + return self.get_types_from_array('attach_type_strings') + + def get_prog_attach_help(self): + return self.get_help_list('ATTACH_TYPE') + +class MapFileExtractor(SourceFileExtractor): + """ + An extractor for bpftool's map.c. + """ + filename = os.path.join(BPFTOOL_DIR, 'map.c') + + def get_map_types(self): + return self.get_types_from_array('map_type_name') + + def get_map_help(self): + return self.get_help_list('TYPE') + +class CgroupFileExtractor(SourceFileExtractor): + """ + An extractor for bpftool's cgroup.c. + """ + filename = os.path.join(BPFTOOL_DIR, 'cgroup.c') + + def get_prog_attach_help(self): + return self.get_help_list('ATTACH_TYPE') + +class CommonFileExtractor(SourceFileExtractor): + """ + An extractor for bpftool's common.c. + """ + filename = os.path.join(BPFTOOL_DIR, 'common.c') + + def __init__(self): + super().__init__() + self.attach_types = {} + + def get_attach_types(self): + if not self.attach_types: + self.attach_types = self.get_types_from_array('attach_type_name') + return self.attach_types + + def get_cgroup_attach_types(self): + if not self.attach_types: + self.get_attach_types() + cgroup_types = {} + for (key, value) in self.attach_types.items(): + if key.find('BPF_CGROUP') != -1: + cgroup_types[key] = value + return cgroup_types + +class GenericSourceExtractor(SourceFileExtractor): + """ + An extractor for generic source code files. + """ + filename = "" + + def __init__(self, filename): + self.filename = os.path.join(BPFTOOL_DIR, filename) + super().__init__() + +class BpfHeaderExtractor(FileExtractor): + """ + An extractor for the UAPI BPF header. + """ + filename = os.path.join(LINUX_ROOT, 'tools/include/uapi/linux/bpf.h') + + def get_prog_types(self): + return self.get_enum('bpf_prog_type') + + def get_map_types(self): + return self.get_enum('bpf_map_type') + + def get_attach_types(self): + return self.get_enum('bpf_attach_type') + +class ManPageExtractor(FileExtractor): + """ + An abstract extractor for an RST documentation page. + This class does not offer a way to set a filename, which is expected to be + defined in children classes. + """ + def get_options(self): + return self.get_rst_list('OPTIONS') + +class ManProgExtractor(ManPageExtractor): + """ + An extractor for bpftool-prog.rst. + """ + filename = os.path.join(BPFTOOL_DIR, 'Documentation/bpftool-prog.rst') + + def get_attach_types(self): + return self.get_rst_list('ATTACH_TYPE') + +class ManMapExtractor(ManPageExtractor): + """ + An extractor for bpftool-map.rst. + """ + filename = os.path.join(BPFTOOL_DIR, 'Documentation/bpftool-map.rst') + + def get_map_types(self): + return self.get_rst_list('TYPE') + +class ManCgroupExtractor(ManPageExtractor): + """ + An extractor for bpftool-cgroup.rst. + """ + filename = os.path.join(BPFTOOL_DIR, 'Documentation/bpftool-cgroup.rst') + + def get_attach_types(self): + return self.get_rst_list('ATTACH_TYPE') + +class ManGenericExtractor(ManPageExtractor): + """ + An extractor for generic RST documentation pages. + """ + filename = "" + + def __init__(self, filename): + self.filename = os.path.join(BPFTOOL_DIR, filename) + super().__init__() + +class BashcompExtractor(FileExtractor): + """ + An extractor for bpftool's bash completion file. + """ + filename = os.path.join(BPFTOOL_DIR, 'bash-completion/bpftool') + + def get_prog_attach_types(self): + return self.get_bashcomp_list('BPFTOOL_PROG_ATTACH_TYPES') + + def get_map_types(self): + return self.get_bashcomp_list('BPFTOOL_MAP_CREATE_TYPES') + + def get_cgroup_attach_types(self): + return self.get_bashcomp_list('BPFTOOL_CGROUP_ATTACH_TYPES') + +def verify(first_set, second_set, message): + """ + Print all values that differ between two sets. + @first_set: one set to compare + @second_set: another set to compare + @message: message to print for values belonging to only one of the sets + """ + global retval + diff = first_set.symmetric_difference(second_set) + if diff: + print(message, diff) + retval = 1 + +def main(): + # No arguments supported at this time, but print usage for -h|--help + argParser = argparse.ArgumentParser(description=""" + Verify that bpftool's code, help messages, documentation and bash + completion are all in sync on program types, map types, attach types, and + options. Also check that bpftool is in sync with the UAPI BPF header. + """) + args = argParser.parse_args() + + # Map types (enum) + + bpf_info = BpfHeaderExtractor() + ref = bpf_info.get_map_types() + + map_info = MapFileExtractor() + source_map_items = map_info.get_map_types() + map_types_enum = set(source_map_items.keys()) + + verify(ref, map_types_enum, + f'Comparing BPF header (enum bpf_map_type) and {MapFileExtractor.filename} (map_type_name):') + + # Map types (names) + + source_map_types = set(source_map_items.values()) + source_map_types.discard('unspec') + + help_map_types = map_info.get_map_help() + help_map_options = map_info.get_options() + map_info.close() + + man_map_info = ManMapExtractor() + man_map_options = man_map_info.get_options() + man_map_types = man_map_info.get_map_types() + man_map_info.close() + + bashcomp_info = BashcompExtractor() + bashcomp_map_types = bashcomp_info.get_map_types() + + verify(source_map_types, help_map_types, + f'Comparing {MapFileExtractor.filename} (map_type_name) and {MapFileExtractor.filename} (do_help() TYPE):') + verify(source_map_types, man_map_types, + f'Comparing {MapFileExtractor.filename} (map_type_name) and {ManMapExtractor.filename} (TYPE):') + verify(help_map_options, man_map_options, + f'Comparing {MapFileExtractor.filename} (do_help() OPTIONS) and {ManMapExtractor.filename} (OPTIONS):') + verify(source_map_types, bashcomp_map_types, + f'Comparing {MapFileExtractor.filename} (map_type_name) and {BashcompExtractor.filename} (BPFTOOL_MAP_CREATE_TYPES):') + + # Program types (enum) + + ref = bpf_info.get_prog_types() + + prog_info = ProgFileExtractor() + prog_types = set(prog_info.get_prog_types().keys()) + + verify(ref, prog_types, + f'Comparing BPF header (enum bpf_prog_type) and {ProgFileExtractor.filename} (prog_type_name):') + + # Attach types (enum) + + ref = bpf_info.get_attach_types() + bpf_info.close() + + common_info = CommonFileExtractor() + attach_types = common_info.get_attach_types() + + verify(ref, attach_types, + f'Comparing BPF header (enum bpf_attach_type) and {CommonFileExtractor.filename} (attach_type_name):') + + # Attach types (names) + + source_prog_attach_types = set(prog_info.get_attach_types().values()) + + help_prog_attach_types = prog_info.get_prog_attach_help() + help_prog_options = prog_info.get_options() + prog_info.close() + + man_prog_info = ManProgExtractor() + man_prog_options = man_prog_info.get_options() + man_prog_attach_types = man_prog_info.get_attach_types() + man_prog_info.close() + + bashcomp_info.reset_read() # We stopped at map types, rewind + bashcomp_prog_attach_types = bashcomp_info.get_prog_attach_types() + + verify(source_prog_attach_types, help_prog_attach_types, + f'Comparing {ProgFileExtractor.filename} (attach_type_strings) and {ProgFileExtractor.filename} (do_help() ATTACH_TYPE):') + verify(source_prog_attach_types, man_prog_attach_types, + f'Comparing {ProgFileExtractor.filename} (attach_type_strings) and {ManProgExtractor.filename} (ATTACH_TYPE):') + verify(help_prog_options, man_prog_options, + f'Comparing {ProgFileExtractor.filename} (do_help() OPTIONS) and {ManProgExtractor.filename} (OPTIONS):') + verify(source_prog_attach_types, bashcomp_prog_attach_types, + f'Comparing {ProgFileExtractor.filename} (attach_type_strings) and {BashcompExtractor.filename} (BPFTOOL_PROG_ATTACH_TYPES):') + + # Cgroup attach types + + source_cgroup_attach_types = set(common_info.get_cgroup_attach_types().values()) + common_info.close() + + cgroup_info = CgroupFileExtractor() + help_cgroup_attach_types = cgroup_info.get_prog_attach_help() + help_cgroup_options = cgroup_info.get_options() + cgroup_info.close() + + man_cgroup_info = ManCgroupExtractor() + man_cgroup_options = man_cgroup_info.get_options() + man_cgroup_attach_types = man_cgroup_info.get_attach_types() + man_cgroup_info.close() + + bashcomp_cgroup_attach_types = bashcomp_info.get_cgroup_attach_types() + bashcomp_info.close() + + verify(source_cgroup_attach_types, help_cgroup_attach_types, + f'Comparing {CommonFileExtractor.filename} (attach_type_strings) and {CgroupFileExtractor.filename} (do_help() ATTACH_TYPE):') + verify(source_cgroup_attach_types, man_cgroup_attach_types, + f'Comparing {CommonFileExtractor.filename} (attach_type_strings) and {ManCgroupExtractor.filename} (ATTACH_TYPE):') + verify(help_cgroup_options, man_cgroup_options, + f'Comparing {CgroupFileExtractor.filename} (do_help() OPTIONS) and {ManCgroupExtractor.filename} (OPTIONS):') + verify(source_cgroup_attach_types, bashcomp_cgroup_attach_types, + f'Comparing {CommonFileExtractor.filename} (attach_type_strings) and {BashcompExtractor.filename} (BPFTOOL_CGROUP_ATTACH_TYPES):') + + # Options for remaining commands + + for cmd in [ 'btf', 'feature', 'gen', 'iter', 'link', 'net', 'perf', 'struct_ops', ]: + source_info = GenericSourceExtractor(cmd + '.c') + help_cmd_options = source_info.get_options() + source_info.close() + + man_cmd_info = ManGenericExtractor(os.path.join('Documentation', 'bpftool-' + cmd + '.rst')) + man_cmd_options = man_cmd_info.get_options() + man_cmd_info.close() + + verify(help_cmd_options, man_cmd_options, + f'Comparing {source_info.filename} (do_help() OPTIONS) and {man_cmd_info.filename} (OPTIONS):') + + source_main_info = GenericSourceExtractor('main.c') + help_main_options = source_main_info.get_options() + source_main_info.close() + + man_main_info = ManGenericExtractor(os.path.join('Documentation', 'bpftool.rst')) + man_main_options = man_main_info.get_options() + man_main_info.close() + + verify(help_main_options, man_main_options, + f'Comparing {source_main_info.filename} (do_help() OPTIONS) and {man_main_info.filename} (OPTIONS):') + + sys.exit(retval) + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 30cbf5d98f7d..14cea869235b 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -764,8 +764,8 @@ static void test_sockmap(unsigned int tasks, void *data) udp = socket(AF_INET, SOCK_DGRAM, 0); i = 0; err = bpf_map_update_elem(fd, &i, &udp, BPF_ANY); - if (!err) { - printf("Failed socket SOCK_DGRAM allowed '%i:%i'\n", + if (err) { + printf("Failed socket update SOCK_DGRAM '%i:%i'\n", i, udp); goto out_sockmap; } @@ -1153,12 +1153,17 @@ out_sockmap: } #define MAPINMAP_PROG "./test_map_in_map.o" +#define MAPINMAP_INVALID_PROG "./test_map_in_map_invalid.o" static void test_map_in_map(void) { struct bpf_object *obj; struct bpf_map *map; int mim_fd, fd, err; int pos = 0; + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + __u32 id = 0; + libbpf_print_fn_t old_print_fn; obj = bpf_object__open(MAPINMAP_PROG); @@ -1228,11 +1233,72 @@ static void test_map_in_map(void) } close(fd); + fd = -1; bpf_object__close(obj); + + /* Test that failing bpf_object__create_map() destroys the inner map */ + obj = bpf_object__open(MAPINMAP_INVALID_PROG); + err = libbpf_get_error(obj); + if (err) { + printf("Failed to load %s program: %d %d", + MAPINMAP_INVALID_PROG, err, errno); + goto out_map_in_map; + } + + map = bpf_object__find_map_by_name(obj, "mim"); + if (!map) { + printf("Failed to load array of maps from test prog\n"); + goto out_map_in_map; + } + + old_print_fn = libbpf_set_print(NULL); + + err = bpf_object__load(obj); + if (!err) { + printf("Loading obj supposed to fail\n"); + goto out_map_in_map; + } + + libbpf_set_print(old_print_fn); + + /* Iterate over all maps to check whether the internal map + * ("mim.internal") has been destroyed. + */ + while (true) { + err = bpf_map_get_next_id(id, &id); + if (err) { + if (errno == ENOENT) + break; + printf("Failed to get next map: %d", errno); + goto out_map_in_map; + } + + fd = bpf_map_get_fd_by_id(id); + if (fd < 0) { + if (errno == ENOENT) + continue; + printf("Failed to get map by id %u: %d", id, errno); + goto out_map_in_map; + } + + err = bpf_obj_get_info_by_fd(fd, &info, &len); + if (err) { + printf("Failed to get map info by fd %d: %d", fd, + errno); + goto out_map_in_map; + } + + if (!strcmp(info.name, "mim.inner")) { + printf("Inner map mim.inner was not destroyed\n"); + goto out_map_in_map; + } + } + return; out_map_in_map: - close(fd); + if (fd >= 0) + close(fd); exit(1); } diff --git a/tools/testing/selftests/bpf/test_netcnt.c b/tools/testing/selftests/bpf/test_netcnt.c deleted file mode 100644 index a7b9a69f4fd5..000000000000 --- a/tools/testing/selftests/bpf/test_netcnt.c +++ /dev/null @@ -1,148 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <errno.h> -#include <assert.h> -#include <sys/sysinfo.h> -#include <sys/time.h> - -#include <linux/bpf.h> -#include <bpf/bpf.h> -#include <bpf/libbpf.h> - -#include "cgroup_helpers.h" -#include "bpf_rlimit.h" -#include "netcnt_common.h" - -#define BPF_PROG "./netcnt_prog.o" -#define TEST_CGROUP "/test-network-counters/" - -static int bpf_find_map(const char *test, struct bpf_object *obj, - const char *name) -{ - struct bpf_map *map; - - map = bpf_object__find_map_by_name(obj, name); - if (!map) { - printf("%s:FAIL:map '%s' not found\n", test, name); - return -1; - } - return bpf_map__fd(map); -} - -int main(int argc, char **argv) -{ - struct percpu_net_cnt *percpu_netcnt; - struct bpf_cgroup_storage_key key; - int map_fd, percpu_map_fd; - int error = EXIT_FAILURE; - struct net_cnt netcnt; - struct bpf_object *obj; - int prog_fd, cgroup_fd; - unsigned long packets; - unsigned long bytes; - int cpu, nproc; - __u32 prog_cnt; - - nproc = get_nprocs_conf(); - percpu_netcnt = malloc(sizeof(*percpu_netcnt) * nproc); - if (!percpu_netcnt) { - printf("Not enough memory for per-cpu area (%d cpus)\n", nproc); - goto err; - } - - if (bpf_prog_load(BPF_PROG, BPF_PROG_TYPE_CGROUP_SKB, - &obj, &prog_fd)) { - printf("Failed to load bpf program\n"); - goto out; - } - - cgroup_fd = cgroup_setup_and_join(TEST_CGROUP); - if (cgroup_fd < 0) - goto err; - - /* Attach bpf program */ - if (bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_INET_EGRESS, 0)) { - printf("Failed to attach bpf program"); - goto err; - } - - if (system("which ping6 &>/dev/null") == 0) - assert(!system("ping6 ::1 -c 10000 -f -q > /dev/null")); - else - assert(!system("ping -6 ::1 -c 10000 -f -q > /dev/null")); - - if (bpf_prog_query(cgroup_fd, BPF_CGROUP_INET_EGRESS, 0, NULL, NULL, - &prog_cnt)) { - printf("Failed to query attached programs"); - goto err; - } - - map_fd = bpf_find_map(__func__, obj, "netcnt"); - if (map_fd < 0) { - printf("Failed to find bpf map with net counters"); - goto err; - } - - percpu_map_fd = bpf_find_map(__func__, obj, "percpu_netcnt"); - if (percpu_map_fd < 0) { - printf("Failed to find bpf map with percpu net counters"); - goto err; - } - - if (bpf_map_get_next_key(map_fd, NULL, &key)) { - printf("Failed to get key in cgroup storage\n"); - goto err; - } - - if (bpf_map_lookup_elem(map_fd, &key, &netcnt)) { - printf("Failed to lookup cgroup storage\n"); - goto err; - } - - if (bpf_map_lookup_elem(percpu_map_fd, &key, &percpu_netcnt[0])) { - printf("Failed to lookup percpu cgroup storage\n"); - goto err; - } - - /* Some packets can be still in per-cpu cache, but not more than - * MAX_PERCPU_PACKETS. - */ - packets = netcnt.packets; - bytes = netcnt.bytes; - for (cpu = 0; cpu < nproc; cpu++) { - if (percpu_netcnt[cpu].packets > MAX_PERCPU_PACKETS) { - printf("Unexpected percpu value: %llu\n", - percpu_netcnt[cpu].packets); - goto err; - } - - packets += percpu_netcnt[cpu].packets; - bytes += percpu_netcnt[cpu].bytes; - } - - /* No packets should be lost */ - if (packets != 10000) { - printf("Unexpected packet count: %lu\n", packets); - goto err; - } - - /* Let's check that bytes counter matches the number of packets - * multiplied by the size of ipv6 ICMP packet. - */ - if (bytes != packets * 104) { - printf("Unexpected bytes count: %lu\n", bytes); - goto err; - } - - error = 0; - printf("test_netcnt:PASS\n"); - -err: - cleanup_cgroup_environment(); - free(percpu_netcnt); - -out: - return error; -} diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index 8ef7f334e715..c8c2bf878f67 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -221,6 +221,18 @@ extern int test__join_cgroup(const char *path); ___ok; \ }) +#define ASSERT_STRNEQ(actual, expected, len, name) ({ \ + static int duration = 0; \ + const char *___act = actual; \ + const char *___exp = expected; \ + int ___len = len; \ + bool ___ok = strncmp(___act, ___exp, ___len) == 0; \ + CHECK(!___ok, (name), \ + "unexpected %s: actual '%.*s' != expected '%.*s'\n", \ + (name), ___len, ___act, ___len, ___exp); \ + ___ok; \ +}) + #define ASSERT_OK(res, name) ({ \ static int duration = 0; \ long long ___res = (res); \ diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh index c9dde9b9d987..088fcad138c9 100755 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -69,7 +69,7 @@ cleanup() { } server_listen() { - ip netns exec "${ns2}" nc "${netcat_opt}" -l -p "${port}" > "${outfile}" & + ip netns exec "${ns2}" nc "${netcat_opt}" -l "${port}" > "${outfile}" & server_pid=$! sleep 0.2 } diff --git a/tools/testing/selftests/bpf/test_xdp_veth.sh b/tools/testing/selftests/bpf/test_xdp_veth.sh index ba8ffcdaac30..995278e684b6 100755 --- a/tools/testing/selftests/bpf/test_xdp_veth.sh +++ b/tools/testing/selftests/bpf/test_xdp_veth.sh @@ -108,7 +108,7 @@ ip link set dev veth2 xdp pinned $BPF_DIR/progs/redirect_map_1 ip link set dev veth3 xdp pinned $BPF_DIR/progs/redirect_map_2 ip -n ns1 link set dev veth11 xdp obj xdp_dummy.o sec xdp_dummy -ip -n ns2 link set dev veth22 xdp obj xdp_tx.o sec tx +ip -n ns2 link set dev veth22 xdp obj xdp_tx.o sec xdp ip -n ns3 link set dev veth33 xdp obj xdp_dummy.o sec xdp_dummy trap cleanup EXIT diff --git a/tools/testing/selftests/nci/nci_dev.c b/tools/testing/selftests/nci/nci_dev.c index 57b505cb1561..e1bf55dabdf6 100644 --- a/tools/testing/selftests/nci/nci_dev.c +++ b/tools/testing/selftests/nci/nci_dev.c @@ -57,6 +57,29 @@ const __u8 nci_init_rsp_v2[] = {0x40, 0x01, 0x1c, 0x00, 0x1a, 0x7e, 0x06, const __u8 nci_rf_disc_map_rsp[] = {0x41, 0x00, 0x01, 0x00}; const __u8 nci_rf_disc_rsp[] = {0x41, 0x03, 0x01, 0x00}; const __u8 nci_rf_deact_rsp[] = {0x41, 0x06, 0x01, 0x00}; +const __u8 nci_rf_deact_ntf[] = {0x61, 0x06, 0x02, 0x00, 0x00}; +const __u8 nci_rf_activate_ntf[] = {0x61, 0x05, 0x1D, 0x01, 0x02, 0x04, 0x00, + 0xFF, 0xFF, 0x0C, 0x44, 0x03, 0x07, 0x04, + 0x62, 0x26, 0x11, 0x80, 0x1D, 0x80, 0x01, + 0x20, 0x00, 0x00, 0x00, 0x06, 0x05, 0x75, + 0x77, 0x81, 0x02, 0x80}; +const __u8 nci_t4t_select_cmd[] = {0x00, 0x00, 0x0C, 0x00, 0xA4, 0x04, 0x00, + 0x07, 0xD2, 0x76, 0x00, 0x00, 0x85, 0x01, 0x01}; +const __u8 nci_t4t_select_cmd2[] = {0x00, 0x00, 0x07, 0x00, 0xA4, 0x00, 0x0C, 0x02, + 0xE1, 0x03}; +const __u8 nci_t4t_select_cmd3[] = {0x00, 0x00, 0x07, 0x00, 0xA4, 0x00, 0x0C, 0x02, + 0xE1, 0x04}; +const __u8 nci_t4t_read_cmd[] = {0x00, 0x00, 0x05, 0x00, 0xB0, 0x00, 0x00, 0x0F}; +const __u8 nci_t4t_read_rsp[] = {0x00, 0x00, 0x11, 0x00, 0x0F, 0x20, 0x00, 0x3B, + 0x00, 0x34, 0x04, 0x06, 0xE1, 0x04, 0x08, 0x00, + 0x00, 0x00, 0x90, 0x00}; +const __u8 nci_t4t_read_cmd2[] = {0x00, 0x00, 0x05, 0x00, 0xB0, 0x00, 0x00, 0x02}; +const __u8 nci_t4t_read_rsp2[] = {0x00, 0x00, 0x04, 0x00, 0x0F, 0x90, 0x00}; +const __u8 nci_t4t_read_cmd3[] = {0x00, 0x00, 0x05, 0x00, 0xB0, 0x00, 0x02, 0x0F}; +const __u8 nci_t4t_read_rsp3[] = {0x00, 0x00, 0x11, 0xD1, 0x01, 0x0B, 0x54, 0x02, + 0x65, 0x6E, 0x4E, 0x46, 0x43, 0x20, 0x54, 0x45, + 0x53, 0x54, 0x90, 0x00}; +const __u8 nci_t4t_rsp_ok[] = {0x00, 0x00, 0x02, 0x90, 0x00}; struct msgtemplate { struct nlmsghdr n; @@ -87,7 +110,7 @@ error: static int send_cmd_mt_nla(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, __u8 genl_cmd, int nla_num, __u16 nla_type[], - void *nla_data[], int nla_len[]) + void *nla_data[], int nla_len[], __u16 flags) { struct sockaddr_nl nladdr; struct msgtemplate msg; @@ -98,7 +121,7 @@ static int send_cmd_mt_nla(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); msg.n.nlmsg_type = nlmsg_type; - msg.n.nlmsg_flags = NLM_F_REQUEST; + msg.n.nlmsg_flags = flags; msg.n.nlmsg_seq = 0; msg.n.nlmsg_pid = nlmsg_pid; msg.g.cmd = genl_cmd; @@ -110,11 +133,11 @@ static int send_cmd_mt_nla(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, na->nla_type = nla_type[cnt]; na->nla_len = nla_len[cnt] + NLA_HDRLEN; - if (nla_len > 0) + if (nla_len[cnt] > 0) memcpy(NLA_DATA(na), nla_data[cnt], nla_len[cnt]); - msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len); - prv_len = na->nla_len; + prv_len = NLA_ALIGN(nla_len[cnt]) + NLA_HDRLEN; + msg.n.nlmsg_len += prv_len; } buf = (char *)&msg; @@ -146,11 +169,11 @@ static int send_get_nfc_family(int sd, __u32 pid) nla_get_family_data = family_name; return send_cmd_mt_nla(sd, GENL_ID_CTRL, pid, CTRL_CMD_GETFAMILY, - 1, &nla_get_family_type, - &nla_get_family_data, &nla_get_family_len); + 1, &nla_get_family_type, &nla_get_family_data, + &nla_get_family_len, NLM_F_REQUEST); } -static int get_family_id(int sd, __u32 pid) +static int get_family_id(int sd, __u32 pid, __u32 *event_group) { struct { struct nlmsghdr n; @@ -158,8 +181,9 @@ static int get_family_id(int sd, __u32 pid) char buf[512]; } ans; struct nlattr *na; - int rep_len; + int resp_len; __u16 id; + int len; int rc; rc = send_get_nfc_family(sd, pid); @@ -167,17 +191,49 @@ static int get_family_id(int sd, __u32 pid) if (rc < 0) return 0; - rep_len = recv(sd, &ans, sizeof(ans), 0); + resp_len = recv(sd, &ans, sizeof(ans), 0); - if (ans.n.nlmsg_type == NLMSG_ERROR || rep_len < 0 || - !NLMSG_OK(&ans.n, rep_len)) + if (ans.n.nlmsg_type == NLMSG_ERROR || resp_len < 0 || + !NLMSG_OK(&ans.n, resp_len)) return 0; + len = 0; + resp_len = GENLMSG_PAYLOAD(&ans.n); na = (struct nlattr *)GENLMSG_DATA(&ans); - na = (struct nlattr *)((char *)na + NLA_ALIGN(na->nla_len)); - if (na->nla_type == CTRL_ATTR_FAMILY_ID) - id = *(__u16 *)NLA_DATA(na); + while (len < resp_len) { + len += NLA_ALIGN(na->nla_len); + if (na->nla_type == CTRL_ATTR_FAMILY_ID) { + id = *(__u16 *)NLA_DATA(na); + } else if (na->nla_type == CTRL_ATTR_MCAST_GROUPS) { + struct nlattr *nested_na; + struct nlattr *group_na; + int group_attr_len; + int group_attr; + + nested_na = (struct nlattr *)((char *)na + NLA_HDRLEN); + group_na = (struct nlattr *)((char *)nested_na + NLA_HDRLEN); + group_attr_len = 0; + + for (group_attr = CTRL_ATTR_MCAST_GRP_UNSPEC; + group_attr < CTRL_ATTR_MCAST_GRP_MAX; group_attr++) { + if (group_na->nla_type == CTRL_ATTR_MCAST_GRP_ID) { + *event_group = *(__u32 *)((char *)group_na + + NLA_HDRLEN); + break; + } + + group_attr_len += NLA_ALIGN(group_na->nla_len) + + NLA_HDRLEN; + if (group_attr_len >= nested_na->nla_len) + break; + + group_na = (struct nlattr *)((char *)group_na + + NLA_ALIGN(group_na->nla_len)); + } + } + na = (struct nlattr *)(GENLMSG_DATA(&ans) + len); + } return id; } @@ -189,12 +245,12 @@ static int send_cmd_with_idx(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, int nla_len = 4; return send_cmd_mt_nla(sd, nlmsg_type, nlmsg_pid, genl_cmd, 1, - &nla_type, &nla_data, &nla_len); + &nla_type, &nla_data, &nla_len, NLM_F_REQUEST); } static int get_nci_devid(int sd, __u16 fid, __u32 pid, int dev_id, struct msgtemplate *msg) { - int rc, rep_len; + int rc, resp_len; rc = send_cmd_with_idx(sd, fid, pid, NFC_CMD_GET_DEVICE, dev_id); if (rc < 0) { @@ -202,14 +258,14 @@ static int get_nci_devid(int sd, __u16 fid, __u32 pid, int dev_id, struct msgtem goto error; } - rep_len = recv(sd, msg, sizeof(*msg), 0); - if (rep_len < 0) { + resp_len = recv(sd, msg, sizeof(*msg), 0); + if (resp_len < 0) { rc = -2; goto error; } if (msg->n.nlmsg_type == NLMSG_ERROR || - !NLMSG_OK(&msg->n, rep_len)) { + !NLMSG_OK(&msg->n, resp_len)) { rc = -3; goto error; } @@ -222,21 +278,21 @@ error: static __u8 get_dev_enable_state(struct msgtemplate *msg) { struct nlattr *na; - int rep_len; + int resp_len; int len; - rep_len = GENLMSG_PAYLOAD(&msg->n); + resp_len = GENLMSG_PAYLOAD(&msg->n); na = (struct nlattr *)GENLMSG_DATA(msg); len = 0; - while (len < rep_len) { + while (len < resp_len) { len += NLA_ALIGN(na->nla_len); if (na->nla_type == NFC_ATTR_DEVICE_POWERED) return *(char *)NLA_DATA(na); na = (struct nlattr *)(GENLMSG_DATA(msg) + len); } - return rep_len; + return resp_len; } FIXTURE(NCI) { @@ -270,8 +326,7 @@ static void *virtual_dev_open(void *data) dev_fd = *(int *)data; - while ((len = read(dev_fd, buf, 258)) == 0) - ; + len = read(dev_fd, buf, 258); if (len <= 0) goto error; if (len != sizeof(nci_reset_cmd)) @@ -280,8 +335,7 @@ static void *virtual_dev_open(void *data) goto error; write(dev_fd, nci_reset_rsp, sizeof(nci_reset_rsp)); - while ((len = read(dev_fd, buf, 258)) == 0) - ; + len = read(dev_fd, buf, 258); if (len <= 0) goto error; if (len != sizeof(nci_init_cmd)) @@ -290,8 +344,7 @@ static void *virtual_dev_open(void *data) goto error; write(dev_fd, nci_init_rsp, sizeof(nci_init_rsp)); - while ((len = read(dev_fd, buf, 258)) == 0) - ; + len = read(dev_fd, buf, 258); if (len <= 0) goto error; if (len != sizeof(nci_rf_disc_map_cmd)) @@ -313,8 +366,7 @@ static void *virtual_dev_open_v2(void *data) dev_fd = *(int *)data; - while ((len = read(dev_fd, buf, 258)) == 0) - ; + len = read(dev_fd, buf, 258); if (len <= 0) goto error; if (len != sizeof(nci_reset_cmd)) @@ -324,8 +376,7 @@ static void *virtual_dev_open_v2(void *data) write(dev_fd, nci_reset_rsp_v2, sizeof(nci_reset_rsp_v2)); write(dev_fd, nci_reset_ntf, sizeof(nci_reset_ntf)); - while ((len = read(dev_fd, buf, 258)) == 0) - ; + len = read(dev_fd, buf, 258); if (len <= 0) goto error; if (len != sizeof(nci_init_cmd_v2)) @@ -334,8 +385,7 @@ static void *virtual_dev_open_v2(void *data) goto error; write(dev_fd, nci_init_rsp_v2, sizeof(nci_init_rsp_v2)); - while ((len = read(dev_fd, buf, 258)) == 0) - ; + len = read(dev_fd, buf, 258); if (len <= 0) goto error; if (len != sizeof(nci_rf_disc_map_cmd)) @@ -353,6 +403,7 @@ FIXTURE_SETUP(NCI) { struct msgtemplate msg; pthread_t thread_t; + __u32 event_group; int status; int rc; @@ -364,12 +415,16 @@ FIXTURE_SETUP(NCI) ASSERT_NE(self->sd, -1); self->pid = getpid(); - self->fid = get_family_id(self->sd, self->pid); + self->fid = get_family_id(self->sd, self->pid, &event_group); ASSERT_NE(self->fid, -1); self->virtual_nci_fd = open("/dev/virtual_nci", O_RDWR); ASSERT_GT(self->virtual_nci_fd, -1); + rc = setsockopt(self->sd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &event_group, + sizeof(event_group)); + ASSERT_NE(rc, -1); + rc = ioctl(self->virtual_nci_fd, IOCTL_GET_NCIDEV_IDX, &self->dev_idex); ASSERT_EQ(rc, 0); @@ -402,8 +457,7 @@ static void *virtual_deinit(void *data) dev_fd = *(int *)data; - while ((len = read(dev_fd, buf, 258)) == 0) - ; + len = read(dev_fd, buf, 258); if (len <= 0) goto error; if (len != sizeof(nci_reset_cmd)) @@ -425,8 +479,7 @@ static void *virtual_deinit_v2(void *data) dev_fd = *(int *)data; - while ((len = read(dev_fd, buf, 258)) == 0) - ; + len = read(dev_fd, buf, 258); if (len <= 0) goto error; if (len != sizeof(nci_reset_cmd)) @@ -489,16 +542,14 @@ static void *virtual_poll_start(void *data) dev_fd = *(int *)data; - while ((len = read(dev_fd, buf, 258)) == 0) - ; + len = read(dev_fd, buf, 258); if (len <= 0) goto error; if (len != sizeof(nci_rf_discovery_cmd)) goto error; if (memcmp(nci_rf_discovery_cmd, buf, len)) goto error; - write(dev_fd, nci_rf_disc_rsp, sizeof(nci_rf_disc_rsp)) - ; + write(dev_fd, nci_rf_disc_rsp, sizeof(nci_rf_disc_rsp)); return (void *)0; error: @@ -513,8 +564,7 @@ static void *virtual_poll_stop(void *data) dev_fd = *(int *)data; - while ((len = read(dev_fd, buf, 258)) == 0) - ; + len = read(dev_fd, buf, 258); if (len <= 0) goto error; if (len != sizeof(nci_rf_deact_cmd)) @@ -528,38 +578,282 @@ error: return (void *)-1; } -TEST_F(NCI, start_poll) +int start_polling(int dev_idx, int proto, int virtual_fd, int sd, int fid, int pid) { __u16 nla_start_poll_type[2] = {NFC_ATTR_DEVICE_INDEX, NFC_ATTR_PROTOCOLS}; - void *nla_start_poll_data[2] = {&self->dev_idex, &self->proto}; + void *nla_start_poll_data[2] = {&dev_idx, &proto}; int nla_start_poll_len[2] = {4, 4}; pthread_t thread_t; int status; int rc; rc = pthread_create(&thread_t, NULL, virtual_poll_start, - (void *)&self->virtual_nci_fd); - ASSERT_GT(rc, -1); + (void *)&virtual_fd); + if (rc < 0) + return rc; - rc = send_cmd_mt_nla(self->sd, self->fid, self->pid, - NFC_CMD_START_POLL, 2, nla_start_poll_type, - nla_start_poll_data, nla_start_poll_len); - EXPECT_EQ(rc, 0); + rc = send_cmd_mt_nla(sd, fid, pid, NFC_CMD_START_POLL, 2, nla_start_poll_type, + nla_start_poll_data, nla_start_poll_len, NLM_F_REQUEST); + if (rc != 0) + return rc; pthread_join(thread_t, (void **)&status); - ASSERT_EQ(status, 0); + return status; +} + +int stop_polling(int dev_idx, int virtual_fd, int sd, int fid, int pid) +{ + pthread_t thread_t; + int status; + int rc; rc = pthread_create(&thread_t, NULL, virtual_poll_stop, - (void *)&self->virtual_nci_fd); - ASSERT_GT(rc, -1); + (void *)&virtual_fd); + if (rc < 0) + return rc; - rc = send_cmd_with_idx(self->sd, self->fid, self->pid, - NFC_CMD_STOP_POLL, self->dev_idex); - EXPECT_EQ(rc, 0); + rc = send_cmd_with_idx(sd, fid, pid, + NFC_CMD_STOP_POLL, dev_idx); + if (rc != 0) + return rc; pthread_join(thread_t, (void **)&status); + return status; +} + +TEST_F(NCI, start_poll) +{ + int status; + + status = start_polling(self->dev_idex, self->proto, self->virtual_nci_fd, + self->sd, self->fid, self->pid); + EXPECT_EQ(status, 0); + + status = stop_polling(self->dev_idex, self->virtual_nci_fd, self->sd, + self->fid, self->pid); + EXPECT_EQ(status, 0); +} + +int get_taginfo(int dev_idx, int sd, int fid, int pid) +{ + struct { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[512]; + } ans; + + struct nlattr *na; + __u32 protocol; + int targetidx; + __u8 sel_res; + int resp_len; + int len; + + __u16 tagid_type; + void *tagid_type_data; + int tagid_len; + + tagid_type = NFC_ATTR_DEVICE_INDEX; + tagid_type_data = &dev_idx; + tagid_len = 4; + + send_cmd_mt_nla(sd, fid, pid, NFC_CMD_GET_TARGET, 1, &tagid_type, + &tagid_type_data, &tagid_len, NLM_F_REQUEST | NLM_F_DUMP); + resp_len = recv(sd, &ans, sizeof(ans), 0); + if (ans.n.nlmsg_type == NLMSG_ERROR || resp_len < 0 || + !NLMSG_OK(&ans.n, resp_len)) + return -1; + + resp_len = GENLMSG_PAYLOAD(&ans.n); + na = (struct nlattr *)GENLMSG_DATA(&ans); + + len = 0; + targetidx = -1; + protocol = -1; + sel_res = -1; + + while (len < resp_len) { + len += NLA_ALIGN(na->nla_len); + + if (na->nla_type == NFC_ATTR_TARGET_INDEX) + targetidx = *(int *)((char *)na + NLA_HDRLEN); + else if (na->nla_type == NFC_ATTR_TARGET_SEL_RES) + sel_res = *(__u8 *)((char *)na + NLA_HDRLEN); + else if (na->nla_type == NFC_ATTR_PROTOCOLS) + protocol = *(__u32 *)((char *)na + NLA_HDRLEN); + + na = (struct nlattr *)(GENLMSG_DATA(&ans) + len); + } + + if (targetidx == -1 || sel_res != 0x20 || protocol != NFC_PROTO_ISO14443_MASK) + return -1; + + return targetidx; +} + +int connect_socket(int dev_idx, int target_idx) +{ + struct sockaddr_nfc addr; + int sock; + int err = 0; + + sock = socket(AF_NFC, SOCK_SEQPACKET, NFC_SOCKPROTO_RAW); + if (sock == -1) + return -1; + + addr.sa_family = AF_NFC; + addr.dev_idx = dev_idx; + addr.target_idx = target_idx; + addr.nfc_protocol = NFC_PROTO_ISO14443; + + err = connect(sock, (struct sockaddr *)&addr, sizeof(addr)); + if (err) { + close(sock); + return -1; + } + + return sock; +} + +int connect_tag(int dev_idx, int virtual_fd, int sd, int fid, int pid) +{ + struct genlmsghdr *genlhdr; + struct nlattr *na; + char evt_data[255]; + int target_idx; + int resp_len; + int evt_dev; + + write(virtual_fd, nci_rf_activate_ntf, sizeof(nci_rf_activate_ntf)); + resp_len = recv(sd, evt_data, sizeof(evt_data), 0); + if (resp_len < 0) + return -1; + + genlhdr = (struct genlmsghdr *)((struct nlmsghdr *)evt_data + 1); + na = (struct nlattr *)(genlhdr + 1); + evt_dev = *(int *)((char *)na + NLA_HDRLEN); + if (dev_idx != evt_dev) + return -1; + + target_idx = get_taginfo(dev_idx, sd, fid, pid); + if (target_idx == -1) + return -1; + return connect_socket(dev_idx, target_idx); +} + +int read_write_nci_cmd(int nfc_sock, int virtual_fd, const __u8 *cmd, __u32 cmd_len, + const __u8 *rsp, __u32 rsp_len) +{ + char buf[256]; + unsigned int len; + + send(nfc_sock, &cmd[3], cmd_len - 3, 0); + len = read(virtual_fd, buf, cmd_len); + if (len < 0 || memcmp(buf, cmd, cmd_len)) + return -1; + + write(virtual_fd, rsp, rsp_len); + len = recv(nfc_sock, buf, rsp_len - 2, 0); + if (len < 0 || memcmp(&buf[1], &rsp[3], rsp_len - 3)) + return -1; + + return 0; +} + +int read_tag(int nfc_sock, int virtual_fd) +{ + if (read_write_nci_cmd(nfc_sock, virtual_fd, nci_t4t_select_cmd, + sizeof(nci_t4t_select_cmd), nci_t4t_rsp_ok, + sizeof(nci_t4t_rsp_ok))) + return -1; + + if (read_write_nci_cmd(nfc_sock, virtual_fd, nci_t4t_select_cmd2, + sizeof(nci_t4t_select_cmd2), nci_t4t_rsp_ok, + sizeof(nci_t4t_rsp_ok))) + return -1; + + if (read_write_nci_cmd(nfc_sock, virtual_fd, nci_t4t_read_cmd, + sizeof(nci_t4t_read_cmd), nci_t4t_read_rsp, + sizeof(nci_t4t_read_rsp))) + return -1; + + if (read_write_nci_cmd(nfc_sock, virtual_fd, nci_t4t_select_cmd3, + sizeof(nci_t4t_select_cmd3), nci_t4t_rsp_ok, + sizeof(nci_t4t_rsp_ok))) + return -1; + + if (read_write_nci_cmd(nfc_sock, virtual_fd, nci_t4t_read_cmd2, + sizeof(nci_t4t_read_cmd2), nci_t4t_read_rsp2, + sizeof(nci_t4t_read_rsp2))) + return -1; + + return read_write_nci_cmd(nfc_sock, virtual_fd, nci_t4t_read_cmd3, + sizeof(nci_t4t_read_cmd3), nci_t4t_read_rsp3, + sizeof(nci_t4t_read_rsp3)); +} + +static void *virtual_deactivate_proc(void *data) +{ + int virtual_fd; + char buf[256]; + int deactcmd_len; + int len; + + virtual_fd = *(int *)data; + deactcmd_len = sizeof(nci_rf_deact_cmd); + len = read(virtual_fd, buf, deactcmd_len); + if (len != deactcmd_len || memcmp(buf, nci_rf_deact_cmd, deactcmd_len)) + return (void *)-1; + + write(virtual_fd, nci_rf_deact_rsp, sizeof(nci_rf_deact_rsp)); + write(virtual_fd, nci_rf_deact_ntf, sizeof(nci_rf_deact_ntf)); + + return (void *)0; +} + +int disconnect_tag(int nfc_sock, int virtual_fd) +{ + pthread_t thread_t; + char buf[256]; + int status; + int len; + + send(nfc_sock, &nci_t4t_select_cmd3[3], sizeof(nci_t4t_select_cmd3) - 3, 0); + len = read(virtual_fd, buf, sizeof(nci_t4t_select_cmd3)); + if (len < 0 || memcmp(buf, nci_t4t_select_cmd3, sizeof(nci_t4t_select_cmd3))) + return -1; + + len = recv(nfc_sock, buf, sizeof(nci_t4t_rsp_ok), 0); + if (len != -1) + return -1; + + status = pthread_create(&thread_t, NULL, virtual_deactivate_proc, + (void *)&virtual_fd); + + close(nfc_sock); + pthread_join(thread_t, (void **)&status); + return status; +} + +TEST_F(NCI, t4t_tag_read) +{ + int nfc_sock; + int status; + + status = start_polling(self->dev_idex, self->proto, self->virtual_nci_fd, + self->sd, self->fid, self->pid); + EXPECT_EQ(status, 0); + + nfc_sock = connect_tag(self->dev_idex, self->virtual_nci_fd, self->sd, + self->fid, self->pid); + ASSERT_GT(nfc_sock, -1); + + status = read_tag(nfc_sock, self->virtual_nci_fd); ASSERT_EQ(status, 0); + + status = disconnect_tag(nfc_sock, self->virtual_nci_fd); + EXPECT_EQ(status, 0); } TEST_F(NCI, deinit) diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 79c9eb0034d5..4f9f73e7a299 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -25,6 +25,7 @@ TEST_PROGS += bareudp.sh TEST_PROGS += unicast_extensions.sh TEST_PROGS += udpgro_fwd.sh TEST_PROGS += veth.sh +TEST_PROGS += ioam6.sh TEST_PROGS_EXTENDED := in_netns.sh TEST_GEN_FILES = socket nettest TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any @@ -36,8 +37,11 @@ TEST_GEN_FILES += fin_ack_lat TEST_GEN_FILES += reuseaddr_ports_exhausted TEST_GEN_FILES += hwtstamp_config rxtimestamp timestamping txtimestamp TEST_GEN_FILES += ipsec +TEST_GEN_FILES += ioam6_parser +TEST_GEN_FILES += gro TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls +TEST_GEN_FILES += toeplitz TEST_FILES := settings diff --git a/tools/testing/selftests/net/af_unix/Makefile b/tools/testing/selftests/net/af_unix/Makefile new file mode 100644 index 000000000000..cfc7f4f97fd1 --- /dev/null +++ b/tools/testing/selftests/net/af_unix/Makefile @@ -0,0 +1,5 @@ +##TEST_GEN_FILES := test_unix_oob +TEST_PROGS := test_unix_oob +include ../../lib.mk + +all: $(TEST_PROGS) diff --git a/tools/testing/selftests/net/af_unix/test_unix_oob.c b/tools/testing/selftests/net/af_unix/test_unix_oob.c new file mode 100644 index 000000000000..0f3e3763f4f8 --- /dev/null +++ b/tools/testing/selftests/net/af_unix/test_unix_oob.c @@ -0,0 +1,437 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <stdio.h> +#include <stdlib.h> +#include <sys/socket.h> +#include <arpa/inet.h> +#include <unistd.h> +#include <string.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <errno.h> +#include <netinet/tcp.h> +#include <sys/un.h> +#include <sys/signal.h> +#include <sys/poll.h> + +static int pipefd[2]; +static int signal_recvd; +static pid_t producer_id; +static char sock_name[32]; + +static void sig_hand(int sn, siginfo_t *si, void *p) +{ + signal_recvd = sn; +} + +static int set_sig_handler(int signal) +{ + struct sigaction sa; + + sa.sa_sigaction = sig_hand; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_SIGINFO | SA_RESTART; + + return sigaction(signal, &sa, NULL); +} + +static void set_filemode(int fd, int set) +{ + int flags = fcntl(fd, F_GETFL, 0); + + if (set) + flags &= ~O_NONBLOCK; + else + flags |= O_NONBLOCK; + fcntl(fd, F_SETFL, flags); +} + +static void signal_producer(int fd) +{ + char cmd; + + cmd = 'S'; + write(fd, &cmd, sizeof(cmd)); +} + +static void wait_for_signal(int fd) +{ + char buf[5]; + + read(fd, buf, 5); +} + +static void die(int status) +{ + fflush(NULL); + unlink(sock_name); + kill(producer_id, SIGTERM); + exit(status); +} + +int is_sioctatmark(int fd) +{ + int ans = -1; + + if (ioctl(fd, SIOCATMARK, &ans, sizeof(ans)) < 0) { +#ifdef DEBUG + perror("SIOCATMARK Failed"); +#endif + } + return ans; +} + +void read_oob(int fd, char *c) +{ + + *c = ' '; + if (recv(fd, c, sizeof(*c), MSG_OOB) < 0) { +#ifdef DEBUG + perror("Reading MSG_OOB Failed"); +#endif + } +} + +int read_data(int pfd, char *buf, int size) +{ + int len = 0; + + memset(buf, size, '0'); + len = read(pfd, buf, size); +#ifdef DEBUG + if (len < 0) + perror("read failed"); +#endif + return len; +} + +static void wait_for_data(int pfd, int event) +{ + struct pollfd pfds[1]; + + pfds[0].fd = pfd; + pfds[0].events = event; + poll(pfds, 1, -1); +} + +void producer(struct sockaddr_un *consumer_addr) +{ + int cfd; + char buf[64]; + int i; + + memset(buf, 'x', sizeof(buf)); + cfd = socket(AF_UNIX, SOCK_STREAM, 0); + + wait_for_signal(pipefd[0]); + if (connect(cfd, (struct sockaddr *)consumer_addr, + sizeof(struct sockaddr)) != 0) { + perror("Connect failed"); + kill(0, SIGTERM); + exit(1); + } + + for (i = 0; i < 2; i++) { + /* Test 1: Test for SIGURG and OOB */ + wait_for_signal(pipefd[0]); + memset(buf, 'x', sizeof(buf)); + buf[63] = '@'; + send(cfd, buf, sizeof(buf), MSG_OOB); + + wait_for_signal(pipefd[0]); + + /* Test 2: Test for OOB being overwitten */ + memset(buf, 'x', sizeof(buf)); + buf[63] = '%'; + send(cfd, buf, sizeof(buf), MSG_OOB); + + memset(buf, 'x', sizeof(buf)); + buf[63] = '#'; + send(cfd, buf, sizeof(buf), MSG_OOB); + + wait_for_signal(pipefd[0]); + + /* Test 3: Test for SIOCATMARK */ + memset(buf, 'x', sizeof(buf)); + buf[63] = '@'; + send(cfd, buf, sizeof(buf), MSG_OOB); + + memset(buf, 'x', sizeof(buf)); + buf[63] = '%'; + send(cfd, buf, sizeof(buf), MSG_OOB); + + memset(buf, 'x', sizeof(buf)); + send(cfd, buf, sizeof(buf), 0); + + wait_for_signal(pipefd[0]); + + /* Test 4: Test for 1byte OOB msg */ + memset(buf, 'x', sizeof(buf)); + buf[0] = '@'; + send(cfd, buf, 1, MSG_OOB); + } +} + +int +main(int argc, char **argv) +{ + int lfd, pfd; + struct sockaddr_un consumer_addr, paddr; + socklen_t len = sizeof(consumer_addr); + char buf[1024]; + int on = 0; + char oob; + int flags; + int atmark; + char *tmp_file; + + lfd = socket(AF_UNIX, SOCK_STREAM, 0); + memset(&consumer_addr, 0, sizeof(consumer_addr)); + consumer_addr.sun_family = AF_UNIX; + sprintf(sock_name, "unix_oob_%d", getpid()); + unlink(sock_name); + strcpy(consumer_addr.sun_path, sock_name); + + if ((bind(lfd, (struct sockaddr *)&consumer_addr, + sizeof(consumer_addr))) != 0) { + perror("socket bind failed"); + exit(1); + } + + pipe(pipefd); + + listen(lfd, 1); + + producer_id = fork(); + if (producer_id == 0) { + producer(&consumer_addr); + exit(0); + } + + set_sig_handler(SIGURG); + signal_producer(pipefd[1]); + + pfd = accept(lfd, (struct sockaddr *) &paddr, &len); + fcntl(pfd, F_SETOWN, getpid()); + + signal_recvd = 0; + signal_producer(pipefd[1]); + + /* Test 1: + * veriyf that SIGURG is + * delivered and 63 bytes are + * read and oob is '@' + */ + wait_for_data(pfd, POLLIN | POLLPRI); + read_oob(pfd, &oob); + len = read_data(pfd, buf, 1024); + if (!signal_recvd || len != 63 || oob != '@') { + fprintf(stderr, "Test 1 failed sigurg %d len %d %c\n", + signal_recvd, len, oob); + die(1); + } + + signal_recvd = 0; + signal_producer(pipefd[1]); + + /* Test 2: + * Verify that the first OOB is over written by + * the 2nd one and the first OOB is returned as + * part of the read, and sigurg is received. + */ + wait_for_data(pfd, POLLIN | POLLPRI); + len = 0; + while (len < 70) + len = recv(pfd, buf, 1024, MSG_PEEK); + len = read_data(pfd, buf, 1024); + read_oob(pfd, &oob); + if (!signal_recvd || len != 127 || oob != '#') { + fprintf(stderr, "Test 2 failed, sigurg %d len %d OOB %c\n", + signal_recvd, len, oob); + die(1); + } + + signal_recvd = 0; + signal_producer(pipefd[1]); + + /* Test 3: + * verify that 2nd oob over writes + * the first one and read breaks at + * oob boundary returning 127 bytes + * and sigurg is received and atmark + * is set. + * oob is '%' and second read returns + * 64 bytes. + */ + len = 0; + wait_for_data(pfd, POLLIN | POLLPRI); + while (len < 150) + len = recv(pfd, buf, 1024, MSG_PEEK); + len = read_data(pfd, buf, 1024); + atmark = is_sioctatmark(pfd); + read_oob(pfd, &oob); + + if (!signal_recvd || len != 127 || oob != '%' || atmark != 1) { + fprintf(stderr, "Test 3 failed, sigurg %d len %d OOB %c ", + "atmark %d\n", signal_recvd, len, oob, atmark); + die(1); + } + + signal_recvd = 0; + + len = read_data(pfd, buf, 1024); + if (len != 64) { + fprintf(stderr, "Test 3.1 failed, sigurg %d len %d OOB %c\n", + signal_recvd, len, oob); + die(1); + } + + signal_recvd = 0; + signal_producer(pipefd[1]); + + /* Test 4: + * verify that a single byte + * oob message is delivered. + * set non blocking mode and + * check proper error is + * returned and sigurg is + * received and correct + * oob is read. + */ + + set_filemode(pfd, 0); + + wait_for_data(pfd, POLLIN | POLLPRI); + len = read_data(pfd, buf, 1024); + if ((len == -1) && (errno == 11)) + len = 0; + + read_oob(pfd, &oob); + + if (!signal_recvd || len != 0 || oob != '@') { + fprintf(stderr, "Test 4 failed, sigurg %d len %d OOB %c\n", + signal_recvd, len, oob); + die(1); + } + + set_filemode(pfd, 1); + + /* Inline Testing */ + + on = 1; + if (setsockopt(pfd, SOL_SOCKET, SO_OOBINLINE, &on, sizeof(on))) { + perror("SO_OOBINLINE"); + die(1); + } + + signal_recvd = 0; + signal_producer(pipefd[1]); + + /* Test 1 -- Inline: + * Check that SIGURG is + * delivered and 63 bytes are + * read and oob is '@' + */ + + wait_for_data(pfd, POLLIN | POLLPRI); + len = read_data(pfd, buf, 1024); + + if (!signal_recvd || len != 63) { + fprintf(stderr, "Test 1 Inline failed, sigurg %d len %d\n", + signal_recvd, len); + die(1); + } + + len = read_data(pfd, buf, 1024); + + if (len != 1) { + fprintf(stderr, + "Test 1.1 Inline failed, sigurg %d len %d oob %c\n", + signal_recvd, len, oob); + die(1); + } + + signal_recvd = 0; + signal_producer(pipefd[1]); + + /* Test 2 -- Inline: + * Verify that the first OOB is over written by + * the 2nd one and read breaks correctly on + * 2nd OOB boundary with the first OOB returned as + * part of the read, and sigurg is delivered and + * siocatmark returns true. + * next read returns one byte, the oob byte + * and siocatmark returns false. + */ + len = 0; + wait_for_data(pfd, POLLIN | POLLPRI); + while (len < 70) + len = recv(pfd, buf, 1024, MSG_PEEK); + len = read_data(pfd, buf, 1024); + atmark = is_sioctatmark(pfd); + if (len != 127 || atmark != 1 || !signal_recvd) { + fprintf(stderr, "Test 2 Inline failed, len %d atmark %d\n", + len, atmark); + die(1); + } + + len = read_data(pfd, buf, 1024); + atmark = is_sioctatmark(pfd); + if (len != 1 || buf[0] != '#' || atmark == 1) { + fprintf(stderr, "Test 2.1 Inline failed, len %d data %c atmark %d\n", + len, buf[0], atmark); + die(1); + } + + signal_recvd = 0; + signal_producer(pipefd[1]); + + /* Test 3 -- Inline: + * verify that 2nd oob over writes + * the first one and read breaks at + * oob boundary returning 127 bytes + * and sigurg is received and siocatmark + * is true after the read. + * subsequent read returns 65 bytes + * because of oob which should be '%'. + */ + len = 0; + wait_for_data(pfd, POLLIN | POLLPRI); + while (len < 126) + len = recv(pfd, buf, 1024, MSG_PEEK); + len = read_data(pfd, buf, 1024); + atmark = is_sioctatmark(pfd); + if (!signal_recvd || len != 127 || !atmark) { + fprintf(stderr, + "Test 3 Inline failed, sigurg %d len %d data %c\n", + signal_recvd, len, buf[0]); + die(1); + } + + len = read_data(pfd, buf, 1024); + atmark = is_sioctatmark(pfd); + if (len != 65 || buf[0] != '%' || atmark != 0) { + fprintf(stderr, + "Test 3.1 Inline failed, len %d oob %c atmark %d\n", + len, buf[0], atmark); + die(1); + } + + signal_recvd = 0; + signal_producer(pipefd[1]); + + /* Test 4 -- Inline: + * verify that a single + * byte oob message is delivered + * and read returns one byte, the oob + * byte and sigurg is received + */ + wait_for_data(pfd, POLLIN | POLLPRI); + len = read_data(pfd, buf, 1024); + if (!signal_recvd || len != 1 || buf[0] != '@') { + fprintf(stderr, + "Test 4 Inline failed, signal %d len %d data %c\n", + signal_recvd, len, buf[0]); + die(1); + } + die(0); +} diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index 6f905b53904f..21b646d10b88 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -42,3 +42,4 @@ CONFIG_NET_CLS_FLOWER=m CONFIG_NET_ACT_TUNNEL_KEY=m CONFIG_NET_ACT_MIRRED=m CONFIG_BAREUDP=m +CONFIG_IPV6_IOAM6_LWTUNNEL=y diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh index a8ad92850e63..162e5f1ac36b 100755 --- a/tools/testing/selftests/net/fcnal-test.sh +++ b/tools/testing/selftests/net/fcnal-test.sh @@ -3879,6 +3879,32 @@ use_case_ping_lla_multi() log_test_addr ${MCAST}%${NSC_DEV} $? 0 "Post cycle ${NSA} ${NSA_DEV2}, ping out ns-C" } +# Perform IPv{4,6} SNAT on ns-A, and verify TCP connection is successfully +# established with ns-B. +use_case_snat_on_vrf() +{ + setup "yes" + + local port="12345" + + run_cmd iptables -t nat -A POSTROUTING -p tcp -m tcp --dport ${port} -j SNAT --to-source ${NSA_LO_IP} -o ${VRF} + run_cmd ip6tables -t nat -A POSTROUTING -p tcp -m tcp --dport ${port} -j SNAT --to-source ${NSA_LO_IP6} -o ${VRF} + + run_cmd_nsb nettest -s -l ${NSB_IP} -p ${port} & + sleep 1 + run_cmd nettest -d ${VRF} -r ${NSB_IP} -p ${port} + log_test $? 0 "IPv4 TCP connection over VRF with SNAT" + + run_cmd_nsb nettest -6 -s -l ${NSB_IP6} -p ${port} & + sleep 1 + run_cmd nettest -6 -d ${VRF} -r ${NSB_IP6} -p ${port} + log_test $? 0 "IPv6 TCP connection over VRF with SNAT" + + # Cleanup + run_cmd iptables -t nat -D POSTROUTING -p tcp -m tcp --dport ${port} -j SNAT --to-source ${NSA_LO_IP} -o ${VRF} + run_cmd ip6tables -t nat -D POSTROUTING -p tcp -m tcp --dport ${port} -j SNAT --to-source ${NSA_LO_IP6} -o ${VRF} +} + use_cases() { log_section "Use cases" @@ -3886,6 +3912,8 @@ use_cases() use_case_br log_subsection "Ping LLA with multiple interfaces" use_case_ping_lla_multi + log_subsection "SNAT on VRF" + use_case_snat_on_vrf } ################################################################################ diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c new file mode 100644 index 000000000000..cf37ce86b0fd --- /dev/null +++ b/tools/testing/selftests/net/gro.c @@ -0,0 +1,1095 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This testsuite provides conformance testing for GRO coalescing. + * + * Test cases: + * 1.data + * Data packets of the same size and same header setup with correct + * sequence numbers coalesce. The one exception being the last data + * packet coalesced: it can be smaller than the rest and coalesced + * as long as it is in the same flow. + * 2.ack + * Pure ACK does not coalesce. + * 3.flags + * Specific test cases: no packets with PSH, SYN, URG, RST set will + * be coalesced. + * 4.tcp + * Packets with incorrect checksum, non-consecutive seqno and + * different TCP header options shouldn't coalesce. Nit: given that + * some extension headers have paddings, such as timestamp, headers + * that are padding differently would not be coalesced. + * 5.ip: + * Packets with different (ECN, TTL, TOS) header, ip options or + * ip fragments (ipv6) shouldn't coalesce. + * 6.large: + * Packets larger than GRO_MAX_SIZE packets shouldn't coalesce. + * + * MSS is defined as 4096 - header because if it is too small + * (i.e. 1500 MTU - header), it will result in many packets, + * increasing the "large" test case's flakiness. This is because + * due to time sensitivity in the coalescing window, the receiver + * may not coalesce all of the packets. + * + * Note the timing issue applies to all of the test cases, so some + * flakiness is to be expected. + * + */ + +#define _GNU_SOURCE + +#include <arpa/inet.h> +#include <errno.h> +#include <error.h> +#include <getopt.h> +#include <linux/filter.h> +#include <linux/if_packet.h> +#include <linux/ipv6.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip6.h> +#include <netinet/tcp.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdio.h> +#include <stdarg.h> +#include <string.h> +#include <unistd.h> + +#define DPORT 8000 +#define SPORT 1500 +#define PAYLOAD_LEN 100 +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) +#define NUM_PACKETS 4 +#define START_SEQ 100 +#define START_ACK 100 +#define SIP6 "fdaa::2" +#define DIP6 "fdaa::1" +#define SIP4 "192.168.1.200" +#define DIP4 "192.168.1.100" +#define ETH_P_NONE 0 +#define TOTAL_HDR_LEN (ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct tcphdr)) +#define MSS (4096 - sizeof(struct tcphdr) - sizeof(struct ipv6hdr)) +#define MAX_PAYLOAD (IP_MAXPACKET - sizeof(struct tcphdr) - sizeof(struct ipv6hdr)) +#define NUM_LARGE_PKT (MAX_PAYLOAD / MSS) +#define MAX_HDR_LEN (ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct tcphdr)) + +static int proto = -1; +static uint8_t src_mac[ETH_ALEN], dst_mac[ETH_ALEN]; +static char *testname = "data"; +static char *ifname = "eth0"; +static char *smac = "aa:00:00:00:00:02"; +static char *dmac = "aa:00:00:00:00:01"; +static bool verbose; +static bool tx_socket = true; +static int tcp_offset = -1; +static int total_hdr_len = -1; +static int ethhdr_proto = -1; + +static void vlog(const char *fmt, ...) +{ + va_list args; + + if (verbose) { + va_start(args, fmt); + vfprintf(stderr, fmt, args); + va_end(args); + } +} + +static void setup_sock_filter(int fd) +{ + const int dport_off = tcp_offset + offsetof(struct tcphdr, dest); + const int ethproto_off = offsetof(struct ethhdr, h_proto); + int optlen = 0; + int ipproto_off; + int next_off; + + if (proto == PF_INET) + next_off = offsetof(struct iphdr, protocol); + else + next_off = offsetof(struct ipv6hdr, nexthdr); + ipproto_off = ETH_HLEN + next_off; + + if (strcmp(testname, "ip") == 0) { + if (proto == PF_INET) + optlen = sizeof(struct ip_timestamp); + else + optlen = sizeof(struct ip6_frag); + } + + struct sock_filter filter[] = { + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, ethproto_off), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ntohs(ethhdr_proto), 0, 7), + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, ipproto_off), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, IPPROTO_TCP, 0, 5), + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, dport_off), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, DPORT, 2, 0), + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, dport_off + optlen), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, DPORT, 0, 1), + BPF_STMT(BPF_RET + BPF_K, 0xFFFFFFFF), + BPF_STMT(BPF_RET + BPF_K, 0), + }; + + struct sock_fprog bpf = { + .len = ARRAY_SIZE(filter), + .filter = filter, + }; + + if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &bpf, sizeof(bpf)) < 0) + error(1, errno, "error setting filter"); +} + +static uint32_t checksum_nofold(void *data, size_t len, uint32_t sum) +{ + uint16_t *words = data; + int i; + + for (i = 0; i < len / 2; i++) + sum += words[i]; + if (len & 1) + sum += ((char *)data)[len - 1]; + return sum; +} + +static uint16_t checksum_fold(void *data, size_t len, uint32_t sum) +{ + sum = checksum_nofold(data, len, sum); + while (sum > 0xFFFF) + sum = (sum & 0xFFFF) + (sum >> 16); + return ~sum; +} + +static uint16_t tcp_checksum(void *buf, int payload_len) +{ + struct pseudo_header6 { + struct in6_addr saddr; + struct in6_addr daddr; + uint16_t protocol; + uint16_t payload_len; + } ph6; + struct pseudo_header4 { + struct in_addr saddr; + struct in_addr daddr; + uint16_t protocol; + uint16_t payload_len; + } ph4; + uint32_t sum = 0; + + if (proto == PF_INET6) { + if (inet_pton(AF_INET6, SIP6, &ph6.saddr) != 1) + error(1, errno, "inet_pton6 source ip pseudo"); + if (inet_pton(AF_INET6, DIP6, &ph6.daddr) != 1) + error(1, errno, "inet_pton6 dest ip pseudo"); + ph6.protocol = htons(IPPROTO_TCP); + ph6.payload_len = htons(sizeof(struct tcphdr) + payload_len); + + sum = checksum_nofold(&ph6, sizeof(ph6), 0); + } else if (proto == PF_INET) { + if (inet_pton(AF_INET, SIP4, &ph4.saddr) != 1) + error(1, errno, "inet_pton source ip pseudo"); + if (inet_pton(AF_INET, DIP4, &ph4.daddr) != 1) + error(1, errno, "inet_pton dest ip pseudo"); + ph4.protocol = htons(IPPROTO_TCP); + ph4.payload_len = htons(sizeof(struct tcphdr) + payload_len); + + sum = checksum_nofold(&ph4, sizeof(ph4), 0); + } + + return checksum_fold(buf, sizeof(struct tcphdr) + payload_len, sum); +} + +static void read_MAC(uint8_t *mac_addr, char *mac) +{ + if (sscanf(mac, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", + &mac_addr[0], &mac_addr[1], &mac_addr[2], + &mac_addr[3], &mac_addr[4], &mac_addr[5]) != 6) + error(1, 0, "sscanf"); +} + +static void fill_datalinklayer(void *buf) +{ + struct ethhdr *eth = buf; + + memcpy(eth->h_dest, dst_mac, ETH_ALEN); + memcpy(eth->h_source, src_mac, ETH_ALEN); + eth->h_proto = ethhdr_proto; +} + +static void fill_networklayer(void *buf, int payload_len) +{ + struct ipv6hdr *ip6h = buf; + struct iphdr *iph = buf; + + if (proto == PF_INET6) { + memset(ip6h, 0, sizeof(*ip6h)); + + ip6h->version = 6; + ip6h->payload_len = htons(sizeof(struct tcphdr) + payload_len); + ip6h->nexthdr = IPPROTO_TCP; + ip6h->hop_limit = 8; + if (inet_pton(AF_INET6, SIP6, &ip6h->saddr) != 1) + error(1, errno, "inet_pton source ip6"); + if (inet_pton(AF_INET6, DIP6, &ip6h->daddr) != 1) + error(1, errno, "inet_pton dest ip6"); + } else if (proto == PF_INET) { + memset(iph, 0, sizeof(*iph)); + + iph->version = 4; + iph->ihl = 5; + iph->ttl = 8; + iph->protocol = IPPROTO_TCP; + iph->tot_len = htons(sizeof(struct tcphdr) + + payload_len + sizeof(struct iphdr)); + iph->frag_off = htons(0x4000); /* DF = 1, MF = 0 */ + if (inet_pton(AF_INET, SIP4, &iph->saddr) != 1) + error(1, errno, "inet_pton source ip"); + if (inet_pton(AF_INET, DIP4, &iph->daddr) != 1) + error(1, errno, "inet_pton dest ip"); + iph->check = checksum_fold(buf, sizeof(struct iphdr), 0); + } +} + +static void fill_transportlayer(void *buf, int seq_offset, int ack_offset, + int payload_len, int fin) +{ + struct tcphdr *tcph = buf; + + memset(tcph, 0, sizeof(*tcph)); + + tcph->source = htons(SPORT); + tcph->dest = htons(DPORT); + tcph->seq = ntohl(START_SEQ + seq_offset); + tcph->ack_seq = ntohl(START_ACK + ack_offset); + tcph->ack = 1; + tcph->fin = fin; + tcph->doff = 5; + tcph->window = htons(TCP_MAXWIN); + tcph->urg_ptr = 0; + tcph->check = tcp_checksum(tcph, payload_len); +} + +static void write_packet(int fd, char *buf, int len, struct sockaddr_ll *daddr) +{ + int ret = -1; + + ret = sendto(fd, buf, len, 0, (struct sockaddr *)daddr, sizeof(*daddr)); + if (ret == -1) + error(1, errno, "sendto failure"); + if (ret != len) + error(1, errno, "sendto wrong length"); +} + +static void create_packet(void *buf, int seq_offset, int ack_offset, + int payload_len, int fin) +{ + memset(buf, 0, total_hdr_len); + memset(buf + total_hdr_len, 'a', payload_len); + fill_transportlayer(buf + tcp_offset, seq_offset, ack_offset, + payload_len, fin); + fill_networklayer(buf + ETH_HLEN, payload_len); + fill_datalinklayer(buf); +} + +/* send one extra flag, not first and not last pkt */ +static void send_flags(int fd, struct sockaddr_ll *daddr, int psh, int syn, + int rst, int urg) +{ + static char flag_buf[MAX_HDR_LEN + PAYLOAD_LEN]; + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + int payload_len, pkt_size, flag, i; + struct tcphdr *tcph; + + payload_len = PAYLOAD_LEN * psh; + pkt_size = total_hdr_len + payload_len; + flag = NUM_PACKETS / 2; + + create_packet(flag_buf, flag * payload_len, 0, payload_len, 0); + + tcph = (struct tcphdr *)(flag_buf + tcp_offset); + tcph->psh = psh; + tcph->syn = syn; + tcph->rst = rst; + tcph->urg = urg; + tcph->check = 0; + tcph->check = tcp_checksum(tcph, payload_len); + + for (i = 0; i < NUM_PACKETS + 1; i++) { + if (i == flag) { + write_packet(fd, flag_buf, pkt_size, daddr); + continue; + } + create_packet(buf, i * PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, total_hdr_len + PAYLOAD_LEN, daddr); + } +} + +/* Test for data of same length, smaller than previous + * and of different lengths + */ +static void send_data_pkts(int fd, struct sockaddr_ll *daddr, + int payload_len1, int payload_len2) +{ + static char buf[ETH_HLEN + IP_MAXPACKET]; + + create_packet(buf, 0, 0, payload_len1, 0); + write_packet(fd, buf, total_hdr_len + payload_len1, daddr); + create_packet(buf, payload_len1, 0, payload_len2, 0); + write_packet(fd, buf, total_hdr_len + payload_len2, daddr); +} + +/* If incoming segments make tracked segment length exceed + * legal IP datagram length, do not coalesce + */ +static void send_large(int fd, struct sockaddr_ll *daddr, int remainder) +{ + static char pkts[NUM_LARGE_PKT][TOTAL_HDR_LEN + MSS]; + static char last[TOTAL_HDR_LEN + MSS]; + static char new_seg[TOTAL_HDR_LEN + MSS]; + int i; + + for (i = 0; i < NUM_LARGE_PKT; i++) + create_packet(pkts[i], i * MSS, 0, MSS, 0); + create_packet(last, NUM_LARGE_PKT * MSS, 0, remainder, 0); + create_packet(new_seg, (NUM_LARGE_PKT + 1) * MSS, 0, remainder, 0); + + for (i = 0; i < NUM_LARGE_PKT; i++) + write_packet(fd, pkts[i], total_hdr_len + MSS, daddr); + write_packet(fd, last, total_hdr_len + remainder, daddr); + write_packet(fd, new_seg, total_hdr_len + remainder, daddr); +} + +/* Pure acks and dup acks don't coalesce */ +static void send_ack(int fd, struct sockaddr_ll *daddr) +{ + static char buf[MAX_HDR_LEN]; + + create_packet(buf, 0, 0, 0, 0); + write_packet(fd, buf, total_hdr_len, daddr); + write_packet(fd, buf, total_hdr_len, daddr); + create_packet(buf, 0, 1, 0, 0); + write_packet(fd, buf, total_hdr_len, daddr); +} + +static void recompute_packet(char *buf, char *no_ext, int extlen) +{ + struct tcphdr *tcphdr = (struct tcphdr *)(buf + tcp_offset); + struct ipv6hdr *ip6h = (struct ipv6hdr *)(buf + ETH_HLEN); + struct iphdr *iph = (struct iphdr *)(buf + ETH_HLEN); + + memmove(buf, no_ext, total_hdr_len); + memmove(buf + total_hdr_len + extlen, + no_ext + total_hdr_len, PAYLOAD_LEN); + + tcphdr->doff = tcphdr->doff + (extlen / 4); + tcphdr->check = 0; + tcphdr->check = tcp_checksum(tcphdr, PAYLOAD_LEN + extlen); + if (proto == PF_INET) { + iph->tot_len = htons(ntohs(iph->tot_len) + extlen); + iph->check = 0; + iph->check = checksum_fold(iph, sizeof(struct iphdr), 0); + } else { + ip6h->payload_len = htons(ntohs(ip6h->payload_len) + extlen); + } +} + +static void tcp_write_options(char *buf, int kind, int ts) +{ + struct tcp_option_ts { + uint8_t kind; + uint8_t len; + uint32_t tsval; + uint32_t tsecr; + } *opt_ts = (void *)buf; + struct tcp_option_window { + uint8_t kind; + uint8_t len; + uint8_t shift; + } *opt_window = (void *)buf; + + switch (kind) { + case TCPOPT_NOP: + buf[0] = TCPOPT_NOP; + break; + case TCPOPT_WINDOW: + memset(opt_window, 0, sizeof(struct tcp_option_window)); + opt_window->kind = TCPOPT_WINDOW; + opt_window->len = TCPOLEN_WINDOW; + opt_window->shift = 0; + break; + case TCPOPT_TIMESTAMP: + memset(opt_ts, 0, sizeof(struct tcp_option_ts)); + opt_ts->kind = TCPOPT_TIMESTAMP; + opt_ts->len = TCPOLEN_TIMESTAMP; + opt_ts->tsval = ts; + opt_ts->tsecr = 0; + break; + default: + error(1, 0, "unimplemented TCP option"); + break; + } +} + +/* TCP with options is always a permutation of {TS, NOP, NOP}. + * Implement different orders to verify coalescing stops. + */ +static void add_standard_tcp_options(char *buf, char *no_ext, int ts, int order) +{ + switch (order) { + case 0: + tcp_write_options(buf + total_hdr_len, TCPOPT_NOP, 0); + tcp_write_options(buf + total_hdr_len + 1, TCPOPT_NOP, 0); + tcp_write_options(buf + total_hdr_len + 2 /* two NOP opts */, + TCPOPT_TIMESTAMP, ts); + break; + case 1: + tcp_write_options(buf + total_hdr_len, TCPOPT_NOP, 0); + tcp_write_options(buf + total_hdr_len + 1, + TCPOPT_TIMESTAMP, ts); + tcp_write_options(buf + total_hdr_len + 1 + TCPOLEN_TIMESTAMP, + TCPOPT_NOP, 0); + break; + case 2: + tcp_write_options(buf + total_hdr_len, TCPOPT_TIMESTAMP, ts); + tcp_write_options(buf + total_hdr_len + TCPOLEN_TIMESTAMP + 1, + TCPOPT_NOP, 0); + tcp_write_options(buf + total_hdr_len + TCPOLEN_TIMESTAMP + 2, + TCPOPT_NOP, 0); + break; + default: + error(1, 0, "unknown order"); + break; + } + recompute_packet(buf, no_ext, TCPOLEN_TSTAMP_APPA); +} + +/* Packets with invalid checksum don't coalesce. */ +static void send_changed_checksum(int fd, struct sockaddr_ll *daddr) +{ + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + struct tcphdr *tcph = (struct tcphdr *)(buf + tcp_offset); + int pkt_size = total_hdr_len + PAYLOAD_LEN; + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + tcph->check = tcph->check - 1; + write_packet(fd, buf, pkt_size, daddr); +} + + /* Packets with non-consecutive sequence number don't coalesce.*/ +static void send_changed_seq(int fd, struct sockaddr_ll *daddr) +{ + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + struct tcphdr *tcph = (struct tcphdr *)(buf + tcp_offset); + int pkt_size = total_hdr_len + PAYLOAD_LEN; + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + tcph->seq = ntohl(htonl(tcph->seq) + 1); + tcph->check = 0; + tcph->check = tcp_checksum(tcph, PAYLOAD_LEN); + write_packet(fd, buf, pkt_size, daddr); +} + + /* Packet with different timestamp option or different timestamps + * don't coalesce. + */ +static void send_changed_ts(int fd, struct sockaddr_ll *daddr) +{ + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + static char extpkt[sizeof(buf) + TCPOLEN_TSTAMP_APPA]; + int pkt_size = total_hdr_len + PAYLOAD_LEN + TCPOLEN_TSTAMP_APPA; + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + add_standard_tcp_options(extpkt, buf, 0, 0); + write_packet(fd, extpkt, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + add_standard_tcp_options(extpkt, buf, 0, 0); + write_packet(fd, extpkt, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0); + add_standard_tcp_options(extpkt, buf, 100, 0); + write_packet(fd, extpkt, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN * 3, 0, PAYLOAD_LEN, 0); + add_standard_tcp_options(extpkt, buf, 100, 1); + write_packet(fd, extpkt, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN * 4, 0, PAYLOAD_LEN, 0); + add_standard_tcp_options(extpkt, buf, 100, 2); + write_packet(fd, extpkt, pkt_size, daddr); +} + +/* Packet with different tcp options don't coalesce. */ +static void send_diff_opt(int fd, struct sockaddr_ll *daddr) +{ + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + static char extpkt1[sizeof(buf) + TCPOLEN_TSTAMP_APPA]; + static char extpkt2[sizeof(buf) + TCPOLEN_MAXSEG]; + int extpkt1_size = total_hdr_len + PAYLOAD_LEN + TCPOLEN_TSTAMP_APPA; + int extpkt2_size = total_hdr_len + PAYLOAD_LEN + TCPOLEN_MAXSEG; + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + add_standard_tcp_options(extpkt1, buf, 0, 0); + write_packet(fd, extpkt1, extpkt1_size, daddr); + + create_packet(buf, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + add_standard_tcp_options(extpkt1, buf, 0, 0); + write_packet(fd, extpkt1, extpkt1_size, daddr); + + create_packet(buf, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0); + tcp_write_options(extpkt2 + MAX_HDR_LEN, TCPOPT_NOP, 0); + tcp_write_options(extpkt2 + MAX_HDR_LEN + 1, TCPOPT_WINDOW, 0); + recompute_packet(extpkt2, buf, TCPOLEN_WINDOW + 1); + write_packet(fd, extpkt2, extpkt2_size, daddr); +} + +static void add_ipv4_ts_option(void *buf, void *optpkt) +{ + struct ip_timestamp *ts = (struct ip_timestamp *)(optpkt + tcp_offset); + int optlen = sizeof(struct ip_timestamp); + struct iphdr *iph; + + if (optlen % 4) + error(1, 0, "ipv4 timestamp length is not a multiple of 4B"); + + ts->ipt_code = IPOPT_TS; + ts->ipt_len = optlen; + ts->ipt_ptr = 5; + ts->ipt_flg = IPOPT_TS_TSONLY; + + memcpy(optpkt, buf, tcp_offset); + memcpy(optpkt + tcp_offset + optlen, buf + tcp_offset, + sizeof(struct tcphdr) + PAYLOAD_LEN); + + iph = (struct iphdr *)(optpkt + ETH_HLEN); + iph->ihl = 5 + (optlen / 4); + iph->tot_len = htons(ntohs(iph->tot_len) + optlen); + iph->check = 0; + iph->check = checksum_fold(iph, sizeof(struct iphdr) + optlen, 0); +} + +/* IPv4 options shouldn't coalesce */ +static void send_ip_options(int fd, struct sockaddr_ll *daddr) +{ + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + static char optpkt[sizeof(buf) + sizeof(struct ip_timestamp)]; + int optlen = sizeof(struct ip_timestamp); + int pkt_size = total_hdr_len + PAYLOAD_LEN + optlen; + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, total_hdr_len + PAYLOAD_LEN, daddr); + + create_packet(buf, PAYLOAD_LEN * 1, 0, PAYLOAD_LEN, 0); + add_ipv4_ts_option(buf, optpkt); + write_packet(fd, optpkt, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, total_hdr_len + PAYLOAD_LEN, daddr); +} + +/* IPv4 fragments shouldn't coalesce */ +static void send_fragment4(int fd, struct sockaddr_ll *daddr) +{ + static char buf[IP_MAXPACKET]; + struct iphdr *iph = (struct iphdr *)(buf + ETH_HLEN); + int pkt_size = total_hdr_len + PAYLOAD_LEN; + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, pkt_size, daddr); + + /* Once fragmented, packet would retain the total_len. + * Tcp header is prepared as if rest of data is in follow-up frags, + * but follow up frags aren't actually sent. + */ + memset(buf + total_hdr_len, 'a', PAYLOAD_LEN * 2); + fill_transportlayer(buf + tcp_offset, PAYLOAD_LEN, 0, PAYLOAD_LEN * 2, 0); + fill_networklayer(buf + ETH_HLEN, PAYLOAD_LEN); + fill_datalinklayer(buf); + + iph->frag_off = htons(0x6000); // DF = 1, MF = 1 + iph->check = 0; + iph->check = checksum_fold(iph, sizeof(struct iphdr), 0); + write_packet(fd, buf, pkt_size, daddr); +} + +/* IPv4 packets with different ttl don't coalesce.*/ +static void send_changed_ttl(int fd, struct sockaddr_ll *daddr) +{ + int pkt_size = total_hdr_len + PAYLOAD_LEN; + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + struct iphdr *iph = (struct iphdr *)(buf + ETH_HLEN); + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + iph->ttl = 7; + iph->check = 0; + iph->check = checksum_fold(iph, sizeof(struct iphdr), 0); + write_packet(fd, buf, pkt_size, daddr); +} + +/* Packets with different tos don't coalesce.*/ +static void send_changed_tos(int fd, struct sockaddr_ll *daddr) +{ + int pkt_size = total_hdr_len + PAYLOAD_LEN; + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + struct iphdr *iph = (struct iphdr *)(buf + ETH_HLEN); + struct ipv6hdr *ip6h = (struct ipv6hdr *)(buf + ETH_HLEN); + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + if (proto == PF_INET) { + iph->tos = 1; + iph->check = 0; + iph->check = checksum_fold(iph, sizeof(struct iphdr), 0); + } else if (proto == PF_INET6) { + ip6h->priority = 0xf; + } + write_packet(fd, buf, pkt_size, daddr); +} + +/* Packets with different ECN don't coalesce.*/ +static void send_changed_ECN(int fd, struct sockaddr_ll *daddr) +{ + int pkt_size = total_hdr_len + PAYLOAD_LEN; + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + struct iphdr *iph = (struct iphdr *)(buf + ETH_HLEN); + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + if (proto == PF_INET) { + buf[ETH_HLEN + 1] ^= 0x2; // ECN set to 10 + iph->check = 0; + iph->check = checksum_fold(iph, sizeof(struct iphdr), 0); + } else { + buf[ETH_HLEN + 1] ^= 0x20; // ECN set to 10 + } + write_packet(fd, buf, pkt_size, daddr); +} + +/* IPv6 fragments and packets with extensions don't coalesce.*/ +static void send_fragment6(int fd, struct sockaddr_ll *daddr) +{ + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + static char extpkt[MAX_HDR_LEN + PAYLOAD_LEN + + sizeof(struct ip6_frag)]; + struct ipv6hdr *ip6h = (struct ipv6hdr *)(buf + ETH_HLEN); + struct ip6_frag *frag = (void *)(extpkt + tcp_offset); + int extlen = sizeof(struct ip6_frag); + int bufpkt_len = total_hdr_len + PAYLOAD_LEN; + int extpkt_len = bufpkt_len + extlen; + int i; + + for (i = 0; i < 2; i++) { + create_packet(buf, PAYLOAD_LEN * i, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, bufpkt_len, daddr); + } + + create_packet(buf, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0); + memset(extpkt, 0, extpkt_len); + + ip6h->nexthdr = IPPROTO_FRAGMENT; + ip6h->payload_len = htons(ntohs(ip6h->payload_len) + extlen); + frag->ip6f_nxt = IPPROTO_TCP; + + memcpy(extpkt, buf, tcp_offset); + memcpy(extpkt + tcp_offset + extlen, buf + tcp_offset, + sizeof(struct tcphdr) + PAYLOAD_LEN); + write_packet(fd, extpkt, extpkt_len, daddr); + + create_packet(buf, PAYLOAD_LEN * 3, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, bufpkt_len, daddr); +} + +static void bind_packetsocket(int fd) +{ + struct sockaddr_ll daddr = {}; + + daddr.sll_family = AF_PACKET; + daddr.sll_protocol = ethhdr_proto; + daddr.sll_ifindex = if_nametoindex(ifname); + if (daddr.sll_ifindex == 0) + error(1, errno, "if_nametoindex"); + + if (bind(fd, (void *)&daddr, sizeof(daddr)) < 0) + error(1, errno, "could not bind socket"); +} + +static void set_timeout(int fd) +{ + struct timeval timeout; + + timeout.tv_sec = 120; + timeout.tv_usec = 0; + if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, + sizeof(timeout)) < 0) + error(1, errno, "cannot set timeout, setsockopt failed"); +} + +static void check_recv_pkts(int fd, int *correct_payload, + int correct_num_pkts) +{ + static char buffer[IP_MAXPACKET + ETH_HLEN + 1]; + struct iphdr *iph = (struct iphdr *)(buffer + ETH_HLEN); + struct ipv6hdr *ip6h = (struct ipv6hdr *)(buffer + ETH_HLEN); + struct tcphdr *tcph; + bool bad_packet = false; + int tcp_ext_len = 0; + int ip_ext_len = 0; + int pkt_size = -1; + int data_len = 0; + int num_pkt = 0; + int i; + + vlog("Expected {"); + for (i = 0; i < correct_num_pkts; i++) + vlog("%d ", correct_payload[i]); + vlog("}, Total %d packets\nReceived {", correct_num_pkts); + + while (1) { + pkt_size = recv(fd, buffer, IP_MAXPACKET + ETH_HLEN + 1, 0); + if (pkt_size < 0) + error(1, errno, "could not receive"); + + if (iph->version == 4) + ip_ext_len = (iph->ihl - 5) * 4; + else if (ip6h->version == 6 && ip6h->nexthdr != IPPROTO_TCP) + ip_ext_len = sizeof(struct ip6_frag); + + tcph = (struct tcphdr *)(buffer + tcp_offset + ip_ext_len); + + if (tcph->fin) + break; + + tcp_ext_len = (tcph->doff - 5) * 4; + data_len = pkt_size - total_hdr_len - tcp_ext_len - ip_ext_len; + /* Min ethernet frame payload is 46(ETH_ZLEN - ETH_HLEN) by RFC 802.3. + * Ipv4/tcp packets without at least 6 bytes of data will be padded. + * Packet sockets are protocol agnostic, and will not trim the padding. + */ + if (pkt_size == ETH_ZLEN && iph->version == 4) { + data_len = ntohs(iph->tot_len) + - sizeof(struct tcphdr) - sizeof(struct iphdr); + } + vlog("%d ", data_len); + if (data_len != correct_payload[num_pkt]) { + vlog("[!=%d]", correct_payload[num_pkt]); + bad_packet = true; + } + num_pkt++; + } + vlog("}, Total %d packets.\n", num_pkt); + if (num_pkt != correct_num_pkts) + error(1, 0, "incorrect number of packets"); + if (bad_packet) + error(1, 0, "incorrect packet geometry"); + + printf("Test succeeded\n\n"); +} + +static void gro_sender(void) +{ + static char fin_pkt[MAX_HDR_LEN]; + struct sockaddr_ll daddr = {}; + int txfd = -1; + + txfd = socket(PF_PACKET, SOCK_RAW, IPPROTO_RAW); + if (txfd < 0) + error(1, errno, "socket creation"); + + memset(&daddr, 0, sizeof(daddr)); + daddr.sll_ifindex = if_nametoindex(ifname); + if (daddr.sll_ifindex == 0) + error(1, errno, "if_nametoindex"); + daddr.sll_family = AF_PACKET; + memcpy(daddr.sll_addr, dst_mac, ETH_ALEN); + daddr.sll_halen = ETH_ALEN; + create_packet(fin_pkt, PAYLOAD_LEN * 2, 0, 0, 1); + + if (strcmp(testname, "data") == 0) { + send_data_pkts(txfd, &daddr, PAYLOAD_LEN, PAYLOAD_LEN); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_data_pkts(txfd, &daddr, PAYLOAD_LEN, PAYLOAD_LEN / 2); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_data_pkts(txfd, &daddr, PAYLOAD_LEN / 2, PAYLOAD_LEN); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ack") == 0) { + send_ack(txfd, &daddr); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "flags") == 0) { + send_flags(txfd, &daddr, 1, 0, 0, 0); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_flags(txfd, &daddr, 0, 1, 0, 0); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_flags(txfd, &daddr, 0, 0, 1, 0); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_flags(txfd, &daddr, 0, 0, 0, 1); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "tcp") == 0) { + send_changed_checksum(txfd, &daddr); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_changed_seq(txfd, &daddr); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_changed_ts(txfd, &daddr); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_diff_opt(txfd, &daddr); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip") == 0) { + send_changed_ECN(txfd, &daddr); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_changed_tos(txfd, &daddr); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + if (proto == PF_INET) { + /* Modified packets may be received out of order. + * Sleep function added to enforce test boundaries + * so that fin pkts are not received prior to other pkts. + */ + sleep(1); + send_changed_ttl(txfd, &daddr); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + sleep(1); + send_ip_options(txfd, &daddr); + sleep(1); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + sleep(1); + send_fragment4(txfd, &daddr); + sleep(1); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (proto == PF_INET6) { + send_fragment6(txfd, &daddr); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } + } else if (strcmp(testname, "large") == 0) { + /* 20 is the difference between min iphdr size + * and min ipv6hdr size. Like MAX_HDR_SIZE, + * MAX_PAYLOAD is defined with the larger header of the two. + */ + int offset = proto == PF_INET ? 20 : 0; + int remainder = (MAX_PAYLOAD + offset) % MSS; + + send_large(txfd, &daddr, remainder); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + send_large(txfd, &daddr, remainder + 1); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else { + error(1, 0, "Unknown testcase"); + } + + if (close(txfd)) + error(1, errno, "socket close"); +} + +static void gro_receiver(void) +{ + static int correct_payload[NUM_PACKETS]; + int rxfd = -1; + + rxfd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_NONE)); + if (rxfd < 0) + error(1, 0, "socket creation"); + setup_sock_filter(rxfd); + set_timeout(rxfd); + bind_packetsocket(rxfd); + + memset(correct_payload, 0, sizeof(correct_payload)); + + if (strcmp(testname, "data") == 0) { + printf("pure data packet of same size: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + + printf("large data packets followed by a smaller one: "); + correct_payload[0] = PAYLOAD_LEN * 1.5; + check_recv_pkts(rxfd, correct_payload, 1); + + printf("small data packets followed by a larger one: "); + correct_payload[0] = PAYLOAD_LEN / 2; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); + } else if (strcmp(testname, "ack") == 0) { + printf("duplicate ack and pure ack: "); + check_recv_pkts(rxfd, correct_payload, 3); + } else if (strcmp(testname, "flags") == 0) { + correct_payload[0] = PAYLOAD_LEN * 3; + correct_payload[1] = PAYLOAD_LEN * 2; + + printf("psh flag ends coalescing: "); + check_recv_pkts(rxfd, correct_payload, 2); + + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = 0; + correct_payload[2] = PAYLOAD_LEN * 2; + printf("syn flag ends coalescing: "); + check_recv_pkts(rxfd, correct_payload, 3); + + printf("rst flag ends coalescing: "); + check_recv_pkts(rxfd, correct_payload, 3); + + printf("urg flag ends coalescing: "); + check_recv_pkts(rxfd, correct_payload, 3); + } else if (strcmp(testname, "tcp") == 0) { + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; + correct_payload[2] = PAYLOAD_LEN; + correct_payload[3] = PAYLOAD_LEN; + + printf("changed checksum does not coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + + printf("Wrong Seq number doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + + printf("Different timestamp doesn't coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 4); + + printf("Different options doesn't coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 2); + } else if (strcmp(testname, "ip") == 0) { + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; + + printf("different ECN doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + + printf("different tos doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + + if (proto == PF_INET) { + printf("different ttl doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + + printf("ip options doesn't coalesce: "); + correct_payload[2] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 3); + + printf("fragmented ip4 doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + } else if (proto == PF_INET6) { + /* GRO doesn't check for ipv6 hop limit when flushing. + * Hence no corresponding test to the ipv4 case. + */ + printf("fragmented ip6 doesn't coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 2); + } + } else if (strcmp(testname, "large") == 0) { + int offset = proto == PF_INET ? 20 : 0; + int remainder = (MAX_PAYLOAD + offset) % MSS; + + correct_payload[0] = (MAX_PAYLOAD + offset); + correct_payload[1] = remainder; + printf("Shouldn't coalesce if exceed IP max pkt size: "); + check_recv_pkts(rxfd, correct_payload, 2); + + /* last segment sent individually, doesn't start new segment */ + correct_payload[0] = correct_payload[0] - remainder; + correct_payload[1] = remainder + 1; + correct_payload[2] = remainder + 1; + check_recv_pkts(rxfd, correct_payload, 3); + } else { + error(1, 0, "Test case error, should never trigger"); + } + + if (close(rxfd)) + error(1, 0, "socket close"); +} + +static void parse_args(int argc, char **argv) +{ + static const struct option opts[] = { + { "dmac", required_argument, NULL, 'D' }, + { "iface", required_argument, NULL, 'i' }, + { "ipv4", no_argument, NULL, '4' }, + { "ipv6", no_argument, NULL, '6' }, + { "rx", no_argument, NULL, 'r' }, + { "smac", required_argument, NULL, 'S' }, + { "test", required_argument, NULL, 't' }, + { "verbose", no_argument, NULL, 'v' }, + { 0, 0, 0, 0 } + }; + int c; + + while ((c = getopt_long(argc, argv, "46D:i:rS:t:v", opts, NULL)) != -1) { + switch (c) { + case '4': + proto = PF_INET; + ethhdr_proto = htons(ETH_P_IP); + break; + case '6': + proto = PF_INET6; + ethhdr_proto = htons(ETH_P_IPV6); + break; + case 'D': + dmac = optarg; + break; + case 'i': + ifname = optarg; + break; + case 'r': + tx_socket = false; + break; + case 'S': + smac = optarg; + break; + case 't': + testname = optarg; + break; + case 'v': + verbose = true; + break; + default: + error(1, 0, "%s invalid option %c\n", __func__, c); + break; + } + } +} + +int main(int argc, char **argv) +{ + parse_args(argc, argv); + + if (proto == PF_INET) { + tcp_offset = ETH_HLEN + sizeof(struct iphdr); + total_hdr_len = tcp_offset + sizeof(struct tcphdr); + } else if (proto == PF_INET6) { + tcp_offset = ETH_HLEN + sizeof(struct ipv6hdr); + total_hdr_len = MAX_HDR_LEN; + } else { + error(1, 0, "Protocol family is not ipv4 or ipv6"); + } + + read_MAC(src_mac, smac); + read_MAC(dst_mac, dmac); + + if (tx_socket) + gro_sender(); + else + gro_receiver(); + return 0; +} diff --git a/tools/testing/selftests/net/gro.sh b/tools/testing/selftests/net/gro.sh new file mode 100755 index 000000000000..794d2bf36dd7 --- /dev/null +++ b/tools/testing/selftests/net/gro.sh @@ -0,0 +1,128 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source setup_loopback.sh +readonly SERVER_MAC="aa:00:00:00:00:02" +readonly CLIENT_MAC="aa:00:00:00:00:01" +readonly TESTS=("data" "ack" "flags" "tcp" "ip" "large") +readonly PROTOS=("ipv4" "ipv6") +dev="eth0" +test="all" +proto="ipv4" + +setup_interrupt() { + # Use timer on host to trigger the network stack + # Also disable device interrupt to not depend on NIC interrupt + # Reduce test flakiness caused by unexpected interrupts + echo 100000 >"${FLUSH_PATH}" + echo 50 >"${IRQ_PATH}" +} + +setup_ns() { + # Set up server_ns namespace and client_ns namespace + setup_macvlan_ns "${dev}" server_ns server "${SERVER_MAC}" + setup_macvlan_ns "${dev}" client_ns client "${CLIENT_MAC}" +} + +cleanup_ns() { + cleanup_macvlan_ns server_ns server client_ns client +} + +setup() { + setup_loopback_environment "${dev}" + setup_interrupt +} + +cleanup() { + cleanup_loopback "${dev}" + + echo "${FLUSH_TIMEOUT}" >"${FLUSH_PATH}" + echo "${HARD_IRQS}" >"${IRQ_PATH}" +} + +run_test() { + local server_pid=0 + local exit_code=0 + local protocol=$1 + local test=$2 + local ARGS=( "--${protocol}" "--dmac" "${SERVER_MAC}" \ + "--smac" "${CLIENT_MAC}" "--test" "${test}" "--verbose" ) + + setup_ns + # Each test is run 3 times to deflake, because given the receive timing, + # not all packets that should coalesce will be considered in the same flow + # on every try. + for tries in {1..3}; do + # Actual test starts here + ip netns exec server_ns ./gro "${ARGS[@]}" "--rx" "--iface" "server" \ + 1>>log.txt & + server_pid=$! + sleep 0.5 # to allow for socket init + ip netns exec client_ns ./gro "${ARGS[@]}" "--iface" "client" \ + 1>>log.txt + wait "${server_pid}" + exit_code=$? + if [[ "${exit_code}" -eq 0 ]]; then + break; + fi + done + cleanup_ns + echo ${exit_code} +} + +run_all_tests() { + local failed_tests=() + for proto in "${PROTOS[@]}"; do + for test in "${TESTS[@]}"; do + echo "running test ${proto} ${test}" >&2 + exit_code=$(run_test $proto $test) + if [[ "${exit_code}" -ne 0 ]]; then + failed_tests+=("${proto}_${test}") + fi; + done; + done + if [[ ${#failed_tests[@]} -ne 0 ]]; then + echo "failed tests: ${failed_tests[*]}. \ + Please see log.txt for more logs" + exit 1 + else + echo "All Tests Succeeded!" + fi; +} + +usage() { + echo "Usage: $0 \ + [-i <DEV>] \ + [-t data|ack|flags|tcp|ip|large] \ + [-p <ipv4|ipv6>]" 1>&2; + exit 1; +} + +while getopts "i:t:p:" opt; do + case "${opt}" in + i) + dev="${OPTARG}" + ;; + t) + test="${OPTARG}" + ;; + p) + proto="${OPTARG}" + ;; + *) + usage + ;; + esac +done + +readonly FLUSH_PATH="/sys/class/net/${dev}/gro_flush_timeout" +readonly IRQ_PATH="/sys/class/net/${dev}/napi_defer_hard_irqs" +readonly FLUSH_TIMEOUT="$(< ${FLUSH_PATH})" +readonly HARD_IRQS="$(< ${IRQ_PATH})" +setup +trap cleanup EXIT +if [[ "${test}" == "all" ]]; then + run_all_tests +else + run_test "${proto}" "${test}" +fi; diff --git a/tools/testing/selftests/net/ioam6.sh b/tools/testing/selftests/net/ioam6.sh new file mode 100755 index 000000000000..3caf72bb9c6a --- /dev/null +++ b/tools/testing/selftests/net/ioam6.sh @@ -0,0 +1,652 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Author: Justin Iurman <justin.iurman@uliege.be> +# +# This script evaluates the IOAM insertion for IPv6 by checking the IOAM data +# consistency directly inside packets on the receiver side. Tests are divided +# into three categories: OUTPUT (evaluates the IOAM processing by the sender), +# INPUT (evaluates the IOAM processing by the receiver) and GLOBAL (evaluates +# wider use cases that do not fall into the other two categories). Both OUTPUT +# and INPUT tests only use a two-node topology (alpha and beta), while GLOBAL +# tests use the entire three-node topology (alpha, beta, gamma). Each test is +# documented inside its own handler in the code below. +# +# An IOAM domain is configured from Alpha to Gamma but not on the reverse path. +# When either Beta or Gamma is the destination (depending on the test category), +# Alpha adds an IOAM option (Pre-allocated Trace) inside a Hop-by-hop. +# +# +# +-------------------+ +-------------------+ +# | | | | +# | Alpha netns | | Gamma netns | +# | | | | +# | +-------------+ | | +-------------+ | +# | | veth0 | | | | veth0 | | +# | | db01::2/64 | | | | db02::2/64 | | +# | +-------------+ | | +-------------+ | +# | . | | . | +# +-------------------+ +-------------------+ +# . . +# . . +# . . +# +----------------------------------------------------+ +# | . . | +# | +-------------+ +-------------+ | +# | | veth0 | | veth1 | | +# | | db01::1/64 | ................ | db02::1/64 | | +# | +-------------+ +-------------+ | +# | | +# | Beta netns | +# | | +# +----------------------------------------------------+ +# +# +# +# ============================================================= +# | Alpha - IOAM configuration | +# +===========================================================+ +# | Node ID | 1 | +# +-----------------------------------------------------------+ +# | Node Wide ID | 11111111 | +# +-----------------------------------------------------------+ +# | Ingress ID | 0xffff (default value) | +# +-----------------------------------------------------------+ +# | Ingress Wide ID | 0xffffffff (default value) | +# +-----------------------------------------------------------+ +# | Egress ID | 101 | +# +-----------------------------------------------------------+ +# | Egress Wide ID | 101101 | +# +-----------------------------------------------------------+ +# | Namespace Data | 0xdeadbee0 | +# +-----------------------------------------------------------+ +# | Namespace Wide Data | 0xcafec0caf00dc0de | +# +-----------------------------------------------------------+ +# | Schema ID | 777 | +# +-----------------------------------------------------------+ +# | Schema Data | something that will be 4n-aligned | +# +-----------------------------------------------------------+ +# +# +# ============================================================= +# | Beta - IOAM configuration | +# +===========================================================+ +# | Node ID | 2 | +# +-----------------------------------------------------------+ +# | Node Wide ID | 22222222 | +# +-----------------------------------------------------------+ +# | Ingress ID | 201 | +# +-----------------------------------------------------------+ +# | Ingress Wide ID | 201201 | +# +-----------------------------------------------------------+ +# | Egress ID | 202 | +# +-----------------------------------------------------------+ +# | Egress Wide ID | 202202 | +# +-----------------------------------------------------------+ +# | Namespace Data | 0xdeadbee1 | +# +-----------------------------------------------------------+ +# | Namespace Wide Data | 0xcafec0caf11dc0de | +# +-----------------------------------------------------------+ +# | Schema ID | 666 | +# +-----------------------------------------------------------+ +# | Schema Data | Hello there -Obi | +# +-----------------------------------------------------------+ +# +# +# ============================================================= +# | Gamma - IOAM configuration | +# +===========================================================+ +# | Node ID | 3 | +# +-----------------------------------------------------------+ +# | Node Wide ID | 33333333 | +# +-----------------------------------------------------------+ +# | Ingress ID | 301 | +# +-----------------------------------------------------------+ +# | Ingress Wide ID | 301301 | +# +-----------------------------------------------------------+ +# | Egress ID | 0xffff (default value) | +# +-----------------------------------------------------------+ +# | Egress Wide ID | 0xffffffff (default value) | +# +-----------------------------------------------------------+ +# | Namespace Data | 0xdeadbee2 | +# +-----------------------------------------------------------+ +# | Namespace Wide Data | 0xcafec0caf22dc0de | +# +-----------------------------------------------------------+ +# | Schema ID | 0xffffff (= None) | +# +-----------------------------------------------------------+ +# | Schema Data | | +# +-----------------------------------------------------------+ + + +################################################################################ +# # +# WARNING: Be careful if you modify the block below - it MUST be kept # +# synchronized with configurations inside ioam6_parser.c and always # +# reflect the same. # +# # +################################################################################ + +ALPHA=( + 1 # ID + 11111111 # Wide ID + 0xffff # Ingress ID + 0xffffffff # Ingress Wide ID + 101 # Egress ID + 101101 # Egress Wide ID + 0xdeadbee0 # Namespace Data + 0xcafec0caf00dc0de # Namespace Wide Data + 777 # Schema ID (0xffffff = None) + "something that will be 4n-aligned" # Schema Data +) + +BETA=( + 2 + 22222222 + 201 + 201201 + 202 + 202202 + 0xdeadbee1 + 0xcafec0caf11dc0de + 666 + "Hello there -Obi" +) + +GAMMA=( + 3 + 33333333 + 301 + 301301 + 0xffff + 0xffffffff + 0xdeadbee2 + 0xcafec0caf22dc0de + 0xffffff + "" +) + +TESTS_OUTPUT=" + out_undef_ns + out_no_room + out_bits + out_full_supp_trace +" + +TESTS_INPUT=" + in_undef_ns + in_no_room + in_oflag + in_bits + in_full_supp_trace +" + +TESTS_GLOBAL=" + fwd_full_supp_trace +" + + +################################################################################ +# # +# LIBRARY # +# # +################################################################################ + +check_kernel_compatibility() +{ + ip netns add ioam-tmp-node + ip link add name veth0 netns ioam-tmp-node type veth \ + peer name veth1 netns ioam-tmp-node + + ip -netns ioam-tmp-node link set veth0 up + ip -netns ioam-tmp-node link set veth1 up + + ip -netns ioam-tmp-node ioam namespace add 0 &>/dev/null + ns_ad=$? + + ip -netns ioam-tmp-node ioam namespace show | grep -q "namespace 0" + ns_sh=$? + + if [[ $ns_ad != 0 || $ns_sh != 0 ]] + then + echo "SKIP: kernel version probably too old, missing ioam support" + ip link del veth0 2>/dev/null || true + ip netns del ioam-tmp-node || true + exit 1 + fi + + ip -netns ioam-tmp-node route add db02::/64 encap ioam6 trace prealloc \ + type 0x800000 ns 0 size 4 dev veth0 &>/dev/null + tr_ad=$? + + ip -netns ioam-tmp-node -6 route | grep -q "encap ioam6 trace" + tr_sh=$? + + if [[ $tr_ad != 0 || $tr_sh != 0 ]] + then + echo "SKIP: cannot attach an ioam trace to a route, did you compile" \ + "without CONFIG_IPV6_IOAM6_LWTUNNEL?" + ip link del veth0 2>/dev/null || true + ip netns del ioam-tmp-node || true + exit 1 + fi + + ip link del veth0 2>/dev/null || true + ip netns del ioam-tmp-node || true +} + +cleanup() +{ + ip link del ioam-veth-alpha 2>/dev/null || true + ip link del ioam-veth-gamma 2>/dev/null || true + + ip netns del ioam-node-alpha || true + ip netns del ioam-node-beta || true + ip netns del ioam-node-gamma || true +} + +setup() +{ + ip netns add ioam-node-alpha + ip netns add ioam-node-beta + ip netns add ioam-node-gamma + + ip link add name ioam-veth-alpha netns ioam-node-alpha type veth \ + peer name ioam-veth-betaL netns ioam-node-beta + ip link add name ioam-veth-betaR netns ioam-node-beta type veth \ + peer name ioam-veth-gamma netns ioam-node-gamma + + ip -netns ioam-node-alpha link set ioam-veth-alpha name veth0 + ip -netns ioam-node-beta link set ioam-veth-betaL name veth0 + ip -netns ioam-node-beta link set ioam-veth-betaR name veth1 + ip -netns ioam-node-gamma link set ioam-veth-gamma name veth0 + + ip -netns ioam-node-alpha addr add db01::2/64 dev veth0 + ip -netns ioam-node-alpha link set veth0 up + ip -netns ioam-node-alpha link set lo up + ip -netns ioam-node-alpha route add db02::/64 via db01::1 dev veth0 + ip -netns ioam-node-alpha route del db01::/64 + ip -netns ioam-node-alpha route add db01::/64 dev veth0 + + ip -netns ioam-node-beta addr add db01::1/64 dev veth0 + ip -netns ioam-node-beta addr add db02::1/64 dev veth1 + ip -netns ioam-node-beta link set veth0 up + ip -netns ioam-node-beta link set veth1 up + ip -netns ioam-node-beta link set lo up + + ip -netns ioam-node-gamma addr add db02::2/64 dev veth0 + ip -netns ioam-node-gamma link set veth0 up + ip -netns ioam-node-gamma link set lo up + ip -netns ioam-node-gamma route add db01::/64 via db02::1 dev veth0 + + # - IOAM config - + ip netns exec ioam-node-alpha sysctl -wq net.ipv6.ioam6_id=${ALPHA[0]} + ip netns exec ioam-node-alpha sysctl -wq net.ipv6.ioam6_id_wide=${ALPHA[1]} + ip netns exec ioam-node-alpha sysctl -wq net.ipv6.conf.veth0.ioam6_id=${ALPHA[4]} + ip netns exec ioam-node-alpha sysctl -wq net.ipv6.conf.veth0.ioam6_id_wide=${ALPHA[5]} + ip -netns ioam-node-alpha ioam namespace add 123 data ${ALPHA[6]} wide ${ALPHA[7]} + ip -netns ioam-node-alpha ioam schema add ${ALPHA[8]} "${ALPHA[9]}" + ip -netns ioam-node-alpha ioam namespace set 123 schema ${ALPHA[8]} + + ip netns exec ioam-node-beta sysctl -wq net.ipv6.conf.all.forwarding=1 + ip netns exec ioam-node-beta sysctl -wq net.ipv6.ioam6_id=${BETA[0]} + ip netns exec ioam-node-beta sysctl -wq net.ipv6.ioam6_id_wide=${BETA[1]} + ip netns exec ioam-node-beta sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=1 + ip netns exec ioam-node-beta sysctl -wq net.ipv6.conf.veth0.ioam6_id=${BETA[2]} + ip netns exec ioam-node-beta sysctl -wq net.ipv6.conf.veth0.ioam6_id_wide=${BETA[3]} + ip netns exec ioam-node-beta sysctl -wq net.ipv6.conf.veth1.ioam6_id=${BETA[4]} + ip netns exec ioam-node-beta sysctl -wq net.ipv6.conf.veth1.ioam6_id_wide=${BETA[5]} + ip -netns ioam-node-beta ioam namespace add 123 data ${BETA[6]} wide ${BETA[7]} + ip -netns ioam-node-beta ioam schema add ${BETA[8]} "${BETA[9]}" + ip -netns ioam-node-beta ioam namespace set 123 schema ${BETA[8]} + + ip netns exec ioam-node-gamma sysctl -wq net.ipv6.ioam6_id=${GAMMA[0]} + ip netns exec ioam-node-gamma sysctl -wq net.ipv6.ioam6_id_wide=${GAMMA[1]} + ip netns exec ioam-node-gamma sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=1 + ip netns exec ioam-node-gamma sysctl -wq net.ipv6.conf.veth0.ioam6_id=${GAMMA[2]} + ip netns exec ioam-node-gamma sysctl -wq net.ipv6.conf.veth0.ioam6_id_wide=${GAMMA[3]} + ip -netns ioam-node-gamma ioam namespace add 123 data ${GAMMA[6]} wide ${GAMMA[7]} + + sleep 1 + + ip netns exec ioam-node-alpha ping6 -c 5 -W 1 db02::2 &>/dev/null + if [ $? != 0 ] + then + echo "Setup FAILED" + cleanup &>/dev/null + exit 0 + fi +} + +log_test_passed() +{ + local desc=$1 + printf "TEST: %-60s [ OK ]\n" "${desc}" +} + +log_test_failed() +{ + local desc=$1 + printf "TEST: %-60s [FAIL]\n" "${desc}" +} + +run_test() +{ + local name=$1 + local desc=$2 + local node_src=$3 + local node_dst=$4 + local ip6_src=$5 + local ip6_dst=$6 + local if_dst=$7 + local trace_type=$8 + local ioam_ns=$9 + + ip netns exec $node_dst ./ioam6_parser $if_dst $name $ip6_src $ip6_dst \ + $trace_type $ioam_ns & + local spid=$! + sleep 0.1 + + ip netns exec $node_src ping6 -t 64 -c 1 -W 1 $ip6_dst &>/dev/null + if [ $? != 0 ] + then + log_test_failed "${desc}" + kill -2 $spid &>/dev/null + else + wait $spid + [ $? = 0 ] && log_test_passed "${desc}" || log_test_failed "${desc}" + fi +} + +run() +{ + echo + echo "OUTPUT tests" + printf "%0.s-" {1..74} + echo + + # set OUTPUT settings + ip netns exec ioam-node-beta sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=0 + + for t in $TESTS_OUTPUT + do + $t + done + + # clean OUTPUT settings + ip netns exec ioam-node-beta sysctl -wq net.ipv6.conf.veth0.ioam6_enabled=1 + ip -netns ioam-node-alpha route change db01::/64 dev veth0 + + + echo + echo "INPUT tests" + printf "%0.s-" {1..74} + echo + + # set INPUT settings + ip -netns ioam-node-alpha ioam namespace del 123 + + for t in $TESTS_INPUT + do + $t + done + + # clean INPUT settings + ip -netns ioam-node-alpha ioam namespace add 123 \ + data ${ALPHA[6]} wide ${ALPHA[7]} + ip -netns ioam-node-alpha ioam namespace set 123 schema ${ALPHA[8]} + ip -netns ioam-node-alpha route change db01::/64 dev veth0 + + + echo + echo "GLOBAL tests" + printf "%0.s-" {1..74} + echo + + for t in $TESTS_GLOBAL + do + $t + done +} + +bit2type=( + 0x800000 0x400000 0x200000 0x100000 0x080000 0x040000 0x020000 0x010000 + 0x008000 0x004000 0x002000 0x001000 0x000800 0x000400 0x000200 0x000100 + 0x000080 0x000040 0x000020 0x000010 0x000008 0x000004 0x000002 +) +bit2size=( 4 4 4 4 4 4 4 4 8 8 8 4 4 4 4 4 4 4 4 4 4 4 4 ) + + +################################################################################ +# # +# OUTPUT tests # +# # +# Two nodes (sender/receiver), IOAM disabled on ingress for the receiver. # +################################################################################ + +out_undef_ns() +{ + ############################################################################## + # Make sure that the encap node won't fill the trace if the chosen IOAM # + # namespace is not configured locally. # + ############################################################################## + local desc="Unknown IOAM namespace" + + ip -netns ioam-node-alpha route change db01::/64 encap ioam6 trace prealloc \ + type 0x800000 ns 0 size 4 dev veth0 + + run_test ${FUNCNAME[0]} "${desc}" ioam-node-alpha ioam-node-beta db01::2 \ + db01::1 veth0 0x800000 0 +} + +out_no_room() +{ + ############################################################################## + # Make sure that the encap node won't fill the trace and will set the # + # Overflow flag since there is no room enough for its data. # + ############################################################################## + local desc="Missing trace room" + + ip -netns ioam-node-alpha route change db01::/64 encap ioam6 trace prealloc \ + type 0xc00000 ns 123 size 4 dev veth0 + + run_test ${FUNCNAME[0]} "${desc}" ioam-node-alpha ioam-node-beta db01::2 \ + db01::1 veth0 0xc00000 123 +} + +out_bits() +{ + ############################################################################## + # Make sure that, for each trace type bit, the encap node will either: # + # (i) fill the trace with its data when it is a supported bit # + # (ii) not fill the trace with its data when it is an unsupported bit # + ############################################################################## + local desc="Trace type with bit <n> only" + + local tmp=${bit2size[22]} + bit2size[22]=$(( $tmp + ${#ALPHA[9]} + ((4 - (${#ALPHA[9]} % 4)) % 4) )) + + for i in {0..22} + do + ip -netns ioam-node-alpha route change db01::/64 encap ioam6 trace \ + prealloc type ${bit2type[$i]} ns 123 size ${bit2size[$i]} dev veth0 + + run_test "out_bit$i" "${desc/<n>/$i}" ioam-node-alpha ioam-node-beta \ + db01::2 db01::1 veth0 ${bit2type[$i]} 123 + done + + bit2size[22]=$tmp +} + +out_full_supp_trace() +{ + ############################################################################## + # Make sure that the encap node will correctly fill a full trace. Be careful,# + # "full trace" here does NOT mean all bits (only supported ones). # + ############################################################################## + local desc="Full supported trace" + + ip -netns ioam-node-alpha route change db01::/64 encap ioam6 trace prealloc \ + type 0xfff002 ns 123 size 100 dev veth0 + + run_test ${FUNCNAME[0]} "${desc}" ioam-node-alpha ioam-node-beta db01::2 \ + db01::1 veth0 0xfff002 123 +} + + +################################################################################ +# # +# INPUT tests # +# # +# Two nodes (sender/receiver), the sender MUST NOT fill the trace upon # +# insertion -> the IOAM namespace configured on the sender is removed # +# and is used in the inserted trace to force the sender not to fill it. # +################################################################################ + +in_undef_ns() +{ + ############################################################################## + # Make sure that the receiving node won't fill the trace if the related IOAM # + # namespace is not configured locally. # + ############################################################################## + local desc="Unknown IOAM namespace" + + ip -netns ioam-node-alpha route change db01::/64 encap ioam6 trace prealloc \ + type 0x800000 ns 0 size 4 dev veth0 + + run_test ${FUNCNAME[0]} "${desc}" ioam-node-alpha ioam-node-beta db01::2 \ + db01::1 veth0 0x800000 0 +} + +in_no_room() +{ + ############################################################################## + # Make sure that the receiving node won't fill the trace and will set the # + # Overflow flag if there is no room enough for its data. # + ############################################################################## + local desc="Missing trace room" + + ip -netns ioam-node-alpha route change db01::/64 encap ioam6 trace prealloc \ + type 0xc00000 ns 123 size 4 dev veth0 + + run_test ${FUNCNAME[0]} "${desc}" ioam-node-alpha ioam-node-beta db01::2 \ + db01::1 veth0 0xc00000 123 +} + +in_bits() +{ + ############################################################################## + # Make sure that, for each trace type bit, the receiving node will either: # + # (i) fill the trace with its data when it is a supported bit # + # (ii) not fill the trace with its data when it is an unsupported bit # + ############################################################################## + local desc="Trace type with bit <n> only" + + local tmp=${bit2size[22]} + bit2size[22]=$(( $tmp + ${#BETA[9]} + ((4 - (${#BETA[9]} % 4)) % 4) )) + + for i in {0..22} + do + ip -netns ioam-node-alpha route change db01::/64 encap ioam6 trace \ + prealloc type ${bit2type[$i]} ns 123 size ${bit2size[$i]} dev veth0 + + run_test "in_bit$i" "${desc/<n>/$i}" ioam-node-alpha ioam-node-beta \ + db01::2 db01::1 veth0 ${bit2type[$i]} 123 + done + + bit2size[22]=$tmp +} + +in_oflag() +{ + ############################################################################## + # Make sure that the receiving node won't fill the trace since the Overflow # + # flag is set. # + ############################################################################## + local desc="Overflow flag is set" + + # Exception: + # Here, we need the sender to set the Overflow flag. For that, we will add + # back the IOAM namespace that was previously configured on the sender. + ip -netns ioam-node-alpha ioam namespace add 123 + + ip -netns ioam-node-alpha route change db01::/64 encap ioam6 trace prealloc \ + type 0xc00000 ns 123 size 4 dev veth0 + + run_test ${FUNCNAME[0]} "${desc}" ioam-node-alpha ioam-node-beta db01::2 \ + db01::1 veth0 0xc00000 123 + + # And we clean the exception for this test to get things back to normal for + # other INPUT tests + ip -netns ioam-node-alpha ioam namespace del 123 +} + +in_full_supp_trace() +{ + ############################################################################## + # Make sure that the receiving node will correctly fill a full trace. Be # + # careful, "full trace" here does NOT mean all bits (only supported ones). # + ############################################################################## + local desc="Full supported trace" + + ip -netns ioam-node-alpha route change db01::/64 encap ioam6 trace prealloc \ + type 0xfff002 ns 123 size 80 dev veth0 + + run_test ${FUNCNAME[0]} "${desc}" ioam-node-alpha ioam-node-beta db01::2 \ + db01::1 veth0 0xfff002 123 +} + + +################################################################################ +# # +# GLOBAL tests # +# # +# Three nodes (sender/router/receiver), IOAM fully enabled on every node. # +################################################################################ + +fwd_full_supp_trace() +{ + ############################################################################## + # Make sure that all three nodes correctly filled the full supported trace # + # by checking that the trace data is consistent with the predefined config. # + ############################################################################## + local desc="Forward - Full supported trace" + + ip -netns ioam-node-alpha route change db02::/64 encap ioam6 trace prealloc \ + type 0xfff002 ns 123 size 244 via db01::1 dev veth0 + + run_test ${FUNCNAME[0]} "${desc}" ioam-node-alpha ioam-node-gamma db01::2 \ + db02::2 veth0 0xfff002 123 +} + + +################################################################################ +# # +# MAIN # +# # +################################################################################ + +if [ "$(id -u)" -ne 0 ] +then + echo "SKIP: Need root privileges" + exit 1 +fi + +if [ ! -x "$(command -v ip)" ] +then + echo "SKIP: Could not run test without ip tool" + exit 1 +fi + +ip ioam &>/dev/null +if [ $? = 1 ] +then + echo "SKIP: iproute2 too old, missing ioam command" + exit 1 +fi + +check_kernel_compatibility + +cleanup &>/dev/null +setup +run +cleanup &>/dev/null diff --git a/tools/testing/selftests/net/ioam6_parser.c b/tools/testing/selftests/net/ioam6_parser.c new file mode 100644 index 000000000000..d376cb2c383c --- /dev/null +++ b/tools/testing/selftests/net/ioam6_parser.c @@ -0,0 +1,720 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Author: Justin Iurman (justin.iurman@uliege.be) + * + * IOAM tester for IPv6, see ioam6.sh for details on each test case. + */ +#include <arpa/inet.h> +#include <errno.h> +#include <limits.h> +#include <linux/const.h> +#include <linux/if_ether.h> +#include <linux/ioam6.h> +#include <linux/ipv6.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +struct ioam_config { + __u32 id; + __u64 wide; + __u16 ingr_id; + __u16 egr_id; + __u32 ingr_wide; + __u32 egr_wide; + __u32 ns_data; + __u64 ns_wide; + __u32 sc_id; + __u8 hlim; + char *sc_data; +}; + +/* + * Be careful if you modify structs below - everything MUST be kept synchronized + * with configurations inside ioam6.sh and always reflect the same. + */ + +static struct ioam_config node1 = { + .id = 1, + .wide = 11111111, + .ingr_id = 0xffff, /* default value */ + .egr_id = 101, + .ingr_wide = 0xffffffff, /* default value */ + .egr_wide = 101101, + .ns_data = 0xdeadbee0, + .ns_wide = 0xcafec0caf00dc0de, + .sc_id = 777, + .sc_data = "something that will be 4n-aligned", + .hlim = 64, +}; + +static struct ioam_config node2 = { + .id = 2, + .wide = 22222222, + .ingr_id = 201, + .egr_id = 202, + .ingr_wide = 201201, + .egr_wide = 202202, + .ns_data = 0xdeadbee1, + .ns_wide = 0xcafec0caf11dc0de, + .sc_id = 666, + .sc_data = "Hello there -Obi", + .hlim = 63, +}; + +static struct ioam_config node3 = { + .id = 3, + .wide = 33333333, + .ingr_id = 301, + .egr_id = 0xffff, /* default value */ + .ingr_wide = 301301, + .egr_wide = 0xffffffff, /* default value */ + .ns_data = 0xdeadbee2, + .ns_wide = 0xcafec0caf22dc0de, + .sc_id = 0xffffff, /* default value */ + .sc_data = NULL, + .hlim = 62, +}; + +enum { + /********** + * OUTPUT * + **********/ + TEST_OUT_UNDEF_NS, + TEST_OUT_NO_ROOM, + TEST_OUT_BIT0, + TEST_OUT_BIT1, + TEST_OUT_BIT2, + TEST_OUT_BIT3, + TEST_OUT_BIT4, + TEST_OUT_BIT5, + TEST_OUT_BIT6, + TEST_OUT_BIT7, + TEST_OUT_BIT8, + TEST_OUT_BIT9, + TEST_OUT_BIT10, + TEST_OUT_BIT11, + TEST_OUT_BIT12, + TEST_OUT_BIT13, + TEST_OUT_BIT14, + TEST_OUT_BIT15, + TEST_OUT_BIT16, + TEST_OUT_BIT17, + TEST_OUT_BIT18, + TEST_OUT_BIT19, + TEST_OUT_BIT20, + TEST_OUT_BIT21, + TEST_OUT_BIT22, + TEST_OUT_FULL_SUPP_TRACE, + + /********* + * INPUT * + *********/ + TEST_IN_UNDEF_NS, + TEST_IN_NO_ROOM, + TEST_IN_OFLAG, + TEST_IN_BIT0, + TEST_IN_BIT1, + TEST_IN_BIT2, + TEST_IN_BIT3, + TEST_IN_BIT4, + TEST_IN_BIT5, + TEST_IN_BIT6, + TEST_IN_BIT7, + TEST_IN_BIT8, + TEST_IN_BIT9, + TEST_IN_BIT10, + TEST_IN_BIT11, + TEST_IN_BIT12, + TEST_IN_BIT13, + TEST_IN_BIT14, + TEST_IN_BIT15, + TEST_IN_BIT16, + TEST_IN_BIT17, + TEST_IN_BIT18, + TEST_IN_BIT19, + TEST_IN_BIT20, + TEST_IN_BIT21, + TEST_IN_BIT22, + TEST_IN_FULL_SUPP_TRACE, + + /********** + * GLOBAL * + **********/ + TEST_FWD_FULL_SUPP_TRACE, + + __TEST_MAX, +}; + +static int check_ioam_header(int tid, struct ioam6_trace_hdr *ioam6h, + __u32 trace_type, __u16 ioam_ns) +{ + if (__be16_to_cpu(ioam6h->namespace_id) != ioam_ns || + __be32_to_cpu(ioam6h->type_be32) != (trace_type << 8)) + return 1; + + switch (tid) { + case TEST_OUT_UNDEF_NS: + case TEST_IN_UNDEF_NS: + return ioam6h->overflow || + ioam6h->nodelen != 1 || + ioam6h->remlen != 1; + + case TEST_OUT_NO_ROOM: + case TEST_IN_NO_ROOM: + case TEST_IN_OFLAG: + return !ioam6h->overflow || + ioam6h->nodelen != 2 || + ioam6h->remlen != 1; + + case TEST_OUT_BIT0: + case TEST_IN_BIT0: + case TEST_OUT_BIT1: + case TEST_IN_BIT1: + case TEST_OUT_BIT2: + case TEST_IN_BIT2: + case TEST_OUT_BIT3: + case TEST_IN_BIT3: + case TEST_OUT_BIT4: + case TEST_IN_BIT4: + case TEST_OUT_BIT5: + case TEST_IN_BIT5: + case TEST_OUT_BIT6: + case TEST_IN_BIT6: + case TEST_OUT_BIT7: + case TEST_IN_BIT7: + case TEST_OUT_BIT11: + case TEST_IN_BIT11: + return ioam6h->overflow || + ioam6h->nodelen != 1 || + ioam6h->remlen; + + case TEST_OUT_BIT8: + case TEST_IN_BIT8: + case TEST_OUT_BIT9: + case TEST_IN_BIT9: + case TEST_OUT_BIT10: + case TEST_IN_BIT10: + return ioam6h->overflow || + ioam6h->nodelen != 2 || + ioam6h->remlen; + + case TEST_OUT_BIT12: + case TEST_IN_BIT12: + case TEST_OUT_BIT13: + case TEST_IN_BIT13: + case TEST_OUT_BIT14: + case TEST_IN_BIT14: + case TEST_OUT_BIT15: + case TEST_IN_BIT15: + case TEST_OUT_BIT16: + case TEST_IN_BIT16: + case TEST_OUT_BIT17: + case TEST_IN_BIT17: + case TEST_OUT_BIT18: + case TEST_IN_BIT18: + case TEST_OUT_BIT19: + case TEST_IN_BIT19: + case TEST_OUT_BIT20: + case TEST_IN_BIT20: + case TEST_OUT_BIT21: + case TEST_IN_BIT21: + return ioam6h->overflow || + ioam6h->nodelen || + ioam6h->remlen != 1; + + case TEST_OUT_BIT22: + case TEST_IN_BIT22: + return ioam6h->overflow || + ioam6h->nodelen || + ioam6h->remlen; + + case TEST_OUT_FULL_SUPP_TRACE: + case TEST_IN_FULL_SUPP_TRACE: + case TEST_FWD_FULL_SUPP_TRACE: + return ioam6h->overflow || + ioam6h->nodelen != 15 || + ioam6h->remlen; + + default: + break; + } + + return 1; +} + +static int check_ioam6_data(__u8 **p, struct ioam6_trace_hdr *ioam6h, + const struct ioam_config cnf) +{ + unsigned int len; + __u8 aligned; + __u64 raw64; + __u32 raw32; + + if (ioam6h->type.bit0) { + raw32 = __be32_to_cpu(*((__u32 *)*p)); + if (cnf.hlim != (raw32 >> 24) || cnf.id != (raw32 & 0xffffff)) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit1) { + raw32 = __be32_to_cpu(*((__u32 *)*p)); + if (cnf.ingr_id != (raw32 >> 16) || + cnf.egr_id != (raw32 & 0xffff)) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit2) + *p += sizeof(__u32); + + if (ioam6h->type.bit3) + *p += sizeof(__u32); + + if (ioam6h->type.bit4) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit5) { + if (__be32_to_cpu(*((__u32 *)*p)) != cnf.ns_data) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit6) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit7) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit8) { + raw64 = __be64_to_cpu(*((__u64 *)*p)); + if (cnf.hlim != (raw64 >> 56) || + cnf.wide != (raw64 & 0xffffffffffffff)) + return 1; + *p += sizeof(__u64); + } + + if (ioam6h->type.bit9) { + if (__be32_to_cpu(*((__u32 *)*p)) != cnf.ingr_wide) + return 1; + *p += sizeof(__u32); + + if (__be32_to_cpu(*((__u32 *)*p)) != cnf.egr_wide) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit10) { + if (__be64_to_cpu(*((__u64 *)*p)) != cnf.ns_wide) + return 1; + *p += sizeof(__u64); + } + + if (ioam6h->type.bit11) { + if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff) + return 1; + *p += sizeof(__u32); + } + + if (ioam6h->type.bit22) { + len = cnf.sc_data ? strlen(cnf.sc_data) : 0; + aligned = cnf.sc_data ? __ALIGN_KERNEL(len, 4) : 0; + + raw32 = __be32_to_cpu(*((__u32 *)*p)); + if (aligned != (raw32 >> 24) * 4 || + cnf.sc_id != (raw32 & 0xffffff)) + return 1; + *p += sizeof(__u32); + + if (cnf.sc_data) { + if (strncmp((char *)*p, cnf.sc_data, len)) + return 1; + + *p += len; + aligned -= len; + + while (aligned--) { + if (**p != '\0') + return 1; + *p += sizeof(__u8); + } + } + } + + return 0; +} + +static int check_ioam_header_and_data(int tid, struct ioam6_trace_hdr *ioam6h, + __u32 trace_type, __u16 ioam_ns) +{ + __u8 *p; + + if (check_ioam_header(tid, ioam6h, trace_type, ioam_ns)) + return 1; + + p = ioam6h->data + ioam6h->remlen * 4; + + switch (tid) { + case TEST_OUT_BIT0: + case TEST_OUT_BIT1: + case TEST_OUT_BIT2: + case TEST_OUT_BIT3: + case TEST_OUT_BIT4: + case TEST_OUT_BIT5: + case TEST_OUT_BIT6: + case TEST_OUT_BIT7: + case TEST_OUT_BIT8: + case TEST_OUT_BIT9: + case TEST_OUT_BIT10: + case TEST_OUT_BIT11: + case TEST_OUT_BIT22: + case TEST_OUT_FULL_SUPP_TRACE: + return check_ioam6_data(&p, ioam6h, node1); + + case TEST_IN_BIT0: + case TEST_IN_BIT1: + case TEST_IN_BIT2: + case TEST_IN_BIT3: + case TEST_IN_BIT4: + case TEST_IN_BIT5: + case TEST_IN_BIT6: + case TEST_IN_BIT7: + case TEST_IN_BIT8: + case TEST_IN_BIT9: + case TEST_IN_BIT10: + case TEST_IN_BIT11: + case TEST_IN_BIT22: + case TEST_IN_FULL_SUPP_TRACE: + { + __u32 tmp32 = node2.egr_wide; + __u16 tmp16 = node2.egr_id; + int res; + + node2.egr_id = 0xffff; + node2.egr_wide = 0xffffffff; + + res = check_ioam6_data(&p, ioam6h, node2); + + node2.egr_id = tmp16; + node2.egr_wide = tmp32; + + return res; + } + + case TEST_FWD_FULL_SUPP_TRACE: + if (check_ioam6_data(&p, ioam6h, node3)) + return 1; + if (check_ioam6_data(&p, ioam6h, node2)) + return 1; + return check_ioam6_data(&p, ioam6h, node1); + + default: + break; + } + + return 1; +} + +static int str2id(const char *tname) +{ + if (!strcmp("out_undef_ns", tname)) + return TEST_OUT_UNDEF_NS; + if (!strcmp("out_no_room", tname)) + return TEST_OUT_NO_ROOM; + if (!strcmp("out_bit0", tname)) + return TEST_OUT_BIT0; + if (!strcmp("out_bit1", tname)) + return TEST_OUT_BIT1; + if (!strcmp("out_bit2", tname)) + return TEST_OUT_BIT2; + if (!strcmp("out_bit3", tname)) + return TEST_OUT_BIT3; + if (!strcmp("out_bit4", tname)) + return TEST_OUT_BIT4; + if (!strcmp("out_bit5", tname)) + return TEST_OUT_BIT5; + if (!strcmp("out_bit6", tname)) + return TEST_OUT_BIT6; + if (!strcmp("out_bit7", tname)) + return TEST_OUT_BIT7; + if (!strcmp("out_bit8", tname)) + return TEST_OUT_BIT8; + if (!strcmp("out_bit9", tname)) + return TEST_OUT_BIT9; + if (!strcmp("out_bit10", tname)) + return TEST_OUT_BIT10; + if (!strcmp("out_bit11", tname)) + return TEST_OUT_BIT11; + if (!strcmp("out_bit12", tname)) + return TEST_OUT_BIT12; + if (!strcmp("out_bit13", tname)) + return TEST_OUT_BIT13; + if (!strcmp("out_bit14", tname)) + return TEST_OUT_BIT14; + if (!strcmp("out_bit15", tname)) + return TEST_OUT_BIT15; + if (!strcmp("out_bit16", tname)) + return TEST_OUT_BIT16; + if (!strcmp("out_bit17", tname)) + return TEST_OUT_BIT17; + if (!strcmp("out_bit18", tname)) + return TEST_OUT_BIT18; + if (!strcmp("out_bit19", tname)) + return TEST_OUT_BIT19; + if (!strcmp("out_bit20", tname)) + return TEST_OUT_BIT20; + if (!strcmp("out_bit21", tname)) + return TEST_OUT_BIT21; + if (!strcmp("out_bit22", tname)) + return TEST_OUT_BIT22; + if (!strcmp("out_full_supp_trace", tname)) + return TEST_OUT_FULL_SUPP_TRACE; + if (!strcmp("in_undef_ns", tname)) + return TEST_IN_UNDEF_NS; + if (!strcmp("in_no_room", tname)) + return TEST_IN_NO_ROOM; + if (!strcmp("in_oflag", tname)) + return TEST_IN_OFLAG; + if (!strcmp("in_bit0", tname)) + return TEST_IN_BIT0; + if (!strcmp("in_bit1", tname)) + return TEST_IN_BIT1; + if (!strcmp("in_bit2", tname)) + return TEST_IN_BIT2; + if (!strcmp("in_bit3", tname)) + return TEST_IN_BIT3; + if (!strcmp("in_bit4", tname)) + return TEST_IN_BIT4; + if (!strcmp("in_bit5", tname)) + return TEST_IN_BIT5; + if (!strcmp("in_bit6", tname)) + return TEST_IN_BIT6; + if (!strcmp("in_bit7", tname)) + return TEST_IN_BIT7; + if (!strcmp("in_bit8", tname)) + return TEST_IN_BIT8; + if (!strcmp("in_bit9", tname)) + return TEST_IN_BIT9; + if (!strcmp("in_bit10", tname)) + return TEST_IN_BIT10; + if (!strcmp("in_bit11", tname)) + return TEST_IN_BIT11; + if (!strcmp("in_bit12", tname)) + return TEST_IN_BIT12; + if (!strcmp("in_bit13", tname)) + return TEST_IN_BIT13; + if (!strcmp("in_bit14", tname)) + return TEST_IN_BIT14; + if (!strcmp("in_bit15", tname)) + return TEST_IN_BIT15; + if (!strcmp("in_bit16", tname)) + return TEST_IN_BIT16; + if (!strcmp("in_bit17", tname)) + return TEST_IN_BIT17; + if (!strcmp("in_bit18", tname)) + return TEST_IN_BIT18; + if (!strcmp("in_bit19", tname)) + return TEST_IN_BIT19; + if (!strcmp("in_bit20", tname)) + return TEST_IN_BIT20; + if (!strcmp("in_bit21", tname)) + return TEST_IN_BIT21; + if (!strcmp("in_bit22", tname)) + return TEST_IN_BIT22; + if (!strcmp("in_full_supp_trace", tname)) + return TEST_IN_FULL_SUPP_TRACE; + if (!strcmp("fwd_full_supp_trace", tname)) + return TEST_FWD_FULL_SUPP_TRACE; + + return -1; +} + +static int ipv6_addr_equal(const struct in6_addr *a1, const struct in6_addr *a2) +{ + return ((a1->s6_addr32[0] ^ a2->s6_addr32[0]) | + (a1->s6_addr32[1] ^ a2->s6_addr32[1]) | + (a1->s6_addr32[2] ^ a2->s6_addr32[2]) | + (a1->s6_addr32[3] ^ a2->s6_addr32[3])) == 0; +} + +static int get_u32(__u32 *val, const char *arg, int base) +{ + unsigned long res; + char *ptr; + + if (!arg || !*arg) + return -1; + res = strtoul(arg, &ptr, base); + + if (!ptr || ptr == arg || *ptr) + return -1; + + if (res == ULONG_MAX && errno == ERANGE) + return -1; + + if (res > 0xFFFFFFFFUL) + return -1; + + *val = res; + return 0; +} + +static int get_u16(__u16 *val, const char *arg, int base) +{ + unsigned long res; + char *ptr; + + if (!arg || !*arg) + return -1; + res = strtoul(arg, &ptr, base); + + if (!ptr || ptr == arg || *ptr) + return -1; + + if (res == ULONG_MAX && errno == ERANGE) + return -1; + + if (res > 0xFFFFUL) + return -1; + + *val = res; + return 0; +} + +static int (*func[__TEST_MAX])(int, struct ioam6_trace_hdr *, __u32, __u16) = { + [TEST_OUT_UNDEF_NS] = check_ioam_header, + [TEST_OUT_NO_ROOM] = check_ioam_header, + [TEST_OUT_BIT0] = check_ioam_header_and_data, + [TEST_OUT_BIT1] = check_ioam_header_and_data, + [TEST_OUT_BIT2] = check_ioam_header_and_data, + [TEST_OUT_BIT3] = check_ioam_header_and_data, + [TEST_OUT_BIT4] = check_ioam_header_and_data, + [TEST_OUT_BIT5] = check_ioam_header_and_data, + [TEST_OUT_BIT6] = check_ioam_header_and_data, + [TEST_OUT_BIT7] = check_ioam_header_and_data, + [TEST_OUT_BIT8] = check_ioam_header_and_data, + [TEST_OUT_BIT9] = check_ioam_header_and_data, + [TEST_OUT_BIT10] = check_ioam_header_and_data, + [TEST_OUT_BIT11] = check_ioam_header_and_data, + [TEST_OUT_BIT12] = check_ioam_header, + [TEST_OUT_BIT13] = check_ioam_header, + [TEST_OUT_BIT14] = check_ioam_header, + [TEST_OUT_BIT15] = check_ioam_header, + [TEST_OUT_BIT16] = check_ioam_header, + [TEST_OUT_BIT17] = check_ioam_header, + [TEST_OUT_BIT18] = check_ioam_header, + [TEST_OUT_BIT19] = check_ioam_header, + [TEST_OUT_BIT20] = check_ioam_header, + [TEST_OUT_BIT21] = check_ioam_header, + [TEST_OUT_BIT22] = check_ioam_header_and_data, + [TEST_OUT_FULL_SUPP_TRACE] = check_ioam_header_and_data, + [TEST_IN_UNDEF_NS] = check_ioam_header, + [TEST_IN_NO_ROOM] = check_ioam_header, + [TEST_IN_OFLAG] = check_ioam_header, + [TEST_IN_BIT0] = check_ioam_header_and_data, + [TEST_IN_BIT1] = check_ioam_header_and_data, + [TEST_IN_BIT2] = check_ioam_header_and_data, + [TEST_IN_BIT3] = check_ioam_header_and_data, + [TEST_IN_BIT4] = check_ioam_header_and_data, + [TEST_IN_BIT5] = check_ioam_header_and_data, + [TEST_IN_BIT6] = check_ioam_header_and_data, + [TEST_IN_BIT7] = check_ioam_header_and_data, + [TEST_IN_BIT8] = check_ioam_header_and_data, + [TEST_IN_BIT9] = check_ioam_header_and_data, + [TEST_IN_BIT10] = check_ioam_header_and_data, + [TEST_IN_BIT11] = check_ioam_header_and_data, + [TEST_IN_BIT12] = check_ioam_header, + [TEST_IN_BIT13] = check_ioam_header, + [TEST_IN_BIT14] = check_ioam_header, + [TEST_IN_BIT15] = check_ioam_header, + [TEST_IN_BIT16] = check_ioam_header, + [TEST_IN_BIT17] = check_ioam_header, + [TEST_IN_BIT18] = check_ioam_header, + [TEST_IN_BIT19] = check_ioam_header, + [TEST_IN_BIT20] = check_ioam_header, + [TEST_IN_BIT21] = check_ioam_header, + [TEST_IN_BIT22] = check_ioam_header_and_data, + [TEST_IN_FULL_SUPP_TRACE] = check_ioam_header_and_data, + [TEST_FWD_FULL_SUPP_TRACE] = check_ioam_header_and_data, +}; + +int main(int argc, char **argv) +{ + int fd, size, hoplen, tid, ret = 1; + struct in6_addr src, dst; + struct ioam6_hdr *opt; + struct ipv6hdr *ip6h; + __u8 buffer[400], *p; + __u16 ioam_ns; + __u32 tr_type; + + if (argc != 7) + goto out; + + tid = str2id(argv[2]); + if (tid < 0 || !func[tid]) + goto out; + + if (inet_pton(AF_INET6, argv[3], &src) != 1 || + inet_pton(AF_INET6, argv[4], &dst) != 1) + goto out; + + if (get_u32(&tr_type, argv[5], 16) || + get_u16(&ioam_ns, argv[6], 0)) + goto out; + + fd = socket(AF_PACKET, SOCK_DGRAM, __cpu_to_be16(ETH_P_IPV6)); + if (!fd) + goto out; + + if (setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, + argv[1], strlen(argv[1]))) + goto close; + +recv: + size = recv(fd, buffer, sizeof(buffer), 0); + if (size <= 0) + goto close; + + ip6h = (struct ipv6hdr *)buffer; + + if (!ipv6_addr_equal(&ip6h->saddr, &src) || + !ipv6_addr_equal(&ip6h->daddr, &dst)) + goto recv; + + if (ip6h->nexthdr != IPPROTO_HOPOPTS) + goto close; + + p = buffer + sizeof(*ip6h); + hoplen = (p[1] + 1) << 3; + p += sizeof(struct ipv6_hopopt_hdr); + + while (hoplen > 0) { + opt = (struct ioam6_hdr *)p; + + if (opt->opt_type == IPV6_TLV_IOAM && + opt->type == IOAM6_TYPE_PREALLOC) { + p += sizeof(*opt); + ret = func[tid](tid, (struct ioam6_trace_hdr *)p, + tr_type, ioam_ns); + break; + } + + p += opt->opt_len + 2; + hoplen -= opt->opt_len + 2; + } +close: + close(fd); +out: + return ret; +} diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index f02f4de2f3a0..8c7117e2c337 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3,8 +3,10 @@ ret=0 sin="" +sinfail="" sout="" cin="" +cinfail="" cinsent="" cout="" ksft_skip=4 @@ -76,6 +78,14 @@ init() done } +init_shapers() +{ + for i in `seq 1 4`; do + tc -n $ns1 qdisc add dev ns1eth$i root netem rate 20mbit delay 1 + tc -n $ns2 qdisc add dev ns2eth$i root netem rate 20mbit delay 1 + done +} + cleanup_partial() { rm -f "$capout" @@ -88,8 +98,8 @@ cleanup_partial() cleanup() { - rm -f "$cin" "$cout" - rm -f "$sin" "$sout" "$cinsent" + rm -f "$cin" "$cout" "$sinfail" + rm -f "$sin" "$sout" "$cinsent" "$cinfail" cleanup_partial } @@ -211,11 +221,15 @@ link_failure() { ns="$1" - l=$((RANDOM%4)) - l=$((l+1)) + if [ -z "$FAILING_LINKS" ]; then + l=$((RANDOM%4)) + FAILING_LINKS=$((l+1)) + fi - veth="ns1eth$l" - ip -net "$ns" link set "$veth" down + for l in $FAILING_LINKS; do + veth="ns1eth$l" + ip -net "$ns" link set "$veth" down + done } # $1: IP address @@ -280,10 +294,17 @@ do_transfer() local_addr="0.0.0.0" fi - timeout ${timeout_test} \ - ip netns exec ${listener_ns} \ - $mptcp_connect -t ${timeout_poll} -l -p $port -s ${srv_proto} \ - ${local_addr} < "$sin" > "$sout" & + if [ "$test_link_fail" -eq 2 ];then + timeout ${timeout_test} \ + ip netns exec ${listener_ns} \ + $mptcp_connect -t ${timeout_poll} -l -p $port -s ${cl_proto} \ + ${local_addr} < "$sinfail" > "$sout" & + else + timeout ${timeout_test} \ + ip netns exec ${listener_ns} \ + $mptcp_connect -t ${timeout_poll} -l -p $port -s ${srv_proto} \ + ${local_addr} < "$sin" > "$sout" & + fi spid=$! sleep 1 @@ -294,7 +315,7 @@ do_transfer() $mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \ $connect_addr < "$cin" > "$cout" & else - ( cat "$cin" ; sleep 2; link_failure $listener_ns ; cat "$cin" ) | \ + ( cat "$cinfail" ; sleep 2; link_failure $listener_ns ; cat "$cinfail" ) | \ tee "$cinsent" | \ timeout ${timeout_test} \ ip netns exec ${connector_ns} \ @@ -323,17 +344,18 @@ do_transfer() let rm_nr_ns1=-addr_nr_ns1 if [ $rm_nr_ns1 -lt 8 ]; then counter=1 + pos=1 dump=(`ip netns exec ${listener_ns} ./pm_nl_ctl dump`) if [ ${#dump[@]} -gt 0 ]; then - id=${dump[1]} sleep 1 while [ $counter -le $rm_nr_ns1 ] do + id=${dump[$pos]} ip netns exec ${listener_ns} ./pm_nl_ctl del $id sleep 1 let counter+=1 - let id+=1 + let pos+=5 done fi elif [ $rm_nr_ns1 -eq 8 ]; then @@ -345,6 +367,12 @@ do_transfer() fi fi + flags="subflow" + if [[ "${addr_nr_ns2}" = "fullmesh_"* ]]; then + flags="${flags},fullmesh" + addr_nr_ns2=${addr_nr_ns2:9} + fi + if [ $addr_nr_ns2 -gt 0 ]; then let add_nr_ns2=addr_nr_ns2 counter=3 @@ -356,7 +384,7 @@ do_transfer() else addr="10.0.$counter.2" fi - ip netns exec $ns2 ./pm_nl_ctl add $addr flags subflow + ip netns exec $ns2 ./pm_nl_ctl add $addr flags $flags let counter+=1 let add_nr_ns2-=1 done @@ -365,17 +393,18 @@ do_transfer() let rm_nr_ns2=-addr_nr_ns2 if [ $rm_nr_ns2 -lt 8 ]; then counter=1 + pos=1 dump=(`ip netns exec ${connector_ns} ./pm_nl_ctl dump`) if [ ${#dump[@]} -gt 0 ]; then - id=${dump[1]} sleep 1 while [ $counter -le $rm_nr_ns2 ] do + id=${dump[$pos]} ip netns exec ${connector_ns} ./pm_nl_ctl del $id sleep 1 let counter+=1 - let id+=1 + let pos+=5 done fi elif [ $rm_nr_ns2 -eq 8 ]; then @@ -434,7 +463,11 @@ do_transfer() return 1 fi - check_transfer $sin $cout "file received by client" + if [ "$test_link_fail" -eq 2 ];then + check_transfer $sinfail $cout "file received by client" + else + check_transfer $sin $cout "file received by client" + fi retc=$? if [ "$test_link_fail" -eq 0 ];then check_transfer $cin $sout "file received by server" @@ -477,29 +510,33 @@ run_tests() lret=0 oldin="" - if [ "$test_linkfail" -eq 1 ];then - size=$((RANDOM%1024)) + # create the input file for the failure test when + # the first failure test run + if [ "$test_linkfail" -ne 0 -a -z "$cinfail" ]; then + # the client file must be considerably larger + # of the maximum expected cwin value, or the + # link utilization will be not predicable + size=$((RANDOM%2)) size=$((size+1)) - size=$((size*128)) + size=$((size*8192)) + size=$((size + ( $RANDOM % 8192) )) - oldin=$(mktemp) - cp "$cin" "$oldin" - make_file "$cin" "client" $size + cinfail=$(mktemp) + make_file "$cinfail" "client" $size fi - do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} \ - ${test_linkfail} ${addr_nr_ns1} ${addr_nr_ns2} ${speed} ${bkup} - lret=$? + if [ "$test_linkfail" -eq 2 -a -z "$sinfail" ]; then + size=$((RANDOM%16)) + size=$((size+1)) + size=$((size*2048)) - if [ "$test_linkfail" -eq 1 ];then - cp "$oldin" "$cin" - rm -f "$oldin" + sinfail=$(mktemp) + make_file "$sinfail" "server" $size fi - if [ $lret -ne 0 ]; then - ret=$lret - return - fi + do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} \ + ${test_linkfail} ${addr_nr_ns1} ${addr_nr_ns2} ${speed} ${bkup} + lret=$? } chk_csum_nr() @@ -593,6 +630,46 @@ chk_join_nr() fi } +# a negative value for 'stale_max' means no upper bound: +# for bidirectional transfer, if one peer sleep for a while +# - as these tests do - we can have a quite high number of +# stale/recover conversions, proportional to +# sleep duration/ MPTCP-level RTX interval. +chk_stale_nr() +{ + local ns=$1 + local stale_min=$2 + local stale_max=$3 + local stale_delta=$4 + local dump_stats + local stale_nr + local recover_nr + + printf "%-39s %-18s" " " "stale" + stale_nr=`ip netns exec $ns nstat -as | grep MPTcpExtSubflowStale | awk '{print $2}'` + [ -z "$stale_nr" ] && stale_nr=0 + recover_nr=`ip netns exec $ns nstat -as | grep MPTcpExtSubflowRecover | awk '{print $2}'` + [ -z "$recover_nr" ] && recover_nr=0 + + if [ $stale_nr -lt $stale_min ] || + [ $stale_max -gt 0 -a $stale_nr -gt $stale_max ] || + [ $((stale_nr - $recover_nr)) -ne $stale_delta ]; then + echo "[fail] got $stale_nr stale[s] $recover_nr recover[s], " \ + " expected stale in range [$stale_min..$stale_max]," \ + " stale-recover delta $stale_delta " + ret=1 + dump_stats=1 + else + echo "[ ok ]" + fi + + if [ "${dump_stats}" = 1 ]; then + echo $ns stats + ip netns exec $ns ip -s link show + ip netns exec $ns nstat -as | grep MPTcp + fi +} + chk_add_nr() { local add_nr=$1 @@ -801,6 +878,27 @@ chk_prio_nr() fi } +chk_link_usage() +{ + local ns=$1 + local link=$2 + local out=$3 + local expected_rate=$4 + local tx_link=`ip netns exec $ns cat /sys/class/net/$link/statistics/tx_bytes` + local tx_total=`ls -l $out | awk '{print $5}'` + local tx_rate=$((tx_link * 100 / $tx_total)) + local tolerance=5 + + printf "%-39s %-18s" " " "link usage" + if [ $tx_rate -lt $((expected_rate - $tolerance)) -o \ + $tx_rate -gt $((expected_rate + $tolerance)) ]; then + echo "[fail] got $tx_rate% usage, expected $expected_rate%" + ret=1 + else + echo "[ ok ]" + fi +} + subflows_tests() { reset @@ -924,14 +1022,80 @@ link_failure_tests() { # accept and use add_addr with additional subflows and link loss reset + + # without any b/w limit each veth could spool the packets and get + # them acked at xmit time, so that the corresponding subflow will + # have almost always no outstanding pkts, the scheduler will pick + # always the first subflow and we will have hard time testing + # active backup and link switch-over. + # Let's set some arbitrary (low) virtual link limits. + init_shapers ip netns exec $ns1 ./pm_nl_ctl limits 0 3 - ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags signal ip netns exec $ns2 ./pm_nl_ctl limits 1 3 - ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow - ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 dev ns2eth3 flags subflow + ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 dev ns2eth4 flags subflow run_tests $ns1 $ns2 10.0.1.1 1 chk_join_nr "multiple flows, signal, link failure" 3 3 3 chk_add_nr 1 1 + chk_stale_nr $ns2 1 5 1 + + # accept and use add_addr with additional subflows and link loss + # for bidirectional transfer + reset + init_shapers + ip netns exec $ns1 ./pm_nl_ctl limits 0 3 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 1 3 + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 dev ns2eth3 flags subflow + ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 dev ns2eth4 flags subflow + run_tests $ns1 $ns2 10.0.1.1 2 + chk_join_nr "multi flows, signal, bidi, link fail" 3 3 3 + chk_add_nr 1 1 + chk_stale_nr $ns2 1 -1 1 + + # 2 subflows plus 1 backup subflow with a lossy link, backup + # will never be used + reset + init_shapers + ip netns exec $ns1 ./pm_nl_ctl limits 0 2 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 1 2 + export FAILING_LINKS="1" + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 dev ns2eth3 flags subflow,backup + run_tests $ns1 $ns2 10.0.1.1 1 + chk_join_nr "backup subflow unused, link failure" 2 2 2 + chk_add_nr 1 1 + chk_link_usage $ns2 ns2eth3 $cinsent 0 + + # 2 lossy links after half transfer, backup will get half of + # the traffic + reset + init_shapers + ip netns exec $ns1 ./pm_nl_ctl limits 0 2 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 1 2 + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 dev ns2eth3 flags subflow,backup + export FAILING_LINKS="1 2" + run_tests $ns1 $ns2 10.0.1.1 1 + chk_join_nr "backup flow used, multi links fail" 2 2 2 + chk_add_nr 1 1 + chk_stale_nr $ns2 2 4 2 + chk_link_usage $ns2 ns2eth3 $cinsent 50 + + # use a backup subflow with the first subflow on a lossy link + # for bidirectional transfer + reset + init_shapers + ip netns exec $ns1 ./pm_nl_ctl limits 0 2 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 1 3 + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 dev ns2eth3 flags subflow,backup + run_tests $ns1 $ns2 10.0.1.1 2 + chk_join_nr "backup flow used, bidi, link failure" 2 2 2 + chk_add_nr 1 1 + chk_stale_nr $ns2 1 -1 2 + chk_link_usage $ns2 ns2eth3 $cinsent 50 } add_addr_timeout_tests() @@ -1530,6 +1694,55 @@ deny_join_id0_tests() chk_join_nr "subflow and address allow join id0 2" 1 1 1 } +fullmesh_tests() +{ + # fullmesh 1 + # 2 fullmesh addrs in ns2, added before the connection, + # 1 non-fullmesh addr in ns1, added during the connection. + reset + ip netns exec $ns1 ./pm_nl_ctl limits 0 4 + ip netns exec $ns2 ./pm_nl_ctl limits 1 4 + ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow,fullmesh + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow,fullmesh + run_tests $ns1 $ns2 10.0.1.1 0 1 0 slow + chk_join_nr "fullmesh test 2x1" 4 4 4 + chk_add_nr 1 1 + + # fullmesh 2 + # 1 non-fullmesh addr in ns1, added before the connection, + # 1 fullmesh addr in ns2, added during the connection. + reset + ip netns exec $ns1 ./pm_nl_ctl limits 1 3 + ip netns exec $ns2 ./pm_nl_ctl limits 1 3 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal + run_tests $ns1 $ns2 10.0.1.1 0 0 fullmesh_1 slow + chk_join_nr "fullmesh test 1x1" 3 3 3 + chk_add_nr 1 1 + + # fullmesh 3 + # 1 non-fullmesh addr in ns1, added before the connection, + # 2 fullmesh addrs in ns2, added during the connection. + reset + ip netns exec $ns1 ./pm_nl_ctl limits 2 5 + ip netns exec $ns2 ./pm_nl_ctl limits 1 5 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal + run_tests $ns1 $ns2 10.0.1.1 0 0 fullmesh_2 slow + chk_join_nr "fullmesh test 1x2" 5 5 5 + chk_add_nr 1 1 + + # fullmesh 4 + # 1 non-fullmesh addr in ns1, added before the connection, + # 2 fullmesh addrs in ns2, added during the connection, + # limit max_subflows to 4. + reset + ip netns exec $ns1 ./pm_nl_ctl limits 2 4 + ip netns exec $ns2 ./pm_nl_ctl limits 1 4 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal + run_tests $ns1 $ns2 10.0.1.1 0 0 fullmesh_2 slow + chk_join_nr "fullmesh test 1x2, limited" 4 4 4 + chk_add_nr 1 1 +} + all_tests() { subflows_tests @@ -1545,6 +1758,7 @@ all_tests() syncookies_tests checksum_tests deny_join_id0_tests + fullmesh_tests } usage() @@ -1563,6 +1777,7 @@ usage() echo " -k syncookies_tests" echo " -S checksum_tests" echo " -d deny_join_id0_tests" + echo " -m fullmesh_tests" echo " -c capture pcap files" echo " -C enable data checksum" echo " -h help" @@ -1598,7 +1813,7 @@ if [ $do_all_tests -eq 1 ]; then exit $ret fi -while getopts 'fsltra64bpkdchCS' opt; do +while getopts 'fsltra64bpkdmchCS' opt; do case $opt in f) subflows_tests @@ -1639,6 +1854,9 @@ while getopts 'fsltra64bpkdchCS' opt; do d) deny_join_id0_tests ;; + m) + fullmesh_tests + ;; c) ;; C) diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c index 115decfdc1ef..354784512748 100644 --- a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c +++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c @@ -25,7 +25,7 @@ static void syntax(char *argv[]) { fprintf(stderr, "%s add|get|set|del|flush|dump|accept [<args>]\n", argv[0]); - fprintf(stderr, "\tadd [flags signal|subflow|backup] [id <nr>] [dev <name>] <ip>\n"); + fprintf(stderr, "\tadd [flags signal|subflow|backup|fullmesh] [id <nr>] [dev <name>] <ip>\n"); fprintf(stderr, "\tdel <id> [<ip>]\n"); fprintf(stderr, "\tget <id>\n"); fprintf(stderr, "\tset <ip> [flags backup|nobackup]\n"); @@ -236,11 +236,18 @@ int add_addr(int fd, int pm_family, int argc, char *argv[]) flags |= MPTCP_PM_ADDR_FLAG_SIGNAL; else if (!strcmp(tok, "backup")) flags |= MPTCP_PM_ADDR_FLAG_BACKUP; + else if (!strcmp(tok, "fullmesh")) + flags |= MPTCP_PM_ADDR_FLAG_FULLMESH; else error(1, errno, "unknown flag %s", argv[arg]); } + if (flags & MPTCP_PM_ADDR_FLAG_SIGNAL && + flags & MPTCP_PM_ADDR_FLAG_FULLMESH) { + error(1, errno, "error flag fullmesh"); + } + rta = (void *)(data + off); rta->rta_type = MPTCP_PM_ADDR_ATTR_FLAGS; rta->rta_len = RTA_LENGTH(4); @@ -422,6 +429,13 @@ static void print_addr(struct rtattr *attrs, int len) printf(","); } + if (flags & MPTCP_PM_ADDR_FLAG_FULLMESH) { + printf("fullmesh"); + flags &= ~MPTCP_PM_ADDR_FLAG_FULLMESH; + if (flags) + printf(","); + } + /* bump unknown flags, if any */ if (flags) printf("0x%x", flags); diff --git a/tools/testing/selftests/net/psock_snd.sh b/tools/testing/selftests/net/psock_snd.sh index 170be65e0816..1cbfeb5052ec 100755 --- a/tools/testing/selftests/net/psock_snd.sh +++ b/tools/testing/selftests/net/psock_snd.sh @@ -86,9 +86,6 @@ echo "raw truncate hlen - 1 (expected to fail: EINVAL)" echo "raw gso min size" ./in_netns.sh ./psock_snd -v -c -g -l "${mss_exceeds}" -echo "raw gso min size - 1 (expected to fail)" -(! ./in_netns.sh ./psock_snd -v -c -g -l "${mss}") - echo "raw gso max size" ./in_netns.sh ./psock_snd -v -c -g -l "${max_mss}" diff --git a/tools/testing/selftests/net/setup_loopback.sh b/tools/testing/selftests/net/setup_loopback.sh new file mode 100755 index 000000000000..0a8ad97b07ea --- /dev/null +++ b/tools/testing/selftests/net/setup_loopback.sh @@ -0,0 +1,82 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +netdev_check_for_carrier() { + local -r dev="$1" + + for i in {1..5}; do + carrier="$(cat /sys/class/net/${dev}/carrier)" + if [[ "${carrier}" -ne 1 ]] ; then + echo "carrier not ready yet..." >&2 + sleep 1 + else + echo "carrier ready" >&2 + break + fi + done + echo "${carrier}" +} + +# Assumes that there is no existing ipvlan device on the physical device +setup_loopback_environment() { + local dev="$1" + + # Fail hard if cannot turn on loopback mode for current NIC + ethtool -K "${dev}" loopback on || exit 1 + sleep 1 + + # Check for the carrier + carrier=$(netdev_check_for_carrier ${dev}) + if [[ "${carrier}" -ne 1 ]] ; then + echo "setup_loopback_environment failed" + exit 1 + fi +} + +setup_macvlan_ns(){ + local -r link_dev="$1" + local -r ns_name="$2" + local -r ns_dev="$3" + local -r ns_mac="$4" + local -r addr="$5" + + ip link add link "${link_dev}" dev "${ns_dev}" \ + address "${ns_mac}" type macvlan + exit_code=$? + if [[ "${exit_code}" -ne 0 ]]; then + echo "setup_macvlan_ns failed" + exit $exit_code + fi + + [[ -e /var/run/netns/"${ns_name}" ]] || ip netns add "${ns_name}" + ip link set dev "${ns_dev}" netns "${ns_name}" + ip -netns "${ns_name}" link set dev "${ns_dev}" up + if [[ -n "${addr}" ]]; then + ip -netns "${ns_name}" addr add dev "${ns_dev}" "${addr}" + fi + + sleep 1 +} + +cleanup_macvlan_ns(){ + while (( $# >= 2 )); do + ns_name="$1" + ns_dev="$2" + ip -netns "${ns_name}" link del dev "${ns_dev}" + ip netns del "${ns_name}" + shift 2 + done +} + +cleanup_loopback(){ + local -r dev="$1" + + ethtool -K "${dev}" loopback off + sleep 1 + + # Check for the carrier + carrier=$(netdev_check_for_carrier ${dev}) + if [[ "${carrier}" -ne 1 ]] ; then + echo "setup_loopback_environment failed" + exit 1 + fi +} diff --git a/tools/testing/selftests/net/toeplitz.c b/tools/testing/selftests/net/toeplitz.c new file mode 100644 index 000000000000..710ac956bdb3 --- /dev/null +++ b/tools/testing/selftests/net/toeplitz.c @@ -0,0 +1,585 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Toeplitz test + * + * 1. Read packets and their rx_hash using PF_PACKET/TPACKET_V3 + * 2. Compute the rx_hash in software based on the packet contents + * 3. Compare the two + * + * Optionally, either '-C $rx_irq_cpu_list' or '-r $rps_bitmap' may be given. + * + * If '-C $rx_irq_cpu_list' is given, also + * + * 4. Identify the cpu on which the packet arrived with PACKET_FANOUT_CPU + * 5. Compute the rxqueue that RSS would select based on this rx_hash + * 6. Using the $rx_irq_cpu_list map, identify the arriving cpu based on rxq irq + * 7. Compare the cpus from 4 and 6 + * + * Else if '-r $rps_bitmap' is given, also + * + * 4. Identify the cpu on which the packet arrived with PACKET_FANOUT_CPU + * 5. Compute the cpu that RPS should select based on rx_hash and $rps_bitmap + * 6. Compare the cpus from 4 and 5 + */ + +#define _GNU_SOURCE + +#include <arpa/inet.h> +#include <errno.h> +#include <error.h> +#include <fcntl.h> +#include <getopt.h> +#include <linux/filter.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <net/if.h> +#include <netdb.h> +#include <netinet/ip.h> +#include <netinet/ip6.h> +#include <netinet/tcp.h> +#include <netinet/udp.h> +#include <poll.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/sysinfo.h> +#include <sys/time.h> +#include <sys/types.h> +#include <unistd.h> + +#define TOEPLITZ_KEY_MIN_LEN 40 +#define TOEPLITZ_KEY_MAX_LEN 60 + +#define TOEPLITZ_STR_LEN(K) (((K) * 3) - 1) /* hex encoded: AA:BB:CC:...:ZZ */ +#define TOEPLITZ_STR_MIN_LEN TOEPLITZ_STR_LEN(TOEPLITZ_KEY_MIN_LEN) +#define TOEPLITZ_STR_MAX_LEN TOEPLITZ_STR_LEN(TOEPLITZ_KEY_MAX_LEN) + +#define FOUR_TUPLE_MAX_LEN ((sizeof(struct in6_addr) * 2) + (sizeof(uint16_t) * 2)) + +#define RSS_MAX_CPUS (1 << 16) /* real constraint is PACKET_FANOUT_MAX */ + +#define RPS_MAX_CPUS 16UL /* must be a power of 2 */ + +/* configuration options (cmdline arguments) */ +static uint16_t cfg_dport = 8000; +static int cfg_family = AF_INET6; +static char *cfg_ifname = "eth0"; +static int cfg_num_queues; +static int cfg_num_rps_cpus; +static bool cfg_sink; +static int cfg_type = SOCK_STREAM; +static int cfg_timeout_msec = 1000; +static bool cfg_verbose; + +/* global vars */ +static int num_cpus; +static int ring_block_nr; +static int ring_block_sz; + +/* stats */ +static int frames_received; +static int frames_nohash; +static int frames_error; + +#define log_verbose(args...) do { if (cfg_verbose) fprintf(stderr, args); } while (0) + +/* tpacket ring */ +struct ring_state { + int fd; + char *mmap; + int idx; + int cpu; +}; + +static unsigned int rx_irq_cpus[RSS_MAX_CPUS]; /* map from rxq to cpu */ +static int rps_silo_to_cpu[RPS_MAX_CPUS]; +static unsigned char toeplitz_key[TOEPLITZ_KEY_MAX_LEN]; +static struct ring_state rings[RSS_MAX_CPUS]; + +static inline uint32_t toeplitz(const unsigned char *four_tuple, + const unsigned char *key) +{ + int i, bit, ret = 0; + uint32_t key32; + + key32 = ntohl(*((uint32_t *)key)); + key += 4; + + for (i = 0; i < FOUR_TUPLE_MAX_LEN; i++) { + for (bit = 7; bit >= 0; bit--) { + if (four_tuple[i] & (1 << bit)) + ret ^= key32; + + key32 <<= 1; + key32 |= !!(key[0] & (1 << bit)); + } + key++; + } + + return ret; +} + +/* Compare computed cpu with arrival cpu from packet_fanout_cpu */ +static void verify_rss(uint32_t rx_hash, int cpu) +{ + int queue = rx_hash % cfg_num_queues; + + log_verbose(" rxq %d (cpu %d)", queue, rx_irq_cpus[queue]); + if (rx_irq_cpus[queue] != cpu) { + log_verbose(". error: rss cpu mismatch (%d)", cpu); + frames_error++; + } +} + +static void verify_rps(uint64_t rx_hash, int cpu) +{ + int silo = (rx_hash * cfg_num_rps_cpus) >> 32; + + log_verbose(" silo %d (cpu %d)", silo, rps_silo_to_cpu[silo]); + if (rps_silo_to_cpu[silo] != cpu) { + log_verbose(". error: rps cpu mismatch (%d)", cpu); + frames_error++; + } +} + +static void log_rxhash(int cpu, uint32_t rx_hash, + const char *addrs, int addr_len) +{ + char saddr[INET6_ADDRSTRLEN], daddr[INET6_ADDRSTRLEN]; + uint16_t *ports; + + if (!inet_ntop(cfg_family, addrs, saddr, sizeof(saddr)) || + !inet_ntop(cfg_family, addrs + addr_len, daddr, sizeof(daddr))) + error(1, 0, "address parse error"); + + ports = (void *)addrs + (addr_len * 2); + log_verbose("cpu %d: rx_hash 0x%08x [saddr %s daddr %s sport %02hu dport %02hu]", + cpu, rx_hash, saddr, daddr, + ntohs(ports[0]), ntohs(ports[1])); +} + +/* Compare computed rxhash with rxhash received from tpacket_v3 */ +static void verify_rxhash(const char *pkt, uint32_t rx_hash, int cpu) +{ + unsigned char four_tuple[FOUR_TUPLE_MAX_LEN] = {0}; + uint32_t rx_hash_sw; + const char *addrs; + int addr_len; + + if (cfg_family == AF_INET) { + addr_len = sizeof(struct in_addr); + addrs = pkt + offsetof(struct iphdr, saddr); + } else { + addr_len = sizeof(struct in6_addr); + addrs = pkt + offsetof(struct ip6_hdr, ip6_src); + } + + memcpy(four_tuple, addrs, (addr_len * 2) + (sizeof(uint16_t) * 2)); + rx_hash_sw = toeplitz(four_tuple, toeplitz_key); + + if (cfg_verbose) + log_rxhash(cpu, rx_hash, addrs, addr_len); + + if (rx_hash != rx_hash_sw) { + log_verbose(" != expected 0x%x\n", rx_hash_sw); + frames_error++; + return; + } + + log_verbose(" OK"); + if (cfg_num_queues) + verify_rss(rx_hash, cpu); + else if (cfg_num_rps_cpus) + verify_rps(rx_hash, cpu); + log_verbose("\n"); +} + +static char *recv_frame(const struct ring_state *ring, char *frame) +{ + struct tpacket3_hdr *hdr = (void *)frame; + + if (hdr->hv1.tp_rxhash) + verify_rxhash(frame + hdr->tp_net, hdr->hv1.tp_rxhash, + ring->cpu); + else + frames_nohash++; + + return frame + hdr->tp_next_offset; +} + +/* A single TPACKET_V3 block can hold multiple frames */ +static void recv_block(struct ring_state *ring) +{ + struct tpacket_block_desc *block; + char *frame; + int i; + + block = (void *)(ring->mmap + ring->idx * ring_block_sz); + if (!(block->hdr.bh1.block_status & TP_STATUS_USER)) + return; + + frame = (char *)block; + frame += block->hdr.bh1.offset_to_first_pkt; + + for (i = 0; i < block->hdr.bh1.num_pkts; i++) { + frame = recv_frame(ring, frame); + frames_received++; + } + + block->hdr.bh1.block_status = TP_STATUS_KERNEL; + ring->idx = (ring->idx + 1) % ring_block_nr; +} + +/* simple test: sleep once unconditionally and then process all rings */ +static void process_rings(void) +{ + int i; + + usleep(1000 * cfg_timeout_msec); + + for (i = 0; i < num_cpus; i++) + recv_block(&rings[i]); + + fprintf(stderr, "count: pass=%u nohash=%u fail=%u\n", + frames_received - frames_nohash - frames_error, + frames_nohash, frames_error); +} + +static char *setup_ring(int fd) +{ + struct tpacket_req3 req3 = {0}; + void *ring; + + req3.tp_retire_blk_tov = cfg_timeout_msec; + req3.tp_feature_req_word = TP_FT_REQ_FILL_RXHASH; + + req3.tp_frame_size = 2048; + req3.tp_frame_nr = 1 << 10; + req3.tp_block_nr = 2; + + req3.tp_block_size = req3.tp_frame_size * req3.tp_frame_nr; + req3.tp_block_size /= req3.tp_block_nr; + + if (setsockopt(fd, SOL_PACKET, PACKET_RX_RING, &req3, sizeof(req3))) + error(1, errno, "setsockopt PACKET_RX_RING"); + + ring_block_sz = req3.tp_block_size; + ring_block_nr = req3.tp_block_nr; + + ring = mmap(0, req3.tp_block_size * req3.tp_block_nr, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_LOCKED | MAP_POPULATE, fd, 0); + if (ring == MAP_FAILED) + error(1, 0, "mmap failed"); + + return ring; +} + +static void __set_filter(int fd, int off_proto, uint8_t proto, int off_dport) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, SKF_AD_OFF + SKF_AD_PKTTYPE), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, PACKET_HOST, 0, 4), + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, off_proto), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, proto, 0, 2), + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, off_dport), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, cfg_dport, 1, 0), + BPF_STMT(BPF_RET + BPF_K, 0), + BPF_STMT(BPF_RET + BPF_K, 0xFFFF), + }; + struct sock_fprog prog = {}; + + prog.filter = filter; + prog.len = sizeof(filter) / sizeof(struct sock_filter); + if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &prog, sizeof(prog))) + error(1, errno, "setsockopt filter"); +} + +/* filter on transport protocol and destination port */ +static void set_filter(int fd) +{ + const int off_dport = offsetof(struct tcphdr, dest); /* same for udp */ + uint8_t proto; + + proto = cfg_type == SOCK_STREAM ? IPPROTO_TCP : IPPROTO_UDP; + if (cfg_family == AF_INET) + __set_filter(fd, offsetof(struct iphdr, protocol), proto, + sizeof(struct iphdr) + off_dport); + else + __set_filter(fd, offsetof(struct ip6_hdr, ip6_nxt), proto, + sizeof(struct ip6_hdr) + off_dport); +} + +/* drop everything: used temporarily during setup */ +static void set_filter_null(int fd) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_RET + BPF_K, 0), + }; + struct sock_fprog prog = {}; + + prog.filter = filter; + prog.len = sizeof(filter) / sizeof(struct sock_filter); + if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &prog, sizeof(prog))) + error(1, errno, "setsockopt filter"); +} + +static int create_ring(char **ring) +{ + struct fanout_args args = { + .id = 1, + .type_flags = PACKET_FANOUT_CPU, + .max_num_members = RSS_MAX_CPUS + }; + struct sockaddr_ll ll = { 0 }; + int fd, val; + + fd = socket(PF_PACKET, SOCK_DGRAM, 0); + if (fd == -1) + error(1, errno, "socket creation failed"); + + val = TPACKET_V3; + if (setsockopt(fd, SOL_PACKET, PACKET_VERSION, &val, sizeof(val))) + error(1, errno, "setsockopt PACKET_VERSION"); + *ring = setup_ring(fd); + + /* block packets until all rings are added to the fanout group: + * else packets can arrive during setup and get misclassified + */ + set_filter_null(fd); + + ll.sll_family = AF_PACKET; + ll.sll_ifindex = if_nametoindex(cfg_ifname); + ll.sll_protocol = cfg_family == AF_INET ? htons(ETH_P_IP) : + htons(ETH_P_IPV6); + if (bind(fd, (void *)&ll, sizeof(ll))) + error(1, errno, "bind"); + + /* must come after bind: verifies all programs in group match */ + if (setsockopt(fd, SOL_PACKET, PACKET_FANOUT, &args, sizeof(args))) { + /* on failure, retry using old API if that is sufficient: + * it has a hard limit of 256 sockets, so only try if + * (a) only testing rxhash, not RSS or (b) <= 256 cpus. + * in this API, the third argument is left implicit. + */ + if (cfg_num_queues || num_cpus > 256 || + setsockopt(fd, SOL_PACKET, PACKET_FANOUT, + &args, sizeof(uint32_t))) + error(1, errno, "setsockopt PACKET_FANOUT cpu"); + } + + return fd; +} + +/* setup inet(6) socket to blackhole the test traffic, if arg '-s' */ +static int setup_sink(void) +{ + int fd, val; + + fd = socket(cfg_family, cfg_type, 0); + if (fd == -1) + error(1, errno, "socket %d.%d", cfg_family, cfg_type); + + val = 1 << 20; + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUFFORCE, &val, sizeof(val))) + error(1, errno, "setsockopt rcvbuf"); + + return fd; +} + +static void setup_rings(void) +{ + int i; + + for (i = 0; i < num_cpus; i++) { + rings[i].cpu = i; + rings[i].fd = create_ring(&rings[i].mmap); + } + + /* accept packets once all rings in the fanout group are up */ + for (i = 0; i < num_cpus; i++) + set_filter(rings[i].fd); +} + +static void cleanup_rings(void) +{ + int i; + + for (i = 0; i < num_cpus; i++) { + if (munmap(rings[i].mmap, ring_block_nr * ring_block_sz)) + error(1, errno, "munmap"); + if (close(rings[i].fd)) + error(1, errno, "close"); + } +} + +static void parse_cpulist(const char *arg) +{ + do { + rx_irq_cpus[cfg_num_queues++] = strtol(arg, NULL, 10); + + arg = strchr(arg, ','); + if (!arg) + break; + arg++; // skip ',' + } while (1); +} + +static void show_cpulist(void) +{ + int i; + + for (i = 0; i < cfg_num_queues; i++) + fprintf(stderr, "rxq %d: cpu %d\n", i, rx_irq_cpus[i]); +} + +static void show_silos(void) +{ + int i; + + for (i = 0; i < cfg_num_rps_cpus; i++) + fprintf(stderr, "silo %d: cpu %d\n", i, rps_silo_to_cpu[i]); +} + +static void parse_toeplitz_key(const char *str, int slen, unsigned char *key) +{ + int i, ret, off; + + if (slen < TOEPLITZ_STR_MIN_LEN || + slen > TOEPLITZ_STR_MAX_LEN + 1) + error(1, 0, "invalid toeplitz key"); + + for (i = 0, off = 0; off < slen; i++, off += 3) { + ret = sscanf(str + off, "%hhx", &key[i]); + if (ret != 1) + error(1, 0, "key parse error at %d off %d len %d", + i, off, slen); + } +} + +static void parse_rps_bitmap(const char *arg) +{ + unsigned long bitmap; + int i; + + bitmap = strtoul(arg, NULL, 0); + + if (bitmap & ~(RPS_MAX_CPUS - 1)) + error(1, 0, "rps bitmap 0x%lx out of bounds 0..%lu", + bitmap, RPS_MAX_CPUS - 1); + + for (i = 0; i < RPS_MAX_CPUS; i++) + if (bitmap & 1UL << i) + rps_silo_to_cpu[cfg_num_rps_cpus++] = i; +} + +static void parse_opts(int argc, char **argv) +{ + static struct option long_options[] = { + {"dport", required_argument, 0, 'd'}, + {"cpus", required_argument, 0, 'C'}, + {"key", required_argument, 0, 'k'}, + {"iface", required_argument, 0, 'i'}, + {"ipv4", no_argument, 0, '4'}, + {"ipv6", no_argument, 0, '6'}, + {"sink", no_argument, 0, 's'}, + {"tcp", no_argument, 0, 't'}, + {"timeout", required_argument, 0, 'T'}, + {"udp", no_argument, 0, 'u'}, + {"verbose", no_argument, 0, 'v'}, + {"rps", required_argument, 0, 'r'}, + {0, 0, 0, 0} + }; + bool have_toeplitz = false; + int index, c; + + while ((c = getopt_long(argc, argv, "46C:d:i:k:r:stT:u:v", long_options, &index)) != -1) { + switch (c) { + case '4': + cfg_family = AF_INET; + break; + case '6': + cfg_family = AF_INET6; + break; + case 'C': + parse_cpulist(optarg); + break; + case 'd': + cfg_dport = strtol(optarg, NULL, 0); + break; + case 'i': + cfg_ifname = optarg; + break; + case 'k': + parse_toeplitz_key(optarg, strlen(optarg), + toeplitz_key); + have_toeplitz = true; + break; + case 'r': + parse_rps_bitmap(optarg); + break; + case 's': + cfg_sink = true; + break; + case 't': + cfg_type = SOCK_STREAM; + break; + case 'T': + cfg_timeout_msec = strtol(optarg, NULL, 0); + break; + case 'u': + cfg_type = SOCK_DGRAM; + break; + case 'v': + cfg_verbose = true; + break; + + default: + error(1, 0, "unknown option %c", optopt); + break; + } + } + + if (!have_toeplitz) + error(1, 0, "Must supply rss key ('-k')"); + + num_cpus = get_nprocs(); + if (num_cpus > RSS_MAX_CPUS) + error(1, 0, "increase RSS_MAX_CPUS"); + + if (cfg_num_queues && cfg_num_rps_cpus) + error(1, 0, + "Can't supply both RSS cpus ('-C') and RPS map ('-r')"); + if (cfg_verbose) { + show_cpulist(); + show_silos(); + } +} + +int main(int argc, char **argv) +{ + const int min_tests = 10; + int fd_sink = -1; + + parse_opts(argc, argv); + + if (cfg_sink) + fd_sink = setup_sink(); + + setup_rings(); + process_rings(); + cleanup_rings(); + + if (cfg_sink && close(fd_sink)) + error(1, errno, "close sink"); + + if (frames_received - frames_nohash < min_tests) + error(1, 0, "too few frames for verification"); + + return frames_error; +} diff --git a/tools/testing/selftests/net/toeplitz.sh b/tools/testing/selftests/net/toeplitz.sh new file mode 100755 index 000000000000..0a49907cd4fe --- /dev/null +++ b/tools/testing/selftests/net/toeplitz.sh @@ -0,0 +1,199 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# extended toeplitz test: test rxhash plus, optionally, either (1) rss mapping +# from rxhash to rx queue ('-rss') or (2) rps mapping from rxhash to cpu +# ('-rps <rps_map>') +# +# irq-pattern-prefix can be derived from /sys/kernel/irq/*/action, +# which is a driver-specific encoding. +# +# invoke as ./toeplitz.sh (-i <iface>) -u|-t -4|-6 \ +# [(-rss -irq_prefix <irq-pattern-prefix>)|(-rps <rps_map>)] + +source setup_loopback.sh +readonly SERVER_IP4="192.168.1.200/24" +readonly SERVER_IP6="fda8::1/64" +readonly SERVER_MAC="aa:00:00:00:00:02" + +readonly CLIENT_IP4="192.168.1.100/24" +readonly CLIENT_IP6="fda8::2/64" +readonly CLIENT_MAC="aa:00:00:00:00:01" + +PORT=8000 +KEY="$(</proc/sys/net/core/netdev_rss_key)" +TEST_RSS=false +RPS_MAP="" +PROTO_FLAG="" +IP_FLAG="" +DEV="eth0" + +# Return the number of rxqs among which RSS is configured to spread packets. +# This is determined by reading the RSS indirection table using ethtool. +get_rss_cfg_num_rxqs() { + echo $(ethtool -x "${DEV}" | + egrep [[:space:]]+[0-9]+:[[:space:]]+ | + cut -d: -f2- | + awk '{$1=$1};1' | + tr ' ' '\n' | + sort -u | + wc -l) +} + +# Return a list of the receive irq handler cpus. +# The list is ordered by the irqs, so first rxq-0 cpu, then rxq-1 cpu, etc. +# Reads /sys/kernel/irq/ in order, so algorithm depends on +# irq_{rxq-0} < irq_{rxq-1}, etc. +get_rx_irq_cpus() { + CPUS="" + # sort so that irq 2 is read before irq 10 + SORTED_IRQS=$(for i in /sys/kernel/irq/*; do echo $i; done | sort -V) + # Consider only as many queues as RSS actually uses. We assume that + # if RSS_CFG_NUM_RXQS=N, then RSS uses rxqs 0-(N-1). + RSS_CFG_NUM_RXQS=$(get_rss_cfg_num_rxqs) + RXQ_COUNT=0 + + for i in ${SORTED_IRQS} + do + [[ "${RXQ_COUNT}" -lt "${RSS_CFG_NUM_RXQS}" ]] || break + # lookup relevant IRQs by action name + [[ -e "$i/actions" ]] || continue + cat "$i/actions" | grep -q "${IRQ_PATTERN}" || continue + irqname=$(<"$i/actions") + + # does the IRQ get called + irqcount=$(cat "$i/per_cpu_count" | tr -d '0,') + [[ -n "${irqcount}" ]] || continue + + # lookup CPU + irq=$(basename "$i") + cpu=$(cat "/proc/irq/$irq/smp_affinity_list") + + if [[ -z "${CPUS}" ]]; then + CPUS="${cpu}" + else + CPUS="${CPUS},${cpu}" + fi + RXQ_COUNT=$((RXQ_COUNT+1)) + done + + echo "${CPUS}" +} + +get_disable_rfs_cmd() { + echo "echo 0 > /proc/sys/net/core/rps_sock_flow_entries;" +} + +get_set_rps_bitmaps_cmd() { + CMD="" + for i in /sys/class/net/${DEV}/queues/rx-*/rps_cpus + do + CMD="${CMD} echo $1 > ${i};" + done + + echo "${CMD}" +} + +get_disable_rps_cmd() { + echo "$(get_set_rps_bitmaps_cmd 0)" +} + +die() { + echo "$1" + exit 1 +} + +check_nic_rxhash_enabled() { + local -r pattern="receive-hashing:\ on" + + ethtool -k "${DEV}" | grep -q "${pattern}" || die "rxhash must be enabled" +} + +parse_opts() { + local prog=$0 + shift 1 + + while [[ "$1" =~ "-" ]]; do + if [[ "$1" = "-irq_prefix" ]]; then + shift + IRQ_PATTERN="^$1-[0-9]*$" + elif [[ "$1" = "-u" || "$1" = "-t" ]]; then + PROTO_FLAG="$1" + elif [[ "$1" = "-4" ]]; then + IP_FLAG="$1" + SERVER_IP="${SERVER_IP4}" + CLIENT_IP="${CLIENT_IP4}" + elif [[ "$1" = "-6" ]]; then + IP_FLAG="$1" + SERVER_IP="${SERVER_IP6}" + CLIENT_IP="${CLIENT_IP6}" + elif [[ "$1" = "-rss" ]]; then + TEST_RSS=true + elif [[ "$1" = "-rps" ]]; then + shift + RPS_MAP="$1" + elif [[ "$1" = "-i" ]]; then + shift + DEV="$1" + else + die "Usage: ${prog} (-i <iface>) -u|-t -4|-6 \ + [(-rss -irq_prefix <irq-pattern-prefix>)|(-rps <rps_map>)]" + fi + shift + done +} + +setup() { + setup_loopback_environment "${DEV}" + + # Set up server_ns namespace and client_ns namespace + setup_macvlan_ns "${DEV}" server_ns server \ + "${SERVER_MAC}" "${SERVER_IP}" + setup_macvlan_ns "${DEV}" client_ns client \ + "${CLIENT_MAC}" "${CLIENT_IP}" +} + +cleanup() { + cleanup_macvlan_ns server_ns server client_ns client + cleanup_loopback "${DEV}" +} + +parse_opts $0 $@ + +setup +trap cleanup EXIT + +check_nic_rxhash_enabled + +# Actual test starts here +if [[ "${TEST_RSS}" = true ]]; then + # RPS/RFS must be disabled because they move packets between cpus, + # which breaks the PACKET_FANOUT_CPU identification of RSS decisions. + eval "$(get_disable_rfs_cmd) $(get_disable_rps_cmd)" \ + ip netns exec server_ns ./toeplitz "${IP_FLAG}" "${PROTO_FLAG}" \ + -d "${PORT}" -i "${DEV}" -k "${KEY}" -T 1000 \ + -C "$(get_rx_irq_cpus)" -s -v & +elif [[ ! -z "${RPS_MAP}" ]]; then + eval "$(get_disable_rfs_cmd) $(get_set_rps_bitmaps_cmd ${RPS_MAP})" \ + ip netns exec server_ns ./toeplitz "${IP_FLAG}" "${PROTO_FLAG}" \ + -d "${PORT}" -i "${DEV}" -k "${KEY}" -T 1000 \ + -r "0x${RPS_MAP}" -s -v & +else + ip netns exec server_ns ./toeplitz "${IP_FLAG}" "${PROTO_FLAG}" \ + -d "${PORT}" -i "${DEV}" -k "${KEY}" -T 1000 -s -v & +fi + +server_pid=$! + +ip netns exec client_ns ./toeplitz_client.sh "${PROTO_FLAG}" \ + "${IP_FLAG}" "${SERVER_IP%%/*}" "${PORT}" & + +client_pid=$! + +wait "${server_pid}" +exit_code=$? +kill -9 "${client_pid}" +if [[ "${exit_code}" -eq 0 ]]; then + echo "Test Succeeded!" +fi +exit "${exit_code}" diff --git a/tools/testing/selftests/net/toeplitz_client.sh b/tools/testing/selftests/net/toeplitz_client.sh new file mode 100755 index 000000000000..2fef34f4aba1 --- /dev/null +++ b/tools/testing/selftests/net/toeplitz_client.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# A simple program for generating traffic for the toeplitz test. +# +# This program sends packets periodically for, conservatively, 20 seconds. The +# intent is for the calling program to kill this program once it is no longer +# needed, rather than waiting for the 20 second expiration. + +send_traffic() { + expiration=$((SECONDS+20)) + while [[ "${SECONDS}" -lt "${expiration}" ]] + do + if [[ "${PROTO}" == "-u" ]]; then + echo "msg $i" | nc "${IPVER}" -u -w 0 "${ADDR}" "${PORT}" + else + echo "msg $i" | nc "${IPVER}" -w 0 "${ADDR}" "${PORT}" + fi + sleep 0.001 + done +} + +PROTO=$1 +IPVER=$2 +ADDR=$3 +PORT=$4 + +send_traffic diff --git a/tools/testing/selftests/net/veth.sh b/tools/testing/selftests/net/veth.sh index 11d7cdb898c0..19eac3e44c06 100755 --- a/tools/testing/selftests/net/veth.sh +++ b/tools/testing/selftests/net/veth.sh @@ -13,7 +13,7 @@ readonly NS_DST=$BASE$DST readonly BM_NET_V4=192.168.1. readonly BM_NET_V6=2001:db8:: -readonly NPROCS=`nproc` +readonly CPUS=`nproc` ret=0 cleanup() { @@ -75,6 +75,29 @@ chk_tso_flag() { __chk_flag "$1" $2 $3 tcp-segmentation-offload } +chk_channels() { + local msg="$1" + local target=$2 + local rx=$3 + local tx=$4 + + local dev=veth$target + + local cur_rx=`ip netns exec $BASE$target ethtool -l $dev |\ + grep RX: | tail -n 1 | awk '{print $2}' ` + local cur_tx=`ip netns exec $BASE$target ethtool -l $dev |\ + grep TX: | tail -n 1 | awk '{print $2}'` + local cur_combined=`ip netns exec $BASE$target ethtool -l $dev |\ + grep Combined: | tail -n 1 | awk '{print $2}'` + + printf "%-60s" "$msg" + if [ "$cur_rx" = "$rx" -a "$cur_tx" = "$tx" -a "$cur_combined" = "n/a" ]; then + echo " ok " + else + echo " fail rx:$rx:$cur_rx tx:$tx:$cur_tx combined:n/a:$cur_combined" + fi +} + chk_gro() { local msg="$1" local expected=$2 @@ -107,11 +130,100 @@ chk_gro() { fi } +__change_channels() +{ + local cur_cpu + local end=$1 + local cur + local i + + while true; do + printf -v cur '%(%s)T' + [ $cur -le $end ] || break + + for i in `seq 1 $CPUS`; do + ip netns exec $NS_SRC ethtool -L veth$SRC rx $i tx $i + ip netns exec $NS_DST ethtool -L veth$DST rx $i tx $i + done + + for i in `seq 1 $((CPUS - 1))`; do + cur_cpu=$((CPUS - $i)) + ip netns exec $NS_SRC ethtool -L veth$SRC rx $cur_cpu tx $cur_cpu + ip netns exec $NS_DST ethtool -L veth$DST rx $cur_cpu tx $cur_cpu + done + done +} + +__send_data() { + local end=$1 + + while true; do + printf -v cur '%(%s)T' + [ $cur -le $end ] || break + + ip netns exec $NS_SRC ./udpgso_bench_tx -4 -s 1000 -M 300 -D $BM_NET_V4$DST + done +} + +do_stress() { + local end + printf -v end '%(%s)T' + end=$((end + $STRESS)) + + ip netns exec $NS_SRC ethtool -L veth$SRC rx 3 tx 3 + ip netns exec $NS_DST ethtool -L veth$DST rx 3 tx 3 + + ip netns exec $NS_DST ./udpgso_bench_rx & + local rx_pid=$! + + echo "Running stress test for $STRESS seconds..." + __change_channels $end & + local ch_pid=$! + __send_data $end & + local data_pid_1=$! + __send_data $end & + local data_pid_2=$! + __send_data $end & + local data_pid_3=$! + __send_data $end & + local data_pid_4=$! + + wait $ch_pid $data_pid_1 $data_pid_2 $data_pid_3 $data_pid_4 + kill -9 $rx_pid + echo "done" + + # restore previous setting + ip netns exec $NS_SRC ethtool -L veth$SRC rx 2 tx 2 + ip netns exec $NS_DST ethtool -L veth$DST rx 2 tx 1 +} + +usage() { + echo "Usage: $0 [-h] [-s <seconds>]" + echo -e "\t-h: show this help" + echo -e "\t-s: run optional stress tests for the given amount of seconds" +} + +STRESS=0 +while getopts "hs:" option; do + case "$option" in + "h") + usage $0 + exit 0 + ;; + "s") + STRESS=$OPTARG + ;; + esac +done + if [ ! -f ../bpf/xdp_dummy.o ]; then echo "Missing xdp_dummy helper. Build bpf selftest first" exit 1 fi +[ $CPUS -lt 2 ] && echo "Only one CPU available, some tests will be skipped" +[ $STRESS -gt 0 -a $CPUS -lt 3 ] && echo " stress test will be skipped, too" + create_ns chk_gro_flag "default - gro flag" $SRC off chk_gro_flag " - peer gro flag" $DST off @@ -134,6 +246,8 @@ chk_gro " - aggregation with TSO off" 1 cleanup create_ns +chk_channels "default channels" $DST 1 1 + ip -n $NS_DST link set dev veth$DST down ip netns exec $NS_DST ethtool -K veth$DST gro on chk_gro_flag "with gro enabled on link down - gro flag" $DST on @@ -147,6 +261,56 @@ chk_gro " - aggregation with TSO off" 1 cleanup create_ns + +CUR_TX=1 +CUR_RX=1 +if [ $CPUS -gt 1 ]; then + ip netns exec $NS_DST ethtool -L veth$DST tx 2 + chk_channels "setting tx channels" $DST 1 2 + CUR_TX=2 +fi + +if [ $CPUS -gt 2 ]; then + ip netns exec $NS_DST ethtool -L veth$DST rx 3 tx 3 + chk_channels "setting both rx and tx channels" $DST 3 3 + CUR_RX=3 + CUR_TX=3 +fi + +ip netns exec $NS_DST ethtool -L veth$DST combined 2 2>/dev/null +chk_channels "bad setting: combined channels" $DST $CUR_RX $CUR_TX + +ip netns exec $NS_DST ethtool -L veth$DST tx $((CPUS + 1)) 2>/dev/null +chk_channels "setting invalid channels nr" $DST $CUR_RX $CUR_TX + +if [ $CPUS -gt 1 ]; then + # this also tests queues nr reduction + ip netns exec $NS_DST ethtool -L veth$DST rx 1 tx 2 2>/dev/null + ip netns exec $NS_SRC ethtool -L veth$SRC rx 1 tx 2 2>/dev/null + printf "%-60s" "bad setting: XDP with RX nr less than TX" + ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o \ + section xdp_dummy 2>/dev/null &&\ + echo "fail - set operation successful ?!?" || echo " ok " + + # the following tests will run with multiple channels active + ip netns exec $NS_SRC ethtool -L veth$SRC rx 2 + ip netns exec $NS_DST ethtool -L veth$DST rx 2 + ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o \ + section xdp_dummy 2>/dev/null + printf "%-60s" "bad setting: reducing RX nr below peer TX with XDP set" + ip netns exec $NS_DST ethtool -L veth$DST rx 1 2>/dev/null &&\ + echo "fail - set operation successful ?!?" || echo " ok " + CUR_RX=2 + CUR_TX=2 +fi + +if [ $CPUS -gt 2 ]; then + printf "%-60s" "bad setting: increasing peer TX nr above RX with XDP set" + ip netns exec $NS_SRC ethtool -L veth$SRC tx 3 2>/dev/null &&\ + echo "fail - set operation successful ?!?" || echo " ok " + chk_channels "setting invalid channels nr" $DST 2 2 +fi + ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o section xdp_dummy 2>/dev/null chk_gro_flag "with xdp attached - gro flag" $DST on chk_gro_flag " - peer gro flag" $SRC off @@ -167,10 +331,27 @@ chk_gro_flag " - after gro on xdp off, gro flag" $DST on chk_gro_flag " - peer gro flag" $SRC off chk_tso_flag " - tso flag" $SRC on chk_tso_flag " - peer tso flag" $DST on + +if [ $CPUS -gt 1 ]; then + ip netns exec $NS_DST ethtool -L veth$DST tx 1 + chk_channels "decreasing tx channels with device down" $DST 2 1 +fi + ip -n $NS_DST link set dev veth$DST up ip -n $NS_SRC link set dev veth$SRC up chk_gro " - aggregation" 1 +if [ $CPUS -gt 1 ]; then + [ $STRESS -gt 0 -a $CPUS -gt 2 ] && do_stress + + ip -n $NS_DST link set dev veth$DST down + ip -n $NS_SRC link set dev veth$SRC down + ip netns exec $NS_DST ethtool -L veth$DST tx 2 + chk_channels "increasing tx channels with device down" $DST 2 2 + ip -n $NS_DST link set dev veth$DST up + ip -n $NS_SRC link set dev veth$SRC up +fi + ip netns exec $NS_DST ethtool -K veth$DST gro off ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off chk_gro "aggregation again with default and TSO off" 10 diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/skbmod.json b/tools/testing/selftests/tc-testing/tc-tests/actions/skbmod.json index 6eb4c4f97060..742f2290973e 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/skbmod.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/skbmod.json @@ -417,5 +417,29 @@ "teardown": [ "$TC actions flush action skbmod" ] + }, + { + "id": "fe09", + "name": "Add skbmod action to mark ECN bits", + "category": [ + "actions", + "skbmod" + ], + "setup": [ + [ + "$TC actions flush action skbmod", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions add action skbmod ecn", + "expExitCode": "0", + "verifyCmd": "$TC actions get action skbmod index 1", + "matchPattern": "action order [0-9]*: skbmod pipe ecn", + "matchCount": "1", + "teardown": [ + "$TC actions flush action skbmod" + ] } ] diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/mq.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/mq.json new file mode 100644 index 000000000000..88a20c781e49 --- /dev/null +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/mq.json @@ -0,0 +1,137 @@ +[ + { + "id": "ce7d", + "name": "Add mq Qdisc to multi-queue device (4 queues)", + "category": [ + "qdisc", + "mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH root handle 1: mq", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc pfifo_fast 0: parent 1:[1-4] bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1", + "matchCount": "4", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "2f82", + "name": "Add mq Qdisc to multi-queue device (256 queues)", + "category": [ + "qdisc", + "mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 256\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH root handle 1: mq", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc pfifo_fast 0: parent 1:[1-9,a-f][0-9,a-f]{0,2} bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1", + "matchCount": "256", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "c525", + "name": "Add duplicate mq Qdisc", + "category": [ + "qdisc", + "mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device", + "$TC qdisc add dev $ETH root handle 1: mq" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH root handle 1: mq", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc pfifo_fast 0: parent 1:[1-4] bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1", + "matchCount": "4", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "128a", + "name": "Delete nonexistent mq Qdisc", + "category": [ + "qdisc", + "mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc del dev $ETH root handle 1: mq", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc pfifo_fast 0: parent 1:[1-4] bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1", + "matchCount": "0", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "03a9", + "name": "Delete mq Qdisc twice", + "category": [ + "qdisc", + "mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device", + "$TC qdisc add dev $ETH root handle 1: mq", + "$TC qdisc del dev $ETH root handle 1: mq" + ], + "cmdUnderTest": "$TC qdisc del dev $ETH root handle 1: mq", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc pfifo_fast 0: parent 1:[1-4] bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1", + "matchCount": "0", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "be0f", + "name": "Add mq Qdisc to single-queue device", + "category": [ + "qdisc", + "mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH root handle 1: mq", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc pfifo_fast 0: parent 1:[1-4] bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1", + "matchCount": "0", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + } +] diff --git a/tools/testing/selftests/tc-testing/tdc_config.py b/tools/testing/selftests/tc-testing/tdc_config.py index cd4a27ee1466..ea04f04c173e 100644 --- a/tools/testing/selftests/tc-testing/tdc_config.py +++ b/tools/testing/selftests/tc-testing/tdc_config.py @@ -17,6 +17,7 @@ NAMES = { 'DEV1': 'v0p1', 'DEV2': '', 'DUMMY': 'dummy1', + 'ETH': 'eth0', 'BATCH_FILE': './batch.txt', 'BATCH_DIR': 'tmp', # Length of time in seconds to wait before terminating a command |