From b66c9b8de22b666718c2fcb0ae84ce620f9b81c0 Mon Sep 17 00:00:00 2001 From: Lourdes Pedrajas Date: Sun, 19 Apr 2020 11:16:51 +0200 Subject: selftests: pmtu: implement IPIP, SIT and ip6tnl PMTU discovery tests Add PMTU discovery tests for these encapsulations: - IPIP - SIT, mode ip6ip - ip6tnl, modes ip6ip6 and ipip6 Signed-off-by: Lourdes Pedrajas Reviewed-by: Stefano Brivio Reviewed-by: David Ahern Signed-off-by: David S. Miller --- tools/testing/selftests/net/pmtu.sh | 122 ++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index 71a62e7e35b1..77c09cd339c3 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -67,6 +67,10 @@ # Same as pmtu_ipv4_vxlan4, but using a generic UDP IPv4/IPv6 # encapsulation (GUE) over IPv4/IPv6, instead of VXLAN # +# - pmtu_ipv{4,6}_ipv{4,6}_exception +# Same as pmtu_ipv4_vxlan4, but using a IPv4/IPv6 tunnel over IPv4/IPv6, +# instead of VXLAN +# # - pmtu_vti4_exception # Set up vti tunnel on top of veth, with xfrm states and policies, in two # namespaces with matching endpoints. Check that route exception is not @@ -151,6 +155,10 @@ tests=" pmtu_ipv6_gue4_exception IPv6 over gue4: PMTU exceptions 1 pmtu_ipv4_gue6_exception IPv4 over gue6: PMTU exceptions 1 pmtu_ipv6_gue6_exception IPv6 over gue6: PMTU exceptions 1 + pmtu_ipv4_ipv4_exception IPv4 over IPv4: PMTU exceptions 1 + pmtu_ipv6_ipv4_exception IPv6 over IPv4: PMTU exceptions 1 + pmtu_ipv4_ipv6_exception IPv4 over IPv6: PMTU exceptions 1 + pmtu_ipv6_ipv6_exception IPv6 over IPv6: PMTU exceptions 1 pmtu_vti6_exception vti6: PMTU exceptions 0 pmtu_vti4_exception vti4: PMTU exceptions 0 pmtu_vti4_default_mtu vti4: default MTU assignment 0 @@ -363,6 +371,62 @@ setup_gue66() { setup_fou_or_gue 6 6 gue } +setup_ipvX_over_ipvY() { + inner=${1} + outer=${2} + + if [ "${outer}" -eq 4 ]; then + a_addr="${prefix4}.${a_r1}.1" + b_addr="${prefix4}.${b_r1}.1" + if [ "${inner}" -eq 4 ]; then + type="ipip" + mode="ipip" + else + type="sit" + mode="ip6ip" + fi + else + a_addr="${prefix6}:${a_r1}::1" + b_addr="${prefix6}:${b_r1}::1" + type="ip6tnl" + if [ "${inner}" -eq 4 ]; then + mode="ipip6" + else + mode="ip6ip6" + fi + fi + + run_cmd ${ns_a} ip link add ip_a type ${type} local ${a_addr} remote ${b_addr} mode ${mode} || return 2 + run_cmd ${ns_b} ip link add ip_b type ${type} local ${b_addr} remote ${a_addr} mode ${mode} + + run_cmd ${ns_a} ip link set ip_a up + run_cmd ${ns_b} ip link set ip_b up + + if [ "${inner}" = "4" ]; then + run_cmd ${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask} dev ip_a + run_cmd ${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ip_b + else + run_cmd ${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask} dev ip_a + run_cmd ${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask} dev ip_b + fi +} + +setup_ip4ip4() { + setup_ipvX_over_ipvY 4 4 +} + +setup_ip6ip4() { + setup_ipvX_over_ipvY 6 4 +} + +setup_ip4ip6() { + setup_ipvX_over_ipvY 4 6 +} + +setup_ip6ip6() { + setup_ipvX_over_ipvY 6 6 +} + setup_namespaces() { for n in ${NS_A} ${NS_B} ${NS_R1} ${NS_R2}; do ip netns add ${n} || return 1 @@ -908,6 +972,64 @@ test_pmtu_ipv6_gue6_exception() { test_pmtu_ipvX_over_fouY_or_gueY 6 6 gue } +test_pmtu_ipvX_over_ipvY_exception() { + inner=${1} + outer=${2} + ll_mtu=4000 + + setup namespaces routing ip${inner}ip${outer} || return 2 + + trace "${ns_a}" ip_a "${ns_b}" ip_b \ + "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \ + "${ns_b}" veth_B-R1 "${ns_r1}" veth_R1-B + + if [ ${inner} -eq 4 ]; then + ping=ping + dst=${tunnel4_b_addr} + else + ping=${ping6} + dst=${tunnel6_b_addr} + fi + + if [ ${outer} -eq 4 ]; then + # IPv4 header + exp_mtu=$((${ll_mtu} - 20)) + else + # IPv6 header Option 4 + exp_mtu=$((${ll_mtu} - 40 - 8)) + fi + + # Create route exception by exceeding link layer MTU + mtu "${ns_a}" veth_A-R1 $((${ll_mtu} + 1000)) + mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000)) + mtu "${ns_b}" veth_B-R1 ${ll_mtu} + mtu "${ns_r1}" veth_R1-B ${ll_mtu} + + mtu "${ns_a}" ip_a $((${ll_mtu} + 1000)) || return + mtu "${ns_b}" ip_b $((${ll_mtu} + 1000)) || return + run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s $((${ll_mtu} + 500)) ${dst} + + # Check that exception was created + pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})" + check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on ip${inner}ip${outer} interface" +} + +test_pmtu_ipv4_ipv4_exception() { + test_pmtu_ipvX_over_ipvY_exception 4 4 +} + +test_pmtu_ipv6_ipv4_exception() { + test_pmtu_ipvX_over_ipvY_exception 6 4 +} + +test_pmtu_ipv4_ipv6_exception() { + test_pmtu_ipvX_over_ipvY_exception 4 6 +} + +test_pmtu_ipv6_ipv6_exception() { + test_pmtu_ipvX_over_ipvY_exception 6 6 +} + test_pmtu_vti4_exception() { setup namespaces veth vti4 xfrm4 || return 2 trace "${ns_a}" veth_a "${ns_b}" veth_b \ -- cgit v1.2.3 From 3f251d741150265cfa7c84d30d105612449601ab Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 21 Apr 2020 18:40:22 -0600 Subject: selftests: Add tests for vrf and xfrms Add tests for vrf and xfrms with a second round after adding a qdisc. There are a few known problems documented with the test cases that fail. The fix is non-trivial; will come back to it when time allows. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/vrf-xfrm-tests.sh | 436 ++++++++++++++++++++++++++ 2 files changed, 437 insertions(+) create mode 100755 tools/testing/selftests/net/vrf-xfrm-tests.sh (limited to 'tools') diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 3f386eb9e7d7..895ec992b2f1 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -16,6 +16,7 @@ TEST_PROGS += altnames.sh icmp_redirect.sh ip6_gre_headroom.sh TEST_PROGS += route_localnet.sh TEST_PROGS += reuseaddr_ports_exhausted.sh TEST_PROGS += txtimestamp.sh +TEST_PROGS += vrf-xfrm-tests.sh TEST_PROGS_EXTENDED := in_netns.sh TEST_GEN_FILES = socket nettest TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any diff --git a/tools/testing/selftests/net/vrf-xfrm-tests.sh b/tools/testing/selftests/net/vrf-xfrm-tests.sh new file mode 100755 index 000000000000..184da81f554f --- /dev/null +++ b/tools/testing/selftests/net/vrf-xfrm-tests.sh @@ -0,0 +1,436 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Various combinations of VRF with xfrms and qdisc. + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +PAUSE_ON_FAIL=no +VERBOSE=0 +ret=0 + +HOST1_4=192.168.1.1 +HOST2_4=192.168.1.2 +HOST1_6=2001:db8:1::1 +HOST2_6=2001:db8:1::2 + +XFRM1_4=10.0.1.1 +XFRM2_4=10.0.1.2 +XFRM1_6=fc00:1000::1 +XFRM2_6=fc00:1000::2 +IF_ID=123 + +VRF=red +TABLE=300 + +AUTH_1=0xd94fcfea65fddf21dc6e0d24a0253508 +AUTH_2=0xdc6e0d24a0253508d94fcfea65fddf21 +ENC_1=0xfc46c20f8048be9725930ff3fb07ac2a91f0347dffeacf62 +ENC_2=0x3fb07ac2a91f0347dffeacf62fc46c20f8048be9725930ff +SPI_1=0x02122b77 +SPI_2=0x2b770212 + +which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping) + +################################################################################ +# +log_test() +{ + local rc=$1 + local expected=$2 + local msg="$3" + + if [ ${rc} -eq ${expected} ]; then + printf "TEST: %-60s [ OK ]\n" "${msg}" + nsuccess=$((nsuccess+1)) + else + ret=1 + nfail=$((nfail+1)) + printf "TEST: %-60s [FAIL]\n" "${msg}" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi + fi +} + +run_cmd_host1() +{ + local cmd="$*" + local out + local rc + + if [ "$VERBOSE" = "1" ]; then + printf " COMMAND: $cmd\n" + fi + + out=$(eval ip netns exec host1 $cmd 2>&1) + rc=$? + if [ "$VERBOSE" = "1" ]; then + if [ -n "$out" ]; then + echo + echo " $out" + fi + echo + fi + + return $rc +} + +################################################################################ +# create namespaces for hosts and sws + +create_vrf() +{ + local ns=$1 + local vrf=$2 + local table=$3 + + if [ -n "${ns}" ]; then + ns="-netns ${ns}" + fi + + ip ${ns} link add ${vrf} type vrf table ${table} + ip ${ns} link set ${vrf} up + ip ${ns} route add vrf ${vrf} unreachable default metric 8192 + ip ${ns} -6 route add vrf ${vrf} unreachable default metric 8192 + + ip ${ns} addr add 127.0.0.1/8 dev ${vrf} + ip ${ns} -6 addr add ::1 dev ${vrf} nodad + + ip ${ns} ru del pref 0 + ip ${ns} ru add pref 32765 from all lookup local + ip ${ns} -6 ru del pref 0 + ip ${ns} -6 ru add pref 32765 from all lookup local +} + +create_ns() +{ + local ns=$1 + local addr=$2 + local addr6=$3 + + [ -z "${addr}" ] && addr="-" + [ -z "${addr6}" ] && addr6="-" + + ip netns add ${ns} + + ip -netns ${ns} link set lo up + if [ "${addr}" != "-" ]; then + ip -netns ${ns} addr add dev lo ${addr} + fi + if [ "${addr6}" != "-" ]; then + ip -netns ${ns} -6 addr add dev lo ${addr6} + fi + + ip -netns ${ns} ro add unreachable default metric 8192 + ip -netns ${ns} -6 ro add unreachable default metric 8192 + + ip netns exec ${ns} sysctl -qw net.ipv4.ip_forward=1 + ip netns exec ${ns} sysctl -qw net.ipv6.conf.all.keep_addr_on_down=1 + ip netns exec ${ns} sysctl -qw net.ipv6.conf.all.forwarding=1 + ip netns exec ${ns} sysctl -qw net.ipv6.conf.default.forwarding=1 + ip netns exec ${ns} sysctl -qw net.ipv6.conf.default.accept_dad=0 +} + +# create veth pair to connect namespaces and apply addresses. +connect_ns() +{ + local ns1=$1 + local ns1_dev=$2 + local ns1_addr=$3 + local ns1_addr6=$4 + local ns2=$5 + local ns2_dev=$6 + local ns2_addr=$7 + local ns2_addr6=$8 + local ns1arg + local ns2arg + + if [ -n "${ns1}" ]; then + ns1arg="-netns ${ns1}" + fi + if [ -n "${ns2}" ]; then + ns2arg="-netns ${ns2}" + fi + + ip ${ns1arg} li add ${ns1_dev} type veth peer name tmp + ip ${ns1arg} li set ${ns1_dev} up + ip ${ns1arg} li set tmp netns ${ns2} name ${ns2_dev} + ip ${ns2arg} li set ${ns2_dev} up + + if [ "${ns1_addr}" != "-" ]; then + ip ${ns1arg} addr add dev ${ns1_dev} ${ns1_addr} + ip ${ns2arg} addr add dev ${ns2_dev} ${ns2_addr} + fi + + if [ "${ns1_addr6}" != "-" ]; then + ip ${ns1arg} addr add dev ${ns1_dev} ${ns1_addr6} nodad + ip ${ns2arg} addr add dev ${ns2_dev} ${ns2_addr6} nodad + fi +} + +################################################################################ + +cleanup() +{ + ip netns del host1 + ip netns del host2 +} + +setup() +{ + create_ns "host1" + create_ns "host2" + + connect_ns "host1" eth0 ${HOST1_4}/24 ${HOST1_6}/64 \ + "host2" eth0 ${HOST2_4}/24 ${HOST2_6}/64 + + create_vrf "host1" ${VRF} ${TABLE} + ip -netns host1 link set dev eth0 master ${VRF} +} + +cleanup_xfrm() +{ + for ns in host1 host2 + do + for x in state policy + do + ip -netns ${ns} xfrm ${x} flush + ip -6 -netns ${ns} xfrm ${x} flush + done + done +} + +setup_xfrm() +{ + local h1_4=$1 + local h2_4=$2 + local h1_6=$3 + local h2_6=$4 + local devarg="$5" + + # + # policy + # + + # host1 - IPv4 out + ip -netns host1 xfrm policy add \ + src ${h1_4} dst ${h2_4} ${devarg} dir out \ + tmpl src ${HOST1_4} dst ${HOST2_4} proto esp mode tunnel + + # host2 - IPv4 in + ip -netns host2 xfrm policy add \ + src ${h1_4} dst ${h2_4} dir in \ + tmpl src ${HOST1_4} dst ${HOST2_4} proto esp mode tunnel + + # host1 - IPv4 in + ip -netns host1 xfrm policy add \ + src ${h2_4} dst ${h1_4} ${devarg} dir in \ + tmpl src ${HOST2_4} dst ${HOST1_4} proto esp mode tunnel + + # host2 - IPv4 out + ip -netns host2 xfrm policy add \ + src ${h2_4} dst ${h1_4} dir out \ + tmpl src ${HOST2_4} dst ${HOST1_4} proto esp mode tunnel + + + # host1 - IPv6 out + ip -6 -netns host1 xfrm policy add \ + src ${h1_6} dst ${h2_6} ${devarg} dir out \ + tmpl src ${HOST1_6} dst ${HOST2_6} proto esp mode tunnel + + # host2 - IPv6 in + ip -6 -netns host2 xfrm policy add \ + src ${h1_6} dst ${h2_6} dir in \ + tmpl src ${HOST1_6} dst ${HOST2_6} proto esp mode tunnel + + # host1 - IPv6 in + ip -6 -netns host1 xfrm policy add \ + src ${h2_6} dst ${h1_6} ${devarg} dir in \ + tmpl src ${HOST2_6} dst ${HOST1_6} proto esp mode tunnel + + # host2 - IPv6 out + ip -6 -netns host2 xfrm policy add \ + src ${h2_6} dst ${h1_6} dir out \ + tmpl src ${HOST2_6} dst ${HOST1_6} proto esp mode tunnel + + # + # state + # + ip -netns host1 xfrm state add src ${HOST1_4} dst ${HOST2_4} \ + proto esp spi ${SPI_1} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ + enc 'cbc(des3_ede)' ${ENC_1} \ + sel src ${h1_4} dst ${h2_4} ${devarg} + + ip -netns host2 xfrm state add src ${HOST1_4} dst ${HOST2_4} \ + proto esp spi ${SPI_1} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ + enc 'cbc(des3_ede)' ${ENC_1} \ + sel src ${h1_4} dst ${h2_4} + + + ip -netns host1 xfrm state add src ${HOST2_4} dst ${HOST1_4} \ + proto esp spi ${SPI_2} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ + enc 'cbc(des3_ede)' ${ENC_2} \ + sel src ${h2_4} dst ${h1_4} ${devarg} + + ip -netns host2 xfrm state add src ${HOST2_4} dst ${HOST1_4} \ + proto esp spi ${SPI_2} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ + enc 'cbc(des3_ede)' ${ENC_2} \ + sel src ${h2_4} dst ${h1_4} + + + ip -6 -netns host1 xfrm state add src ${HOST1_6} dst ${HOST2_6} \ + proto esp spi ${SPI_1} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ + enc 'cbc(des3_ede)' ${ENC_1} \ + sel src ${h1_6} dst ${h2_6} ${devarg} + + ip -6 -netns host2 xfrm state add src ${HOST1_6} dst ${HOST2_6} \ + proto esp spi ${SPI_1} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ + enc 'cbc(des3_ede)' ${ENC_1} \ + sel src ${h1_6} dst ${h2_6} + + + ip -6 -netns host1 xfrm state add src ${HOST2_6} dst ${HOST1_6} \ + proto esp spi ${SPI_2} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ + enc 'cbc(des3_ede)' ${ENC_2} \ + sel src ${h2_6} dst ${h1_6} ${devarg} + + ip -6 -netns host2 xfrm state add src ${HOST2_6} dst ${HOST1_6} \ + proto esp spi ${SPI_2} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ + enc 'cbc(des3_ede)' ${ENC_2} \ + sel src ${h2_6} dst ${h1_6} +} + +cleanup_xfrm_dev() +{ + ip -netns host1 li del xfrm0 + ip -netns host2 addr del ${XFRM2_4}/24 dev eth0 + ip -netns host2 addr del ${XFRM2_6}/64 dev eth0 +} + +setup_xfrm_dev() +{ + local vrfarg="vrf ${VRF}" + + ip -netns host1 li add type xfrm dev eth0 if_id ${IF_ID} + ip -netns host1 li set xfrm0 ${vrfarg} up + ip -netns host1 addr add ${XFRM1_4}/24 dev xfrm0 + ip -netns host1 addr add ${XFRM1_6}/64 dev xfrm0 + + ip -netns host2 addr add ${XFRM2_4}/24 dev eth0 + ip -netns host2 addr add ${XFRM2_6}/64 dev eth0 + + setup_xfrm ${XFRM1_4} ${XFRM2_4} ${XFRM1_6} ${XFRM2_6} "if_id ${IF_ID}" +} + +run_tests() +{ + cleanup_xfrm + + # no IPsec + run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${HOST2_4} + log_test $? 0 "IPv4 no xfrm policy" + run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${HOST2_6} + log_test $? 0 "IPv6 no xfrm policy" + + # xfrm without VRF in sel + setup_xfrm ${HOST1_4} ${HOST2_4} ${HOST1_6} ${HOST2_6} + run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${HOST2_4} + log_test $? 0 "IPv4 xfrm policy based on address" + run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${HOST2_6} + log_test $? 0 "IPv6 xfrm policy based on address" + cleanup_xfrm + + # xfrm with VRF in sel + # Known failure: ipv4 resets the flow oif after the lookup. Fix is + # not straightforward. + # setup_xfrm ${HOST1_4} ${HOST2_4} ${HOST1_6} ${HOST2_6} "dev ${VRF}" + # run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${HOST2_4} + # log_test $? 0 "IPv4 xfrm policy with VRF in selector" + run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${HOST2_6} + log_test $? 0 "IPv6 xfrm policy with VRF in selector" + cleanup_xfrm + + # xfrm with enslaved device in sel + # Known failures: combined with the above, __xfrm{4,6}_selector_match + # needs to consider both l3mdev and enslaved device index. + # setup_xfrm ${HOST1_4} ${HOST2_4} ${HOST1_6} ${HOST2_6} "dev eth0" + # run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${HOST2_4} + # log_test $? 0 "IPv4 xfrm policy with enslaved device in selector" + # run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${HOST2_6} + # log_test $? 0 "IPv6 xfrm policy with enslaved device in selector" + # cleanup_xfrm + + # xfrm device + setup_xfrm_dev + run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${XFRM2_4} + log_test $? 0 "IPv4 xfrm policy with xfrm device" + run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${XFRM2_6} + log_test $? 0 "IPv6 xfrm policy with xfrm device" + cleanup_xfrm_dev +} + +################################################################################ +# usage + +usage() +{ + cat </dev/null +setup + +echo +echo "No qdisc on VRF device" +run_tests + +run_cmd_host1 tc qdisc add dev ${VRF} root netem delay 100ms +echo +echo "netem qdisc on VRF device" +run_tests + +printf "\nTests passed: %3d\n" ${nsuccess} +printf "Tests failed: %3d\n" ${nfail} + +exit $ret -- cgit v1.2.3 From 93e106da6a7514445c1e27fdbb6b9810f3df8452 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 22 Apr 2020 19:48:29 +0300 Subject: selftests: forwarding: pedit_dsfield: Add pedit munge ip6 dsfield Extend the pedit_dsfield forwarding selftest with coverage of "pedit ex munge ip6 dsfield set". Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- .../selftests/net/forwarding/pedit_dsfield.sh | 66 ++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/pedit_dsfield.sh b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh index b50081855913..1181d647f6a7 100755 --- a/tools/testing/selftests/net/forwarding/pedit_dsfield.sh +++ b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh @@ -20,10 +20,14 @@ ALL_TESTS=" ping_ipv4 + ping_ipv6 test_ip_dsfield test_ip_dscp test_ip_ecn test_ip_dscp_ecn + test_ip6_dsfield + test_ip6_dscp + test_ip6_ecn " NUM_NETIFS=4 @@ -107,6 +111,11 @@ ping_ipv4() ping_test $h1 192.0.2.2 } +ping_ipv6() +{ + ping6_test $h1 2001:db8:1::2 +} + do_test_pedit_dsfield_common() { local pedit_locus=$1; shift @@ -228,6 +237,63 @@ test_ip_dscp_ecn() do_test_ip_dscp_ecn "dev $swp2 egress" } +do_test_ip6_dsfield() +{ + local locus=$1; shift + local dsfield + + for dsfield in 0 1 2 3 128 252 253 254 255; do + do_test_pedit_dsfield "$locus" \ + "ip6 traffic_class set $dsfield" \ + ipv6 "ip_tos $dsfield" \ + "-6 -A 2001:db8:1::1 -B 2001:db8:1::2" + done +} + +test_ip6_dsfield() +{ + do_test_ip6_dsfield "dev $swp1 ingress" + do_test_ip6_dsfield "dev $swp2 egress" +} + +do_test_ip6_dscp() +{ + local locus=$1; shift + local dscp + + for dscp in 0 1 2 3 32 61 62 63; do + do_test_pedit_dsfield "$locus" \ + "ip6 traffic_class set $((dscp << 2)) retain 0xfc" \ + ipv6 "ip_tos $(((dscp << 2) | 1))" \ + "-6 -A 2001:db8:1::1 -B 2001:db8:1::2" + done +} + +test_ip6_dscp() +{ + do_test_ip6_dscp "dev $swp1 ingress" + do_test_ip6_dscp "dev $swp2 egress" +} + +do_test_ip6_ecn() +{ + local locus=$1; shift + local ecn + + for ecn in 0 1 2 3; do + do_test_pedit_dsfield "$locus" \ + "ip6 traffic_class set $ecn retain 0x3" \ + ipv6 "ip_tos $((124 | $ecn))" \ + "-6 -A 2001:db8:1::1 -B 2001:db8:1::2" + done +} + +test_ip6_ecn() +{ + do_test_ip6_ecn "dev $swp1 ingress" + do_test_ip6_ecn "dev $swp2 egress" +} + trap cleanup EXIT setup_prepare -- cgit v1.2.3 From f132ccc56e35875655226915588cab63a16237ef Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 22 Apr 2020 19:48:30 +0300 Subject: selftests: tc-testing: Add a TDC test for pedit munge ip6 dsfield Add a self-test for the IPv6 dsfield munge that iproute2 will support. Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- .../tc-testing/tc-tests/actions/pedit.json | 25 ++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json b/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json index f8ea6f5fa8e9..72cdc3c800a5 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json @@ -1471,6 +1471,31 @@ "$TC actions flush action pedit" ] }, + { + "id": "94bb", + "name": "Add pedit action with LAYERED_OP ip6 traffic_class", + "category": [ + "actions", + "pedit", + "layered_op" + ], + "setup": [ + [ + "$TC actions flush action pedit", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions add action pedit ex munge ip6 traffic_class set 0x40 continue", + "expExitCode": "0", + "verifyCmd": "$TC actions list action pedit", + "matchPattern": "ipv6\\+0: val 04000000 mask f00fffff", + "matchCount": "1", + "teardown": [ + "$TC actions flush action pedit" + ] + }, { "id": "6f5e", "name": "Add pedit action with LAYERED_OP ip6 flow_lbl", -- cgit v1.2.3 From 493f3cc7ee020a4c5da02f6502743d9ae7be50d6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 22 Apr 2020 17:08:22 -0600 Subject: selftests: A few improvements to fib_nexthops.sh Add nodad when adding IPv6 addresses and remove the sleep. A recent change to iproute2 moved the 'pref medium' to the prefix (where it belongs). Change the expected route check to strip 'pref medium' to be compatible with old and new iproute2. Add IPv4 runtime test with an IPv6 address as the gateway in the default route. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- tools/testing/selftests/net/fib_nexthops.sh | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index 796670ebc65b..5890ba6d7ef6 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -150,31 +150,31 @@ setup() $IP li add veth1 type veth peer name veth2 $IP li set veth1 up $IP addr add 172.16.1.1/24 dev veth1 - $IP -6 addr add 2001:db8:91::1/64 dev veth1 + $IP -6 addr add 2001:db8:91::1/64 dev veth1 nodad $IP li add veth3 type veth peer name veth4 $IP li set veth3 up $IP addr add 172.16.2.1/24 dev veth3 - $IP -6 addr add 2001:db8:92::1/64 dev veth3 + $IP -6 addr add 2001:db8:92::1/64 dev veth3 nodad $IP li set veth2 netns peer up ip -netns peer addr add 172.16.1.2/24 dev veth2 - ip -netns peer -6 addr add 2001:db8:91::2/64 dev veth2 + ip -netns peer -6 addr add 2001:db8:91::2/64 dev veth2 nodad $IP li set veth4 netns peer up ip -netns peer addr add 172.16.2.2/24 dev veth4 - ip -netns peer -6 addr add 2001:db8:92::2/64 dev veth4 + ip -netns peer -6 addr add 2001:db8:92::2/64 dev veth4 nodad ip -netns remote li add veth5 type veth peer name veth6 ip -netns remote li set veth5 up ip -netns remote addr add dev veth5 172.16.101.1/24 - ip -netns remote addr add dev veth5 2001:db8:101::1/64 + ip -netns remote -6 addr add dev veth5 2001:db8:101::1/64 nodad ip -netns remote ro add 172.16.0.0/22 via 172.16.101.2 ip -netns remote -6 ro add 2001:db8:90::/40 via 2001:db8:101::2 ip -netns remote li set veth6 netns peer up ip -netns peer addr add dev veth6 172.16.101.2/24 - ip -netns peer addr add dev veth6 2001:db8:101::2/64 + ip -netns peer -6 addr add dev veth6 2001:db8:101::2/64 nodad set +e } @@ -248,7 +248,7 @@ check_route6() local expected="$2" local out - out=$($IP -6 route ls match ${pfx} 2>/dev/null) + out=$($IP -6 route ls match ${pfx} 2>/dev/null | sed -e 's/pref medium//') check_output "${out}" "${expected}" } @@ -423,8 +423,6 @@ ipv6_fcnal_runtime() echo "IPv6 functional runtime" echo "-----------------------" - sleep 5 - # # IPv6 - the basics # @@ -481,12 +479,12 @@ ipv6_fcnal_runtime() run_cmd "$IP -6 nexthop add id 85 dev veth1" run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 85" log_test $? 0 "IPv6 route with device only nexthop" - check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 85 dev veth1 metric 1024 pref medium" + check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 85 dev veth1 metric 1024" run_cmd "$IP nexthop add id 123 group 81/85" run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 123" log_test $? 0 "IPv6 multipath route with nexthop mix - dev only + gw" - check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 123 metric 1024 nexthop via 2001:db8:91::2 dev veth1 weight 1 nexthop dev veth1 weight 1 pref medium" + check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 123 metric 1024 nexthop via 2001:db8:91::2 dev veth1 weight 1 nexthop dev veth1 weight 1" # # IPv6 route with v4 nexthop - not allowed @@ -843,6 +841,11 @@ ipv4_fcnal_runtime() $IP neigh sh | grep 'dev veth1' fi + run_cmd "$IP ro del 172.16.101.1/32 via inet6 ${lladdr} dev veth1" + run_cmd "$IP -4 ro add default via inet6 ${lladdr} dev veth1" + run_cmd "ip netns exec me ping -c1 -w1 172.16.101.1" + log_test $? 0 "IPv4 default route with IPv6 gateway" + # # MPLS as an example of LWT encap # -- cgit v1.2.3 From 93e516894752e8b2ae3c2e7671e3ea33e27e3898 Mon Sep 17 00:00:00 2001 From: Jagadeesh Pagadala Date: Sun, 19 Apr 2020 11:09:17 +0530 Subject: tools/bpf/bpftool: Remove duplicate headers Code cleanup: Remove duplicate headers which are included twice. Signed-off-by: Jagadeesh Pagadala Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/1587274757-14101-1-git-send-email-jagdsh.linux@gmail.com --- tools/bpf/bpftool/btf.c | 1 - tools/bpf/bpftool/gen.c | 1 - tools/bpf/bpftool/jit_disasm.c | 1 - 3 files changed, 3 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index bcaf55b59498..41a1346934a1 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -15,7 +15,6 @@ #include #include #include -#include #include "json_writer.h" #include "main.h" diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index f8113b3646f5..0e5f0236cc76 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include "bpf/libbpf_internal.h" diff --git a/tools/bpf/bpftool/jit_disasm.c b/tools/bpf/bpftool/jit_disasm.c index f7f5885aa3ba..e7e7eee9f172 100644 --- a/tools/bpf/bpftool/jit_disasm.c +++ b/tools/bpf/bpftool/jit_disasm.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From 0456ea170cd665ddbb9503be92e39f96055dd5fa Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 20 Apr 2020 10:46:10 -0700 Subject: bpf: Enable more helpers for BPF_PROG_TYPE_CGROUP_{DEVICE,SYSCTL,SOCKOPT} Currently the following prog types don't fall back to bpf_base_func_proto() (instead they have cgroup_base_func_proto which has a limited set of helpers from bpf_base_func_proto): * BPF_PROG_TYPE_CGROUP_DEVICE * BPF_PROG_TYPE_CGROUP_SYSCTL * BPF_PROG_TYPE_CGROUP_SOCKOPT I don't see any specific reason why we shouldn't use bpf_base_func_proto(), every other type of program (except bpf-lirc and, understandably, tracing) use it, so let's fall back to bpf_base_func_proto for those prog types as well. This basically boils down to adding access to the following helpers: * BPF_FUNC_get_prandom_u32 * BPF_FUNC_get_smp_processor_id * BPF_FUNC_get_numa_node_id * BPF_FUNC_tail_call * BPF_FUNC_ktime_get_ns * BPF_FUNC_spin_lock (CAP_SYS_ADMIN) * BPF_FUNC_spin_unlock (CAP_SYS_ADMIN) * BPF_FUNC_jiffies64 (CAP_SYS_ADMIN) I've also added bpf_perf_event_output() because it's really handy for logging and debugging. Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200420174610.77494-1-sdf@google.com --- include/linux/bpf.h | 1 + kernel/bpf/cgroup.c | 20 +++--------------- net/core/filter.c | 2 +- .../testing/selftests/bpf/verifier/event_output.c | 24 ++++++++++++++++++++++ 4 files changed, 29 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fd2b2322412d..25da6ff2a880 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1523,6 +1523,7 @@ extern const struct bpf_func_proto bpf_strtoul_proto; extern const struct bpf_func_proto bpf_tcp_sock_proto; extern const struct bpf_func_proto bpf_jiffies64_proto; extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto; +extern const struct bpf_func_proto bpf_event_output_data_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index cb305e71e7de..4d748c5785bc 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1060,30 +1060,16 @@ static const struct bpf_func_proto * cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { - case BPF_FUNC_map_lookup_elem: - return &bpf_map_lookup_elem_proto; - case BPF_FUNC_map_update_elem: - return &bpf_map_update_elem_proto; - case BPF_FUNC_map_delete_elem: - return &bpf_map_delete_elem_proto; - case BPF_FUNC_map_push_elem: - return &bpf_map_push_elem_proto; - case BPF_FUNC_map_pop_elem: - return &bpf_map_pop_elem_proto; - case BPF_FUNC_map_peek_elem: - return &bpf_map_peek_elem_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_trace_printk: - if (capable(CAP_SYS_ADMIN)) - return bpf_get_trace_printk_proto(); - /* fall through */ + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; default: - return NULL; + return bpf_base_func_proto(func_id); } } diff --git a/net/core/filter.c b/net/core/filter.c index 7d6ceaa54d21..a943df3ad8b0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4214,7 +4214,7 @@ BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags return bpf_event_output(map, flags, data, size, NULL, 0, NULL); } -static const struct bpf_func_proto bpf_event_output_data_proto = { +const struct bpf_func_proto bpf_event_output_data_proto = { .func = bpf_event_output_data, .gpl_only = true, .ret_type = RET_INTEGER, diff --git a/tools/testing/selftests/bpf/verifier/event_output.c b/tools/testing/selftests/bpf/verifier/event_output.c index 130553e19eca..99f8f582c02b 100644 --- a/tools/testing/selftests/bpf/verifier/event_output.c +++ b/tools/testing/selftests/bpf/verifier/event_output.c @@ -92,3 +92,27 @@ .result = ACCEPT, .retval = 1, }, +{ + "perfevent for cgroup dev", + .insns = { __PERF_EVENT_INSNS__ }, + .prog_type = BPF_PROG_TYPE_CGROUP_DEVICE, + .fixup_map_event_output = { 4 }, + .result = ACCEPT, + .retval = 1, +}, +{ + "perfevent for cgroup sysctl", + .insns = { __PERF_EVENT_INSNS__ }, + .prog_type = BPF_PROG_TYPE_CGROUP_SYSCTL, + .fixup_map_event_output = { 4 }, + .result = ACCEPT, + .retval = 1, +}, +{ + "perfevent for cgroup sockopt", + .insns = { __PERF_EVENT_INSNS__ }, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCKOPT, + .fixup_map_event_output = { 4 }, + .result = ACCEPT, + .retval = 1, +}, -- cgit v1.2.3 From ae460c022453337850bdc36a36bf7596a6cfcf99 Mon Sep 17 00:00:00 2001 From: Yoshiki Komachi Date: Tue, 21 Apr 2020 09:05:27 +0900 Subject: bpf_helpers.h: Add note for building with vmlinux.h or linux/types.h The following error was shown when a bpf program was compiled without vmlinux.h auto-generated from BTF: # clang -I./linux/tools/lib/ -I/lib/modules/$(uname -r)/build/include/ \ -O2 -Wall -target bpf -emit-llvm -c bpf_prog.c -o bpf_prog.bc ... In file included from linux/tools/lib/bpf/bpf_helpers.h:5: linux/tools/lib/bpf/bpf_helper_defs.h:56:82: error: unknown type name '__u64' ... It seems that bpf programs are intended for being built together with the vmlinux.h (which will have all the __u64 and other typedefs). But users may mistakenly think "include " is missing because the vmlinux.h is not common for non-bpf developers. IMO, an explicit comment therefore should be added to bpf_helpers.h as this patch shows. Signed-off-by: Yoshiki Komachi Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1587427527-29399-1-git-send-email-komachi.yoshiki@gmail.com --- tools/lib/bpf/bpf_helpers.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index f69cc208778a..60aad054eea1 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -2,6 +2,12 @@ #ifndef __BPF_HELPERS__ #define __BPF_HELPERS__ +/* + * Note that bpf programs need to include either + * vmlinux.h (auto-generated from BTF) or linux/types.h + * in advance since bpf_helper_defs.h uses such types + * as __u64. + */ #include "bpf_helper_defs.h" #define __uint(name, val) int (*name)[val] -- cgit v1.2.3 From 71d19214776e61b33da48f7c1b46e522c7f78221 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Sun, 26 Apr 2020 09:15:25 -0700 Subject: bpf: add bpf_ktime_get_boot_ns() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On a device like a cellphone which is constantly suspending and resuming CLOCK_MONOTONIC is not particularly useful for keeping track of or reacting to external network events. Instead you want to use CLOCK_BOOTTIME. Hence add bpf_ktime_get_boot_ns() as a mirror of bpf_ktime_get_ns() based around CLOCK_BOOTTIME instead of CLOCK_MONOTONIC. Signed-off-by: Maciej Żenczykowski Signed-off-by: Alexei Starovoitov --- drivers/media/rc/bpf-lirc.c | 2 ++ include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 13 ++++++++++++- kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 14 ++++++++++++++ kernel/trace/bpf_trace.c | 2 ++ tools/include/uapi/linux/bpf.h | 13 ++++++++++++- 7 files changed, 44 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index 0f3417d161b8..069c42f22a8c 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -103,6 +103,8 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_peek_elem_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_boot_ns: + return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_get_prandom_u32: diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5147e11e53ff..10960cfabea4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1509,6 +1509,7 @@ extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; extern const struct bpf_func_proto bpf_get_numa_node_id_proto; extern const struct bpf_func_proto bpf_tail_call_proto; extern const struct bpf_func_proto bpf_ktime_get_ns_proto; +extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto; extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7bbf1b65be10..4a6c47f3febe 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -652,6 +652,8 @@ union bpf_attr { * u64 bpf_ktime_get_ns(void) * Description * Return the time elapsed since system boot, in nanoseconds. + * Does not include time the system was suspended. + * See: clock_gettime(CLOCK_MONOTONIC) * Return * Current *ktime*. * @@ -3025,6 +3027,14 @@ union bpf_attr { * * **-EOPNOTSUPP** Unsupported operation, for example a * call from outside of TC ingress. * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). + * + * u64 bpf_ktime_get_boot_ns(void) + * Description + * Return the time elapsed since system boot, in nanoseconds. + * Does include the time the system was suspended. + * See: clock_gettime(CLOCK_BOOTTIME) + * Return + * Current *ktime*. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3151,7 +3161,8 @@ union bpf_attr { FN(xdp_output), \ FN(get_netns_cookie), \ FN(get_current_ancestor_cgroup_id), \ - FN(sk_assign), + FN(sk_assign), \ + FN(ktime_get_boot_ns), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 0cc91805069a..6aa11de67315 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2156,6 +2156,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; +const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9a6b23387d02..5c0290e0696e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -155,6 +155,18 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_0(bpf_ktime_get_boot_ns) +{ + /* NMI safe access to clock boottime */ + return ktime_get_boot_fast_ns(); +} + +const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = { + .func = bpf_ktime_get_boot_ns, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + BPF_CALL_0(bpf_get_current_pid_tgid) { struct task_struct *task = current; @@ -615,6 +627,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_tail_call_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_boot_ns: + return &bpf_ktime_get_boot_ns_proto; default: break; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ca1796747a77..e875c95d3ced 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -797,6 +797,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_peek_elem_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_boot_ns: + return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_get_current_pid_tgid: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7bbf1b65be10..4a6c47f3febe 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -652,6 +652,8 @@ union bpf_attr { * u64 bpf_ktime_get_ns(void) * Description * Return the time elapsed since system boot, in nanoseconds. + * Does not include time the system was suspended. + * See: clock_gettime(CLOCK_MONOTONIC) * Return * Current *ktime*. * @@ -3025,6 +3027,14 @@ union bpf_attr { * * **-EOPNOTSUPP** Unsupported operation, for example a * call from outside of TC ingress. * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). + * + * u64 bpf_ktime_get_boot_ns(void) + * Description + * Return the time elapsed since system boot, in nanoseconds. + * Does include the time the system was suspended. + * See: clock_gettime(CLOCK_BOOTTIME) + * Return + * Current *ktime*. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3151,7 +3161,8 @@ union bpf_attr { FN(xdp_output), \ FN(get_netns_cookie), \ FN(get_current_ancestor_cgroup_id), \ - FN(sk_assign), + FN(sk_assign), \ + FN(ktime_get_boot_ns), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 6f8a57ccf8511724e6f48d732cb2940889789ab2 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 23 Apr 2020 12:58:50 -0700 Subject: bpf: Make verifier log more relevant by default To make BPF verifier verbose log more releavant and easier to use to debug verification failures, "pop" parts of log that were successfully verified. This has effect of leaving only verifier logs that correspond to code branches that lead to verification failure, which in practice should result in much shorter and more relevant verifier log dumps. This behavior is made the default behavior and can be overriden to do exhaustive logging by specifying BPF_LOG_LEVEL2 log level. Using BPF_LOG_LEVEL2 to disable this behavior is not ideal, because in some cases it's good to have BPF_LOG_LEVEL2 per-instruction register dump verbosity, but still have only relevant verifier branches logged. But for this patch, I didn't want to add any new flags. It might be worth-while to just rethink how BPF verifier logging is performed and requested and streamline it a bit. But this trimming of successfully verified branches seems to be useful and a good default behavior. To test this, I modified runqslower slightly to introduce read of uninitialized stack variable. Log (**truncated in the middle** to save many lines out of this commit message) BEFORE this change: ; int handle__sched_switch(u64 *ctx) 0: (bf) r6 = r1 ; struct task_struct *prev = (struct task_struct *)ctx[1]; 1: (79) r1 = *(u64 *)(r6 +8) func 'sched_switch' arg1 has btf_id 151 type STRUCT 'task_struct' 2: (b7) r2 = 0 ; struct event event = {}; 3: (7b) *(u64 *)(r10 -24) = r2 last_idx 3 first_idx 0 regs=4 stack=0 before 2: (b7) r2 = 0 4: (7b) *(u64 *)(r10 -32) = r2 5: (7b) *(u64 *)(r10 -40) = r2 6: (7b) *(u64 *)(r10 -48) = r2 ; if (prev->state == TASK_RUNNING) [ ... instruction dump from insn #7 through #50 are cut out ... ] 51: (b7) r2 = 16 52: (85) call bpf_get_current_comm#16 last_idx 52 first_idx 42 regs=4 stack=0 before 51: (b7) r2 = 16 ; bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, 53: (bf) r1 = r6 54: (18) r2 = 0xffff8881f3868800 56: (18) r3 = 0xffffffff 58: (bf) r4 = r7 59: (b7) r5 = 32 60: (85) call bpf_perf_event_output#25 last_idx 60 first_idx 53 regs=20 stack=0 before 59: (b7) r5 = 32 61: (bf) r2 = r10 ; event.pid = pid; 62: (07) r2 += -16 ; bpf_map_delete_elem(&start, &pid); 63: (18) r1 = 0xffff8881f3868000 65: (85) call bpf_map_delete_elem#3 ; } 66: (b7) r0 = 0 67: (95) exit from 44 to 66: safe from 34 to 66: safe from 11 to 28: R1_w=inv0 R2_w=inv0 R6_w=ctx(id=0,off=0,imm=0) R10=fp0 fp-8=mmmm???? fp-24_w=00000000 fp-32_w=00000000 fp-40_w=00000000 fp-48_w=00000000 ; bpf_map_update_elem(&start, &pid, &ts, 0); 28: (bf) r2 = r10 ; 29: (07) r2 += -16 ; tsp = bpf_map_lookup_elem(&start, &pid); 30: (18) r1 = 0xffff8881f3868000 32: (85) call bpf_map_lookup_elem#1 invalid indirect read from stack off -16+0 size 4 processed 65 insns (limit 1000000) max_states_per_insn 1 total_states 5 peak_states 5 mark_read 4 Notice how there is a successful code path from instruction 0 through 67, few successfully verified jumps (44->66, 34->66), and only after that 11->28 jump plus error on instruction #32. AFTER this change (full verifier log, **no truncation**): ; int handle__sched_switch(u64 *ctx) 0: (bf) r6 = r1 ; struct task_struct *prev = (struct task_struct *)ctx[1]; 1: (79) r1 = *(u64 *)(r6 +8) func 'sched_switch' arg1 has btf_id 151 type STRUCT 'task_struct' 2: (b7) r2 = 0 ; struct event event = {}; 3: (7b) *(u64 *)(r10 -24) = r2 last_idx 3 first_idx 0 regs=4 stack=0 before 2: (b7) r2 = 0 4: (7b) *(u64 *)(r10 -32) = r2 5: (7b) *(u64 *)(r10 -40) = r2 6: (7b) *(u64 *)(r10 -48) = r2 ; if (prev->state == TASK_RUNNING) 7: (79) r2 = *(u64 *)(r1 +16) ; if (prev->state == TASK_RUNNING) 8: (55) if r2 != 0x0 goto pc+19 R1_w=ptr_task_struct(id=0,off=0,imm=0) R2_w=inv0 R6_w=ctx(id=0,off=0,imm=0) R10=fp0 fp-24_w=00000000 fp-32_w=00000000 fp-40_w=00000000 fp-48_w=00000000 ; trace_enqueue(prev->tgid, prev->pid); 9: (61) r1 = *(u32 *)(r1 +1184) 10: (63) *(u32 *)(r10 -4) = r1 ; if (!pid || (targ_pid && targ_pid != pid)) 11: (15) if r1 == 0x0 goto pc+16 from 11 to 28: R1_w=inv0 R2_w=inv0 R6_w=ctx(id=0,off=0,imm=0) R10=fp0 fp-8=mmmm???? fp-24_w=00000000 fp-32_w=00000000 fp-40_w=00000000 fp-48_w=00000000 ; bpf_map_update_elem(&start, &pid, &ts, 0); 28: (bf) r2 = r10 ; 29: (07) r2 += -16 ; tsp = bpf_map_lookup_elem(&start, &pid); 30: (18) r1 = 0xffff8881db3ce800 32: (85) call bpf_map_lookup_elem#1 invalid indirect read from stack off -16+0 size 4 processed 65 insns (limit 1000000) max_states_per_insn 1 total_states 5 peak_states 5 mark_read 4 Notice how in this case, there are 0-11 instructions + jump from 11 to 28 is recorded + 28-32 instructions with error on insn #32. test_verifier test runner was updated to specify BPF_LOG_LEVEL2 for VERBOSE_ACCEPT expected result due to potentially "incomplete" success verbose log at BPF_LOG_LEVEL1. On success, verbose log will only have a summary of number of processed instructions, etc, but no branch tracing log. Having just a last succesful branch tracing seemed weird and confusing. Having small and clean summary log in success case seems quite logical and nice, though. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200423195850.1259827-1-andriin@fb.com --- kernel/bpf/verifier.c | 29 +++++++++++++++++++++++++---- tools/testing/selftests/bpf/test_verifier.c | 7 ++++++- 2 files changed, 31 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 15ba8bf92ca9..91728e0f27eb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -168,6 +168,8 @@ struct bpf_verifier_stack_elem { int insn_idx; int prev_insn_idx; struct bpf_verifier_stack_elem *next; + /* length of verifier log at the time this state was pushed on stack */ + u32 log_pos; }; #define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192 @@ -283,6 +285,18 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, log->ubuf = NULL; } +static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos) +{ + char zero = 0; + + if (!bpf_verifier_log_needed(log)) + return; + + log->len_used = new_pos; + if (put_user(zero, log->ubuf + new_pos)) + log->ubuf = NULL; +} + /* log_level controls verbosity level of eBPF verifier. * bpf_verifier_log_write() is used to dump the verification trace to the log, * so the user can figure out what's wrong with the program @@ -846,7 +860,7 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi } static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, - int *insn_idx) + int *insn_idx, bool pop_log) { struct bpf_verifier_state *cur = env->cur_state; struct bpf_verifier_stack_elem *elem, *head = env->head; @@ -860,6 +874,8 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, if (err) return err; } + if (pop_log) + bpf_vlog_reset(&env->log, head->log_pos); if (insn_idx) *insn_idx = head->insn_idx; if (prev_insn_idx) @@ -887,6 +903,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; elem->next = env->head; + elem->log_pos = env->log.len_used; env->head = elem; env->stack_size++; err = copy_verifier_state(&elem->st, cur); @@ -915,7 +932,7 @@ err: free_verifier_state(env->cur_state, true); env->cur_state = NULL; /* pop all elements and return */ - while (!pop_stack(env, NULL, NULL)); + while (!pop_stack(env, NULL, NULL, false)); return NULL; } @@ -8407,6 +8424,7 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) static int do_check(struct bpf_verifier_env *env) { + bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_verifier_state *state = env->cur_state; struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; @@ -8683,7 +8701,7 @@ static int do_check(struct bpf_verifier_env *env) process_bpf_exit: update_branch_counts(env, env->cur_state); err = pop_stack(env, &prev_insn_idx, - &env->insn_idx); + &env->insn_idx, pop_log); if (err < 0) { if (err != -ENOENT) return err; @@ -10206,6 +10224,7 @@ static void sanitize_insn_aux_data(struct bpf_verifier_env *env) static int do_check_common(struct bpf_verifier_env *env, int subprog) { + bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_verifier_state *state; struct bpf_reg_state *regs; int ret, i; @@ -10268,7 +10287,9 @@ out: free_verifier_state(env->cur_state, true); env->cur_state = NULL; } - while (!pop_stack(env, NULL, NULL)); + while (!pop_stack(env, NULL, NULL, false)); + if (!ret && pop_log) + bpf_vlog_reset(&env->log, 0); free_states(env); if (ret) /* clean aux data in case subprog was rejected */ diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 87eaa49609a0..ad6939c67c5e 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -943,7 +943,12 @@ static void do_test_single(struct bpf_test *test, bool unpriv, attr.insns = prog; attr.insns_cnt = prog_len; attr.license = "GPL"; - attr.log_level = verbose || expected_ret == VERBOSE_ACCEPT ? 1 : 4; + if (verbose) + attr.log_level = 1; + else if (expected_ret == VERBOSE_ACCEPT) + attr.log_level = 2; + else + attr.log_level = 4; attr.prog_flags = pflags; fd_prog = bpf_load_program_xattr(&attr, bpf_vlog, sizeof(bpf_vlog)); -- cgit v1.2.3 From 234589012ba0e5bf448e3fdbbac0f4c265dbdd7b Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 24 Apr 2020 19:55:55 +0100 Subject: selftests/bpf: Add cls_redirect classifier cls_redirect is a TC clsact based replacement for the glb-redirect iptables module available at [1]. It enables what GitHub calls "second chance" flows [2], similarly proposed by the Beamer paper [3]. In contrast to glb-redirect, it also supports migrating UDP flows as long as connected sockets are used. cls_redirect is in production at Cloudflare, as part of our own L4 load balancer. We have modified the encapsulation format slightly from glb-redirect: glbgue_chained_routing.private_data_type has been repurposed to form a version field and several flags. Both have been arranged in a way that a private_data_type value of zero matches the current glb-redirect behaviour. This means that cls_redirect will understand packets in glb-redirect format, but not vice versa. The test suite only covers basic features. For example, cls_redirect will correctly forward path MTU discovery packets, but this is not exercised. It is also possible to switch the encapsulation format to GRE on the last hop, which is also not tested. There are two major distinctions from glb-redirect: first, cls_redirect relies on receiving encapsulated packets directly from a router. This is because we don't have access to the neighbour tables from BPF, yet. See forward_to_next_hop for details. Second, cls_redirect performs decapsulation instead of using separate ipip and sit tunnel devices. This avoids issues with the sit tunnel [4] and makes deploying the classifier easier: decapsulated packets appear on the same interface, so existing firewall rules continue to work as expected. The code base started it's life on v4.19, so there are most likely still hold overs from old workarounds. In no particular order: - The function buf_off is required to defeat a clang optimization that leads to the verifier rejecting the program due to pointer arithmetic in the wrong order. - The function pkt_parse_ipv6 is force inlined, because it would otherwise be rejected due to returning a pointer to stack memory. - The functions fill_tuple and classify_tcp contain kludges, because we've run out of function arguments. - The logic in general is rather nested, due to verifier restrictions. I think this is either because the verifier loses track of constants on the stack, or because it can't track enum like variables. 1: https://github.com/github/glb-director/tree/master/src/glb-redirect 2: https://github.com/github/glb-director/blob/master/docs/development/second-chance-design.md 3: https://www.usenix.org/conference/nsdi18/presentation/olteanu 4: https://github.com/github/glb-director/issues/64 Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200424185556.7358-2-lmb@cloudflare.com --- .../selftests/bpf/prog_tests/cls_redirect.c | 456 +++++++++ .../selftests/bpf/progs/test_cls_redirect.c | 1058 ++++++++++++++++++++ .../selftests/bpf/progs/test_cls_redirect.h | 54 + tools/testing/selftests/bpf/test_progs.h | 7 + 4 files changed, 1575 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/cls_redirect.c create mode 100644 tools/testing/selftests/bpf/progs/test_cls_redirect.c create mode 100644 tools/testing/selftests/bpf/progs/test_cls_redirect.h (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c new file mode 100644 index 000000000000..f259085cca6a --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c @@ -0,0 +1,456 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +// Copyright (c) 2020 Cloudflare + +#define _GNU_SOURCE + +#include +#include + +#include + +#include + +#include "progs/test_cls_redirect.h" +#include "test_cls_redirect.skel.h" + +#define ENCAP_IP INADDR_LOOPBACK +#define ENCAP_PORT (1234) + +struct addr_port { + in_port_t port; + union { + struct in_addr in_addr; + struct in6_addr in6_addr; + }; +}; + +struct tuple { + int family; + struct addr_port src; + struct addr_port dst; +}; + +static int start_server(const struct sockaddr *addr, socklen_t len, int type) +{ + int fd = socket(addr->sa_family, type, 0); + if (CHECK_FAIL(fd == -1)) + return -1; + if (CHECK_FAIL(bind(fd, addr, len) == -1)) + goto err; + if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1)) + goto err; + + return fd; + +err: + close(fd); + return -1; +} + +static int connect_to_server(const struct sockaddr *addr, socklen_t len, + int type) +{ + int fd = socket(addr->sa_family, type, 0); + if (CHECK_FAIL(fd == -1)) + return -1; + if (CHECK_FAIL(connect(fd, addr, len))) + goto err; + + return fd; + +err: + close(fd); + return -1; +} + +static bool fill_addr_port(const struct sockaddr *sa, struct addr_port *ap) +{ + const struct sockaddr_in6 *in6; + const struct sockaddr_in *in; + + switch (sa->sa_family) { + case AF_INET: + in = (const struct sockaddr_in *)sa; + ap->in_addr = in->sin_addr; + ap->port = in->sin_port; + return true; + + case AF_INET6: + in6 = (const struct sockaddr_in6 *)sa; + ap->in6_addr = in6->sin6_addr; + ap->port = in6->sin6_port; + return true; + + default: + return false; + } +} + +static bool set_up_conn(const struct sockaddr *addr, socklen_t len, int type, + int *server, int *conn, struct tuple *tuple) +{ + struct sockaddr_storage ss; + socklen_t slen = sizeof(ss); + struct sockaddr *sa = (struct sockaddr *)&ss; + + *server = start_server(addr, len, type); + if (*server < 0) + return false; + + if (CHECK_FAIL(getsockname(*server, sa, &slen))) + goto close_server; + + *conn = connect_to_server(sa, slen, type); + if (*conn < 0) + goto close_server; + + /* We want to simulate packets arriving at conn, so we have to + * swap src and dst. + */ + slen = sizeof(ss); + if (CHECK_FAIL(getsockname(*conn, sa, &slen))) + goto close_conn; + + if (CHECK_FAIL(!fill_addr_port(sa, &tuple->dst))) + goto close_conn; + + slen = sizeof(ss); + if (CHECK_FAIL(getpeername(*conn, sa, &slen))) + goto close_conn; + + if (CHECK_FAIL(!fill_addr_port(sa, &tuple->src))) + goto close_conn; + + tuple->family = ss.ss_family; + return true; + +close_conn: + close(*conn); + *conn = -1; +close_server: + close(*server); + *server = -1; + return false; +} + +static socklen_t prepare_addr(struct sockaddr_storage *addr, int family) +{ + struct sockaddr_in *addr4; + struct sockaddr_in6 *addr6; + + switch (family) { + case AF_INET: + addr4 = (struct sockaddr_in *)addr; + memset(addr4, 0, sizeof(*addr4)); + addr4->sin_family = family; + addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + return sizeof(*addr4); + case AF_INET6: + addr6 = (struct sockaddr_in6 *)addr; + memset(addr6, 0, sizeof(*addr6)); + addr6->sin6_family = family; + addr6->sin6_addr = in6addr_loopback; + return sizeof(*addr6); + default: + fprintf(stderr, "Invalid family %d", family); + return 0; + } +} + +static bool was_decapsulated(struct bpf_prog_test_run_attr *tattr) +{ + return tattr->data_size_out < tattr->data_size_in; +} + +enum type { + UDP, + TCP, + __NR_KIND, +}; + +enum hops { + NO_HOPS, + ONE_HOP, +}; + +enum flags { + NONE, + SYN, + ACK, +}; + +enum conn { + KNOWN_CONN, + UNKNOWN_CONN, +}; + +enum result { + ACCEPT, + FORWARD, +}; + +struct test_cfg { + enum type type; + enum result result; + enum conn conn; + enum hops hops; + enum flags flags; +}; + +static int test_str(void *buf, size_t len, const struct test_cfg *test, + int family) +{ + const char *family_str, *type, *conn, *hops, *result, *flags; + + family_str = "IPv4"; + if (family == AF_INET6) + family_str = "IPv6"; + + type = "TCP"; + if (test->type == UDP) + type = "UDP"; + + conn = "known"; + if (test->conn == UNKNOWN_CONN) + conn = "unknown"; + + hops = "no hops"; + if (test->hops == ONE_HOP) + hops = "one hop"; + + result = "accept"; + if (test->result == FORWARD) + result = "forward"; + + flags = "none"; + if (test->flags == SYN) + flags = "SYN"; + else if (test->flags == ACK) + flags = "ACK"; + + return snprintf(buf, len, "%s %s %s %s (%s, flags: %s)", family_str, + type, result, conn, hops, flags); +} + +static struct test_cfg tests[] = { + { TCP, ACCEPT, UNKNOWN_CONN, NO_HOPS, SYN }, + { TCP, ACCEPT, UNKNOWN_CONN, NO_HOPS, ACK }, + { TCP, FORWARD, UNKNOWN_CONN, ONE_HOP, ACK }, + { TCP, ACCEPT, KNOWN_CONN, ONE_HOP, ACK }, + { UDP, ACCEPT, UNKNOWN_CONN, NO_HOPS, NONE }, + { UDP, FORWARD, UNKNOWN_CONN, ONE_HOP, NONE }, + { UDP, ACCEPT, KNOWN_CONN, ONE_HOP, NONE }, +}; + +static void encap_init(encap_headers_t *encap, uint8_t hop_count, uint8_t proto) +{ + const uint8_t hlen = + (sizeof(struct guehdr) / sizeof(uint32_t)) + hop_count; + *encap = (encap_headers_t){ + .eth = { .h_proto = htons(ETH_P_IP) }, + .ip = { + .ihl = 5, + .version = 4, + .ttl = IPDEFTTL, + .protocol = IPPROTO_UDP, + .daddr = htonl(ENCAP_IP) + }, + .udp = { + .dest = htons(ENCAP_PORT), + }, + .gue = { + .hlen = hlen, + .proto_ctype = proto + }, + .unigue = { + .hop_count = hop_count + }, + }; +} + +static size_t build_input(const struct test_cfg *test, void *const buf, + const struct tuple *tuple) +{ + in_port_t sport = tuple->src.port; + encap_headers_t encap; + struct iphdr ip; + struct ipv6hdr ipv6; + struct tcphdr tcp; + struct udphdr udp; + struct in_addr next_hop; + uint8_t *p = buf; + int proto; + + proto = IPPROTO_IPIP; + if (tuple->family == AF_INET6) + proto = IPPROTO_IPV6; + + encap_init(&encap, test->hops == ONE_HOP ? 1 : 0, proto); + p = mempcpy(p, &encap, sizeof(encap)); + + if (test->hops == ONE_HOP) { + next_hop = (struct in_addr){ .s_addr = htonl(0x7f000002) }; + p = mempcpy(p, &next_hop, sizeof(next_hop)); + } + + proto = IPPROTO_TCP; + if (test->type == UDP) + proto = IPPROTO_UDP; + + switch (tuple->family) { + case AF_INET: + ip = (struct iphdr){ + .ihl = 5, + .version = 4, + .ttl = IPDEFTTL, + .protocol = proto, + .saddr = tuple->src.in_addr.s_addr, + .daddr = tuple->dst.in_addr.s_addr, + }; + p = mempcpy(p, &ip, sizeof(ip)); + break; + case AF_INET6: + ipv6 = (struct ipv6hdr){ + .version = 6, + .hop_limit = IPDEFTTL, + .nexthdr = proto, + .saddr = tuple->src.in6_addr, + .daddr = tuple->dst.in6_addr, + }; + p = mempcpy(p, &ipv6, sizeof(ipv6)); + break; + default: + return 0; + } + + if (test->conn == UNKNOWN_CONN) + sport--; + + switch (test->type) { + case TCP: + tcp = (struct tcphdr){ + .source = sport, + .dest = tuple->dst.port, + }; + if (test->flags == SYN) + tcp.syn = true; + if (test->flags == ACK) + tcp.ack = true; + p = mempcpy(p, &tcp, sizeof(tcp)); + break; + case UDP: + udp = (struct udphdr){ + .source = sport, + .dest = tuple->dst.port, + }; + p = mempcpy(p, &udp, sizeof(udp)); + break; + default: + return 0; + } + + return (void *)p - buf; +} + +static void close_fds(int *fds, int n) +{ + int i; + + for (i = 0; i < n; i++) + if (fds[i] > 0) + close(fds[i]); +} + +void test_cls_redirect(void) +{ + struct test_cls_redirect *skel = NULL; + struct bpf_prog_test_run_attr tattr = {}; + int families[] = { AF_INET, AF_INET6 }; + struct sockaddr_storage ss; + struct sockaddr *addr; + socklen_t slen; + int i, j, err; + + int servers[__NR_KIND][ARRAY_SIZE(families)] = {}; + int conns[__NR_KIND][ARRAY_SIZE(families)] = {}; + struct tuple tuples[__NR_KIND][ARRAY_SIZE(families)]; + + skel = test_cls_redirect__open(); + if (CHECK_FAIL(!skel)) + return; + + skel->rodata->ENCAPSULATION_IP = htonl(ENCAP_IP); + skel->rodata->ENCAPSULATION_PORT = htons(ENCAP_PORT); + + if (CHECK_FAIL(test_cls_redirect__load(skel))) + goto cleanup; + + addr = (struct sockaddr *)&ss; + for (i = 0; i < ARRAY_SIZE(families); i++) { + slen = prepare_addr(&ss, families[i]); + if (CHECK_FAIL(!slen)) + goto cleanup; + + if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_DGRAM, + &servers[UDP][i], &conns[UDP][i], + &tuples[UDP][i]))) + goto cleanup; + + if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_STREAM, + &servers[TCP][i], &conns[TCP][i], + &tuples[TCP][i]))) + goto cleanup; + } + + tattr.prog_fd = bpf_program__fd(skel->progs.cls_redirect); + for (i = 0; i < ARRAY_SIZE(tests); i++) { + struct test_cfg *test = &tests[i]; + + for (j = 0; j < ARRAY_SIZE(families); j++) { + struct tuple *tuple = &tuples[test->type][j]; + char input[256]; + char tmp[256]; + + test_str(tmp, sizeof(tmp), test, tuple->family); + if (!test__start_subtest(tmp)) + continue; + + tattr.data_out = tmp; + tattr.data_size_out = sizeof(tmp); + + tattr.data_in = input; + tattr.data_size_in = build_input(test, input, tuple); + if (CHECK_FAIL(!tattr.data_size_in)) + continue; + + err = bpf_prog_test_run_xattr(&tattr); + if (CHECK_FAIL(err)) + continue; + + if (tattr.retval != TC_ACT_REDIRECT) { + PRINT_FAIL("expected TC_ACT_REDIRECT, got %d\n", + tattr.retval); + continue; + } + + switch (test->result) { + case ACCEPT: + if (CHECK_FAIL(!was_decapsulated(&tattr))) + continue; + break; + case FORWARD: + if (CHECK_FAIL(was_decapsulated(&tattr))) + continue; + break; + default: + PRINT_FAIL("unknown result %d\n", test->result); + continue; + } + } + } + +cleanup: + test_cls_redirect__destroy(skel); + close_fds((int *)servers, sizeof(servers) / sizeof(servers[0][0])); + close_fds((int *)conns, sizeof(conns) / sizeof(conns[0][0])); +} diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.c b/tools/testing/selftests/bpf/progs/test_cls_redirect.c new file mode 100644 index 000000000000..1668b993eb86 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.c @@ -0,0 +1,1058 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +// Copyright (c) 2019, 2020 Cloudflare + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "test_cls_redirect.h" + +#define offsetofend(TYPE, MEMBER) \ + (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER))) + +#define IP_OFFSET_MASK (0x1FFF) +#define IP_MF (0x2000) + +char _license[] SEC("license") = "Dual BSD/GPL"; + +/** + * Destination port and IP used for UDP encapsulation. + */ +static volatile const __be16 ENCAPSULATION_PORT; +static volatile const __be32 ENCAPSULATION_IP; + +typedef struct { + uint64_t processed_packets_total; + uint64_t l3_protocol_packets_total_ipv4; + uint64_t l3_protocol_packets_total_ipv6; + uint64_t l4_protocol_packets_total_tcp; + uint64_t l4_protocol_packets_total_udp; + uint64_t accepted_packets_total_syn; + uint64_t accepted_packets_total_syn_cookies; + uint64_t accepted_packets_total_last_hop; + uint64_t accepted_packets_total_icmp_echo_request; + uint64_t accepted_packets_total_established; + uint64_t forwarded_packets_total_gue; + uint64_t forwarded_packets_total_gre; + + uint64_t errors_total_unknown_l3_proto; + uint64_t errors_total_unknown_l4_proto; + uint64_t errors_total_malformed_ip; + uint64_t errors_total_fragmented_ip; + uint64_t errors_total_malformed_icmp; + uint64_t errors_total_unwanted_icmp; + uint64_t errors_total_malformed_icmp_pkt_too_big; + uint64_t errors_total_malformed_tcp; + uint64_t errors_total_malformed_udp; + uint64_t errors_total_icmp_echo_replies; + uint64_t errors_total_malformed_encapsulation; + uint64_t errors_total_encap_adjust_failed; + uint64_t errors_total_encap_buffer_too_small; + uint64_t errors_total_redirect_loop; +} metrics_t; + +typedef enum { + INVALID = 0, + UNKNOWN, + ECHO_REQUEST, + SYN, + SYN_COOKIE, + ESTABLISHED, +} verdict_t; + +typedef struct { + uint16_t src, dst; +} flow_ports_t; + +_Static_assert( + sizeof(flow_ports_t) != + offsetofend(struct bpf_sock_tuple, ipv4.dport) - + offsetof(struct bpf_sock_tuple, ipv4.sport) - 1, + "flow_ports_t must match sport and dport in struct bpf_sock_tuple"); +_Static_assert( + sizeof(flow_ports_t) != + offsetofend(struct bpf_sock_tuple, ipv6.dport) - + offsetof(struct bpf_sock_tuple, ipv6.sport) - 1, + "flow_ports_t must match sport and dport in struct bpf_sock_tuple"); + +typedef int ret_t; + +/* This is a bit of a hack. We need a return value which allows us to + * indicate that the regular flow of the program should continue, + * while allowing functions to use XDP_PASS and XDP_DROP, etc. + */ +static const ret_t CONTINUE_PROCESSING = -1; + +/* Convenience macro to call functions which return ret_t. + */ +#define MAYBE_RETURN(x) \ + do { \ + ret_t __ret = x; \ + if (__ret != CONTINUE_PROCESSING) \ + return __ret; \ + } while (0) + +/* Linux packet pointers are either aligned to NET_IP_ALIGN (aka 2 bytes), + * or not aligned if the arch supports efficient unaligned access. + * + * Since the verifier ensures that eBPF packet accesses follow these rules, + * we can tell LLVM to emit code as if we always had a larger alignment. + * It will yell at us if we end up on a platform where this is not valid. + */ +typedef uint8_t *net_ptr __attribute__((align_value(8))); + +typedef struct buf { + struct __sk_buff *skb; + net_ptr head; + /* NB: tail musn't have alignment other than 1, otherwise + * LLVM will go and eliminate code, e.g. when checking packet lengths. + */ + uint8_t *const tail; +} buf_t; + +static size_t buf_off(const buf_t *buf) +{ + /* Clang seems to optimize constructs like + * a - b + c + * if c is known: + * r? = c + * r? -= b + * r? += a + * + * This is a problem if a and b are packet pointers, + * since the verifier allows subtracting two pointers to + * get a scalar, but not a scalar and a pointer. + * + * Use inline asm to break this optimization. + */ + size_t off = (size_t)buf->head; + asm("%0 -= %1" : "+r"(off) : "r"(buf->skb->data)); + return off; +} + +static bool buf_copy(buf_t *buf, void *dst, size_t len) +{ + if (bpf_skb_load_bytes(buf->skb, buf_off(buf), dst, len)) { + return false; + } + + buf->head += len; + return true; +} + +static bool buf_skip(buf_t *buf, const size_t len) +{ + /* Check whether off + len is valid in the non-linear part. */ + if (buf_off(buf) + len > buf->skb->len) { + return false; + } + + buf->head += len; + return true; +} + +/* Returns a pointer to the start of buf, or NULL if len is + * larger than the remaining data. Consumes len bytes on a successful + * call. + * + * If scratch is not NULL, the function will attempt to load non-linear + * data via bpf_skb_load_bytes. On success, scratch is returned. + */ +static void *buf_assign(buf_t *buf, const size_t len, void *scratch) +{ + if (buf->head + len > buf->tail) { + if (scratch == NULL) { + return NULL; + } + + return buf_copy(buf, scratch, len) ? scratch : NULL; + } + + void *ptr = buf->head; + buf->head += len; + return ptr; +} + +static bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4) +{ + if (ipv4->ihl <= 5) { + return true; + } + + return buf_skip(buf, (ipv4->ihl - 5) * 4); +} + +static bool ipv4_is_fragment(const struct iphdr *ip) +{ + uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK); + return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0; +} + +static struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch) +{ + struct iphdr *ipv4 = buf_assign(pkt, sizeof(*ipv4), scratch); + if (ipv4 == NULL) { + return NULL; + } + + if (ipv4->ihl < 5) { + return NULL; + } + + if (!pkt_skip_ipv4_options(pkt, ipv4)) { + return NULL; + } + + return ipv4; +} + +/* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */ +static bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports) +{ + if (!buf_copy(pkt, ports, sizeof(*ports))) { + return false; + } + + /* Ports in the L4 headers are reversed, since we are parsing an ICMP + * payload which is going towards the eyeball. + */ + uint16_t dst = ports->src; + ports->src = ports->dst; + ports->dst = dst; + return true; +} + +static uint16_t pkt_checksum_fold(uint32_t csum) +{ + /* The highest reasonable value for an IPv4 header + * checksum requires two folds, so we just do that always. + */ + csum = (csum & 0xffff) + (csum >> 16); + csum = (csum & 0xffff) + (csum >> 16); + return (uint16_t)~csum; +} + +static void pkt_ipv4_checksum(struct iphdr *iph) +{ + iph->check = 0; + + /* An IP header without options is 20 bytes. Two of those + * are the checksum, which we always set to zero. Hence, + * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7, + * which fits in 32 bit. + */ + _Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes"); + uint32_t acc = 0; + uint16_t *ipw = (uint16_t *)iph; + +#pragma clang loop unroll(full) + for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) { + acc += ipw[i]; + } + + iph->check = pkt_checksum_fold(acc); +} + +static bool pkt_skip_ipv6_extension_headers(buf_t *pkt, + const struct ipv6hdr *ipv6, + uint8_t *upper_proto, + bool *is_fragment) +{ + /* We understand five extension headers. + * https://tools.ietf.org/html/rfc8200#section-4.1 states that all + * headers should occur once, except Destination Options, which may + * occur twice. Hence we give up after 6 headers. + */ + struct { + uint8_t next; + uint8_t len; + } exthdr = { + .next = ipv6->nexthdr, + }; + *is_fragment = false; + +#pragma clang loop unroll(full) + for (int i = 0; i < 6; i++) { + switch (exthdr.next) { + case IPPROTO_FRAGMENT: + *is_fragment = true; + /* NB: We don't check that hdrlen == 0 as per spec. */ + /* fallthrough; */ + + case IPPROTO_HOPOPTS: + case IPPROTO_ROUTING: + case IPPROTO_DSTOPTS: + case IPPROTO_MH: + if (!buf_copy(pkt, &exthdr, sizeof(exthdr))) { + return false; + } + + /* hdrlen is in 8-octet units, and excludes the first 8 octets. */ + if (!buf_skip(pkt, + (exthdr.len + 1) * 8 - sizeof(exthdr))) { + return false; + } + + /* Decode next header */ + break; + + default: + /* The next header is not one of the known extension + * headers, treat it as the upper layer header. + * + * This handles IPPROTO_NONE. + * + * Encapsulating Security Payload (50) and Authentication + * Header (51) also end up here (and will trigger an + * unknown proto error later). They have a custom header + * format and seem too esoteric to care about. + */ + *upper_proto = exthdr.next; + return true; + } + } + + /* We never found an upper layer header. */ + return false; +} + +/* This function has to be inlined, because the verifier otherwise rejects it + * due to returning a pointer to the stack. This is technically correct, since + * scratch is allocated on the stack. However, this usage should be safe since + * it's the callers stack after all. + */ +static inline __attribute__((__always_inline__)) struct ipv6hdr * +pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto, + bool *is_fragment) +{ + struct ipv6hdr *ipv6 = buf_assign(pkt, sizeof(*ipv6), scratch); + if (ipv6 == NULL) { + return NULL; + } + + if (!pkt_skip_ipv6_extension_headers(pkt, ipv6, proto, is_fragment)) { + return NULL; + } + + return ipv6; +} + +/* Global metrics, per CPU + */ +struct bpf_map_def metrics_map SEC("maps") = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(unsigned int), + .value_size = sizeof(metrics_t), + .max_entries = 1, +}; + +static metrics_t *get_global_metrics(void) +{ + uint64_t key = 0; + return bpf_map_lookup_elem(&metrics_map, &key); +} + +static ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap) +{ + const int payload_off = + sizeof(*encap) + + sizeof(struct in_addr) * encap->unigue.hop_count; + int32_t encap_overhead = payload_off - sizeof(struct ethhdr); + + // Changing the ethertype if the encapsulated packet is ipv6 + if (encap->gue.proto_ctype == IPPROTO_IPV6) { + encap->eth.h_proto = bpf_htons(ETH_P_IPV6); + } + + if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC, + BPF_F_ADJ_ROOM_FIXED_GSO)) { + return TC_ACT_SHOT; + } + + return bpf_redirect(skb->ifindex, BPF_F_INGRESS); +} + +static ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap, + struct in_addr *next_hop, metrics_t *metrics) +{ + metrics->forwarded_packets_total_gre++; + + const int payload_off = + sizeof(*encap) + + sizeof(struct in_addr) * encap->unigue.hop_count; + int32_t encap_overhead = + payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr); + int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead; + uint16_t proto = ETH_P_IP; + + /* Loop protection: the inner packet's TTL is decremented as a safeguard + * against any forwarding loop. As the only interesting field is the TTL + * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes + * as they handle the split packets if needed (no need for the data to be + * in the linear section). + */ + if (encap->gue.proto_ctype == IPPROTO_IPV6) { + proto = ETH_P_IPV6; + uint8_t ttl; + int rc; + + rc = bpf_skb_load_bytes( + skb, payload_off + offsetof(struct ipv6hdr, hop_limit), + &ttl, 1); + if (rc != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (ttl == 0) { + metrics->errors_total_redirect_loop++; + return TC_ACT_SHOT; + } + + ttl--; + rc = bpf_skb_store_bytes( + skb, payload_off + offsetof(struct ipv6hdr, hop_limit), + &ttl, 1, 0); + if (rc != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + } else { + uint8_t ttl; + int rc; + + rc = bpf_skb_load_bytes( + skb, payload_off + offsetof(struct iphdr, ttl), &ttl, + 1); + if (rc != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (ttl == 0) { + metrics->errors_total_redirect_loop++; + return TC_ACT_SHOT; + } + + /* IPv4 also has a checksum to patch. While the TTL is only one byte, + * this function only works for 2 and 4 bytes arguments (the result is + * the same). + */ + rc = bpf_l3_csum_replace( + skb, payload_off + offsetof(struct iphdr, check), ttl, + ttl - 1, 2); + if (rc != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + ttl--; + rc = bpf_skb_store_bytes( + skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1, + 0); + if (rc != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + } + + if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET, + BPF_F_ADJ_ROOM_FIXED_GSO)) { + metrics->errors_total_encap_adjust_failed++; + return TC_ACT_SHOT; + } + + if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) { + metrics->errors_total_encap_buffer_too_small++; + return TC_ACT_SHOT; + } + + buf_t pkt = { + .skb = skb, + .head = (uint8_t *)(long)skb->data, + .tail = (uint8_t *)(long)skb->data_end, + }; + + encap_gre_t *encap_gre = buf_assign(&pkt, sizeof(encap_gre_t), NULL); + if (encap_gre == NULL) { + metrics->errors_total_encap_buffer_too_small++; + return TC_ACT_SHOT; + } + + encap_gre->ip.protocol = IPPROTO_GRE; + encap_gre->ip.daddr = next_hop->s_addr; + encap_gre->ip.saddr = ENCAPSULATION_IP; + encap_gre->ip.tot_len = + bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta); + encap_gre->gre.flags = 0; + encap_gre->gre.protocol = bpf_htons(proto); + pkt_ipv4_checksum((void *)&encap_gre->ip); + + return bpf_redirect(skb->ifindex, 0); +} + +static ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap, + struct in_addr *next_hop, metrics_t *metrics) +{ + /* swap L2 addresses */ + /* This assumes that packets are received from a router. + * So just swapping the MAC addresses here will make the packet go back to + * the router, which will send it to the appropriate machine. + */ + unsigned char temp[ETH_ALEN]; + memcpy(temp, encap->eth.h_dest, sizeof(temp)); + memcpy(encap->eth.h_dest, encap->eth.h_source, + sizeof(encap->eth.h_dest)); + memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source)); + + if (encap->unigue.next_hop == encap->unigue.hop_count - 1 && + encap->unigue.last_hop_gre) { + return forward_with_gre(skb, encap, next_hop, metrics); + } + + metrics->forwarded_packets_total_gue++; + uint32_t old_saddr = encap->ip.saddr; + encap->ip.saddr = encap->ip.daddr; + encap->ip.daddr = next_hop->s_addr; + if (encap->unigue.next_hop < encap->unigue.hop_count) { + encap->unigue.next_hop++; + } + + /* Remove ip->saddr, add next_hop->s_addr */ + const uint64_t off = offsetof(typeof(*encap), ip.check); + int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4); + if (ret < 0) { + return TC_ACT_SHOT; + } + + return bpf_redirect(skb->ifindex, 0); +} + +static ret_t skip_next_hops(buf_t *pkt, int n) +{ + switch (n) { + case 1: + if (!buf_skip(pkt, sizeof(struct in_addr))) + return TC_ACT_SHOT; + case 0: + return CONTINUE_PROCESSING; + + default: + return TC_ACT_SHOT; + } +} + +/* Get the next hop from the GLB header. + * + * Sets next_hop->s_addr to 0 if there are no more hops left. + * pkt is positioned just after the variable length GLB header + * iff the call is successful. + */ +static ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap, + struct in_addr *next_hop) +{ + if (encap->unigue.next_hop > encap->unigue.hop_count) { + return TC_ACT_SHOT; + } + + /* Skip "used" next hops. */ + MAYBE_RETURN(skip_next_hops(pkt, encap->unigue.next_hop)); + + if (encap->unigue.next_hop == encap->unigue.hop_count) { + /* No more next hops, we are at the end of the GLB header. */ + next_hop->s_addr = 0; + return CONTINUE_PROCESSING; + } + + if (!buf_copy(pkt, next_hop, sizeof(*next_hop))) { + return TC_ACT_SHOT; + } + + /* Skip the remainig next hops (may be zero). */ + return skip_next_hops(pkt, encap->unigue.hop_count - + encap->unigue.next_hop - 1); +} + +/* Fill a bpf_sock_tuple to be used with the socket lookup functions. + * This is a kludge that let's us work around verifier limitations: + * + * fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321) + * + * clang will substitue a costant for sizeof, which allows the verifier + * to track it's value. Based on this, it can figure out the constant + * return value, and calling code works while still being "generic" to + * IPv4 and IPv6. + */ +static uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph, + uint64_t iphlen, uint16_t sport, uint16_t dport) +{ + switch (iphlen) { + case sizeof(struct iphdr): { + struct iphdr *ipv4 = (struct iphdr *)iph; + tuple->ipv4.daddr = ipv4->daddr; + tuple->ipv4.saddr = ipv4->saddr; + tuple->ipv4.sport = sport; + tuple->ipv4.dport = dport; + return sizeof(tuple->ipv4); + } + + case sizeof(struct ipv6hdr): { + struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph; + memcpy(&tuple->ipv6.daddr, &ipv6->daddr, + sizeof(tuple->ipv6.daddr)); + memcpy(&tuple->ipv6.saddr, &ipv6->saddr, + sizeof(tuple->ipv6.saddr)); + tuple->ipv6.sport = sport; + tuple->ipv6.dport = dport; + return sizeof(tuple->ipv6); + } + + default: + return 0; + } +} + +static verdict_t classify_tcp(struct __sk_buff *skb, + struct bpf_sock_tuple *tuple, uint64_t tuplen, + void *iph, struct tcphdr *tcp) +{ + struct bpf_sock *sk = + bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); + if (sk == NULL) { + return UNKNOWN; + } + + if (sk->state != BPF_TCP_LISTEN) { + bpf_sk_release(sk); + return ESTABLISHED; + } + + if (iph != NULL && tcp != NULL) { + /* Kludge: we've run out of arguments, but need the length of the ip header. */ + uint64_t iphlen = sizeof(struct iphdr); + if (tuplen == sizeof(tuple->ipv6)) { + iphlen = sizeof(struct ipv6hdr); + } + + if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp, + sizeof(*tcp)) == 0) { + bpf_sk_release(sk); + return SYN_COOKIE; + } + } + + bpf_sk_release(sk); + return UNKNOWN; +} + +static verdict_t classify_udp(struct __sk_buff *skb, + struct bpf_sock_tuple *tuple, uint64_t tuplen) +{ + struct bpf_sock *sk = + bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); + if (sk == NULL) { + return UNKNOWN; + } + + if (sk->state == BPF_TCP_ESTABLISHED) { + bpf_sk_release(sk); + return ESTABLISHED; + } + + bpf_sk_release(sk); + return UNKNOWN; +} + +static verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, + struct bpf_sock_tuple *tuple, uint64_t tuplen, + metrics_t *metrics) +{ + switch (proto) { + case IPPROTO_TCP: + return classify_tcp(skb, tuple, tuplen, NULL, NULL); + + case IPPROTO_UDP: + return classify_udp(skb, tuple, tuplen); + + default: + metrics->errors_total_malformed_icmp++; + return INVALID; + } +} + +static verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics) +{ + struct icmphdr icmp; + if (!buf_copy(pkt, &icmp, sizeof(icmp))) { + metrics->errors_total_malformed_icmp++; + return INVALID; + } + + /* We should never receive encapsulated echo replies. */ + if (icmp.type == ICMP_ECHOREPLY) { + metrics->errors_total_icmp_echo_replies++; + return INVALID; + } + + if (icmp.type == ICMP_ECHO) { + return ECHO_REQUEST; + } + + if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) { + metrics->errors_total_unwanted_icmp++; + return INVALID; + } + + struct iphdr _ip4; + const struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4); + if (ipv4 == NULL) { + metrics->errors_total_malformed_icmp_pkt_too_big++; + return INVALID; + } + + /* The source address in the outer IP header is from the entity that + * originated the ICMP message. Use the original IP header to restore + * the correct flow tuple. + */ + struct bpf_sock_tuple tuple; + tuple.ipv4.saddr = ipv4->daddr; + tuple.ipv4.daddr = ipv4->saddr; + + if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv4.sport)) { + metrics->errors_total_malformed_icmp_pkt_too_big++; + return INVALID; + } + + return classify_icmp(pkt->skb, ipv4->protocol, &tuple, + sizeof(tuple.ipv4), metrics); +} + +static verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics) +{ + struct icmp6hdr icmp6; + if (!buf_copy(pkt, &icmp6, sizeof(icmp6))) { + metrics->errors_total_malformed_icmp++; + return INVALID; + } + + /* We should never receive encapsulated echo replies. */ + if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) { + metrics->errors_total_icmp_echo_replies++; + return INVALID; + } + + if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) { + return ECHO_REQUEST; + } + + if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) { + metrics->errors_total_unwanted_icmp++; + return INVALID; + } + + bool is_fragment; + uint8_t l4_proto; + struct ipv6hdr _ipv6; + const struct ipv6hdr *ipv6 = + pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment); + if (ipv6 == NULL) { + metrics->errors_total_malformed_icmp_pkt_too_big++; + return INVALID; + } + + if (is_fragment) { + metrics->errors_total_fragmented_ip++; + return INVALID; + } + + /* Swap source and dest addresses. */ + struct bpf_sock_tuple tuple; + memcpy(&tuple.ipv6.saddr, &ipv6->daddr, sizeof(tuple.ipv6.saddr)); + memcpy(&tuple.ipv6.daddr, &ipv6->saddr, sizeof(tuple.ipv6.daddr)); + + if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv6.sport)) { + metrics->errors_total_malformed_icmp_pkt_too_big++; + return INVALID; + } + + return classify_icmp(pkt->skb, l4_proto, &tuple, sizeof(tuple.ipv6), + metrics); +} + +static verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen, + metrics_t *metrics) +{ + metrics->l4_protocol_packets_total_tcp++; + + struct tcphdr _tcp; + struct tcphdr *tcp = buf_assign(pkt, sizeof(_tcp), &_tcp); + if (tcp == NULL) { + metrics->errors_total_malformed_tcp++; + return INVALID; + } + + if (tcp->syn) { + return SYN; + } + + struct bpf_sock_tuple tuple; + uint64_t tuplen = + fill_tuple(&tuple, iph, iphlen, tcp->source, tcp->dest); + return classify_tcp(pkt->skb, &tuple, tuplen, iph, tcp); +} + +static verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen, + metrics_t *metrics) +{ + metrics->l4_protocol_packets_total_udp++; + + struct udphdr _udp; + struct udphdr *udph = buf_assign(pkt, sizeof(_udp), &_udp); + if (udph == NULL) { + metrics->errors_total_malformed_udp++; + return INVALID; + } + + struct bpf_sock_tuple tuple; + uint64_t tuplen = + fill_tuple(&tuple, iph, iphlen, udph->source, udph->dest); + return classify_udp(pkt->skb, &tuple, tuplen); +} + +static verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics) +{ + metrics->l3_protocol_packets_total_ipv4++; + + struct iphdr _ip4; + struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4); + if (ipv4 == NULL) { + metrics->errors_total_malformed_ip++; + return INVALID; + } + + if (ipv4->version != 4) { + metrics->errors_total_malformed_ip++; + return INVALID; + } + + if (ipv4_is_fragment(ipv4)) { + metrics->errors_total_fragmented_ip++; + return INVALID; + } + + switch (ipv4->protocol) { + case IPPROTO_ICMP: + return process_icmpv4(pkt, metrics); + + case IPPROTO_TCP: + return process_tcp(pkt, ipv4, sizeof(*ipv4), metrics); + + case IPPROTO_UDP: + return process_udp(pkt, ipv4, sizeof(*ipv4), metrics); + + default: + metrics->errors_total_unknown_l4_proto++; + return INVALID; + } +} + +static verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics) +{ + metrics->l3_protocol_packets_total_ipv6++; + + uint8_t l4_proto; + bool is_fragment; + struct ipv6hdr _ipv6; + struct ipv6hdr *ipv6 = + pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment); + if (ipv6 == NULL) { + metrics->errors_total_malformed_ip++; + return INVALID; + } + + if (ipv6->version != 6) { + metrics->errors_total_malformed_ip++; + return INVALID; + } + + if (is_fragment) { + metrics->errors_total_fragmented_ip++; + return INVALID; + } + + switch (l4_proto) { + case IPPROTO_ICMPV6: + return process_icmpv6(pkt, metrics); + + case IPPROTO_TCP: + return process_tcp(pkt, ipv6, sizeof(*ipv6), metrics); + + case IPPROTO_UDP: + return process_udp(pkt, ipv6, sizeof(*ipv6), metrics); + + default: + metrics->errors_total_unknown_l4_proto++; + return INVALID; + } +} + +SEC("classifier/cls_redirect") +int cls_redirect(struct __sk_buff *skb) +{ + metrics_t *metrics = get_global_metrics(); + if (metrics == NULL) { + return TC_ACT_SHOT; + } + + metrics->processed_packets_total++; + + /* Pass bogus packets as long as we're not sure they're + * destined for us. + */ + if (skb->protocol != bpf_htons(ETH_P_IP)) { + return TC_ACT_OK; + } + + encap_headers_t *encap; + + /* Make sure that all encapsulation headers are available in + * the linear portion of the skb. This makes it easy to manipulate them. + */ + if (bpf_skb_pull_data(skb, sizeof(*encap))) { + return TC_ACT_OK; + } + + buf_t pkt = { + .skb = skb, + .head = (uint8_t *)(long)skb->data, + .tail = (uint8_t *)(long)skb->data_end, + }; + + encap = buf_assign(&pkt, sizeof(*encap), NULL); + if (encap == NULL) { + return TC_ACT_OK; + } + + if (encap->ip.ihl != 5) { + /* We never have any options. */ + return TC_ACT_OK; + } + + if (encap->ip.daddr != ENCAPSULATION_IP || + encap->ip.protocol != IPPROTO_UDP) { + return TC_ACT_OK; + } + + /* TODO Check UDP length? */ + if (encap->udp.dest != ENCAPSULATION_PORT) { + return TC_ACT_OK; + } + + /* We now know that the packet is destined to us, we can + * drop bogus ones. + */ + if (ipv4_is_fragment((void *)&encap->ip)) { + metrics->errors_total_fragmented_ip++; + return TC_ACT_SHOT; + } + + if (encap->gue.variant != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (encap->gue.control != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (encap->gue.flags != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (encap->gue.hlen != + sizeof(encap->unigue) / 4 + encap->unigue.hop_count) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (encap->unigue.version != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (encap->unigue.reserved != 0) { + return TC_ACT_SHOT; + } + + struct in_addr next_hop; + MAYBE_RETURN(get_next_hop(&pkt, encap, &next_hop)); + + if (next_hop.s_addr == 0) { + metrics->accepted_packets_total_last_hop++; + return accept_locally(skb, encap); + } + + verdict_t verdict; + switch (encap->gue.proto_ctype) { + case IPPROTO_IPIP: + verdict = process_ipv4(&pkt, metrics); + break; + + case IPPROTO_IPV6: + verdict = process_ipv6(&pkt, metrics); + break; + + default: + metrics->errors_total_unknown_l3_proto++; + return TC_ACT_SHOT; + } + + switch (verdict) { + case INVALID: + /* metrics have already been bumped */ + return TC_ACT_SHOT; + + case UNKNOWN: + return forward_to_next_hop(skb, encap, &next_hop, metrics); + + case ECHO_REQUEST: + metrics->accepted_packets_total_icmp_echo_request++; + break; + + case SYN: + if (encap->unigue.forward_syn) { + return forward_to_next_hop(skb, encap, &next_hop, + metrics); + } + + metrics->accepted_packets_total_syn++; + break; + + case SYN_COOKIE: + metrics->accepted_packets_total_syn_cookies++; + break; + + case ESTABLISHED: + metrics->accepted_packets_total_established++; + break; + } + + return accept_locally(skb, encap); +} diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.h b/tools/testing/selftests/bpf/progs/test_cls_redirect.h new file mode 100644 index 000000000000..76eab0aacba0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright 2019, 2020 Cloudflare */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +struct gre_base_hdr { + uint16_t flags; + uint16_t protocol; +} __attribute__((packed)); + +struct guehdr { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + uint8_t hlen : 5, control : 1, variant : 2; +#else + uint8_t variant : 2, control : 1, hlen : 5; +#endif + uint8_t proto_ctype; + uint16_t flags; +}; + +struct unigue { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + uint8_t _r : 2, last_hop_gre : 1, forward_syn : 1, version : 4; +#else + uint8_t version : 4, forward_syn : 1, last_hop_gre : 1, _r : 2; +#endif + uint8_t reserved; + uint8_t next_hop; + uint8_t hop_count; + // Next hops go here +} __attribute__((packed)); + +typedef struct { + struct ethhdr eth; + struct iphdr ip; + struct gre_base_hdr gre; +} __attribute__((packed)) encap_gre_t; + +typedef struct { + struct ethhdr eth; + struct iphdr ip; + struct udphdr udp; + struct guehdr gue; + struct unigue unigue; +} __attribute__((packed)) encap_headers_t; diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index f4aff6b8284b..10188cc8e9e0 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -105,6 +105,13 @@ struct ipv6_packet { } __packed; extern struct ipv6_packet pkt_v6; +#define PRINT_FAIL(format...) \ + ({ \ + test__fail(); \ + fprintf(stdout, "%s:FAIL:%d ", __func__, __LINE__); \ + fprintf(stdout, ##format); \ + }) + #define _CHECK(condition, tag, duration, format...) ({ \ int __ret = !!(condition); \ int __save_errno = errno; \ -- cgit v1.2.3 From e411eb257b331bf44cbe8845b5351260c8222c6c Mon Sep 17 00:00:00 2001 From: Mao Wenan Date: Sun, 26 Apr 2020 14:36:35 +0800 Subject: libbpf: Return err if bpf_object__load failed bpf_object__load() has various return code, when it failed to load object, it must return err instead of -EINVAL. Signed-off-by: Mao Wenan Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200426063635.130680-3-maowenan@huawei.com --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 8f480e29a6b0..8e1dc6980fac 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -7006,7 +7006,7 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, err = bpf_object__load(obj); if (err) { bpf_object__close(obj); - return -EINVAL; + return err; } *pobj = obj; -- cgit v1.2.3 From 3e54442c93845316762b1b3c75e654463fd1b715 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sun, 26 Apr 2020 15:22:01 +0200 Subject: net: bridge: Add port attribute IFLA_BRPORT_MRP_RING_OPEN This patch adds a new port attribute, IFLA_BRPORT_MRP_RING_OPEN, which allows to notify the userspace when the port lost the continuite of MRP frames. This attribute is set by kernel whenever the SW or HW detects that the ring is being open or closed. Reviewed-by: Nikolay Aleksandrov Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/bridge/br_netlink.c | 3 +++ tools/include/uapi/linux/if_link.h | 1 + 3 files changed, 5 insertions(+) (limited to 'tools') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 127c704eeba9..a009365ad67b 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -343,6 +343,7 @@ enum { IFLA_BRPORT_NEIGH_SUPPRESS, IFLA_BRPORT_ISOLATED, IFLA_BRPORT_BACKUP_PORT, + IFLA_BRPORT_MRP_RING_OPEN, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 43dab4066f91..4084f1ef8641 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -151,6 +151,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */ #endif + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */ + + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_RING_OPEN */ + 0; } @@ -213,6 +214,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) || nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS, !!(p->flags & BR_NEIGH_SUPPRESS)) || + nla_put_u8(skb, IFLA_BRPORT_MRP_RING_OPEN, !!(p->flags & + BR_MRP_LOST_CONT)) || nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED))) return -EMSGSIZE; diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index ca6665ea758a..cafedbbfefbe 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -343,6 +343,7 @@ enum { IFLA_BRPORT_NEIGH_SUPPRESS, IFLA_BRPORT_ISOLATED, IFLA_BRPORT_BACKUP_PORT, + IFLA_BRPORT_MRP_RING_OPEN, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) -- cgit v1.2.3 From 075c8aa79d541ea08c67a2e6d955f6457e98c21c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 27 Apr 2020 18:13:10 +0300 Subject: selftests: forwarding: tc_actions.sh: add matchall mirror test Add test for matchall classifier with mirred egress mirror action. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- .../testing/selftests/net/forwarding/tc_actions.sh | 26 +++++++++++++++------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh index 813d02d1939d..d9eca227136b 100755 --- a/tools/testing/selftests/net/forwarding/tc_actions.sh +++ b/tools/testing/selftests/net/forwarding/tc_actions.sh @@ -2,7 +2,8 @@ # SPDX-License-Identifier: GPL-2.0 ALL_TESTS="gact_drop_and_ok_test mirred_egress_redirect_test \ - mirred_egress_mirror_test gact_trap_test" + mirred_egress_mirror_test matchall_mirred_egress_mirror_test \ + gact_trap_test" NUM_NETIFS=4 source tc_common.sh source lib.sh @@ -50,6 +51,9 @@ switch_destroy() mirred_egress_test() { local action=$1 + local protocol=$2 + local classifier=$3 + local classifier_args=$4 RET=0 @@ -62,9 +66,9 @@ mirred_egress_test() tc_check_packets "dev $h2 ingress" 101 1 check_fail $? "Matched without redirect rule inserted" - tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \ - $tcflags dst_ip 192.0.2.2 action mirred egress $action \ - dev $swp2 + tc filter add dev $swp1 ingress protocol $protocol pref 1 handle 101 \ + $classifier $tcflags $classifier_args \ + action mirred egress $action dev $swp2 $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \ -t ip -q @@ -72,10 +76,11 @@ mirred_egress_test() tc_check_packets "dev $h2 ingress" 101 1 check_err $? "Did not match incoming $action packet" - tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower + tc filter del dev $swp1 ingress protocol $protocol pref 1 handle 101 \ + $classifier tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower - log_test "mirred egress $action ($tcflags)" + log_test "mirred egress $classifier $action ($tcflags)" } gact_drop_and_ok_test() @@ -187,12 +192,17 @@ cleanup() mirred_egress_redirect_test() { - mirred_egress_test "redirect" + mirred_egress_test "redirect" "ip" "flower" "dst_ip 192.0.2.2" } mirred_egress_mirror_test() { - mirred_egress_test "mirror" + mirred_egress_test "mirror" "ip" "flower" "dst_ip 192.0.2.2" +} + +matchall_mirred_egress_mirror_test() +{ + mirred_egress_test "mirror" "all" "matchall" "" } trap cleanup EXIT -- cgit v1.2.3 From b26d1e2b60284dc9f66ffad9ccd5c5da1100bb4b Mon Sep 17 00:00:00 2001 From: Veronika Kabatova Date: Tue, 28 Apr 2020 19:37:42 +0200 Subject: selftests/bpf: Copy runqslower to OUTPUT directory $(OUTPUT)/runqslower makefile target doesn't actually create runqslower binary in the $(OUTPUT) directory. As lib.mk expects all TEST_GEN_PROGS_EXTENDED (which runqslower is a part of) to be present in the OUTPUT directory, this results in an error when running e.g. `make install`: rsync: link_stat "tools/testing/selftests/bpf/runqslower" failed: No such file or directory (2) Copy the binary into the OUTPUT directory after building it to fix the error. Fixes: 3a0d3092a4ed ("selftests/bpf: Build runqslower from selftests") Signed-off-by: Veronika Kabatova Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200428173742.2988395-1-vkabatov@redhat.com --- tools/testing/selftests/bpf/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 7729892e0b04..4e654d41c7af 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -141,7 +141,8 @@ VMLINUX_BTF := $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) $(OUTPUT)/runqslower: $(BPFOBJ) $(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/runqslower \ OUTPUT=$(SCRATCH_DIR)/ VMLINUX_BTF=$(VMLINUX_BTF) \ - BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR) + BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR) && \ + cp $(SCRATCH_DIR)/runqslower $@ $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/test_stub.o $(BPFOBJ) -- cgit v1.2.3 From a6bbdf2e750f245d219d39f3c3d06ace2c5871e6 Mon Sep 17 00:00:00 2001 From: Zou Wei Date: Tue, 28 Apr 2020 17:07:09 +0800 Subject: libbpf: Remove unneeded semicolon in btf_dump_emit_type Fixes the following coccicheck warning: tools/lib/bpf/btf_dump.c:661:4-5: Unneeded semicolon Reported-by: Hulk Robot Signed-off-by: Zou Wei Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/1588064829-70613-1-git-send-email-zou_wei@huawei.com --- tools/lib/bpf/btf_dump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 0c28ee82834b..de07e559a11d 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -658,7 +658,7 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id) if (!btf_dump_is_blacklisted(d, id)) { btf_dump_emit_typedef_def(d, id, t, 0); btf_dump_printf(d, ";\n\n"); - }; + } tstate->fwd_emitted = 1; break; default: -- cgit v1.2.3 From 4dddb5be136a7b151c11f0fbe350feff75a89867 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Mon, 27 Apr 2020 13:56:47 -0700 Subject: selftests: net: add new testcases for nexthop API compat mode sysctl New tests to check route dump and notifications with net.ipv4.nexthop_compat_mode on and off. Signed-off-by: Roopa Prabhu Reviewed-by: David Ahern Signed-off-by: David S. Miller --- tools/testing/selftests/net/fib_nexthops.sh | 198 +++++++++++++++++++++++++++- 1 file changed, 196 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index b785241127df..dd0e5fec6367 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -19,8 +19,8 @@ ret=0 ksft_skip=4 # all tests in this script. Can be overridden with -t option -IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime" -IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime" +IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_compat_mode" +IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_compat_mode" ALL_TESTS="basic ${IPV4_TESTS} ${IPV6_TESTS}" TESTS="${ALL_TESTS}" @@ -253,6 +253,33 @@ check_route6() check_output "${out}" "${expected}" } +start_ip_monitor() +{ + local mtype=$1 + + # start the monitor in the background + tmpfile=`mktemp /var/run/nexthoptestXXX` + mpid=`($IP monitor $mtype > $tmpfile & echo $!) 2>/dev/null` + sleep 0.2 + echo "$mpid $tmpfile" +} + +stop_ip_monitor() +{ + local mpid=$1 + local tmpfile=$2 + local el=$3 + + # check the monitor results + kill $mpid + lines=`wc -l $tmpfile | cut "-d " -f1` + test $lines -eq $el + rc=$? + rm -rf $tmpfile + + return $rc +} + ################################################################################ # basic operations (add, delete, replace) on nexthops and nexthop groups # @@ -883,6 +910,173 @@ ipv4_fcnal_runtime() log_test $? 0 "IPv4 route with MPLS encap, v6 gw - check" } +sysctl_nexthop_compat_mode_check() +{ + local sysctlname="net.ipv4.nexthop_compat_mode" + local lprefix=$1 + + IPE="ip netns exec me" + + $IPE sysctl -q $sysctlname 2>&1 >/dev/null + if [ $? -ne 0 ]; then + echo "SKIP: kernel lacks nexthop compat mode sysctl control" + return $ksft_skip + fi + + out=$($IPE sysctl $sysctlname 2>/dev/null) + log_test $? 0 "$lprefix default nexthop compat mode check" + check_output "${out}" "$sysctlname = 1" +} + +sysctl_nexthop_compat_mode_set() +{ + local sysctlname="net.ipv4.nexthop_compat_mode" + local mode=$1 + local lprefix=$2 + + IPE="ip netns exec me" + + out=$($IPE sysctl -w $sysctlname=$mode) + log_test $? 0 "$lprefix set compat mode - $mode" + check_output "${out}" "net.ipv4.nexthop_compat_mode = $mode" +} + +ipv6_compat_mode() +{ + local rc + + echo + echo "IPv6 nexthop api compat mode test" + echo "--------------------------------" + + sysctl_nexthop_compat_mode_check "IPv6" + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop add id 122 group 62/63" + ipmout=$(start_ip_monitor route) + + run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 122" + # route add notification should contain expanded nexthops + stop_ip_monitor $ipmout 3 + log_test $? 0 "IPv6 compat mode on - route add notification" + + # route dump should contain expanded nexthops + check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 122 metric 1024 pref medium nexthop via 2001:db8:91::2 dev veth1 weight 1 nexthop via 2001:db8:91::3 dev veth1 weight 1" + log_test $? 0 "IPv6 compat mode on - route dump" + + # change in nexthop group should generate route notification + run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1" + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop replace id 122 group 62/64" + stop_ip_monitor $ipmout 3 + + log_test $? 0 "IPv6 compat mode on - nexthop change" + + # set compat mode off + sysctl_nexthop_compat_mode_set 0 "IPv6" + + run_cmd "$IP -6 ro del 2001:db8:101::1/128 nhid 122" + + run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop add id 122 group 62/63" + ipmout=$(start_ip_monitor route) + + run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 122" + # route add notification should not contain expanded nexthops + stop_ip_monitor $ipmout 1 + log_test $? 0 "IPv6 compat mode off - route add notification" + + # route dump should not contain expanded nexthops + check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 122 metric 1024 pref medium" + log_test $? 0 "IPv6 compat mode off - route dump" + + # change in nexthop group should not generate route notification + run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1" + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop replace id 122 group 62/64" + stop_ip_monitor $ipmout 0 + log_test $? 0 "IPv6 compat mode off - nexthop change" + + # nexthop delete should not generate route notification + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop del id 122" + stop_ip_monitor $ipmout 0 + log_test $? 0 "IPv6 compat mode off - nexthop delete" + + # set compat mode back on + sysctl_nexthop_compat_mode_set 1 "IPv6" +} + +ipv4_compat_mode() +{ + local rc + + echo + echo "IPv4 nexthop api compat mode" + echo "----------------------------" + + sysctl_nexthop_compat_mode_check "IPv4" + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + run_cmd "$IP nexthop add id 21 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 22 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 122 group 21/22" + ipmout=$(start_ip_monitor route) + + run_cmd "$IP ro add 172.16.101.1/32 nhid 122" + stop_ip_monitor $ipmout 3 + + # route add notification should contain expanded nexthops + log_test $? 0 "IPv4 compat mode on - route add notification" + + # route dump should contain expanded nexthops + check_route "172.16.101.1" "172.16.101.1 nhid 122 nexthop via 172.16.1.2 dev veth1 weight 1 nexthop via 172.16.1.2 dev veth1 weight 1" + log_test $? 0 "IPv4 compat mode on - route dump" + + # change in nexthop group should generate route notification + run_cmd "$IP nexthop add id 23 via 172.16.1.3 dev veth1" + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop replace id 122 group 21/23" + stop_ip_monitor $ipmout 3 + log_test $? 0 "IPv4 compat mode on - nexthop change" + + sysctl_nexthop_compat_mode_set 0 "IPv4" + + # cleanup + run_cmd "$IP ro del 172.16.101.1/32 nhid 122" + + ipmout=$(start_ip_monitor route) + run_cmd "$IP ro add 172.16.101.1/32 nhid 122" + stop_ip_monitor $ipmout 1 + # route add notification should not contain expanded nexthops + log_test $? 0 "IPv4 compat mode off - route add notification" + + # route dump should not contain expanded nexthops + check_route "172.16.101.1" "172.16.101.1 nhid 122" + log_test $? 0 "IPv4 compat mode off - route dump" + + # change in nexthop group should not generate route notification + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop replace id 122 group 21/22" + stop_ip_monitor $ipmout 0 + log_test $? 0 "IPv4 compat mode off - nexthop change" + + # nexthop delete should not generate route notification + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop del id 122" + stop_ip_monitor $ipmout 0 + log_test $? 0 "IPv4 compat mode off - nexthop delete" + + sysctl_nexthop_compat_mode_set 1 "IPv4" +} + basic() { echo -- cgit v1.2.3 From 1a89595c2272aa9b4cd3fda562545dc1d9cd89ed Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 27 Apr 2020 18:03:47 -0700 Subject: kselftest: factor out list manipulation to a helper Kees suggest to factor out the list append code to a macro, since following commits need it, which leads to code duplication. Suggested-by: Kees Cook Signed-off-by: Jakub Kicinski Acked-by: Kees Cook Signed-off-by: David S. Miller --- tools/testing/selftests/kselftest_harness.h | 42 ++++++++++++++++------------- 1 file changed, 24 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 2bb8c81fc0b4..77f754854f0d 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -631,6 +631,29 @@ } \ } while (0); OPTIONAL_HANDLER(_assert) +/* List helpers */ +#define __LIST_APPEND(head, item) \ +{ \ + /* Circular linked list where only prev is circular. */ \ + if (head == NULL) { \ + head = item; \ + item->next = NULL; \ + item->prev = item; \ + return; \ + } \ + if (__constructor_order == _CONSTRUCTOR_ORDER_FORWARD) { \ + item->next = NULL; \ + item->prev = head->prev; \ + item->prev->next = item; \ + head->prev = item; \ + } else { \ + item->next = head; \ + item->next->prev = item; \ + item->prev = item; \ + head = item; \ + } \ +} + /* Contains all the information for test execution and status checking. */ struct __test_metadata { const char *name; @@ -667,24 +690,7 @@ static int __constructor_order; static inline void __register_test(struct __test_metadata *t) { __test_count++; - /* Circular linked list where only prev is circular. */ - if (__test_list == NULL) { - __test_list = t; - t->next = NULL; - t->prev = t; - return; - } - if (__constructor_order == _CONSTRUCTOR_ORDER_FORWARD) { - t->next = NULL; - t->prev = __test_list->prev; - t->prev->next = t; - __test_list->prev = t; - } else { - t->next = __test_list; - t->next->prev = t; - t->prev = t; - __test_list = t; - } + __LIST_APPEND(__test_list, t); } static inline int __bail(int for_realz, bool no_print, __u8 step) -- cgit v1.2.3 From 142aca6b388c8ab83dc41bd71150cb23115bd285 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 27 Apr 2020 18:03:48 -0700 Subject: kselftest: create fixture objects Grouping tests by fixture will allow us to parametrize test runs. Create full objects for fixtures. Add a "global" fixture for tests without a fixture. Signed-off-by: Jakub Kicinski Acked-by: Kees Cook Signed-off-by: David S. Miller --- tools/testing/selftests/kselftest_harness.h | 51 +++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 77f754854f0d..de283fd6fc4d 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -169,8 +169,10 @@ #define __TEST_IMPL(test_name, _signal) \ static void test_name(struct __test_metadata *_metadata); \ static struct __test_metadata _##test_name##_object = \ - { .name = "global." #test_name, \ - .fn = &test_name, .termsig = _signal, \ + { .name = #test_name, \ + .fn = &test_name, \ + .fixture = &_fixture_global, \ + .termsig = _signal, \ .timeout = TEST_TIMEOUT_DEFAULT, }; \ static void __attribute__((constructor)) _register_##test_name(void) \ { \ @@ -212,10 +214,12 @@ * populated and cleaned up using FIXTURE_SETUP() and FIXTURE_TEARDOWN(). */ #define FIXTURE(fixture_name) \ + static struct __fixture_metadata _##fixture_name##_fixture_object = \ + { .name = #fixture_name, }; \ static void __attribute__((constructor)) \ _register_##fixture_name##_data(void) \ { \ - __fixture_count++; \ + __register_fixture(&_##fixture_name##_fixture_object); \ } \ FIXTURE_DATA(fixture_name) @@ -309,8 +313,9 @@ } \ static struct __test_metadata \ _##fixture_name##_##test_name##_object = { \ - .name = #fixture_name "." #test_name, \ + .name = #test_name, \ .fn = &wrapper_##fixture_name##_##test_name, \ + .fixture = &_##fixture_name##_fixture_object, \ .termsig = signal, \ .timeout = tmout, \ }; \ @@ -654,11 +659,34 @@ } \ } +/* Contains all the information about a fixture. */ +struct __fixture_metadata { + const char *name; + struct __fixture_metadata *prev, *next; +} _fixture_global __attribute__((unused)) = { + .name = "global", + .prev = &_fixture_global, +}; + +static struct __fixture_metadata *__fixture_list = &_fixture_global; +static unsigned int __fixture_count; +static int __constructor_order; + +#define _CONSTRUCTOR_ORDER_FORWARD 1 +#define _CONSTRUCTOR_ORDER_BACKWARD -1 + +static inline void __register_fixture(struct __fixture_metadata *f) +{ + __fixture_count++; + __LIST_APPEND(__fixture_list, f); +} + /* Contains all the information for test execution and status checking. */ struct __test_metadata { const char *name; void (*fn)(struct __test_metadata *); pid_t pid; /* pid of test when being run */ + struct __fixture_metadata *fixture; int termsig; int passed; int trigger; /* extra handler after the evaluation */ @@ -672,11 +700,6 @@ struct __test_metadata { /* Storage for the (global) tests to be run. */ static struct __test_metadata *__test_list; static unsigned int __test_count; -static unsigned int __fixture_count; -static int __constructor_order; - -#define _CONSTRUCTOR_ORDER_FORWARD 1 -#define _CONSTRUCTOR_ORDER_BACKWARD -1 /* * Since constructors are called in reverse order, reverse the test @@ -796,11 +819,12 @@ void __wait_for_test(struct __test_metadata *t) } } -void __run_test(struct __test_metadata *t) +void __run_test(struct __fixture_metadata *f, + struct __test_metadata *t) { t->passed = 1; t->trigger = 0; - printf("[ RUN ] %s\n", t->name); + printf("[ RUN ] %s.%s\n", f->name, t->name); t->pid = fork(); if (t->pid < 0) { printf("ERROR SPAWNING TEST CHILD\n"); @@ -812,7 +836,8 @@ void __run_test(struct __test_metadata *t) } else { __wait_for_test(t); } - printf("[ %4s ] %s\n", (t->passed ? "OK" : "FAIL"), t->name); + printf("[ %4s ] %s.%s\n", (t->passed ? "OK" : "FAIL"), + f->name, t->name); } static int test_harness_run(int __attribute__((unused)) argc, @@ -828,7 +853,7 @@ static int test_harness_run(int __attribute__((unused)) argc, __test_count, __fixture_count + 1); for (t = __test_list; t; t = t->next) { count++; - __run_test(t); + __run_test(t->fixture, t); if (t->passed) pass_count++; else -- cgit v1.2.3 From e7f304607778e31bfd8e6b00ce2a8f990b265e14 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 27 Apr 2020 18:03:49 -0700 Subject: kselftest: run tests by fixture Now that all tests have a fixture object move from a global list of tests to a list of tests per fixture. Order of tests may change as we will now group and run test fixture by fixture, rather than in declaration order. Signed-off-by: Jakub Kicinski Acked-by: Kees Cook Signed-off-by: David S. Miller --- tools/testing/selftests/kselftest_harness.h | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index de283fd6fc4d..fa7185e45472 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -659,9 +659,12 @@ } \ } +struct __test_metadata; + /* Contains all the information about a fixture. */ struct __fixture_metadata { const char *name; + struct __test_metadata *tests; struct __fixture_metadata *prev, *next; } _fixture_global __attribute__((unused)) = { .name = "global", @@ -698,7 +701,6 @@ struct __test_metadata { }; /* Storage for the (global) tests to be run. */ -static struct __test_metadata *__test_list; static unsigned int __test_count; /* @@ -713,7 +715,7 @@ static unsigned int __test_count; static inline void __register_test(struct __test_metadata *t) { __test_count++; - __LIST_APPEND(__test_list, t); + __LIST_APPEND(t->fixture->tests, t); } static inline int __bail(int for_realz, bool no_print, __u8 step) @@ -843,6 +845,7 @@ void __run_test(struct __fixture_metadata *f, static int test_harness_run(int __attribute__((unused)) argc, char __attribute__((unused)) **argv) { + struct __fixture_metadata *f; struct __test_metadata *t; int ret = 0; unsigned int count = 0; @@ -851,13 +854,15 @@ static int test_harness_run(int __attribute__((unused)) argc, /* TODO(wad) add optional arguments similar to gtest. */ printf("[==========] Running %u tests from %u test cases.\n", __test_count, __fixture_count + 1); - for (t = __test_list; t; t = t->next) { - count++; - __run_test(t->fixture, t); - if (t->passed) - pass_count++; - else - ret = 1; + for (f = __fixture_list; f; f = f->next) { + for (t = f->tests; t; t = t->next) { + count++; + __run_test(f, t); + if (t->passed) + pass_count++; + else + ret = 1; + } } printf("[==========] %u / %u tests passed.\n", pass_count, count); printf("[ %s ]\n", (ret ? "FAILED" : "PASSED")); -- cgit v1.2.3 From 74bc7c97fa88ae334752e7b45702d23813df8873 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 27 Apr 2020 18:03:50 -0700 Subject: kselftest: add fixture variants Allow users to build parameterized variants of fixtures. If fixtures want variants, they call FIXTURE_VARIANT() to declare the structure to fill for each variant. Each fixture will be re-run for each of the variants defined by calling FIXTURE_VARIANT_ADD() with the differing parameters initializing the structure. Since tests are being re-run, additional initialization (steps, no_print) is also added. Signed-off-by: Jakub Kicinski Acked-by: Kees Cook Signed-off-by: David S. Miller --- Documentation/dev-tools/kselftest.rst | 3 +- tools/testing/selftests/kselftest_harness.h | 148 +++++++++++++++++++++++----- 2 files changed, 124 insertions(+), 27 deletions(-) (limited to 'tools') diff --git a/Documentation/dev-tools/kselftest.rst b/Documentation/dev-tools/kselftest.rst index 61ae13c44f91..5d1f56fcd2e7 100644 --- a/Documentation/dev-tools/kselftest.rst +++ b/Documentation/dev-tools/kselftest.rst @@ -301,7 +301,8 @@ Helpers .. kernel-doc:: tools/testing/selftests/kselftest_harness.h :functions: TH_LOG TEST TEST_SIGNAL FIXTURE FIXTURE_DATA FIXTURE_SETUP - FIXTURE_TEARDOWN TEST_F TEST_HARNESS_MAIN + FIXTURE_TEARDOWN TEST_F TEST_HARNESS_MAIN FIXTURE_VARIANT + FIXTURE_VARIANT_ADD Operators --------- diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index fa7185e45472..c9f03ef93338 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -168,9 +168,15 @@ #define __TEST_IMPL(test_name, _signal) \ static void test_name(struct __test_metadata *_metadata); \ + static inline void wrapper_##test_name( \ + struct __test_metadata *_metadata, \ + struct __fixture_variant_metadata *variant) \ + { \ + test_name(_metadata); \ + } \ static struct __test_metadata _##test_name##_object = \ { .name = #test_name, \ - .fn = &test_name, \ + .fn = &wrapper_##test_name, \ .fixture = &_fixture_global, \ .termsig = _signal, \ .timeout = TEST_TIMEOUT_DEFAULT, }; \ @@ -214,6 +220,7 @@ * populated and cleaned up using FIXTURE_SETUP() and FIXTURE_TEARDOWN(). */ #define FIXTURE(fixture_name) \ + FIXTURE_VARIANT(fixture_name); \ static struct __fixture_metadata _##fixture_name##_fixture_object = \ { .name = #fixture_name, }; \ static void __attribute__((constructor)) \ @@ -245,7 +252,10 @@ #define FIXTURE_SETUP(fixture_name) \ void fixture_name##_setup( \ struct __test_metadata __attribute__((unused)) *_metadata, \ - FIXTURE_DATA(fixture_name) __attribute__((unused)) *self) + FIXTURE_DATA(fixture_name) __attribute__((unused)) *self, \ + const FIXTURE_VARIANT(fixture_name) \ + __attribute__((unused)) *variant) + /** * FIXTURE_TEARDOWN(fixture_name) * *_metadata* is included so that EXPECT_* and ASSERT_* work correctly. @@ -267,6 +277,59 @@ struct __test_metadata __attribute__((unused)) *_metadata, \ FIXTURE_DATA(fixture_name) __attribute__((unused)) *self) +/** + * FIXTURE_VARIANT(fixture_name) - Optionally called once per fixture + * to declare fixture variant + * + * @fixture_name: fixture name + * + * .. code-block:: c + * + * FIXTURE_VARIANT(datatype name) { + * type property1; + * ... + * }; + * + * Defines type of constant parameters provided to FIXTURE_SETUP() and TEST_F() + * as *variant*. Variants allow the same tests to be run with different + * arguments. + */ +#define FIXTURE_VARIANT(fixture_name) struct _fixture_variant_##fixture_name + +/** + * FIXTURE_VARIANT_ADD(fixture_name, variant_name) - Called once per fixture + * variant to setup and register the data + * + * @fixture_name: fixture name + * @variant_name: name of the parameter set + * + * .. code-block:: c + * + * FIXTURE_ADD(datatype name) { + * .property1 = val1; + * ... + * }; + * + * Defines a variant of the test fixture, provided to FIXTURE_SETUP() and + * TEST_F() as *variant*. Tests of each fixture will be run once for each + * variant. + */ +#define FIXTURE_VARIANT_ADD(fixture_name, variant_name) \ + extern FIXTURE_VARIANT(fixture_name) \ + _##fixture_name##_##variant_name##_variant; \ + static struct __fixture_variant_metadata \ + _##fixture_name##_##variant_name##_object = \ + { .name = #variant_name, \ + .data = &_##fixture_name##_##variant_name##_variant}; \ + static void __attribute__((constructor)) \ + _register_##fixture_name##_##variant_name(void) \ + { \ + __register_fixture_variant(&_##fixture_name##_fixture_object, \ + &_##fixture_name##_##variant_name##_object); \ + } \ + FIXTURE_VARIANT(fixture_name) \ + _##fixture_name##_##variant_name##_variant = + /** * TEST_F(fixture_name, test_name) - Emits test registration and helpers for * fixture-based test cases @@ -297,18 +360,20 @@ #define __TEST_F_IMPL(fixture_name, test_name, signal, tmout) \ static void fixture_name##_##test_name( \ struct __test_metadata *_metadata, \ - FIXTURE_DATA(fixture_name) *self); \ + FIXTURE_DATA(fixture_name) *self, \ + const FIXTURE_VARIANT(fixture_name) *variant); \ static inline void wrapper_##fixture_name##_##test_name( \ - struct __test_metadata *_metadata) \ + struct __test_metadata *_metadata, \ + struct __fixture_variant_metadata *variant) \ { \ /* fixture data is alloced, setup, and torn down per call. */ \ FIXTURE_DATA(fixture_name) self; \ memset(&self, 0, sizeof(FIXTURE_DATA(fixture_name))); \ - fixture_name##_setup(_metadata, &self); \ + fixture_name##_setup(_metadata, &self, variant->data); \ /* Let setup failure terminate early. */ \ if (!_metadata->passed) \ return; \ - fixture_name##_##test_name(_metadata, &self); \ + fixture_name##_##test_name(_metadata, &self, variant->data); \ fixture_name##_teardown(_metadata, &self); \ } \ static struct __test_metadata \ @@ -326,7 +391,9 @@ } \ static void fixture_name##_##test_name( \ struct __test_metadata __attribute__((unused)) *_metadata, \ - FIXTURE_DATA(fixture_name) __attribute__((unused)) *self) + FIXTURE_DATA(fixture_name) __attribute__((unused)) *self, \ + const FIXTURE_VARIANT(fixture_name) \ + __attribute__((unused)) *variant) /** * TEST_HARNESS_MAIN - Simple wrapper to run the test harness @@ -660,11 +727,13 @@ } struct __test_metadata; +struct __fixture_variant_metadata; /* Contains all the information about a fixture. */ struct __fixture_metadata { const char *name; struct __test_metadata *tests; + struct __fixture_variant_metadata *variant; struct __fixture_metadata *prev, *next; } _fixture_global __attribute__((unused)) = { .name = "global", @@ -672,7 +741,6 @@ struct __fixture_metadata { }; static struct __fixture_metadata *__fixture_list = &_fixture_global; -static unsigned int __fixture_count; static int __constructor_order; #define _CONSTRUCTOR_ORDER_FORWARD 1 @@ -680,14 +748,27 @@ static int __constructor_order; static inline void __register_fixture(struct __fixture_metadata *f) { - __fixture_count++; __LIST_APPEND(__fixture_list, f); } +struct __fixture_variant_metadata { + const char *name; + const void *data; + struct __fixture_variant_metadata *prev, *next; +}; + +static inline void +__register_fixture_variant(struct __fixture_metadata *f, + struct __fixture_variant_metadata *variant) +{ + __LIST_APPEND(f->variant, variant); +} + /* Contains all the information for test execution and status checking. */ struct __test_metadata { const char *name; - void (*fn)(struct __test_metadata *); + void (*fn)(struct __test_metadata *, + struct __fixture_variant_metadata *); pid_t pid; /* pid of test when being run */ struct __fixture_metadata *fixture; int termsig; @@ -700,9 +781,6 @@ struct __test_metadata { struct __test_metadata *prev, *next; }; -/* Storage for the (global) tests to be run. */ -static unsigned int __test_count; - /* * Since constructors are called in reverse order, reverse the test * list so tests are run in source declaration order. @@ -714,7 +792,6 @@ static unsigned int __test_count; */ static inline void __register_test(struct __test_metadata *t) { - __test_count++; __LIST_APPEND(t->fixture->tests, t); } @@ -822,46 +899,65 @@ void __wait_for_test(struct __test_metadata *t) } void __run_test(struct __fixture_metadata *f, + struct __fixture_variant_metadata *variant, struct __test_metadata *t) { + /* reset test struct */ t->passed = 1; t->trigger = 0; - printf("[ RUN ] %s.%s\n", f->name, t->name); + t->step = 0; + t->no_print = 0; + + printf("[ RUN ] %s%s%s.%s\n", + f->name, variant->name[0] ? "." : "", variant->name, t->name); t->pid = fork(); if (t->pid < 0) { printf("ERROR SPAWNING TEST CHILD\n"); t->passed = 0; } else if (t->pid == 0) { - t->fn(t); + t->fn(t, variant); /* return the step that failed or 0 */ _exit(t->passed ? 0 : t->step); } else { __wait_for_test(t); } - printf("[ %4s ] %s.%s\n", (t->passed ? "OK" : "FAIL"), - f->name, t->name); + printf("[ %4s ] %s%s%s.%s\n", (t->passed ? "OK" : "FAIL"), + f->name, variant->name[0] ? "." : "", variant->name, t->name); } static int test_harness_run(int __attribute__((unused)) argc, char __attribute__((unused)) **argv) { + struct __fixture_variant_metadata no_variant = { .name = "", }; + struct __fixture_variant_metadata *v; struct __fixture_metadata *f; struct __test_metadata *t; int ret = 0; + unsigned int case_count = 0, test_count = 0; unsigned int count = 0; unsigned int pass_count = 0; + for (f = __fixture_list; f; f = f->next) { + for (v = f->variant ?: &no_variant; v; v = v->next) { + case_count++; + for (t = f->tests; t; t = t->next) + test_count++; + } + } + /* TODO(wad) add optional arguments similar to gtest. */ printf("[==========] Running %u tests from %u test cases.\n", - __test_count, __fixture_count + 1); + test_count, case_count); for (f = __fixture_list; f; f = f->next) { - for (t = f->tests; t; t = t->next) { - count++; - __run_test(f, t); - if (t->passed) - pass_count++; - else - ret = 1; + for (v = f->variant ?: &no_variant; v; v = v->next) { + for (t = f->tests; t; t = t->next) { + count++; + __run_test(f, v, t); + if (t->passed) + pass_count++; + else + ret = 1; + } } } printf("[==========] %u / %u tests passed.\n", pass_count, count); -- cgit v1.2.3 From 0feba2219b7348dce7d59312f4701a4805768f2d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 27 Apr 2020 18:03:51 -0700 Subject: selftests: tls: run all tests for TLS 1.2 and TLS 1.3 TLS 1.2 and TLS 1.3 differ in the implementation. Use fixture parameters to run all tests for both versions, and remove the one-off TLS 1.2 test. Signed-off-by: Jakub Kicinski Reviewed-by: Kees Cook Signed-off-by: David S. Miller --- tools/testing/selftests/net/tls.c | 93 +++++++-------------------------------- 1 file changed, 17 insertions(+), 76 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 0ea44d975b6c..c5282e62df75 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -101,6 +101,21 @@ FIXTURE(tls) bool notls; }; +FIXTURE_VARIANT(tls) +{ + unsigned int tls_version; +}; + +FIXTURE_VARIANT_ADD(tls, 12) +{ + .tls_version = TLS_1_2_VERSION, +}; + +FIXTURE_VARIANT_ADD(tls, 13) +{ + .tls_version = TLS_1_3_VERSION, +}; + FIXTURE_SETUP(tls) { struct tls12_crypto_info_aes_gcm_128 tls12; @@ -112,7 +127,7 @@ FIXTURE_SETUP(tls) len = sizeof(addr); memset(&tls12, 0, sizeof(tls12)); - tls12.info.version = TLS_1_3_VERSION; + tls12.info.version = variant->tls_version; tls12.info.cipher_type = TLS_CIPHER_AES_GCM_128; addr.sin_family = AF_INET; @@ -733,7 +748,7 @@ TEST_F(tls, bidir) struct tls12_crypto_info_aes_gcm_128 tls12; memset(&tls12, 0, sizeof(tls12)); - tls12.info.version = TLS_1_3_VERSION; + tls12.info.version = variant->tls_version; tls12.info.cipher_type = TLS_CIPHER_AES_GCM_128; ret = setsockopt(self->fd, SOL_TLS, TLS_RX, &tls12, @@ -1258,78 +1273,4 @@ TEST(keysizes) { close(cfd); } -TEST(tls12) { - int fd, cfd; - bool notls; - - struct tls12_crypto_info_aes_gcm_128 tls12; - struct sockaddr_in addr; - socklen_t len; - int sfd, ret; - - notls = false; - len = sizeof(addr); - - memset(&tls12, 0, sizeof(tls12)); - tls12.info.version = TLS_1_2_VERSION; - tls12.info.cipher_type = TLS_CIPHER_AES_GCM_128; - - addr.sin_family = AF_INET; - addr.sin_addr.s_addr = htonl(INADDR_ANY); - addr.sin_port = 0; - - fd = socket(AF_INET, SOCK_STREAM, 0); - sfd = socket(AF_INET, SOCK_STREAM, 0); - - ret = bind(sfd, &addr, sizeof(addr)); - ASSERT_EQ(ret, 0); - ret = listen(sfd, 10); - ASSERT_EQ(ret, 0); - - ret = getsockname(sfd, &addr, &len); - ASSERT_EQ(ret, 0); - - ret = connect(fd, &addr, sizeof(addr)); - ASSERT_EQ(ret, 0); - - ret = setsockopt(fd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls")); - if (ret != 0) { - notls = true; - printf("Failure setting TCP_ULP, testing without tls\n"); - } - - if (!notls) { - ret = setsockopt(fd, SOL_TLS, TLS_TX, &tls12, - sizeof(tls12)); - ASSERT_EQ(ret, 0); - } - - cfd = accept(sfd, &addr, &len); - ASSERT_GE(cfd, 0); - - if (!notls) { - ret = setsockopt(cfd, IPPROTO_TCP, TCP_ULP, "tls", - sizeof("tls")); - ASSERT_EQ(ret, 0); - - ret = setsockopt(cfd, SOL_TLS, TLS_RX, &tls12, - sizeof(tls12)); - ASSERT_EQ(ret, 0); - } - - close(sfd); - - char const *test_str = "test_read"; - int send_len = 10; - char buf[10]; - - send_len = strlen(test_str) + 1; - EXPECT_EQ(send(fd, test_str, send_len, 0), send_len); - EXPECT_NE(recv(cfd, buf, send_len, 0), -1); - EXPECT_EQ(memcmp(buf, test_str, send_len), 0); - - close(fd); - close(cfd); -} - TEST_HARNESS_MAIN -- cgit v1.2.3 From cb3f0d56e153398a035eb22769d2cb2837f29747 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:36 +0200 Subject: docs: networking: convert filter.txt to ReST - add SPDX header; - adjust title markup; - mark code blocks and literals as such; - use footnote markup; - mark tables as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/bpf/index.rst | 4 +- Documentation/networking/filter.rst | 1651 ++++++++++++++++++++++++++++++ Documentation/networking/filter.txt | 1545 ---------------------------- Documentation/networking/index.rst | 1 + Documentation/networking/packet_mmap.txt | 2 +- MAINTAINERS | 2 +- tools/bpf/bpf_asm.c | 2 +- tools/bpf/bpf_dbg.c | 2 +- 8 files changed, 1658 insertions(+), 1551 deletions(-) create mode 100644 Documentation/networking/filter.rst delete mode 100644 Documentation/networking/filter.txt (limited to 'tools') diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst index f99677f3572f..38b4db8be7a2 100644 --- a/Documentation/bpf/index.rst +++ b/Documentation/bpf/index.rst @@ -7,7 +7,7 @@ Filter) facility, with a focus on the extended BPF version (eBPF). This kernel side documentation is still work in progress. The main textual documentation is (for historical reasons) described in -`Documentation/networking/filter.txt`_, which describe both classical +`Documentation/networking/filter.rst`_, which describe both classical and extended BPF instruction-set. The Cilium project also maintains a `BPF and XDP Reference Guide`_ that goes into great technical depth about the BPF Architecture. @@ -59,7 +59,7 @@ Testing and debugging BPF .. Links: -.. _Documentation/networking/filter.txt: ../networking/filter.txt +.. _Documentation/networking/filter.rst: ../networking/filter.txt .. _man-pages: https://www.kernel.org/doc/man-pages/ .. _bpf(2): http://man7.org/linux/man-pages/man2/bpf.2.html .. _BPF and XDP Reference Guide: http://cilium.readthedocs.io/en/latest/bpf/ diff --git a/Documentation/networking/filter.rst b/Documentation/networking/filter.rst new file mode 100644 index 000000000000..a1d3e192b9fa --- /dev/null +++ b/Documentation/networking/filter.rst @@ -0,0 +1,1651 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================================================= +Linux Socket Filtering aka Berkeley Packet Filter (BPF) +======================================================= + +Introduction +------------ + +Linux Socket Filtering (LSF) is derived from the Berkeley Packet Filter. +Though there are some distinct differences between the BSD and Linux +Kernel filtering, but when we speak of BPF or LSF in Linux context, we +mean the very same mechanism of filtering in the Linux kernel. + +BPF allows a user-space program to attach a filter onto any socket and +allow or disallow certain types of data to come through the socket. LSF +follows exactly the same filter code structure as BSD's BPF, so referring +to the BSD bpf.4 manpage is very helpful in creating filters. + +On Linux, BPF is much simpler than on BSD. One does not have to worry +about devices or anything like that. You simply create your filter code, +send it to the kernel via the SO_ATTACH_FILTER option and if your filter +code passes the kernel check on it, you then immediately begin filtering +data on that socket. + +You can also detach filters from your socket via the SO_DETACH_FILTER +option. This will probably not be used much since when you close a socket +that has a filter on it the filter is automagically removed. The other +less common case may be adding a different filter on the same socket where +you had another filter that is still running: the kernel takes care of +removing the old one and placing your new one in its place, assuming your +filter has passed the checks, otherwise if it fails the old filter will +remain on that socket. + +SO_LOCK_FILTER option allows to lock the filter attached to a socket. Once +set, a filter cannot be removed or changed. This allows one process to +setup a socket, attach a filter, lock it then drop privileges and be +assured that the filter will be kept until the socket is closed. + +The biggest user of this construct might be libpcap. Issuing a high-level +filter command like `tcpdump -i em1 port 22` passes through the libpcap +internal compiler that generates a structure that can eventually be loaded +via SO_ATTACH_FILTER to the kernel. `tcpdump -i em1 port 22 -ddd` +displays what is being placed into this structure. + +Although we were only speaking about sockets here, BPF in Linux is used +in many more places. There's xt_bpf for netfilter, cls_bpf in the kernel +qdisc layer, SECCOMP-BPF (SECure COMPuting [1]_), and lots of other places +such as team driver, PTP code, etc where BPF is being used. + +.. [1] Documentation/userspace-api/seccomp_filter.rst + +Original BPF paper: + +Steven McCanne and Van Jacobson. 1993. The BSD packet filter: a new +architecture for user-level packet capture. In Proceedings of the +USENIX Winter 1993 Conference Proceedings on USENIX Winter 1993 +Conference Proceedings (USENIX'93). USENIX Association, Berkeley, +CA, USA, 2-2. [http://www.tcpdump.org/papers/bpf-usenix93.pdf] + +Structure +--------- + +User space applications include which contains the +following relevant structures:: + + struct sock_filter { /* Filter block */ + __u16 code; /* Actual filter code */ + __u8 jt; /* Jump true */ + __u8 jf; /* Jump false */ + __u32 k; /* Generic multiuse field */ + }; + +Such a structure is assembled as an array of 4-tuples, that contains +a code, jt, jf and k value. jt and jf are jump offsets and k a generic +value to be used for a provided code:: + + struct sock_fprog { /* Required for SO_ATTACH_FILTER. */ + unsigned short len; /* Number of filter blocks */ + struct sock_filter __user *filter; + }; + +For socket filtering, a pointer to this structure (as shown in +follow-up example) is being passed to the kernel through setsockopt(2). + +Example +------- + +:: + + #include + #include + #include + #include + /* ... */ + + /* From the example above: tcpdump -i em1 port 22 -dd */ + struct sock_filter code[] = { + { 0x28, 0, 0, 0x0000000c }, + { 0x15, 0, 8, 0x000086dd }, + { 0x30, 0, 0, 0x00000014 }, + { 0x15, 2, 0, 0x00000084 }, + { 0x15, 1, 0, 0x00000006 }, + { 0x15, 0, 17, 0x00000011 }, + { 0x28, 0, 0, 0x00000036 }, + { 0x15, 14, 0, 0x00000016 }, + { 0x28, 0, 0, 0x00000038 }, + { 0x15, 12, 13, 0x00000016 }, + { 0x15, 0, 12, 0x00000800 }, + { 0x30, 0, 0, 0x00000017 }, + { 0x15, 2, 0, 0x00000084 }, + { 0x15, 1, 0, 0x00000006 }, + { 0x15, 0, 8, 0x00000011 }, + { 0x28, 0, 0, 0x00000014 }, + { 0x45, 6, 0, 0x00001fff }, + { 0xb1, 0, 0, 0x0000000e }, + { 0x48, 0, 0, 0x0000000e }, + { 0x15, 2, 0, 0x00000016 }, + { 0x48, 0, 0, 0x00000010 }, + { 0x15, 0, 1, 0x00000016 }, + { 0x06, 0, 0, 0x0000ffff }, + { 0x06, 0, 0, 0x00000000 }, + }; + + struct sock_fprog bpf = { + .len = ARRAY_SIZE(code), + .filter = code, + }; + + sock = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); + if (sock < 0) + /* ... bail out ... */ + + ret = setsockopt(sock, SOL_SOCKET, SO_ATTACH_FILTER, &bpf, sizeof(bpf)); + if (ret < 0) + /* ... bail out ... */ + + /* ... */ + close(sock); + +The above example code attaches a socket filter for a PF_PACKET socket +in order to let all IPv4/IPv6 packets with port 22 pass. The rest will +be dropped for this socket. + +The setsockopt(2) call to SO_DETACH_FILTER doesn't need any arguments +and SO_LOCK_FILTER for preventing the filter to be detached, takes an +integer value with 0 or 1. + +Note that socket filters are not restricted to PF_PACKET sockets only, +but can also be used on other socket families. + +Summary of system calls: + + * setsockopt(sockfd, SOL_SOCKET, SO_ATTACH_FILTER, &val, sizeof(val)); + * setsockopt(sockfd, SOL_SOCKET, SO_DETACH_FILTER, &val, sizeof(val)); + * setsockopt(sockfd, SOL_SOCKET, SO_LOCK_FILTER, &val, sizeof(val)); + +Normally, most use cases for socket filtering on packet sockets will be +covered by libpcap in high-level syntax, so as an application developer +you should stick to that. libpcap wraps its own layer around all that. + +Unless i) using/linking to libpcap is not an option, ii) the required BPF +filters use Linux extensions that are not supported by libpcap's compiler, +iii) a filter might be more complex and not cleanly implementable with +libpcap's compiler, or iv) particular filter codes should be optimized +differently than libpcap's internal compiler does; then in such cases +writing such a filter "by hand" can be of an alternative. For example, +xt_bpf and cls_bpf users might have requirements that could result in +more complex filter code, or one that cannot be expressed with libpcap +(e.g. different return codes for various code paths). Moreover, BPF JIT +implementors may wish to manually write test cases and thus need low-level +access to BPF code as well. + +BPF engine and instruction set +------------------------------ + +Under tools/bpf/ there's a small helper tool called bpf_asm which can +be used to write low-level filters for example scenarios mentioned in the +previous section. Asm-like syntax mentioned here has been implemented in +bpf_asm and will be used for further explanations (instead of dealing with +less readable opcodes directly, principles are the same). The syntax is +closely modelled after Steven McCanne's and Van Jacobson's BPF paper. + +The BPF architecture consists of the following basic elements: + + ======= ==================================================== + Element Description + ======= ==================================================== + A 32 bit wide accumulator + X 32 bit wide X register + M[] 16 x 32 bit wide misc registers aka "scratch memory + store", addressable from 0 to 15 + ======= ==================================================== + +A program, that is translated by bpf_asm into "opcodes" is an array that +consists of the following elements (as already mentioned):: + + op:16, jt:8, jf:8, k:32 + +The element op is a 16 bit wide opcode that has a particular instruction +encoded. jt and jf are two 8 bit wide jump targets, one for condition +"jump if true", the other one "jump if false". Eventually, element k +contains a miscellaneous argument that can be interpreted in different +ways depending on the given instruction in op. + +The instruction set consists of load, store, branch, alu, miscellaneous +and return instructions that are also represented in bpf_asm syntax. This +table lists all bpf_asm instructions available resp. what their underlying +opcodes as defined in linux/filter.h stand for: + + =========== =================== ===================== + Instruction Addressing mode Description + =========== =================== ===================== + ld 1, 2, 3, 4, 12 Load word into A + ldi 4 Load word into A + ldh 1, 2 Load half-word into A + ldb 1, 2 Load byte into A + ldx 3, 4, 5, 12 Load word into X + ldxi 4 Load word into X + ldxb 5 Load byte into X + + st 3 Store A into M[] + stx 3 Store X into M[] + + jmp 6 Jump to label + ja 6 Jump to label + jeq 7, 8, 9, 10 Jump on A == + jneq 9, 10 Jump on A != + jne 9, 10 Jump on A != + jlt 9, 10 Jump on A < + jle 9, 10 Jump on A <= + jgt 7, 8, 9, 10 Jump on A > + jge 7, 8, 9, 10 Jump on A >= + jset 7, 8, 9, 10 Jump on A & + + add 0, 4 A + + sub 0, 4 A - + mul 0, 4 A * + div 0, 4 A / + mod 0, 4 A % + neg !A + and 0, 4 A & + or 0, 4 A | + xor 0, 4 A ^ + lsh 0, 4 A << + rsh 0, 4 A >> + + tax Copy A into X + txa Copy X into A + + ret 4, 11 Return + =========== =================== ===================== + +The next table shows addressing formats from the 2nd column: + + =============== =================== =============================================== + Addressing mode Syntax Description + =============== =================== =============================================== + 0 x/%x Register X + 1 [k] BHW at byte offset k in the packet + 2 [x + k] BHW at the offset X + k in the packet + 3 M[k] Word at offset k in M[] + 4 #k Literal value stored in k + 5 4*([k]&0xf) Lower nibble * 4 at byte offset k in the packet + 6 L Jump label L + 7 #k,Lt,Lf Jump to Lt if true, otherwise jump to Lf + 8 x/%x,Lt,Lf Jump to Lt if true, otherwise jump to Lf + 9 #k,Lt Jump to Lt if predicate is true + 10 x/%x,Lt Jump to Lt if predicate is true + 11 a/%a Accumulator A + 12 extension BPF extension + =============== =================== =============================================== + +The Linux kernel also has a couple of BPF extensions that are used along +with the class of load instructions by "overloading" the k argument with +a negative offset + a particular extension offset. The result of such BPF +extensions are loaded into A. + +Possible BPF extensions are shown in the following table: + + =================================== ================================================= + Extension Description + =================================== ================================================= + len skb->len + proto skb->protocol + type skb->pkt_type + poff Payload start offset + ifidx skb->dev->ifindex + nla Netlink attribute of type X with offset A + nlan Nested Netlink attribute of type X with offset A + mark skb->mark + queue skb->queue_mapping + hatype skb->dev->type + rxhash skb->hash + cpu raw_smp_processor_id() + vlan_tci skb_vlan_tag_get(skb) + vlan_avail skb_vlan_tag_present(skb) + vlan_tpid skb->vlan_proto + rand prandom_u32() + =================================== ================================================= + +These extensions can also be prefixed with '#'. +Examples for low-level BPF: + +**ARP packets**:: + + ldh [12] + jne #0x806, drop + ret #-1 + drop: ret #0 + +**IPv4 TCP packets**:: + + ldh [12] + jne #0x800, drop + ldb [23] + jneq #6, drop + ret #-1 + drop: ret #0 + +**(Accelerated) VLAN w/ id 10**:: + + ld vlan_tci + jneq #10, drop + ret #-1 + drop: ret #0 + +**icmp random packet sampling, 1 in 4**: + + ldh [12] + jne #0x800, drop + ldb [23] + jneq #1, drop + # get a random uint32 number + ld rand + mod #4 + jneq #1, drop + ret #-1 + drop: ret #0 + +**SECCOMP filter example**:: + + ld [4] /* offsetof(struct seccomp_data, arch) */ + jne #0xc000003e, bad /* AUDIT_ARCH_X86_64 */ + ld [0] /* offsetof(struct seccomp_data, nr) */ + jeq #15, good /* __NR_rt_sigreturn */ + jeq #231, good /* __NR_exit_group */ + jeq #60, good /* __NR_exit */ + jeq #0, good /* __NR_read */ + jeq #1, good /* __NR_write */ + jeq #5, good /* __NR_fstat */ + jeq #9, good /* __NR_mmap */ + jeq #14, good /* __NR_rt_sigprocmask */ + jeq #13, good /* __NR_rt_sigaction */ + jeq #35, good /* __NR_nanosleep */ + bad: ret #0 /* SECCOMP_RET_KILL_THREAD */ + good: ret #0x7fff0000 /* SECCOMP_RET_ALLOW */ + +The above example code can be placed into a file (here called "foo"), and +then be passed to the bpf_asm tool for generating opcodes, output that xt_bpf +and cls_bpf understands and can directly be loaded with. Example with above +ARP code:: + + $ ./bpf_asm foo + 4,40 0 0 12,21 0 1 2054,6 0 0 4294967295,6 0 0 0, + +In copy and paste C-like output:: + + $ ./bpf_asm -c foo + { 0x28, 0, 0, 0x0000000c }, + { 0x15, 0, 1, 0x00000806 }, + { 0x06, 0, 0, 0xffffffff }, + { 0x06, 0, 0, 0000000000 }, + +In particular, as usage with xt_bpf or cls_bpf can result in more complex BPF +filters that might not be obvious at first, it's good to test filters before +attaching to a live system. For that purpose, there's a small tool called +bpf_dbg under tools/bpf/ in the kernel source directory. This debugger allows +for testing BPF filters against given pcap files, single stepping through the +BPF code on the pcap's packets and to do BPF machine register dumps. + +Starting bpf_dbg is trivial and just requires issuing:: + + # ./bpf_dbg + +In case input and output do not equal stdin/stdout, bpf_dbg takes an +alternative stdin source as a first argument, and an alternative stdout +sink as a second one, e.g. `./bpf_dbg test_in.txt test_out.txt`. + +Other than that, a particular libreadline configuration can be set via +file "~/.bpf_dbg_init" and the command history is stored in the file +"~/.bpf_dbg_history". + +Interaction in bpf_dbg happens through a shell that also has auto-completion +support (follow-up example commands starting with '>' denote bpf_dbg shell). +The usual workflow would be to ... + +* load bpf 6,40 0 0 12,21 0 3 2048,48 0 0 23,21 0 1 1,6 0 0 65535,6 0 0 0 + Loads a BPF filter from standard output of bpf_asm, or transformed via + e.g. ``tcpdump -iem1 -ddd port 22 | tr '\n' ','``. Note that for JIT + debugging (next section), this command creates a temporary socket and + loads the BPF code into the kernel. Thus, this will also be useful for + JIT developers. + +* load pcap foo.pcap + + Loads standard tcpdump pcap file. + +* run [] + +bpf passes:1 fails:9 + Runs through all packets from a pcap to account how many passes and fails + the filter will generate. A limit of packets to traverse can be given. + +* disassemble:: + + l0: ldh [12] + l1: jeq #0x800, l2, l5 + l2: ldb [23] + l3: jeq #0x1, l4, l5 + l4: ret #0xffff + l5: ret #0 + + Prints out BPF code disassembly. + +* dump:: + + /* { op, jt, jf, k }, */ + { 0x28, 0, 0, 0x0000000c }, + { 0x15, 0, 3, 0x00000800 }, + { 0x30, 0, 0, 0x00000017 }, + { 0x15, 0, 1, 0x00000001 }, + { 0x06, 0, 0, 0x0000ffff }, + { 0x06, 0, 0, 0000000000 }, + + Prints out C-style BPF code dump. + +* breakpoint 0:: + + breakpoint at: l0: ldh [12] + +* breakpoint 1:: + + breakpoint at: l1: jeq #0x800, l2, l5 + + ... + + Sets breakpoints at particular BPF instructions. Issuing a `run` command + will walk through the pcap file continuing from the current packet and + break when a breakpoint is being hit (another `run` will continue from + the currently active breakpoint executing next instructions): + + * run:: + + -- register dump -- + pc: [0] <-- program counter + code: [40] jt[0] jf[0] k[12] <-- plain BPF code of current instruction + curr: l0: ldh [12] <-- disassembly of current instruction + A: [00000000][0] <-- content of A (hex, decimal) + X: [00000000][0] <-- content of X (hex, decimal) + M[0,15]: [00000000][0] <-- folded content of M (hex, decimal) + -- packet dump -- <-- Current packet from pcap (hex) + len: 42 + 0: 00 19 cb 55 55 a4 00 14 a4 43 78 69 08 06 00 01 + 16: 08 00 06 04 00 01 00 14 a4 43 78 69 0a 3b 01 26 + 32: 00 00 00 00 00 00 0a 3b 01 01 + (breakpoint) + > + + * breakpoint:: + + breakpoints: 0 1 + + Prints currently set breakpoints. + +* step [-, +] + + Performs single stepping through the BPF program from the current pc + offset. Thus, on each step invocation, above register dump is issued. + This can go forwards and backwards in time, a plain `step` will break + on the next BPF instruction, thus +1. (No `run` needs to be issued here.) + +* select + + Selects a given packet from the pcap file to continue from. Thus, on + the next `run` or `step`, the BPF program is being evaluated against + the user pre-selected packet. Numbering starts just as in Wireshark + with index 1. + +* quit + + Exits bpf_dbg. + +JIT compiler +------------ + +The Linux kernel has a built-in BPF JIT compiler for x86_64, SPARC, +PowerPC, ARM, ARM64, MIPS, RISC-V and s390 and can be enabled through +CONFIG_BPF_JIT. The JIT compiler is transparently invoked for each +attached filter from user space or for internal kernel users if it has +been previously enabled by root:: + + echo 1 > /proc/sys/net/core/bpf_jit_enable + +For JIT developers, doing audits etc, each compile run can output the generated +opcode image into the kernel log via:: + + echo 2 > /proc/sys/net/core/bpf_jit_enable + +Example output from dmesg:: + + [ 3389.935842] flen=6 proglen=70 pass=3 image=ffffffffa0069c8f + [ 3389.935847] JIT code: 00000000: 55 48 89 e5 48 83 ec 60 48 89 5d f8 44 8b 4f 68 + [ 3389.935849] JIT code: 00000010: 44 2b 4f 6c 4c 8b 87 d8 00 00 00 be 0c 00 00 00 + [ 3389.935850] JIT code: 00000020: e8 1d 94 ff e0 3d 00 08 00 00 75 16 be 17 00 00 + [ 3389.935851] JIT code: 00000030: 00 e8 28 94 ff e0 83 f8 01 75 07 b8 ff ff 00 00 + [ 3389.935852] JIT code: 00000040: eb 02 31 c0 c9 c3 + +When CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set to 1 and +setting any other value than that will return in failure. This is even the case for +setting bpf_jit_enable to 2, since dumping the final JIT image into the kernel log +is discouraged and introspection through bpftool (under tools/bpf/bpftool/) is the +generally recommended approach instead. + +In the kernel source tree under tools/bpf/, there's bpf_jit_disasm for +generating disassembly out of the kernel log's hexdump:: + + # ./bpf_jit_disasm + 70 bytes emitted from JIT compiler (pass:3, flen:6) + ffffffffa0069c8f + : + 0: push %rbp + 1: mov %rsp,%rbp + 4: sub $0x60,%rsp + 8: mov %rbx,-0x8(%rbp) + c: mov 0x68(%rdi),%r9d + 10: sub 0x6c(%rdi),%r9d + 14: mov 0xd8(%rdi),%r8 + 1b: mov $0xc,%esi + 20: callq 0xffffffffe0ff9442 + 25: cmp $0x800,%eax + 2a: jne 0x0000000000000042 + 2c: mov $0x17,%esi + 31: callq 0xffffffffe0ff945e + 36: cmp $0x1,%eax + 39: jne 0x0000000000000042 + 3b: mov $0xffff,%eax + 40: jmp 0x0000000000000044 + 42: xor %eax,%eax + 44: leaveq + 45: retq + + Issuing option `-o` will "annotate" opcodes to resulting assembler + instructions, which can be very useful for JIT developers: + + # ./bpf_jit_disasm -o + 70 bytes emitted from JIT compiler (pass:3, flen:6) + ffffffffa0069c8f + : + 0: push %rbp + 55 + 1: mov %rsp,%rbp + 48 89 e5 + 4: sub $0x60,%rsp + 48 83 ec 60 + 8: mov %rbx,-0x8(%rbp) + 48 89 5d f8 + c: mov 0x68(%rdi),%r9d + 44 8b 4f 68 + 10: sub 0x6c(%rdi),%r9d + 44 2b 4f 6c + 14: mov 0xd8(%rdi),%r8 + 4c 8b 87 d8 00 00 00 + 1b: mov $0xc,%esi + be 0c 00 00 00 + 20: callq 0xffffffffe0ff9442 + e8 1d 94 ff e0 + 25: cmp $0x800,%eax + 3d 00 08 00 00 + 2a: jne 0x0000000000000042 + 75 16 + 2c: mov $0x17,%esi + be 17 00 00 00 + 31: callq 0xffffffffe0ff945e + e8 28 94 ff e0 + 36: cmp $0x1,%eax + 83 f8 01 + 39: jne 0x0000000000000042 + 75 07 + 3b: mov $0xffff,%eax + b8 ff ff 00 00 + 40: jmp 0x0000000000000044 + eb 02 + 42: xor %eax,%eax + 31 c0 + 44: leaveq + c9 + 45: retq + c3 + +For BPF JIT developers, bpf_jit_disasm, bpf_asm and bpf_dbg provides a useful +toolchain for developing and testing the kernel's JIT compiler. + +BPF kernel internals +-------------------- +Internally, for the kernel interpreter, a different instruction set +format with similar underlying principles from BPF described in previous +paragraphs is being used. However, the instruction set format is modelled +closer to the underlying architecture to mimic native instruction sets, so +that a better performance can be achieved (more details later). This new +ISA is called 'eBPF' or 'internal BPF' interchangeably. (Note: eBPF which +originates from [e]xtended BPF is not the same as BPF extensions! While +eBPF is an ISA, BPF extensions date back to classic BPF's 'overloading' +of BPF_LD | BPF_{B,H,W} | BPF_ABS instruction.) + +It is designed to be JITed with one to one mapping, which can also open up +the possibility for GCC/LLVM compilers to generate optimized eBPF code through +an eBPF backend that performs almost as fast as natively compiled code. + +The new instruction set was originally designed with the possible goal in +mind to write programs in "restricted C" and compile into eBPF with a optional +GCC/LLVM backend, so that it can just-in-time map to modern 64-bit CPUs with +minimal performance overhead over two steps, that is, C -> eBPF -> native code. + +Currently, the new format is being used for running user BPF programs, which +includes seccomp BPF, classic socket filters, cls_bpf traffic classifier, +team driver's classifier for its load-balancing mode, netfilter's xt_bpf +extension, PTP dissector/classifier, and much more. They are all internally +converted by the kernel into the new instruction set representation and run +in the eBPF interpreter. For in-kernel handlers, this all works transparently +by using bpf_prog_create() for setting up the filter, resp. +bpf_prog_destroy() for destroying it. The macro +BPF_PROG_RUN(filter, ctx) transparently invokes eBPF interpreter or JITed +code to run the filter. 'filter' is a pointer to struct bpf_prog that we +got from bpf_prog_create(), and 'ctx' the given context (e.g. +skb pointer). All constraints and restrictions from bpf_check_classic() apply +before a conversion to the new layout is being done behind the scenes! + +Currently, the classic BPF format is being used for JITing on most +32-bit architectures, whereas x86-64, aarch64, s390x, powerpc64, +sparc64, arm32, riscv64, riscv32 perform JIT compilation from eBPF +instruction set. + +Some core changes of the new internal format: + +- Number of registers increase from 2 to 10: + + The old format had two registers A and X, and a hidden frame pointer. The + new layout extends this to be 10 internal registers and a read-only frame + pointer. Since 64-bit CPUs are passing arguments to functions via registers + the number of args from eBPF program to in-kernel function is restricted + to 5 and one register is used to accept return value from an in-kernel + function. Natively, x86_64 passes first 6 arguments in registers, aarch64/ + sparcv9/mips64 have 7 - 8 registers for arguments; x86_64 has 6 callee saved + registers, and aarch64/sparcv9/mips64 have 11 or more callee saved registers. + + Therefore, eBPF calling convention is defined as: + + * R0 - return value from in-kernel function, and exit value for eBPF program + * R1 - R5 - arguments from eBPF program to in-kernel function + * R6 - R9 - callee saved registers that in-kernel function will preserve + * R10 - read-only frame pointer to access stack + + Thus, all eBPF registers map one to one to HW registers on x86_64, aarch64, + etc, and eBPF calling convention maps directly to ABIs used by the kernel on + 64-bit architectures. + + On 32-bit architectures JIT may map programs that use only 32-bit arithmetic + and may let more complex programs to be interpreted. + + R0 - R5 are scratch registers and eBPF program needs spill/fill them if + necessary across calls. Note that there is only one eBPF program (== one + eBPF main routine) and it cannot call other eBPF functions, it can only + call predefined in-kernel functions, though. + +- Register width increases from 32-bit to 64-bit: + + Still, the semantics of the original 32-bit ALU operations are preserved + via 32-bit subregisters. All eBPF registers are 64-bit with 32-bit lower + subregisters that zero-extend into 64-bit if they are being written to. + That behavior maps directly to x86_64 and arm64 subregister definition, but + makes other JITs more difficult. + + 32-bit architectures run 64-bit internal BPF programs via interpreter. + Their JITs may convert BPF programs that only use 32-bit subregisters into + native instruction set and let the rest being interpreted. + + Operation is 64-bit, because on 64-bit architectures, pointers are also + 64-bit wide, and we want to pass 64-bit values in/out of kernel functions, + so 32-bit eBPF registers would otherwise require to define register-pair + ABI, thus, there won't be able to use a direct eBPF register to HW register + mapping and JIT would need to do combine/split/move operations for every + register in and out of the function, which is complex, bug prone and slow. + Another reason is the use of atomic 64-bit counters. + +- Conditional jt/jf targets replaced with jt/fall-through: + + While the original design has constructs such as ``if (cond) jump_true; + else jump_false;``, they are being replaced into alternative constructs like + ``if (cond) jump_true; /* else fall-through */``. + +- Introduces bpf_call insn and register passing convention for zero overhead + calls from/to other kernel functions: + + Before an in-kernel function call, the internal BPF program needs to + place function arguments into R1 to R5 registers to satisfy calling + convention, then the interpreter will take them from registers and pass + to in-kernel function. If R1 - R5 registers are mapped to CPU registers + that are used for argument passing on given architecture, the JIT compiler + doesn't need to emit extra moves. Function arguments will be in the correct + registers and BPF_CALL instruction will be JITed as single 'call' HW + instruction. This calling convention was picked to cover common call + situations without performance penalty. + + After an in-kernel function call, R1 - R5 are reset to unreadable and R0 has + a return value of the function. Since R6 - R9 are callee saved, their state + is preserved across the call. + + For example, consider three C functions:: + + u64 f1() { return (*_f2)(1); } + u64 f2(u64 a) { return f3(a + 1, a); } + u64 f3(u64 a, u64 b) { return a - b; } + + GCC can compile f1, f3 into x86_64:: + + f1: + movl $1, %edi + movq _f2(%rip), %rax + jmp *%rax + f3: + movq %rdi, %rax + subq %rsi, %rax + ret + + Function f2 in eBPF may look like:: + + f2: + bpf_mov R2, R1 + bpf_add R1, 1 + bpf_call f3 + bpf_exit + + If f2 is JITed and the pointer stored to ``_f2``. The calls f1 -> f2 -> f3 and + returns will be seamless. Without JIT, __bpf_prog_run() interpreter needs to + be used to call into f2. + + For practical reasons all eBPF programs have only one argument 'ctx' which is + already placed into R1 (e.g. on __bpf_prog_run() startup) and the programs + can call kernel functions with up to 5 arguments. Calls with 6 or more arguments + are currently not supported, but these restrictions can be lifted if necessary + in the future. + + On 64-bit architectures all register map to HW registers one to one. For + example, x86_64 JIT compiler can map them as ... + + :: + + R0 - rax + R1 - rdi + R2 - rsi + R3 - rdx + R4 - rcx + R5 - r8 + R6 - rbx + R7 - r13 + R8 - r14 + R9 - r15 + R10 - rbp + + ... since x86_64 ABI mandates rdi, rsi, rdx, rcx, r8, r9 for argument passing + and rbx, r12 - r15 are callee saved. + + Then the following internal BPF pseudo-program:: + + bpf_mov R6, R1 /* save ctx */ + bpf_mov R2, 2 + bpf_mov R3, 3 + bpf_mov R4, 4 + bpf_mov R5, 5 + bpf_call foo + bpf_mov R7, R0 /* save foo() return value */ + bpf_mov R1, R6 /* restore ctx for next call */ + bpf_mov R2, 6 + bpf_mov R3, 7 + bpf_mov R4, 8 + bpf_mov R5, 9 + bpf_call bar + bpf_add R0, R7 + bpf_exit + + After JIT to x86_64 may look like:: + + push %rbp + mov %rsp,%rbp + sub $0x228,%rsp + mov %rbx,-0x228(%rbp) + mov %r13,-0x220(%rbp) + mov %rdi,%rbx + mov $0x2,%esi + mov $0x3,%edx + mov $0x4,%ecx + mov $0x5,%r8d + callq foo + mov %rax,%r13 + mov %rbx,%rdi + mov $0x6,%esi + mov $0x7,%edx + mov $0x8,%ecx + mov $0x9,%r8d + callq bar + add %r13,%rax + mov -0x228(%rbp),%rbx + mov -0x220(%rbp),%r13 + leaveq + retq + + Which is in this example equivalent in C to:: + + u64 bpf_filter(u64 ctx) + { + return foo(ctx, 2, 3, 4, 5) + bar(ctx, 6, 7, 8, 9); + } + + In-kernel functions foo() and bar() with prototype: u64 (*)(u64 arg1, u64 + arg2, u64 arg3, u64 arg4, u64 arg5); will receive arguments in proper + registers and place their return value into ``%rax`` which is R0 in eBPF. + Prologue and epilogue are emitted by JIT and are implicit in the + interpreter. R0-R5 are scratch registers, so eBPF program needs to preserve + them across the calls as defined by calling convention. + + For example the following program is invalid:: + + bpf_mov R1, 1 + bpf_call foo + bpf_mov R0, R1 + bpf_exit + + After the call the registers R1-R5 contain junk values and cannot be read. + An in-kernel eBPF verifier is used to validate internal BPF programs. + +Also in the new design, eBPF is limited to 4096 insns, which means that any +program will terminate quickly and will only call a fixed number of kernel +functions. Original BPF and the new format are two operand instructions, +which helps to do one-to-one mapping between eBPF insn and x86 insn during JIT. + +The input context pointer for invoking the interpreter function is generic, +its content is defined by a specific use case. For seccomp register R1 points +to seccomp_data, for converted BPF filters R1 points to a skb. + +A program, that is translated internally consists of the following elements:: + + op:16, jt:8, jf:8, k:32 ==> op:8, dst_reg:4, src_reg:4, off:16, imm:32 + +So far 87 internal BPF instructions were implemented. 8-bit 'op' opcode field +has room for new instructions. Some of them may use 16/24/32 byte encoding. New +instructions must be multiple of 8 bytes to preserve backward compatibility. + +Internal BPF is a general purpose RISC instruction set. Not every register and +every instruction are used during translation from original BPF to new format. +For example, socket filters are not using ``exclusive add`` instruction, but +tracing filters may do to maintain counters of events, for example. Register R9 +is not used by socket filters either, but more complex filters may be running +out of registers and would have to resort to spill/fill to stack. + +Internal BPF can be used as a generic assembler for last step performance +optimizations, socket filters and seccomp are using it as assembler. Tracing +filters may use it as assembler to generate code from kernel. In kernel usage +may not be bounded by security considerations, since generated internal BPF code +may be optimizing internal code path and not being exposed to the user space. +Safety of internal BPF can come from a verifier (TBD). In such use cases as +described, it may be used as safe instruction set. + +Just like the original BPF, the new format runs within a controlled environment, +is deterministic and the kernel can easily prove that. The safety of the program +can be determined in two steps: first step does depth-first-search to disallow +loops and other CFG validation; second step starts from the first insn and +descends all possible paths. It simulates execution of every insn and observes +the state change of registers and stack. + +eBPF opcode encoding +-------------------- + +eBPF is reusing most of the opcode encoding from classic to simplify conversion +of classic BPF to eBPF. For arithmetic and jump instructions the 8-bit 'code' +field is divided into three parts:: + + +----------------+--------+--------------------+ + | 4 bits | 1 bit | 3 bits | + | operation code | source | instruction class | + +----------------+--------+--------------------+ + (MSB) (LSB) + +Three LSB bits store instruction class which is one of: + + =================== =============== + Classic BPF classes eBPF classes + =================== =============== + BPF_LD 0x00 BPF_LD 0x00 + BPF_LDX 0x01 BPF_LDX 0x01 + BPF_ST 0x02 BPF_ST 0x02 + BPF_STX 0x03 BPF_STX 0x03 + BPF_ALU 0x04 BPF_ALU 0x04 + BPF_JMP 0x05 BPF_JMP 0x05 + BPF_RET 0x06 BPF_JMP32 0x06 + BPF_MISC 0x07 BPF_ALU64 0x07 + =================== =============== + +When BPF_CLASS(code) == BPF_ALU or BPF_JMP, 4th bit encodes source operand ... + + :: + + BPF_K 0x00 + BPF_X 0x08 + + * in classic BPF, this means:: + + BPF_SRC(code) == BPF_X - use register X as source operand + BPF_SRC(code) == BPF_K - use 32-bit immediate as source operand + + * in eBPF, this means:: + + BPF_SRC(code) == BPF_X - use 'src_reg' register as source operand + BPF_SRC(code) == BPF_K - use 32-bit immediate as source operand + +... and four MSB bits store operation code. + +If BPF_CLASS(code) == BPF_ALU or BPF_ALU64 [ in eBPF ], BPF_OP(code) is one of:: + + BPF_ADD 0x00 + BPF_SUB 0x10 + BPF_MUL 0x20 + BPF_DIV 0x30 + BPF_OR 0x40 + BPF_AND 0x50 + BPF_LSH 0x60 + BPF_RSH 0x70 + BPF_NEG 0x80 + BPF_MOD 0x90 + BPF_XOR 0xa0 + BPF_MOV 0xb0 /* eBPF only: mov reg to reg */ + BPF_ARSH 0xc0 /* eBPF only: sign extending shift right */ + BPF_END 0xd0 /* eBPF only: endianness conversion */ + +If BPF_CLASS(code) == BPF_JMP or BPF_JMP32 [ in eBPF ], BPF_OP(code) is one of:: + + BPF_JA 0x00 /* BPF_JMP only */ + BPF_JEQ 0x10 + BPF_JGT 0x20 + BPF_JGE 0x30 + BPF_JSET 0x40 + BPF_JNE 0x50 /* eBPF only: jump != */ + BPF_JSGT 0x60 /* eBPF only: signed '>' */ + BPF_JSGE 0x70 /* eBPF only: signed '>=' */ + BPF_CALL 0x80 /* eBPF BPF_JMP only: function call */ + BPF_EXIT 0x90 /* eBPF BPF_JMP only: function return */ + BPF_JLT 0xa0 /* eBPF only: unsigned '<' */ + BPF_JLE 0xb0 /* eBPF only: unsigned '<=' */ + BPF_JSLT 0xc0 /* eBPF only: signed '<' */ + BPF_JSLE 0xd0 /* eBPF only: signed '<=' */ + +So BPF_ADD | BPF_X | BPF_ALU means 32-bit addition in both classic BPF +and eBPF. There are only two registers in classic BPF, so it means A += X. +In eBPF it means dst_reg = (u32) dst_reg + (u32) src_reg; similarly, +BPF_XOR | BPF_K | BPF_ALU means A ^= imm32 in classic BPF and analogous +src_reg = (u32) src_reg ^ (u32) imm32 in eBPF. + +Classic BPF is using BPF_MISC class to represent A = X and X = A moves. +eBPF is using BPF_MOV | BPF_X | BPF_ALU code instead. Since there are no +BPF_MISC operations in eBPF, the class 7 is used as BPF_ALU64 to mean +exactly the same operations as BPF_ALU, but with 64-bit wide operands +instead. So BPF_ADD | BPF_X | BPF_ALU64 means 64-bit addition, i.e.: +dst_reg = dst_reg + src_reg + +Classic BPF wastes the whole BPF_RET class to represent a single ``ret`` +operation. Classic BPF_RET | BPF_K means copy imm32 into return register +and perform function exit. eBPF is modeled to match CPU, so BPF_JMP | BPF_EXIT +in eBPF means function exit only. The eBPF program needs to store return +value into register R0 before doing a BPF_EXIT. Class 6 in eBPF is used as +BPF_JMP32 to mean exactly the same operations as BPF_JMP, but with 32-bit wide +operands for the comparisons instead. + +For load and store instructions the 8-bit 'code' field is divided as:: + + +--------+--------+-------------------+ + | 3 bits | 2 bits | 3 bits | + | mode | size | instruction class | + +--------+--------+-------------------+ + (MSB) (LSB) + +Size modifier is one of ... + +:: + + BPF_W 0x00 /* word */ + BPF_H 0x08 /* half word */ + BPF_B 0x10 /* byte */ + BPF_DW 0x18 /* eBPF only, double word */ + +... which encodes size of load/store operation:: + + B - 1 byte + H - 2 byte + W - 4 byte + DW - 8 byte (eBPF only) + +Mode modifier is one of:: + + BPF_IMM 0x00 /* used for 32-bit mov in classic BPF and 64-bit in eBPF */ + BPF_ABS 0x20 + BPF_IND 0x40 + BPF_MEM 0x60 + BPF_LEN 0x80 /* classic BPF only, reserved in eBPF */ + BPF_MSH 0xa0 /* classic BPF only, reserved in eBPF */ + BPF_XADD 0xc0 /* eBPF only, exclusive add */ + +eBPF has two non-generic instructions: (BPF_ABS | | BPF_LD) and +(BPF_IND | | BPF_LD) which are used to access packet data. + +They had to be carried over from classic to have strong performance of +socket filters running in eBPF interpreter. These instructions can only +be used when interpreter context is a pointer to ``struct sk_buff`` and +have seven implicit operands. Register R6 is an implicit input that must +contain pointer to sk_buff. Register R0 is an implicit output which contains +the data fetched from the packet. Registers R1-R5 are scratch registers +and must not be used to store the data across BPF_ABS | BPF_LD or +BPF_IND | BPF_LD instructions. + +These instructions have implicit program exit condition as well. When +eBPF program is trying to access the data beyond the packet boundary, +the interpreter will abort the execution of the program. JIT compilers +therefore must preserve this property. src_reg and imm32 fields are +explicit inputs to these instructions. + +For example:: + + BPF_IND | BPF_W | BPF_LD means: + + R0 = ntohl(*(u32 *) (((struct sk_buff *) R6)->data + src_reg + imm32)) + and R1 - R5 were scratched. + +Unlike classic BPF instruction set, eBPF has generic load/store operations:: + + BPF_MEM | | BPF_STX: *(size *) (dst_reg + off) = src_reg + BPF_MEM | | BPF_ST: *(size *) (dst_reg + off) = imm32 + BPF_MEM | | BPF_LDX: dst_reg = *(size *) (src_reg + off) + BPF_XADD | BPF_W | BPF_STX: lock xadd *(u32 *)(dst_reg + off16) += src_reg + BPF_XADD | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg + +Where size is one of: BPF_B or BPF_H or BPF_W or BPF_DW. Note that 1 and +2 byte atomic increments are not supported. + +eBPF has one 16-byte instruction: BPF_LD | BPF_DW | BPF_IMM which consists +of two consecutive ``struct bpf_insn`` 8-byte blocks and interpreted as single +instruction that loads 64-bit immediate value into a dst_reg. +Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads +32-bit immediate value into a register. + +eBPF verifier +------------- +The safety of the eBPF program is determined in two steps. + +First step does DAG check to disallow loops and other CFG validation. +In particular it will detect programs that have unreachable instructions. +(though classic BPF checker allows them) + +Second step starts from the first insn and descends all possible paths. +It simulates execution of every insn and observes the state change of +registers and stack. + +At the start of the program the register R1 contains a pointer to context +and has type PTR_TO_CTX. +If verifier sees an insn that does R2=R1, then R2 has now type +PTR_TO_CTX as well and can be used on the right hand side of expression. +If R1=PTR_TO_CTX and insn is R2=R1+R1, then R2=SCALAR_VALUE, +since addition of two valid pointers makes invalid pointer. +(In 'secure' mode verifier will reject any type of pointer arithmetic to make +sure that kernel addresses don't leak to unprivileged users) + +If register was never written to, it's not readable:: + + bpf_mov R0 = R2 + bpf_exit + +will be rejected, since R2 is unreadable at the start of the program. + +After kernel function call, R1-R5 are reset to unreadable and +R0 has a return type of the function. + +Since R6-R9 are callee saved, their state is preserved across the call. + +:: + + bpf_mov R6 = 1 + bpf_call foo + bpf_mov R0 = R6 + bpf_exit + +is a correct program. If there was R1 instead of R6, it would have +been rejected. + +load/store instructions are allowed only with registers of valid types, which +are PTR_TO_CTX, PTR_TO_MAP, PTR_TO_STACK. They are bounds and alignment checked. +For example:: + + bpf_mov R1 = 1 + bpf_mov R2 = 2 + bpf_xadd *(u32 *)(R1 + 3) += R2 + bpf_exit + +will be rejected, since R1 doesn't have a valid pointer type at the time of +execution of instruction bpf_xadd. + +At the start R1 type is PTR_TO_CTX (a pointer to generic ``struct bpf_context``) +A callback is used to customize verifier to restrict eBPF program access to only +certain fields within ctx structure with specified size and alignment. + +For example, the following insn:: + + bpf_ld R0 = *(u32 *)(R6 + 8) + +intends to load a word from address R6 + 8 and store it into R0 +If R6=PTR_TO_CTX, via is_valid_access() callback the verifier will know +that offset 8 of size 4 bytes can be accessed for reading, otherwise +the verifier will reject the program. +If R6=PTR_TO_STACK, then access should be aligned and be within +stack bounds, which are [-MAX_BPF_STACK, 0). In this example offset is 8, +so it will fail verification, since it's out of bounds. + +The verifier will allow eBPF program to read data from stack only after +it wrote into it. + +Classic BPF verifier does similar check with M[0-15] memory slots. +For example:: + + bpf_ld R0 = *(u32 *)(R10 - 4) + bpf_exit + +is invalid program. +Though R10 is correct read-only register and has type PTR_TO_STACK +and R10 - 4 is within stack bounds, there were no stores into that location. + +Pointer register spill/fill is tracked as well, since four (R6-R9) +callee saved registers may not be enough for some programs. + +Allowed function calls are customized with bpf_verifier_ops->get_func_proto() +The eBPF verifier will check that registers match argument constraints. +After the call register R0 will be set to return type of the function. + +Function calls is a main mechanism to extend functionality of eBPF programs. +Socket filters may let programs to call one set of functions, whereas tracing +filters may allow completely different set. + +If a function made accessible to eBPF program, it needs to be thought through +from safety point of view. The verifier will guarantee that the function is +called with valid arguments. + +seccomp vs socket filters have different security restrictions for classic BPF. +Seccomp solves this by two stage verifier: classic BPF verifier is followed +by seccomp verifier. In case of eBPF one configurable verifier is shared for +all use cases. + +See details of eBPF verifier in kernel/bpf/verifier.c + +Register value tracking +----------------------- +In order to determine the safety of an eBPF program, the verifier must track +the range of possible values in each register and also in each stack slot. +This is done with ``struct bpf_reg_state``, defined in include/linux/ +bpf_verifier.h, which unifies tracking of scalar and pointer values. Each +register state has a type, which is either NOT_INIT (the register has not been +written to), SCALAR_VALUE (some value which is not usable as a pointer), or a +pointer type. The types of pointers describe their base, as follows: + + + PTR_TO_CTX + Pointer to bpf_context. + CONST_PTR_TO_MAP + Pointer to struct bpf_map. "Const" because arithmetic + on these pointers is forbidden. + PTR_TO_MAP_VALUE + Pointer to the value stored in a map element. + PTR_TO_MAP_VALUE_OR_NULL + Either a pointer to a map value, or NULL; map accesses + (see section 'eBPF maps', below) return this type, + which becomes a PTR_TO_MAP_VALUE when checked != NULL. + Arithmetic on these pointers is forbidden. + PTR_TO_STACK + Frame pointer. + PTR_TO_PACKET + skb->data. + PTR_TO_PACKET_END + skb->data + headlen; arithmetic forbidden. + PTR_TO_SOCKET + Pointer to struct bpf_sock_ops, implicitly refcounted. + PTR_TO_SOCKET_OR_NULL + Either a pointer to a socket, or NULL; socket lookup + returns this type, which becomes a PTR_TO_SOCKET when + checked != NULL. PTR_TO_SOCKET is reference-counted, + so programs must release the reference through the + socket release function before the end of the program. + Arithmetic on these pointers is forbidden. + +However, a pointer may be offset from this base (as a result of pointer +arithmetic), and this is tracked in two parts: the 'fixed offset' and 'variable +offset'. The former is used when an exactly-known value (e.g. an immediate +operand) is added to a pointer, while the latter is used for values which are +not exactly known. The variable offset is also used in SCALAR_VALUEs, to track +the range of possible values in the register. + +The verifier's knowledge about the variable offset consists of: + +* minimum and maximum values as unsigned +* minimum and maximum values as signed + +* knowledge of the values of individual bits, in the form of a 'tnum': a u64 + 'mask' and a u64 'value'. 1s in the mask represent bits whose value is unknown; + 1s in the value represent bits known to be 1. Bits known to be 0 have 0 in both + mask and value; no bit should ever be 1 in both. For example, if a byte is read + into a register from memory, the register's top 56 bits are known zero, while + the low 8 are unknown - which is represented as the tnum (0x0; 0xff). If we + then OR this with 0x40, we get (0x40; 0xbf), then if we add 1 we get (0x0; + 0x1ff), because of potential carries. + +Besides arithmetic, the register state can also be updated by conditional +branches. For instance, if a SCALAR_VALUE is compared > 8, in the 'true' branch +it will have a umin_value (unsigned minimum value) of 9, whereas in the 'false' +branch it will have a umax_value of 8. A signed compare (with BPF_JSGT or +BPF_JSGE) would instead update the signed minimum/maximum values. Information +from the signed and unsigned bounds can be combined; for instance if a value is +first tested < 8 and then tested s> 4, the verifier will conclude that the value +is also > 4 and s< 8, since the bounds prevent crossing the sign boundary. + +PTR_TO_PACKETs with a variable offset part have an 'id', which is common to all +pointers sharing that same variable offset. This is important for packet range +checks: after adding a variable to a packet pointer register A, if you then copy +it to another register B and then add a constant 4 to A, both registers will +share the same 'id' but the A will have a fixed offset of +4. Then if A is +bounds-checked and found to be less than a PTR_TO_PACKET_END, the register B is +now known to have a safe range of at least 4 bytes. See 'Direct packet access', +below, for more on PTR_TO_PACKET ranges. + +The 'id' field is also used on PTR_TO_MAP_VALUE_OR_NULL, common to all copies of +the pointer returned from a map lookup. This means that when one copy is +checked and found to be non-NULL, all copies can become PTR_TO_MAP_VALUEs. +As well as range-checking, the tracked information is also used for enforcing +alignment of pointer accesses. For instance, on most systems the packet pointer +is 2 bytes after a 4-byte alignment. If a program adds 14 bytes to that to jump +over the Ethernet header, then reads IHL and addes (IHL * 4), the resulting +pointer will have a variable offset known to be 4n+2 for some n, so adding the 2 +bytes (NET_IP_ALIGN) gives a 4-byte alignment and so word-sized accesses through +that pointer are safe. +The 'id' field is also used on PTR_TO_SOCKET and PTR_TO_SOCKET_OR_NULL, common +to all copies of the pointer returned from a socket lookup. This has similar +behaviour to the handling for PTR_TO_MAP_VALUE_OR_NULL->PTR_TO_MAP_VALUE, but +it also handles reference tracking for the pointer. PTR_TO_SOCKET implicitly +represents a reference to the corresponding ``struct sock``. To ensure that the +reference is not leaked, it is imperative to NULL-check the reference and in +the non-NULL case, and pass the valid reference to the socket release function. + +Direct packet access +-------------------- +In cls_bpf and act_bpf programs the verifier allows direct access to the packet +data via skb->data and skb->data_end pointers. +Ex:: + + 1: r4 = *(u32 *)(r1 +80) /* load skb->data_end */ + 2: r3 = *(u32 *)(r1 +76) /* load skb->data */ + 3: r5 = r3 + 4: r5 += 14 + 5: if r5 > r4 goto pc+16 + R1=ctx R3=pkt(id=0,off=0,r=14) R4=pkt_end R5=pkt(id=0,off=14,r=14) R10=fp + 6: r0 = *(u16 *)(r3 +12) /* access 12 and 13 bytes of the packet */ + +this 2byte load from the packet is safe to do, since the program author +did check ``if (skb->data + 14 > skb->data_end) goto err`` at insn #5 which +means that in the fall-through case the register R3 (which points to skb->data) +has at least 14 directly accessible bytes. The verifier marks it +as R3=pkt(id=0,off=0,r=14). +id=0 means that no additional variables were added to the register. +off=0 means that no additional constants were added. +r=14 is the range of safe access which means that bytes [R3, R3 + 14) are ok. +Note that R5 is marked as R5=pkt(id=0,off=14,r=14). It also points +to the packet data, but constant 14 was added to the register, so +it now points to ``skb->data + 14`` and accessible range is [R5, R5 + 14 - 14) +which is zero bytes. + +More complex packet access may look like:: + + + R0=inv1 R1=ctx R3=pkt(id=0,off=0,r=14) R4=pkt_end R5=pkt(id=0,off=14,r=14) R10=fp + 6: r0 = *(u8 *)(r3 +7) /* load 7th byte from the packet */ + 7: r4 = *(u8 *)(r3 +12) + 8: r4 *= 14 + 9: r3 = *(u32 *)(r1 +76) /* load skb->data */ + 10: r3 += r4 + 11: r2 = r1 + 12: r2 <<= 48 + 13: r2 >>= 48 + 14: r3 += r2 + 15: r2 = r3 + 16: r2 += 8 + 17: r1 = *(u32 *)(r1 +80) /* load skb->data_end */ + 18: if r2 > r1 goto pc+2 + R0=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R1=pkt_end R2=pkt(id=2,off=8,r=8) R3=pkt(id=2,off=0,r=8) R4=inv(id=0,umax_value=3570,var_off=(0x0; 0xfffe)) R5=pkt(id=0,off=14,r=14) R10=fp + 19: r1 = *(u8 *)(r3 +4) + +The state of the register R3 is R3=pkt(id=2,off=0,r=8) +id=2 means that two ``r3 += rX`` instructions were seen, so r3 points to some +offset within a packet and since the program author did +``if (r3 + 8 > r1) goto err`` at insn #18, the safe range is [R3, R3 + 8). +The verifier only allows 'add'/'sub' operations on packet registers. Any other +operation will set the register state to 'SCALAR_VALUE' and it won't be +available for direct packet access. + +Operation ``r3 += rX`` may overflow and become less than original skb->data, +therefore the verifier has to prevent that. So when it sees ``r3 += rX`` +instruction and rX is more than 16-bit value, any subsequent bounds-check of r3 +against skb->data_end will not give us 'range' information, so attempts to read +through the pointer will give "invalid access to packet" error. + +Ex. after insn ``r4 = *(u8 *)(r3 +12)`` (insn #7 above) the state of r4 is +R4=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) which means that upper 56 bits +of the register are guaranteed to be zero, and nothing is known about the lower +8 bits. After insn ``r4 *= 14`` the state becomes +R4=inv(id=0,umax_value=3570,var_off=(0x0; 0xfffe)), since multiplying an 8-bit +value by constant 14 will keep upper 52 bits as zero, also the least significant +bit will be zero as 14 is even. Similarly ``r2 >>= 48`` will make +R2=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff)), since the shift is not sign +extending. This logic is implemented in adjust_reg_min_max_vals() function, +which calls adjust_ptr_min_max_vals() for adding pointer to scalar (or vice +versa) and adjust_scalar_min_max_vals() for operations on two scalars. + +The end result is that bpf program author can access packet directly +using normal C code as:: + + void *data = (void *)(long)skb->data; + void *data_end = (void *)(long)skb->data_end; + struct eth_hdr *eth = data; + struct iphdr *iph = data + sizeof(*eth); + struct udphdr *udp = data + sizeof(*eth) + sizeof(*iph); + + if (data + sizeof(*eth) + sizeof(*iph) + sizeof(*udp) > data_end) + return 0; + if (eth->h_proto != htons(ETH_P_IP)) + return 0; + if (iph->protocol != IPPROTO_UDP || iph->ihl != 5) + return 0; + if (udp->dest == 53 || udp->source == 9) + ...; + +which makes such programs easier to write comparing to LD_ABS insn +and significantly faster. + +eBPF maps +--------- +'maps' is a generic storage of different types for sharing data between kernel +and userspace. + +The maps are accessed from user space via BPF syscall, which has commands: + +- create a map with given type and attributes + ``map_fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)`` + using attr->map_type, attr->key_size, attr->value_size, attr->max_entries + returns process-local file descriptor or negative error + +- lookup key in a given map + ``err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)`` + using attr->map_fd, attr->key, attr->value + returns zero and stores found elem into value or negative error + +- create or update key/value pair in a given map + ``err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)`` + using attr->map_fd, attr->key, attr->value + returns zero or negative error + +- find and delete element by key in a given map + ``err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)`` + using attr->map_fd, attr->key + +- to delete map: close(fd) + Exiting process will delete maps automatically + +userspace programs use this syscall to create/access maps that eBPF programs +are concurrently updating. + +maps can have different types: hash, array, bloom filter, radix-tree, etc. + +The map is defined by: + + - type + - max number of elements + - key size in bytes + - value size in bytes + +Pruning +------- +The verifier does not actually walk all possible paths through the program. For +each new branch to analyse, the verifier looks at all the states it's previously +been in when at this instruction. If any of them contain the current state as a +subset, the branch is 'pruned' - that is, the fact that the previous state was +accepted implies the current state would be as well. For instance, if in the +previous state, r1 held a packet-pointer, and in the current state, r1 holds a +packet-pointer with a range as long or longer and at least as strict an +alignment, then r1 is safe. Similarly, if r2 was NOT_INIT before then it can't +have been used by any path from that point, so any value in r2 (including +another NOT_INIT) is safe. The implementation is in the function regsafe(). +Pruning considers not only the registers but also the stack (and any spilled +registers it may hold). They must all be safe for the branch to be pruned. +This is implemented in states_equal(). + +Understanding eBPF verifier messages +------------------------------------ + +The following are few examples of invalid eBPF programs and verifier error +messages as seen in the log: + +Program with unreachable instructions:: + + static struct bpf_insn prog[] = { + BPF_EXIT_INSN(), + BPF_EXIT_INSN(), + }; + +Error: + + unreachable insn 1 + +Program that reads uninitialized register:: + + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), + +Error:: + + 0: (bf) r0 = r2 + R2 !read_ok + +Program that doesn't initialize R0 before exiting:: + + BPF_MOV64_REG(BPF_REG_2, BPF_REG_1), + BPF_EXIT_INSN(), + +Error:: + + 0: (bf) r2 = r1 + 1: (95) exit + R0 !read_ok + +Program that accesses stack out of bounds:: + + BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0), + BPF_EXIT_INSN(), + +Error:: + + 0: (7a) *(u64 *)(r10 +8) = 0 + invalid stack off=8 size=8 + +Program that doesn't initialize stack before passing its address into function:: + + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_EXIT_INSN(), + +Error:: + + 0: (bf) r2 = r10 + 1: (07) r2 += -8 + 2: (b7) r1 = 0x0 + 3: (85) call 1 + invalid indirect read from stack off -8+0 size 8 + +Program that uses invalid map_fd=0 while calling to map_lookup_elem() function:: + + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_EXIT_INSN(), + +Error:: + + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 0x0 + 4: (85) call 1 + fd 0 is not pointing to valid bpf_map + +Program that doesn't check return value of map_lookup_elem() before accessing +map element:: + + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), + +Error:: + + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 0x0 + 4: (85) call 1 + 5: (7a) *(u64 *)(r0 +0) = 0 + R0 invalid mem access 'map_value_or_null' + +Program that correctly checks map_lookup_elem() returned value for NULL, but +accesses the memory with incorrect alignment:: + + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0), + BPF_EXIT_INSN(), + +Error:: + + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 1 + 4: (85) call 1 + 5: (15) if r0 == 0x0 goto pc+1 + R0=map_ptr R10=fp + 6: (7a) *(u64 *)(r0 +4) = 0 + misaligned access off 4 size 8 + +Program that correctly checks map_lookup_elem() returned value for NULL and +accesses memory with correct alignment in one side of 'if' branch, but fails +to do so in the other side of 'if' branch:: + + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + +Error:: + + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 1 + 4: (85) call 1 + 5: (15) if r0 == 0x0 goto pc+2 + R0=map_ptr R10=fp + 6: (7a) *(u64 *)(r0 +0) = 0 + 7: (95) exit + + from 5 to 8: R0=imm0 R10=fp + 8: (7a) *(u64 *)(r0 +0) = 1 + R0 invalid mem access 'imm' + +Program that performs a socket lookup then sets the pointer to NULL without +checking it:: + + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_MOV64_IMM(BPF_REG_3, 4), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_MOV64_IMM(BPF_REG_5, 0), + BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + +Error:: + + 0: (b7) r2 = 0 + 1: (63) *(u32 *)(r10 -8) = r2 + 2: (bf) r2 = r10 + 3: (07) r2 += -8 + 4: (b7) r3 = 4 + 5: (b7) r4 = 0 + 6: (b7) r5 = 0 + 7: (85) call bpf_sk_lookup_tcp#65 + 8: (b7) r0 = 0 + 9: (95) exit + Unreleased reference id=1, alloc_insn=7 + +Program that performs a socket lookup but does not NULL-check the returned +value:: + + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_MOV64_IMM(BPF_REG_3, 4), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_MOV64_IMM(BPF_REG_5, 0), + BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp), + BPF_EXIT_INSN(), + +Error:: + + 0: (b7) r2 = 0 + 1: (63) *(u32 *)(r10 -8) = r2 + 2: (bf) r2 = r10 + 3: (07) r2 += -8 + 4: (b7) r3 = 4 + 5: (b7) r4 = 0 + 6: (b7) r5 = 0 + 7: (85) call bpf_sk_lookup_tcp#65 + 8: (95) exit + Unreleased reference id=1, alloc_insn=7 + +Testing +------- + +Next to the BPF toolchain, the kernel also ships a test module that contains +various test cases for classic and internal BPF that can be executed against +the BPF interpreter and JIT compiler. It can be found in lib/test_bpf.c and +enabled via Kconfig:: + + CONFIG_TEST_BPF=m + +After the module has been built and installed, the test suite can be executed +via insmod or modprobe against 'test_bpf' module. Results of the test cases +including timings in nsec can be found in the kernel log (dmesg). + +Misc +---- + +Also trinity, the Linux syscall fuzzer, has built-in support for BPF and +SECCOMP-BPF kernel fuzzing. + +Written by +---------- + +The document was written in the hope that it is found useful and in order +to give potential BPF hackers or security auditors a better overview of +the underlying architecture. + +- Jay Schulist +- Daniel Borkmann +- Alexei Starovoitov diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt deleted file mode 100644 index 2f0f8b17dade..000000000000 --- a/Documentation/networking/filter.txt +++ /dev/null @@ -1,1545 +0,0 @@ -Linux Socket Filtering aka Berkeley Packet Filter (BPF) -======================================================= - -Introduction ------------- - -Linux Socket Filtering (LSF) is derived from the Berkeley Packet Filter. -Though there are some distinct differences between the BSD and Linux -Kernel filtering, but when we speak of BPF or LSF in Linux context, we -mean the very same mechanism of filtering in the Linux kernel. - -BPF allows a user-space program to attach a filter onto any socket and -allow or disallow certain types of data to come through the socket. LSF -follows exactly the same filter code structure as BSD's BPF, so referring -to the BSD bpf.4 manpage is very helpful in creating filters. - -On Linux, BPF is much simpler than on BSD. One does not have to worry -about devices or anything like that. You simply create your filter code, -send it to the kernel via the SO_ATTACH_FILTER option and if your filter -code passes the kernel check on it, you then immediately begin filtering -data on that socket. - -You can also detach filters from your socket via the SO_DETACH_FILTER -option. This will probably not be used much since when you close a socket -that has a filter on it the filter is automagically removed. The other -less common case may be adding a different filter on the same socket where -you had another filter that is still running: the kernel takes care of -removing the old one and placing your new one in its place, assuming your -filter has passed the checks, otherwise if it fails the old filter will -remain on that socket. - -SO_LOCK_FILTER option allows to lock the filter attached to a socket. Once -set, a filter cannot be removed or changed. This allows one process to -setup a socket, attach a filter, lock it then drop privileges and be -assured that the filter will be kept until the socket is closed. - -The biggest user of this construct might be libpcap. Issuing a high-level -filter command like `tcpdump -i em1 port 22` passes through the libpcap -internal compiler that generates a structure that can eventually be loaded -via SO_ATTACH_FILTER to the kernel. `tcpdump -i em1 port 22 -ddd` -displays what is being placed into this structure. - -Although we were only speaking about sockets here, BPF in Linux is used -in many more places. There's xt_bpf for netfilter, cls_bpf in the kernel -qdisc layer, SECCOMP-BPF (SECure COMPuting [1]), and lots of other places -such as team driver, PTP code, etc where BPF is being used. - - [1] Documentation/userspace-api/seccomp_filter.rst - -Original BPF paper: - -Steven McCanne and Van Jacobson. 1993. The BSD packet filter: a new -architecture for user-level packet capture. In Proceedings of the -USENIX Winter 1993 Conference Proceedings on USENIX Winter 1993 -Conference Proceedings (USENIX'93). USENIX Association, Berkeley, -CA, USA, 2-2. [http://www.tcpdump.org/papers/bpf-usenix93.pdf] - -Structure ---------- - -User space applications include which contains the -following relevant structures: - -struct sock_filter { /* Filter block */ - __u16 code; /* Actual filter code */ - __u8 jt; /* Jump true */ - __u8 jf; /* Jump false */ - __u32 k; /* Generic multiuse field */ -}; - -Such a structure is assembled as an array of 4-tuples, that contains -a code, jt, jf and k value. jt and jf are jump offsets and k a generic -value to be used for a provided code. - -struct sock_fprog { /* Required for SO_ATTACH_FILTER. */ - unsigned short len; /* Number of filter blocks */ - struct sock_filter __user *filter; -}; - -For socket filtering, a pointer to this structure (as shown in -follow-up example) is being passed to the kernel through setsockopt(2). - -Example -------- - -#include -#include -#include -#include -/* ... */ - -/* From the example above: tcpdump -i em1 port 22 -dd */ -struct sock_filter code[] = { - { 0x28, 0, 0, 0x0000000c }, - { 0x15, 0, 8, 0x000086dd }, - { 0x30, 0, 0, 0x00000014 }, - { 0x15, 2, 0, 0x00000084 }, - { 0x15, 1, 0, 0x00000006 }, - { 0x15, 0, 17, 0x00000011 }, - { 0x28, 0, 0, 0x00000036 }, - { 0x15, 14, 0, 0x00000016 }, - { 0x28, 0, 0, 0x00000038 }, - { 0x15, 12, 13, 0x00000016 }, - { 0x15, 0, 12, 0x00000800 }, - { 0x30, 0, 0, 0x00000017 }, - { 0x15, 2, 0, 0x00000084 }, - { 0x15, 1, 0, 0x00000006 }, - { 0x15, 0, 8, 0x00000011 }, - { 0x28, 0, 0, 0x00000014 }, - { 0x45, 6, 0, 0x00001fff }, - { 0xb1, 0, 0, 0x0000000e }, - { 0x48, 0, 0, 0x0000000e }, - { 0x15, 2, 0, 0x00000016 }, - { 0x48, 0, 0, 0x00000010 }, - { 0x15, 0, 1, 0x00000016 }, - { 0x06, 0, 0, 0x0000ffff }, - { 0x06, 0, 0, 0x00000000 }, -}; - -struct sock_fprog bpf = { - .len = ARRAY_SIZE(code), - .filter = code, -}; - -sock = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); -if (sock < 0) - /* ... bail out ... */ - -ret = setsockopt(sock, SOL_SOCKET, SO_ATTACH_FILTER, &bpf, sizeof(bpf)); -if (ret < 0) - /* ... bail out ... */ - -/* ... */ -close(sock); - -The above example code attaches a socket filter for a PF_PACKET socket -in order to let all IPv4/IPv6 packets with port 22 pass. The rest will -be dropped for this socket. - -The setsockopt(2) call to SO_DETACH_FILTER doesn't need any arguments -and SO_LOCK_FILTER for preventing the filter to be detached, takes an -integer value with 0 or 1. - -Note that socket filters are not restricted to PF_PACKET sockets only, -but can also be used on other socket families. - -Summary of system calls: - - * setsockopt(sockfd, SOL_SOCKET, SO_ATTACH_FILTER, &val, sizeof(val)); - * setsockopt(sockfd, SOL_SOCKET, SO_DETACH_FILTER, &val, sizeof(val)); - * setsockopt(sockfd, SOL_SOCKET, SO_LOCK_FILTER, &val, sizeof(val)); - -Normally, most use cases for socket filtering on packet sockets will be -covered by libpcap in high-level syntax, so as an application developer -you should stick to that. libpcap wraps its own layer around all that. - -Unless i) using/linking to libpcap is not an option, ii) the required BPF -filters use Linux extensions that are not supported by libpcap's compiler, -iii) a filter might be more complex and not cleanly implementable with -libpcap's compiler, or iv) particular filter codes should be optimized -differently than libpcap's internal compiler does; then in such cases -writing such a filter "by hand" can be of an alternative. For example, -xt_bpf and cls_bpf users might have requirements that could result in -more complex filter code, or one that cannot be expressed with libpcap -(e.g. different return codes for various code paths). Moreover, BPF JIT -implementors may wish to manually write test cases and thus need low-level -access to BPF code as well. - -BPF engine and instruction set ------------------------------- - -Under tools/bpf/ there's a small helper tool called bpf_asm which can -be used to write low-level filters for example scenarios mentioned in the -previous section. Asm-like syntax mentioned here has been implemented in -bpf_asm and will be used for further explanations (instead of dealing with -less readable opcodes directly, principles are the same). The syntax is -closely modelled after Steven McCanne's and Van Jacobson's BPF paper. - -The BPF architecture consists of the following basic elements: - - Element Description - - A 32 bit wide accumulator - X 32 bit wide X register - M[] 16 x 32 bit wide misc registers aka "scratch memory - store", addressable from 0 to 15 - -A program, that is translated by bpf_asm into "opcodes" is an array that -consists of the following elements (as already mentioned): - - op:16, jt:8, jf:8, k:32 - -The element op is a 16 bit wide opcode that has a particular instruction -encoded. jt and jf are two 8 bit wide jump targets, one for condition -"jump if true", the other one "jump if false". Eventually, element k -contains a miscellaneous argument that can be interpreted in different -ways depending on the given instruction in op. - -The instruction set consists of load, store, branch, alu, miscellaneous -and return instructions that are also represented in bpf_asm syntax. This -table lists all bpf_asm instructions available resp. what their underlying -opcodes as defined in linux/filter.h stand for: - - Instruction Addressing mode Description - - ld 1, 2, 3, 4, 12 Load word into A - ldi 4 Load word into A - ldh 1, 2 Load half-word into A - ldb 1, 2 Load byte into A - ldx 3, 4, 5, 12 Load word into X - ldxi 4 Load word into X - ldxb 5 Load byte into X - - st 3 Store A into M[] - stx 3 Store X into M[] - - jmp 6 Jump to label - ja 6 Jump to label - jeq 7, 8, 9, 10 Jump on A == - jneq 9, 10 Jump on A != - jne 9, 10 Jump on A != - jlt 9, 10 Jump on A < - jle 9, 10 Jump on A <= - jgt 7, 8, 9, 10 Jump on A > - jge 7, 8, 9, 10 Jump on A >= - jset 7, 8, 9, 10 Jump on A & - - add 0, 4 A + - sub 0, 4 A - - mul 0, 4 A * - div 0, 4 A / - mod 0, 4 A % - neg !A - and 0, 4 A & - or 0, 4 A | - xor 0, 4 A ^ - lsh 0, 4 A << - rsh 0, 4 A >> - - tax Copy A into X - txa Copy X into A - - ret 4, 11 Return - -The next table shows addressing formats from the 2nd column: - - Addressing mode Syntax Description - - 0 x/%x Register X - 1 [k] BHW at byte offset k in the packet - 2 [x + k] BHW at the offset X + k in the packet - 3 M[k] Word at offset k in M[] - 4 #k Literal value stored in k - 5 4*([k]&0xf) Lower nibble * 4 at byte offset k in the packet - 6 L Jump label L - 7 #k,Lt,Lf Jump to Lt if true, otherwise jump to Lf - 8 x/%x,Lt,Lf Jump to Lt if true, otherwise jump to Lf - 9 #k,Lt Jump to Lt if predicate is true - 10 x/%x,Lt Jump to Lt if predicate is true - 11 a/%a Accumulator A - 12 extension BPF extension - -The Linux kernel also has a couple of BPF extensions that are used along -with the class of load instructions by "overloading" the k argument with -a negative offset + a particular extension offset. The result of such BPF -extensions are loaded into A. - -Possible BPF extensions are shown in the following table: - - Extension Description - - len skb->len - proto skb->protocol - type skb->pkt_type - poff Payload start offset - ifidx skb->dev->ifindex - nla Netlink attribute of type X with offset A - nlan Nested Netlink attribute of type X with offset A - mark skb->mark - queue skb->queue_mapping - hatype skb->dev->type - rxhash skb->hash - cpu raw_smp_processor_id() - vlan_tci skb_vlan_tag_get(skb) - vlan_avail skb_vlan_tag_present(skb) - vlan_tpid skb->vlan_proto - rand prandom_u32() - -These extensions can also be prefixed with '#'. -Examples for low-level BPF: - -** ARP packets: - - ldh [12] - jne #0x806, drop - ret #-1 - drop: ret #0 - -** IPv4 TCP packets: - - ldh [12] - jne #0x800, drop - ldb [23] - jneq #6, drop - ret #-1 - drop: ret #0 - -** (Accelerated) VLAN w/ id 10: - - ld vlan_tci - jneq #10, drop - ret #-1 - drop: ret #0 - -** icmp random packet sampling, 1 in 4 - ldh [12] - jne #0x800, drop - ldb [23] - jneq #1, drop - # get a random uint32 number - ld rand - mod #4 - jneq #1, drop - ret #-1 - drop: ret #0 - -** SECCOMP filter example: - - ld [4] /* offsetof(struct seccomp_data, arch) */ - jne #0xc000003e, bad /* AUDIT_ARCH_X86_64 */ - ld [0] /* offsetof(struct seccomp_data, nr) */ - jeq #15, good /* __NR_rt_sigreturn */ - jeq #231, good /* __NR_exit_group */ - jeq #60, good /* __NR_exit */ - jeq #0, good /* __NR_read */ - jeq #1, good /* __NR_write */ - jeq #5, good /* __NR_fstat */ - jeq #9, good /* __NR_mmap */ - jeq #14, good /* __NR_rt_sigprocmask */ - jeq #13, good /* __NR_rt_sigaction */ - jeq #35, good /* __NR_nanosleep */ - bad: ret #0 /* SECCOMP_RET_KILL_THREAD */ - good: ret #0x7fff0000 /* SECCOMP_RET_ALLOW */ - -The above example code can be placed into a file (here called "foo"), and -then be passed to the bpf_asm tool for generating opcodes, output that xt_bpf -and cls_bpf understands and can directly be loaded with. Example with above -ARP code: - -$ ./bpf_asm foo -4,40 0 0 12,21 0 1 2054,6 0 0 4294967295,6 0 0 0, - -In copy and paste C-like output: - -$ ./bpf_asm -c foo -{ 0x28, 0, 0, 0x0000000c }, -{ 0x15, 0, 1, 0x00000806 }, -{ 0x06, 0, 0, 0xffffffff }, -{ 0x06, 0, 0, 0000000000 }, - -In particular, as usage with xt_bpf or cls_bpf can result in more complex BPF -filters that might not be obvious at first, it's good to test filters before -attaching to a live system. For that purpose, there's a small tool called -bpf_dbg under tools/bpf/ in the kernel source directory. This debugger allows -for testing BPF filters against given pcap files, single stepping through the -BPF code on the pcap's packets and to do BPF machine register dumps. - -Starting bpf_dbg is trivial and just requires issuing: - -# ./bpf_dbg - -In case input and output do not equal stdin/stdout, bpf_dbg takes an -alternative stdin source as a first argument, and an alternative stdout -sink as a second one, e.g. `./bpf_dbg test_in.txt test_out.txt`. - -Other than that, a particular libreadline configuration can be set via -file "~/.bpf_dbg_init" and the command history is stored in the file -"~/.bpf_dbg_history". - -Interaction in bpf_dbg happens through a shell that also has auto-completion -support (follow-up example commands starting with '>' denote bpf_dbg shell). -The usual workflow would be to ... - -> load bpf 6,40 0 0 12,21 0 3 2048,48 0 0 23,21 0 1 1,6 0 0 65535,6 0 0 0 - Loads a BPF filter from standard output of bpf_asm, or transformed via - e.g. `tcpdump -iem1 -ddd port 22 | tr '\n' ','`. Note that for JIT - debugging (next section), this command creates a temporary socket and - loads the BPF code into the kernel. Thus, this will also be useful for - JIT developers. - -> load pcap foo.pcap - Loads standard tcpdump pcap file. - -> run [] -bpf passes:1 fails:9 - Runs through all packets from a pcap to account how many passes and fails - the filter will generate. A limit of packets to traverse can be given. - -> disassemble -l0: ldh [12] -l1: jeq #0x800, l2, l5 -l2: ldb [23] -l3: jeq #0x1, l4, l5 -l4: ret #0xffff -l5: ret #0 - Prints out BPF code disassembly. - -> dump -/* { op, jt, jf, k }, */ -{ 0x28, 0, 0, 0x0000000c }, -{ 0x15, 0, 3, 0x00000800 }, -{ 0x30, 0, 0, 0x00000017 }, -{ 0x15, 0, 1, 0x00000001 }, -{ 0x06, 0, 0, 0x0000ffff }, -{ 0x06, 0, 0, 0000000000 }, - Prints out C-style BPF code dump. - -> breakpoint 0 -breakpoint at: l0: ldh [12] -> breakpoint 1 -breakpoint at: l1: jeq #0x800, l2, l5 - ... - Sets breakpoints at particular BPF instructions. Issuing a `run` command - will walk through the pcap file continuing from the current packet and - break when a breakpoint is being hit (another `run` will continue from - the currently active breakpoint executing next instructions): - - > run - -- register dump -- - pc: [0] <-- program counter - code: [40] jt[0] jf[0] k[12] <-- plain BPF code of current instruction - curr: l0: ldh [12] <-- disassembly of current instruction - A: [00000000][0] <-- content of A (hex, decimal) - X: [00000000][0] <-- content of X (hex, decimal) - M[0,15]: [00000000][0] <-- folded content of M (hex, decimal) - -- packet dump -- <-- Current packet from pcap (hex) - len: 42 - 0: 00 19 cb 55 55 a4 00 14 a4 43 78 69 08 06 00 01 - 16: 08 00 06 04 00 01 00 14 a4 43 78 69 0a 3b 01 26 - 32: 00 00 00 00 00 00 0a 3b 01 01 - (breakpoint) - > - -> breakpoint -breakpoints: 0 1 - Prints currently set breakpoints. - -> step [-, +] - Performs single stepping through the BPF program from the current pc - offset. Thus, on each step invocation, above register dump is issued. - This can go forwards and backwards in time, a plain `step` will break - on the next BPF instruction, thus +1. (No `run` needs to be issued here.) - -> select - Selects a given packet from the pcap file to continue from. Thus, on - the next `run` or `step`, the BPF program is being evaluated against - the user pre-selected packet. Numbering starts just as in Wireshark - with index 1. - -> quit -# - Exits bpf_dbg. - -JIT compiler ------------- - -The Linux kernel has a built-in BPF JIT compiler for x86_64, SPARC, -PowerPC, ARM, ARM64, MIPS, RISC-V and s390 and can be enabled through -CONFIG_BPF_JIT. The JIT compiler is transparently invoked for each -attached filter from user space or for internal kernel users if it has -been previously enabled by root: - - echo 1 > /proc/sys/net/core/bpf_jit_enable - -For JIT developers, doing audits etc, each compile run can output the generated -opcode image into the kernel log via: - - echo 2 > /proc/sys/net/core/bpf_jit_enable - -Example output from dmesg: - -[ 3389.935842] flen=6 proglen=70 pass=3 image=ffffffffa0069c8f -[ 3389.935847] JIT code: 00000000: 55 48 89 e5 48 83 ec 60 48 89 5d f8 44 8b 4f 68 -[ 3389.935849] JIT code: 00000010: 44 2b 4f 6c 4c 8b 87 d8 00 00 00 be 0c 00 00 00 -[ 3389.935850] JIT code: 00000020: e8 1d 94 ff e0 3d 00 08 00 00 75 16 be 17 00 00 -[ 3389.935851] JIT code: 00000030: 00 e8 28 94 ff e0 83 f8 01 75 07 b8 ff ff 00 00 -[ 3389.935852] JIT code: 00000040: eb 02 31 c0 c9 c3 - -When CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set to 1 and -setting any other value than that will return in failure. This is even the case for -setting bpf_jit_enable to 2, since dumping the final JIT image into the kernel log -is discouraged and introspection through bpftool (under tools/bpf/bpftool/) is the -generally recommended approach instead. - -In the kernel source tree under tools/bpf/, there's bpf_jit_disasm for -generating disassembly out of the kernel log's hexdump: - -# ./bpf_jit_disasm -70 bytes emitted from JIT compiler (pass:3, flen:6) -ffffffffa0069c8f + : - 0: push %rbp - 1: mov %rsp,%rbp - 4: sub $0x60,%rsp - 8: mov %rbx,-0x8(%rbp) - c: mov 0x68(%rdi),%r9d - 10: sub 0x6c(%rdi),%r9d - 14: mov 0xd8(%rdi),%r8 - 1b: mov $0xc,%esi - 20: callq 0xffffffffe0ff9442 - 25: cmp $0x800,%eax - 2a: jne 0x0000000000000042 - 2c: mov $0x17,%esi - 31: callq 0xffffffffe0ff945e - 36: cmp $0x1,%eax - 39: jne 0x0000000000000042 - 3b: mov $0xffff,%eax - 40: jmp 0x0000000000000044 - 42: xor %eax,%eax - 44: leaveq - 45: retq - -Issuing option `-o` will "annotate" opcodes to resulting assembler -instructions, which can be very useful for JIT developers: - -# ./bpf_jit_disasm -o -70 bytes emitted from JIT compiler (pass:3, flen:6) -ffffffffa0069c8f + : - 0: push %rbp - 55 - 1: mov %rsp,%rbp - 48 89 e5 - 4: sub $0x60,%rsp - 48 83 ec 60 - 8: mov %rbx,-0x8(%rbp) - 48 89 5d f8 - c: mov 0x68(%rdi),%r9d - 44 8b 4f 68 - 10: sub 0x6c(%rdi),%r9d - 44 2b 4f 6c - 14: mov 0xd8(%rdi),%r8 - 4c 8b 87 d8 00 00 00 - 1b: mov $0xc,%esi - be 0c 00 00 00 - 20: callq 0xffffffffe0ff9442 - e8 1d 94 ff e0 - 25: cmp $0x800,%eax - 3d 00 08 00 00 - 2a: jne 0x0000000000000042 - 75 16 - 2c: mov $0x17,%esi - be 17 00 00 00 - 31: callq 0xffffffffe0ff945e - e8 28 94 ff e0 - 36: cmp $0x1,%eax - 83 f8 01 - 39: jne 0x0000000000000042 - 75 07 - 3b: mov $0xffff,%eax - b8 ff ff 00 00 - 40: jmp 0x0000000000000044 - eb 02 - 42: xor %eax,%eax - 31 c0 - 44: leaveq - c9 - 45: retq - c3 - -For BPF JIT developers, bpf_jit_disasm, bpf_asm and bpf_dbg provides a useful -toolchain for developing and testing the kernel's JIT compiler. - -BPF kernel internals --------------------- -Internally, for the kernel interpreter, a different instruction set -format with similar underlying principles from BPF described in previous -paragraphs is being used. However, the instruction set format is modelled -closer to the underlying architecture to mimic native instruction sets, so -that a better performance can be achieved (more details later). This new -ISA is called 'eBPF' or 'internal BPF' interchangeably. (Note: eBPF which -originates from [e]xtended BPF is not the same as BPF extensions! While -eBPF is an ISA, BPF extensions date back to classic BPF's 'overloading' -of BPF_LD | BPF_{B,H,W} | BPF_ABS instruction.) - -It is designed to be JITed with one to one mapping, which can also open up -the possibility for GCC/LLVM compilers to generate optimized eBPF code through -an eBPF backend that performs almost as fast as natively compiled code. - -The new instruction set was originally designed with the possible goal in -mind to write programs in "restricted C" and compile into eBPF with a optional -GCC/LLVM backend, so that it can just-in-time map to modern 64-bit CPUs with -minimal performance overhead over two steps, that is, C -> eBPF -> native code. - -Currently, the new format is being used for running user BPF programs, which -includes seccomp BPF, classic socket filters, cls_bpf traffic classifier, -team driver's classifier for its load-balancing mode, netfilter's xt_bpf -extension, PTP dissector/classifier, and much more. They are all internally -converted by the kernel into the new instruction set representation and run -in the eBPF interpreter. For in-kernel handlers, this all works transparently -by using bpf_prog_create() for setting up the filter, resp. -bpf_prog_destroy() for destroying it. The macro -BPF_PROG_RUN(filter, ctx) transparently invokes eBPF interpreter or JITed -code to run the filter. 'filter' is a pointer to struct bpf_prog that we -got from bpf_prog_create(), and 'ctx' the given context (e.g. -skb pointer). All constraints and restrictions from bpf_check_classic() apply -before a conversion to the new layout is being done behind the scenes! - -Currently, the classic BPF format is being used for JITing on most -32-bit architectures, whereas x86-64, aarch64, s390x, powerpc64, -sparc64, arm32, riscv64, riscv32 perform JIT compilation from eBPF -instruction set. - -Some core changes of the new internal format: - -- Number of registers increase from 2 to 10: - - The old format had two registers A and X, and a hidden frame pointer. The - new layout extends this to be 10 internal registers and a read-only frame - pointer. Since 64-bit CPUs are passing arguments to functions via registers - the number of args from eBPF program to in-kernel function is restricted - to 5 and one register is used to accept return value from an in-kernel - function. Natively, x86_64 passes first 6 arguments in registers, aarch64/ - sparcv9/mips64 have 7 - 8 registers for arguments; x86_64 has 6 callee saved - registers, and aarch64/sparcv9/mips64 have 11 or more callee saved registers. - - Therefore, eBPF calling convention is defined as: - - * R0 - return value from in-kernel function, and exit value for eBPF program - * R1 - R5 - arguments from eBPF program to in-kernel function - * R6 - R9 - callee saved registers that in-kernel function will preserve - * R10 - read-only frame pointer to access stack - - Thus, all eBPF registers map one to one to HW registers on x86_64, aarch64, - etc, and eBPF calling convention maps directly to ABIs used by the kernel on - 64-bit architectures. - - On 32-bit architectures JIT may map programs that use only 32-bit arithmetic - and may let more complex programs to be interpreted. - - R0 - R5 are scratch registers and eBPF program needs spill/fill them if - necessary across calls. Note that there is only one eBPF program (== one - eBPF main routine) and it cannot call other eBPF functions, it can only - call predefined in-kernel functions, though. - -- Register width increases from 32-bit to 64-bit: - - Still, the semantics of the original 32-bit ALU operations are preserved - via 32-bit subregisters. All eBPF registers are 64-bit with 32-bit lower - subregisters that zero-extend into 64-bit if they are being written to. - That behavior maps directly to x86_64 and arm64 subregister definition, but - makes other JITs more difficult. - - 32-bit architectures run 64-bit internal BPF programs via interpreter. - Their JITs may convert BPF programs that only use 32-bit subregisters into - native instruction set and let the rest being interpreted. - - Operation is 64-bit, because on 64-bit architectures, pointers are also - 64-bit wide, and we want to pass 64-bit values in/out of kernel functions, - so 32-bit eBPF registers would otherwise require to define register-pair - ABI, thus, there won't be able to use a direct eBPF register to HW register - mapping and JIT would need to do combine/split/move operations for every - register in and out of the function, which is complex, bug prone and slow. - Another reason is the use of atomic 64-bit counters. - -- Conditional jt/jf targets replaced with jt/fall-through: - - While the original design has constructs such as "if (cond) jump_true; - else jump_false;", they are being replaced into alternative constructs like - "if (cond) jump_true; /* else fall-through */". - -- Introduces bpf_call insn and register passing convention for zero overhead - calls from/to other kernel functions: - - Before an in-kernel function call, the internal BPF program needs to - place function arguments into R1 to R5 registers to satisfy calling - convention, then the interpreter will take them from registers and pass - to in-kernel function. If R1 - R5 registers are mapped to CPU registers - that are used for argument passing on given architecture, the JIT compiler - doesn't need to emit extra moves. Function arguments will be in the correct - registers and BPF_CALL instruction will be JITed as single 'call' HW - instruction. This calling convention was picked to cover common call - situations without performance penalty. - - After an in-kernel function call, R1 - R5 are reset to unreadable and R0 has - a return value of the function. Since R6 - R9 are callee saved, their state - is preserved across the call. - - For example, consider three C functions: - - u64 f1() { return (*_f2)(1); } - u64 f2(u64 a) { return f3(a + 1, a); } - u64 f3(u64 a, u64 b) { return a - b; } - - GCC can compile f1, f3 into x86_64: - - f1: - movl $1, %edi - movq _f2(%rip), %rax - jmp *%rax - f3: - movq %rdi, %rax - subq %rsi, %rax - ret - - Function f2 in eBPF may look like: - - f2: - bpf_mov R2, R1 - bpf_add R1, 1 - bpf_call f3 - bpf_exit - - If f2 is JITed and the pointer stored to '_f2'. The calls f1 -> f2 -> f3 and - returns will be seamless. Without JIT, __bpf_prog_run() interpreter needs to - be used to call into f2. - - For practical reasons all eBPF programs have only one argument 'ctx' which is - already placed into R1 (e.g. on __bpf_prog_run() startup) and the programs - can call kernel functions with up to 5 arguments. Calls with 6 or more arguments - are currently not supported, but these restrictions can be lifted if necessary - in the future. - - On 64-bit architectures all register map to HW registers one to one. For - example, x86_64 JIT compiler can map them as ... - - R0 - rax - R1 - rdi - R2 - rsi - R3 - rdx - R4 - rcx - R5 - r8 - R6 - rbx - R7 - r13 - R8 - r14 - R9 - r15 - R10 - rbp - - ... since x86_64 ABI mandates rdi, rsi, rdx, rcx, r8, r9 for argument passing - and rbx, r12 - r15 are callee saved. - - Then the following internal BPF pseudo-program: - - bpf_mov R6, R1 /* save ctx */ - bpf_mov R2, 2 - bpf_mov R3, 3 - bpf_mov R4, 4 - bpf_mov R5, 5 - bpf_call foo - bpf_mov R7, R0 /* save foo() return value */ - bpf_mov R1, R6 /* restore ctx for next call */ - bpf_mov R2, 6 - bpf_mov R3, 7 - bpf_mov R4, 8 - bpf_mov R5, 9 - bpf_call bar - bpf_add R0, R7 - bpf_exit - - After JIT to x86_64 may look like: - - push %rbp - mov %rsp,%rbp - sub $0x228,%rsp - mov %rbx,-0x228(%rbp) - mov %r13,-0x220(%rbp) - mov %rdi,%rbx - mov $0x2,%esi - mov $0x3,%edx - mov $0x4,%ecx - mov $0x5,%r8d - callq foo - mov %rax,%r13 - mov %rbx,%rdi - mov $0x6,%esi - mov $0x7,%edx - mov $0x8,%ecx - mov $0x9,%r8d - callq bar - add %r13,%rax - mov -0x228(%rbp),%rbx - mov -0x220(%rbp),%r13 - leaveq - retq - - Which is in this example equivalent in C to: - - u64 bpf_filter(u64 ctx) - { - return foo(ctx, 2, 3, 4, 5) + bar(ctx, 6, 7, 8, 9); - } - - In-kernel functions foo() and bar() with prototype: u64 (*)(u64 arg1, u64 - arg2, u64 arg3, u64 arg4, u64 arg5); will receive arguments in proper - registers and place their return value into '%rax' which is R0 in eBPF. - Prologue and epilogue are emitted by JIT and are implicit in the - interpreter. R0-R5 are scratch registers, so eBPF program needs to preserve - them across the calls as defined by calling convention. - - For example the following program is invalid: - - bpf_mov R1, 1 - bpf_call foo - bpf_mov R0, R1 - bpf_exit - - After the call the registers R1-R5 contain junk values and cannot be read. - An in-kernel eBPF verifier is used to validate internal BPF programs. - -Also in the new design, eBPF is limited to 4096 insns, which means that any -program will terminate quickly and will only call a fixed number of kernel -functions. Original BPF and the new format are two operand instructions, -which helps to do one-to-one mapping between eBPF insn and x86 insn during JIT. - -The input context pointer for invoking the interpreter function is generic, -its content is defined by a specific use case. For seccomp register R1 points -to seccomp_data, for converted BPF filters R1 points to a skb. - -A program, that is translated internally consists of the following elements: - - op:16, jt:8, jf:8, k:32 ==> op:8, dst_reg:4, src_reg:4, off:16, imm:32 - -So far 87 internal BPF instructions were implemented. 8-bit 'op' opcode field -has room for new instructions. Some of them may use 16/24/32 byte encoding. New -instructions must be multiple of 8 bytes to preserve backward compatibility. - -Internal BPF is a general purpose RISC instruction set. Not every register and -every instruction are used during translation from original BPF to new format. -For example, socket filters are not using 'exclusive add' instruction, but -tracing filters may do to maintain counters of events, for example. Register R9 -is not used by socket filters either, but more complex filters may be running -out of registers and would have to resort to spill/fill to stack. - -Internal BPF can be used as a generic assembler for last step performance -optimizations, socket filters and seccomp are using it as assembler. Tracing -filters may use it as assembler to generate code from kernel. In kernel usage -may not be bounded by security considerations, since generated internal BPF code -may be optimizing internal code path and not being exposed to the user space. -Safety of internal BPF can come from a verifier (TBD). In such use cases as -described, it may be used as safe instruction set. - -Just like the original BPF, the new format runs within a controlled environment, -is deterministic and the kernel can easily prove that. The safety of the program -can be determined in two steps: first step does depth-first-search to disallow -loops and other CFG validation; second step starts from the first insn and -descends all possible paths. It simulates execution of every insn and observes -the state change of registers and stack. - -eBPF opcode encoding --------------------- - -eBPF is reusing most of the opcode encoding from classic to simplify conversion -of classic BPF to eBPF. For arithmetic and jump instructions the 8-bit 'code' -field is divided into three parts: - - +----------------+--------+--------------------+ - | 4 bits | 1 bit | 3 bits | - | operation code | source | instruction class | - +----------------+--------+--------------------+ - (MSB) (LSB) - -Three LSB bits store instruction class which is one of: - - Classic BPF classes: eBPF classes: - - BPF_LD 0x00 BPF_LD 0x00 - BPF_LDX 0x01 BPF_LDX 0x01 - BPF_ST 0x02 BPF_ST 0x02 - BPF_STX 0x03 BPF_STX 0x03 - BPF_ALU 0x04 BPF_ALU 0x04 - BPF_JMP 0x05 BPF_JMP 0x05 - BPF_RET 0x06 BPF_JMP32 0x06 - BPF_MISC 0x07 BPF_ALU64 0x07 - -When BPF_CLASS(code) == BPF_ALU or BPF_JMP, 4th bit encodes source operand ... - - BPF_K 0x00 - BPF_X 0x08 - - * in classic BPF, this means: - - BPF_SRC(code) == BPF_X - use register X as source operand - BPF_SRC(code) == BPF_K - use 32-bit immediate as source operand - - * in eBPF, this means: - - BPF_SRC(code) == BPF_X - use 'src_reg' register as source operand - BPF_SRC(code) == BPF_K - use 32-bit immediate as source operand - -... and four MSB bits store operation code. - -If BPF_CLASS(code) == BPF_ALU or BPF_ALU64 [ in eBPF ], BPF_OP(code) is one of: - - BPF_ADD 0x00 - BPF_SUB 0x10 - BPF_MUL 0x20 - BPF_DIV 0x30 - BPF_OR 0x40 - BPF_AND 0x50 - BPF_LSH 0x60 - BPF_RSH 0x70 - BPF_NEG 0x80 - BPF_MOD 0x90 - BPF_XOR 0xa0 - BPF_MOV 0xb0 /* eBPF only: mov reg to reg */ - BPF_ARSH 0xc0 /* eBPF only: sign extending shift right */ - BPF_END 0xd0 /* eBPF only: endianness conversion */ - -If BPF_CLASS(code) == BPF_JMP or BPF_JMP32 [ in eBPF ], BPF_OP(code) is one of: - - BPF_JA 0x00 /* BPF_JMP only */ - BPF_JEQ 0x10 - BPF_JGT 0x20 - BPF_JGE 0x30 - BPF_JSET 0x40 - BPF_JNE 0x50 /* eBPF only: jump != */ - BPF_JSGT 0x60 /* eBPF only: signed '>' */ - BPF_JSGE 0x70 /* eBPF only: signed '>=' */ - BPF_CALL 0x80 /* eBPF BPF_JMP only: function call */ - BPF_EXIT 0x90 /* eBPF BPF_JMP only: function return */ - BPF_JLT 0xa0 /* eBPF only: unsigned '<' */ - BPF_JLE 0xb0 /* eBPF only: unsigned '<=' */ - BPF_JSLT 0xc0 /* eBPF only: signed '<' */ - BPF_JSLE 0xd0 /* eBPF only: signed '<=' */ - -So BPF_ADD | BPF_X | BPF_ALU means 32-bit addition in both classic BPF -and eBPF. There are only two registers in classic BPF, so it means A += X. -In eBPF it means dst_reg = (u32) dst_reg + (u32) src_reg; similarly, -BPF_XOR | BPF_K | BPF_ALU means A ^= imm32 in classic BPF and analogous -src_reg = (u32) src_reg ^ (u32) imm32 in eBPF. - -Classic BPF is using BPF_MISC class to represent A = X and X = A moves. -eBPF is using BPF_MOV | BPF_X | BPF_ALU code instead. Since there are no -BPF_MISC operations in eBPF, the class 7 is used as BPF_ALU64 to mean -exactly the same operations as BPF_ALU, but with 64-bit wide operands -instead. So BPF_ADD | BPF_X | BPF_ALU64 means 64-bit addition, i.e.: -dst_reg = dst_reg + src_reg - -Classic BPF wastes the whole BPF_RET class to represent a single 'ret' -operation. Classic BPF_RET | BPF_K means copy imm32 into return register -and perform function exit. eBPF is modeled to match CPU, so BPF_JMP | BPF_EXIT -in eBPF means function exit only. The eBPF program needs to store return -value into register R0 before doing a BPF_EXIT. Class 6 in eBPF is used as -BPF_JMP32 to mean exactly the same operations as BPF_JMP, but with 32-bit wide -operands for the comparisons instead. - -For load and store instructions the 8-bit 'code' field is divided as: - - +--------+--------+-------------------+ - | 3 bits | 2 bits | 3 bits | - | mode | size | instruction class | - +--------+--------+-------------------+ - (MSB) (LSB) - -Size modifier is one of ... - - BPF_W 0x00 /* word */ - BPF_H 0x08 /* half word */ - BPF_B 0x10 /* byte */ - BPF_DW 0x18 /* eBPF only, double word */ - -... which encodes size of load/store operation: - - B - 1 byte - H - 2 byte - W - 4 byte - DW - 8 byte (eBPF only) - -Mode modifier is one of: - - BPF_IMM 0x00 /* used for 32-bit mov in classic BPF and 64-bit in eBPF */ - BPF_ABS 0x20 - BPF_IND 0x40 - BPF_MEM 0x60 - BPF_LEN 0x80 /* classic BPF only, reserved in eBPF */ - BPF_MSH 0xa0 /* classic BPF only, reserved in eBPF */ - BPF_XADD 0xc0 /* eBPF only, exclusive add */ - -eBPF has two non-generic instructions: (BPF_ABS | | BPF_LD) and -(BPF_IND | | BPF_LD) which are used to access packet data. - -They had to be carried over from classic to have strong performance of -socket filters running in eBPF interpreter. These instructions can only -be used when interpreter context is a pointer to 'struct sk_buff' and -have seven implicit operands. Register R6 is an implicit input that must -contain pointer to sk_buff. Register R0 is an implicit output which contains -the data fetched from the packet. Registers R1-R5 are scratch registers -and must not be used to store the data across BPF_ABS | BPF_LD or -BPF_IND | BPF_LD instructions. - -These instructions have implicit program exit condition as well. When -eBPF program is trying to access the data beyond the packet boundary, -the interpreter will abort the execution of the program. JIT compilers -therefore must preserve this property. src_reg and imm32 fields are -explicit inputs to these instructions. - -For example: - - BPF_IND | BPF_W | BPF_LD means: - - R0 = ntohl(*(u32 *) (((struct sk_buff *) R6)->data + src_reg + imm32)) - and R1 - R5 were scratched. - -Unlike classic BPF instruction set, eBPF has generic load/store operations: - -BPF_MEM | | BPF_STX: *(size *) (dst_reg + off) = src_reg -BPF_MEM | | BPF_ST: *(size *) (dst_reg + off) = imm32 -BPF_MEM | | BPF_LDX: dst_reg = *(size *) (src_reg + off) -BPF_XADD | BPF_W | BPF_STX: lock xadd *(u32 *)(dst_reg + off16) += src_reg -BPF_XADD | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg - -Where size is one of: BPF_B or BPF_H or BPF_W or BPF_DW. Note that 1 and -2 byte atomic increments are not supported. - -eBPF has one 16-byte instruction: BPF_LD | BPF_DW | BPF_IMM which consists -of two consecutive 'struct bpf_insn' 8-byte blocks and interpreted as single -instruction that loads 64-bit immediate value into a dst_reg. -Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads -32-bit immediate value into a register. - -eBPF verifier -------------- -The safety of the eBPF program is determined in two steps. - -First step does DAG check to disallow loops and other CFG validation. -In particular it will detect programs that have unreachable instructions. -(though classic BPF checker allows them) - -Second step starts from the first insn and descends all possible paths. -It simulates execution of every insn and observes the state change of -registers and stack. - -At the start of the program the register R1 contains a pointer to context -and has type PTR_TO_CTX. -If verifier sees an insn that does R2=R1, then R2 has now type -PTR_TO_CTX as well and can be used on the right hand side of expression. -If R1=PTR_TO_CTX and insn is R2=R1+R1, then R2=SCALAR_VALUE, -since addition of two valid pointers makes invalid pointer. -(In 'secure' mode verifier will reject any type of pointer arithmetic to make -sure that kernel addresses don't leak to unprivileged users) - -If register was never written to, it's not readable: - bpf_mov R0 = R2 - bpf_exit -will be rejected, since R2 is unreadable at the start of the program. - -After kernel function call, R1-R5 are reset to unreadable and -R0 has a return type of the function. - -Since R6-R9 are callee saved, their state is preserved across the call. - bpf_mov R6 = 1 - bpf_call foo - bpf_mov R0 = R6 - bpf_exit -is a correct program. If there was R1 instead of R6, it would have -been rejected. - -load/store instructions are allowed only with registers of valid types, which -are PTR_TO_CTX, PTR_TO_MAP, PTR_TO_STACK. They are bounds and alignment checked. -For example: - bpf_mov R1 = 1 - bpf_mov R2 = 2 - bpf_xadd *(u32 *)(R1 + 3) += R2 - bpf_exit -will be rejected, since R1 doesn't have a valid pointer type at the time of -execution of instruction bpf_xadd. - -At the start R1 type is PTR_TO_CTX (a pointer to generic 'struct bpf_context') -A callback is used to customize verifier to restrict eBPF program access to only -certain fields within ctx structure with specified size and alignment. - -For example, the following insn: - bpf_ld R0 = *(u32 *)(R6 + 8) -intends to load a word from address R6 + 8 and store it into R0 -If R6=PTR_TO_CTX, via is_valid_access() callback the verifier will know -that offset 8 of size 4 bytes can be accessed for reading, otherwise -the verifier will reject the program. -If R6=PTR_TO_STACK, then access should be aligned and be within -stack bounds, which are [-MAX_BPF_STACK, 0). In this example offset is 8, -so it will fail verification, since it's out of bounds. - -The verifier will allow eBPF program to read data from stack only after -it wrote into it. -Classic BPF verifier does similar check with M[0-15] memory slots. -For example: - bpf_ld R0 = *(u32 *)(R10 - 4) - bpf_exit -is invalid program. -Though R10 is correct read-only register and has type PTR_TO_STACK -and R10 - 4 is within stack bounds, there were no stores into that location. - -Pointer register spill/fill is tracked as well, since four (R6-R9) -callee saved registers may not be enough for some programs. - -Allowed function calls are customized with bpf_verifier_ops->get_func_proto() -The eBPF verifier will check that registers match argument constraints. -After the call register R0 will be set to return type of the function. - -Function calls is a main mechanism to extend functionality of eBPF programs. -Socket filters may let programs to call one set of functions, whereas tracing -filters may allow completely different set. - -If a function made accessible to eBPF program, it needs to be thought through -from safety point of view. The verifier will guarantee that the function is -called with valid arguments. - -seccomp vs socket filters have different security restrictions for classic BPF. -Seccomp solves this by two stage verifier: classic BPF verifier is followed -by seccomp verifier. In case of eBPF one configurable verifier is shared for -all use cases. - -See details of eBPF verifier in kernel/bpf/verifier.c - -Register value tracking ------------------------ -In order to determine the safety of an eBPF program, the verifier must track -the range of possible values in each register and also in each stack slot. -This is done with 'struct bpf_reg_state', defined in include/linux/ -bpf_verifier.h, which unifies tracking of scalar and pointer values. Each -register state has a type, which is either NOT_INIT (the register has not been -written to), SCALAR_VALUE (some value which is not usable as a pointer), or a -pointer type. The types of pointers describe their base, as follows: - PTR_TO_CTX Pointer to bpf_context. - CONST_PTR_TO_MAP Pointer to struct bpf_map. "Const" because arithmetic - on these pointers is forbidden. - PTR_TO_MAP_VALUE Pointer to the value stored in a map element. - PTR_TO_MAP_VALUE_OR_NULL - Either a pointer to a map value, or NULL; map accesses - (see section 'eBPF maps', below) return this type, - which becomes a PTR_TO_MAP_VALUE when checked != NULL. - Arithmetic on these pointers is forbidden. - PTR_TO_STACK Frame pointer. - PTR_TO_PACKET skb->data. - PTR_TO_PACKET_END skb->data + headlen; arithmetic forbidden. - PTR_TO_SOCKET Pointer to struct bpf_sock_ops, implicitly refcounted. - PTR_TO_SOCKET_OR_NULL - Either a pointer to a socket, or NULL; socket lookup - returns this type, which becomes a PTR_TO_SOCKET when - checked != NULL. PTR_TO_SOCKET is reference-counted, - so programs must release the reference through the - socket release function before the end of the program. - Arithmetic on these pointers is forbidden. -However, a pointer may be offset from this base (as a result of pointer -arithmetic), and this is tracked in two parts: the 'fixed offset' and 'variable -offset'. The former is used when an exactly-known value (e.g. an immediate -operand) is added to a pointer, while the latter is used for values which are -not exactly known. The variable offset is also used in SCALAR_VALUEs, to track -the range of possible values in the register. -The verifier's knowledge about the variable offset consists of: -* minimum and maximum values as unsigned -* minimum and maximum values as signed -* knowledge of the values of individual bits, in the form of a 'tnum': a u64 -'mask' and a u64 'value'. 1s in the mask represent bits whose value is unknown; -1s in the value represent bits known to be 1. Bits known to be 0 have 0 in both -mask and value; no bit should ever be 1 in both. For example, if a byte is read -into a register from memory, the register's top 56 bits are known zero, while -the low 8 are unknown - which is represented as the tnum (0x0; 0xff). If we -then OR this with 0x40, we get (0x40; 0xbf), then if we add 1 we get (0x0; -0x1ff), because of potential carries. - -Besides arithmetic, the register state can also be updated by conditional -branches. For instance, if a SCALAR_VALUE is compared > 8, in the 'true' branch -it will have a umin_value (unsigned minimum value) of 9, whereas in the 'false' -branch it will have a umax_value of 8. A signed compare (with BPF_JSGT or -BPF_JSGE) would instead update the signed minimum/maximum values. Information -from the signed and unsigned bounds can be combined; for instance if a value is -first tested < 8 and then tested s> 4, the verifier will conclude that the value -is also > 4 and s< 8, since the bounds prevent crossing the sign boundary. - -PTR_TO_PACKETs with a variable offset part have an 'id', which is common to all -pointers sharing that same variable offset. This is important for packet range -checks: after adding a variable to a packet pointer register A, if you then copy -it to another register B and then add a constant 4 to A, both registers will -share the same 'id' but the A will have a fixed offset of +4. Then if A is -bounds-checked and found to be less than a PTR_TO_PACKET_END, the register B is -now known to have a safe range of at least 4 bytes. See 'Direct packet access', -below, for more on PTR_TO_PACKET ranges. - -The 'id' field is also used on PTR_TO_MAP_VALUE_OR_NULL, common to all copies of -the pointer returned from a map lookup. This means that when one copy is -checked and found to be non-NULL, all copies can become PTR_TO_MAP_VALUEs. -As well as range-checking, the tracked information is also used for enforcing -alignment of pointer accesses. For instance, on most systems the packet pointer -is 2 bytes after a 4-byte alignment. If a program adds 14 bytes to that to jump -over the Ethernet header, then reads IHL and addes (IHL * 4), the resulting -pointer will have a variable offset known to be 4n+2 for some n, so adding the 2 -bytes (NET_IP_ALIGN) gives a 4-byte alignment and so word-sized accesses through -that pointer are safe. -The 'id' field is also used on PTR_TO_SOCKET and PTR_TO_SOCKET_OR_NULL, common -to all copies of the pointer returned from a socket lookup. This has similar -behaviour to the handling for PTR_TO_MAP_VALUE_OR_NULL->PTR_TO_MAP_VALUE, but -it also handles reference tracking for the pointer. PTR_TO_SOCKET implicitly -represents a reference to the corresponding 'struct sock'. To ensure that the -reference is not leaked, it is imperative to NULL-check the reference and in -the non-NULL case, and pass the valid reference to the socket release function. - -Direct packet access --------------------- -In cls_bpf and act_bpf programs the verifier allows direct access to the packet -data via skb->data and skb->data_end pointers. -Ex: -1: r4 = *(u32 *)(r1 +80) /* load skb->data_end */ -2: r3 = *(u32 *)(r1 +76) /* load skb->data */ -3: r5 = r3 -4: r5 += 14 -5: if r5 > r4 goto pc+16 -R1=ctx R3=pkt(id=0,off=0,r=14) R4=pkt_end R5=pkt(id=0,off=14,r=14) R10=fp -6: r0 = *(u16 *)(r3 +12) /* access 12 and 13 bytes of the packet */ - -this 2byte load from the packet is safe to do, since the program author -did check 'if (skb->data + 14 > skb->data_end) goto err' at insn #5 which -means that in the fall-through case the register R3 (which points to skb->data) -has at least 14 directly accessible bytes. The verifier marks it -as R3=pkt(id=0,off=0,r=14). -id=0 means that no additional variables were added to the register. -off=0 means that no additional constants were added. -r=14 is the range of safe access which means that bytes [R3, R3 + 14) are ok. -Note that R5 is marked as R5=pkt(id=0,off=14,r=14). It also points -to the packet data, but constant 14 was added to the register, so -it now points to 'skb->data + 14' and accessible range is [R5, R5 + 14 - 14) -which is zero bytes. - -More complex packet access may look like: - R0=inv1 R1=ctx R3=pkt(id=0,off=0,r=14) R4=pkt_end R5=pkt(id=0,off=14,r=14) R10=fp - 6: r0 = *(u8 *)(r3 +7) /* load 7th byte from the packet */ - 7: r4 = *(u8 *)(r3 +12) - 8: r4 *= 14 - 9: r3 = *(u32 *)(r1 +76) /* load skb->data */ -10: r3 += r4 -11: r2 = r1 -12: r2 <<= 48 -13: r2 >>= 48 -14: r3 += r2 -15: r2 = r3 -16: r2 += 8 -17: r1 = *(u32 *)(r1 +80) /* load skb->data_end */ -18: if r2 > r1 goto pc+2 - R0=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R1=pkt_end R2=pkt(id=2,off=8,r=8) R3=pkt(id=2,off=0,r=8) R4=inv(id=0,umax_value=3570,var_off=(0x0; 0xfffe)) R5=pkt(id=0,off=14,r=14) R10=fp -19: r1 = *(u8 *)(r3 +4) -The state of the register R3 is R3=pkt(id=2,off=0,r=8) -id=2 means that two 'r3 += rX' instructions were seen, so r3 points to some -offset within a packet and since the program author did -'if (r3 + 8 > r1) goto err' at insn #18, the safe range is [R3, R3 + 8). -The verifier only allows 'add'/'sub' operations on packet registers. Any other -operation will set the register state to 'SCALAR_VALUE' and it won't be -available for direct packet access. -Operation 'r3 += rX' may overflow and become less than original skb->data, -therefore the verifier has to prevent that. So when it sees 'r3 += rX' -instruction and rX is more than 16-bit value, any subsequent bounds-check of r3 -against skb->data_end will not give us 'range' information, so attempts to read -through the pointer will give "invalid access to packet" error. -Ex. after insn 'r4 = *(u8 *)(r3 +12)' (insn #7 above) the state of r4 is -R4=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) which means that upper 56 bits -of the register are guaranteed to be zero, and nothing is known about the lower -8 bits. After insn 'r4 *= 14' the state becomes -R4=inv(id=0,umax_value=3570,var_off=(0x0; 0xfffe)), since multiplying an 8-bit -value by constant 14 will keep upper 52 bits as zero, also the least significant -bit will be zero as 14 is even. Similarly 'r2 >>= 48' will make -R2=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff)), since the shift is not sign -extending. This logic is implemented in adjust_reg_min_max_vals() function, -which calls adjust_ptr_min_max_vals() for adding pointer to scalar (or vice -versa) and adjust_scalar_min_max_vals() for operations on two scalars. - -The end result is that bpf program author can access packet directly -using normal C code as: - void *data = (void *)(long)skb->data; - void *data_end = (void *)(long)skb->data_end; - struct eth_hdr *eth = data; - struct iphdr *iph = data + sizeof(*eth); - struct udphdr *udp = data + sizeof(*eth) + sizeof(*iph); - - if (data + sizeof(*eth) + sizeof(*iph) + sizeof(*udp) > data_end) - return 0; - if (eth->h_proto != htons(ETH_P_IP)) - return 0; - if (iph->protocol != IPPROTO_UDP || iph->ihl != 5) - return 0; - if (udp->dest == 53 || udp->source == 9) - ...; -which makes such programs easier to write comparing to LD_ABS insn -and significantly faster. - -eBPF maps ---------- -'maps' is a generic storage of different types for sharing data between kernel -and userspace. - -The maps are accessed from user space via BPF syscall, which has commands: -- create a map with given type and attributes - map_fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size) - using attr->map_type, attr->key_size, attr->value_size, attr->max_entries - returns process-local file descriptor or negative error - -- lookup key in a given map - err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size) - using attr->map_fd, attr->key, attr->value - returns zero and stores found elem into value or negative error - -- create or update key/value pair in a given map - err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size) - using attr->map_fd, attr->key, attr->value - returns zero or negative error - -- find and delete element by key in a given map - err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size) - using attr->map_fd, attr->key - -- to delete map: close(fd) - Exiting process will delete maps automatically - -userspace programs use this syscall to create/access maps that eBPF programs -are concurrently updating. - -maps can have different types: hash, array, bloom filter, radix-tree, etc. - -The map is defined by: - . type - . max number of elements - . key size in bytes - . value size in bytes - -Pruning -------- -The verifier does not actually walk all possible paths through the program. For -each new branch to analyse, the verifier looks at all the states it's previously -been in when at this instruction. If any of them contain the current state as a -subset, the branch is 'pruned' - that is, the fact that the previous state was -accepted implies the current state would be as well. For instance, if in the -previous state, r1 held a packet-pointer, and in the current state, r1 holds a -packet-pointer with a range as long or longer and at least as strict an -alignment, then r1 is safe. Similarly, if r2 was NOT_INIT before then it can't -have been used by any path from that point, so any value in r2 (including -another NOT_INIT) is safe. The implementation is in the function regsafe(). -Pruning considers not only the registers but also the stack (and any spilled -registers it may hold). They must all be safe for the branch to be pruned. -This is implemented in states_equal(). - -Understanding eBPF verifier messages ------------------------------------- - -The following are few examples of invalid eBPF programs and verifier error -messages as seen in the log: - -Program with unreachable instructions: -static struct bpf_insn prog[] = { - BPF_EXIT_INSN(), - BPF_EXIT_INSN(), -}; -Error: - unreachable insn 1 - -Program that reads uninitialized register: - BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), - BPF_EXIT_INSN(), -Error: - 0: (bf) r0 = r2 - R2 !read_ok - -Program that doesn't initialize R0 before exiting: - BPF_MOV64_REG(BPF_REG_2, BPF_REG_1), - BPF_EXIT_INSN(), -Error: - 0: (bf) r2 = r1 - 1: (95) exit - R0 !read_ok - -Program that accesses stack out of bounds: - BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0), - BPF_EXIT_INSN(), -Error: - 0: (7a) *(u64 *)(r10 +8) = 0 - invalid stack off=8 size=8 - -Program that doesn't initialize stack before passing its address into function: - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_EXIT_INSN(), -Error: - 0: (bf) r2 = r10 - 1: (07) r2 += -8 - 2: (b7) r1 = 0x0 - 3: (85) call 1 - invalid indirect read from stack off -8+0 size 8 - -Program that uses invalid map_fd=0 while calling to map_lookup_elem() function: - BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_EXIT_INSN(), -Error: - 0: (7a) *(u64 *)(r10 -8) = 0 - 1: (bf) r2 = r10 - 2: (07) r2 += -8 - 3: (b7) r1 = 0x0 - 4: (85) call 1 - fd 0 is not pointing to valid bpf_map - -Program that doesn't check return value of map_lookup_elem() before accessing -map element: - BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), - BPF_EXIT_INSN(), -Error: - 0: (7a) *(u64 *)(r10 -8) = 0 - 1: (bf) r2 = r10 - 2: (07) r2 += -8 - 3: (b7) r1 = 0x0 - 4: (85) call 1 - 5: (7a) *(u64 *)(r0 +0) = 0 - R0 invalid mem access 'map_value_or_null' - -Program that correctly checks map_lookup_elem() returned value for NULL, but -accesses the memory with incorrect alignment: - BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), - BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0), - BPF_EXIT_INSN(), -Error: - 0: (7a) *(u64 *)(r10 -8) = 0 - 1: (bf) r2 = r10 - 2: (07) r2 += -8 - 3: (b7) r1 = 1 - 4: (85) call 1 - 5: (15) if r0 == 0x0 goto pc+1 - R0=map_ptr R10=fp - 6: (7a) *(u64 *)(r0 +4) = 0 - misaligned access off 4 size 8 - -Program that correctly checks map_lookup_elem() returned value for NULL and -accesses memory with correct alignment in one side of 'if' branch, but fails -to do so in the other side of 'if' branch: - BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), - BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), - BPF_EXIT_INSN(), - BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1), - BPF_EXIT_INSN(), -Error: - 0: (7a) *(u64 *)(r10 -8) = 0 - 1: (bf) r2 = r10 - 2: (07) r2 += -8 - 3: (b7) r1 = 1 - 4: (85) call 1 - 5: (15) if r0 == 0x0 goto pc+2 - R0=map_ptr R10=fp - 6: (7a) *(u64 *)(r0 +0) = 0 - 7: (95) exit - - from 5 to 8: R0=imm0 R10=fp - 8: (7a) *(u64 *)(r0 +0) = 1 - R0 invalid mem access 'imm' - -Program that performs a socket lookup then sets the pointer to NULL without -checking it: -value: - BPF_MOV64_IMM(BPF_REG_2, 0), - BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_MOV64_IMM(BPF_REG_3, 4), - BPF_MOV64_IMM(BPF_REG_4, 0), - BPF_MOV64_IMM(BPF_REG_5, 0), - BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp), - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), -Error: - 0: (b7) r2 = 0 - 1: (63) *(u32 *)(r10 -8) = r2 - 2: (bf) r2 = r10 - 3: (07) r2 += -8 - 4: (b7) r3 = 4 - 5: (b7) r4 = 0 - 6: (b7) r5 = 0 - 7: (85) call bpf_sk_lookup_tcp#65 - 8: (b7) r0 = 0 - 9: (95) exit - Unreleased reference id=1, alloc_insn=7 - -Program that performs a socket lookup but does not NULL-check the returned -value: - BPF_MOV64_IMM(BPF_REG_2, 0), - BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_MOV64_IMM(BPF_REG_3, 4), - BPF_MOV64_IMM(BPF_REG_4, 0), - BPF_MOV64_IMM(BPF_REG_5, 0), - BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp), - BPF_EXIT_INSN(), -Error: - 0: (b7) r2 = 0 - 1: (63) *(u32 *)(r10 -8) = r2 - 2: (bf) r2 = r10 - 3: (07) r2 += -8 - 4: (b7) r3 = 4 - 5: (b7) r4 = 0 - 6: (b7) r5 = 0 - 7: (85) call bpf_sk_lookup_tcp#65 - 8: (95) exit - Unreleased reference id=1, alloc_insn=7 - -Testing -------- - -Next to the BPF toolchain, the kernel also ships a test module that contains -various test cases for classic and internal BPF that can be executed against -the BPF interpreter and JIT compiler. It can be found in lib/test_bpf.c and -enabled via Kconfig: - - CONFIG_TEST_BPF=m - -After the module has been built and installed, the test suite can be executed -via insmod or modprobe against 'test_bpf' module. Results of the test cases -including timings in nsec can be found in the kernel log (dmesg). - -Misc ----- - -Also trinity, the Linux syscall fuzzer, has built-in support for BPF and -SECCOMP-BPF kernel fuzzing. - -Written by ----------- - -The document was written in the hope that it is found useful and in order -to give potential BPF hackers or security auditors a better overview of -the underlying architecture. - -Jay Schulist -Daniel Borkmann -Alexei Starovoitov diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 807abe25ae4b..144ed838c1a9 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -56,6 +56,7 @@ Contents: driver eql fib_trie + filter .. only:: subproject and html diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt index 999eb41da81d..494614573c67 100644 --- a/Documentation/networking/packet_mmap.txt +++ b/Documentation/networking/packet_mmap.txt @@ -1051,7 +1051,7 @@ for more information on hardware timestamps. ------------------------------------------------------------------------------- - Packet sockets work well together with Linux socket filters, thus you also - might want to have a look at Documentation/networking/filter.txt + might want to have a look at Documentation/networking/filter.rst -------------------------------------------------------------------------------- + THANKS diff --git a/MAINTAINERS b/MAINTAINERS index 7323bfc1720f..4ec6d2741d36 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3192,7 +3192,7 @@ Q: https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147 T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git F: Documentation/bpf/ -F: Documentation/networking/filter.txt +F: Documentation/networking/filter.rst F: arch/*/net/* F: include/linux/bpf* F: include/linux/filter.h diff --git a/tools/bpf/bpf_asm.c b/tools/bpf/bpf_asm.c index e5f95e3eede3..0063c3c029e7 100644 --- a/tools/bpf/bpf_asm.c +++ b/tools/bpf/bpf_asm.c @@ -11,7 +11,7 @@ * * How to get into it: * - * 1) read Documentation/networking/filter.txt + * 1) read Documentation/networking/filter.rst * 2) Run `bpf_asm [-c] ` to translate into binary * blob that is loadable with xt_bpf, cls_bpf et al. Note: -c will * pretty print a C-like construct. diff --git a/tools/bpf/bpf_dbg.c b/tools/bpf/bpf_dbg.c index 9d3766e653a9..a0ebcdf59c31 100644 --- a/tools/bpf/bpf_dbg.c +++ b/tools/bpf/bpf_dbg.c @@ -13,7 +13,7 @@ * for making a verdict when multiple simple BPF programs are combined * into one in order to prevent parsing same headers multiple times. * - * More on how to debug BPF opcodes see Documentation/networking/filter.txt + * More on how to debug BPF opcodes see Documentation/networking/filter.rst * which is the main document on BPF. Mini howto for getting started: * * 1) `./bpf_dbg` to enter the shell (shell cmds denoted with '>'): -- cgit v1.2.3 From 9b329d0dbe413bf46eb5010edd06b3076960a60a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 28 Apr 2020 15:30:48 -0700 Subject: selftests/bpf: fix test_sysctl_prog with alu32 Similar to commit b7a0d65d80a0 ("bpf, testing: Workaround a verifier failure for test_progs") fix test_sysctl_prog.c as well. Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/test_sysctl_prog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_sysctl_prog.c b/tools/testing/selftests/bpf/progs/test_sysctl_prog.c index 2d0b0b82a78a..50525235380e 100644 --- a/tools/testing/selftests/bpf/progs/test_sysctl_prog.c +++ b/tools/testing/selftests/bpf/progs/test_sysctl_prog.c @@ -45,7 +45,7 @@ int sysctl_tcp_mem(struct bpf_sysctl *ctx) unsigned long tcp_mem[3] = {0, 0, 0}; char value[MAX_VALUE_STR_LEN]; unsigned char i, off = 0; - int ret; + volatile int ret; if (ctx->write) return 0; -- cgit v1.2.3 From f2e10bff16a0fdd41ba278c84da9813700e356af Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:08 -0700 Subject: bpf: Add support for BPF_OBJ_GET_INFO_BY_FD for bpf_link Add ability to fetch bpf_link details through BPF_OBJ_GET_INFO_BY_FD command. Also enhance show_fdinfo to potentially include bpf_link type-specific information (similarly to obj_info). Also introduce enum bpf_link_type stored in bpf_link itself and expose it in UAPI. bpf_link_tracing also now will store and return bpf_attach_type. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429001614.1544-5-andriin@fb.com --- include/linux/bpf-cgroup.h | 2 - include/linux/bpf.h | 8 ++- include/linux/bpf_types.h | 6 ++ include/uapi/linux/bpf.h | 28 ++++++++ kernel/bpf/btf.c | 2 + kernel/bpf/cgroup.c | 43 +++++++++++- kernel/bpf/syscall.c | 155 ++++++++++++++++++++++++++++++++++++----- kernel/bpf/verifier.c | 2 + tools/include/uapi/linux/bpf.h | 31 +++++++++ 9 files changed, 253 insertions(+), 24 deletions(-) (limited to 'tools') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index a9cb9a5bf8e9..272626cc3fc9 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -57,8 +57,6 @@ struct bpf_cgroup_link { enum bpf_attach_type type; }; -extern const struct bpf_link_ops bpf_cgroup_link_lops; - struct bpf_prog_list { struct list_head node; struct bpf_prog *prog; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 875d1f0af803..c07b1d2f3824 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1026,9 +1026,11 @@ extern const struct file_operations bpf_prog_fops; extern const struct bpf_verifier_ops _name ## _verifier_ops; #define BPF_MAP_TYPE(_id, _ops) \ extern const struct bpf_map_ops _ops; +#define BPF_LINK_TYPE(_id, _name) #include #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE extern const struct bpf_prog_ops bpf_offload_prog_ops; extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops; @@ -1086,6 +1088,7 @@ int bpf_prog_new_fd(struct bpf_prog *prog); struct bpf_link { atomic64_t refcnt; u32 id; + enum bpf_link_type type; const struct bpf_link_ops *ops; struct bpf_prog *prog; struct work_struct work; @@ -1103,9 +1106,12 @@ struct bpf_link_ops { void (*dealloc)(struct bpf_link *link); int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog); + void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq); + int (*fill_link_info)(const struct bpf_link *link, + struct bpf_link_info *info); }; -void bpf_link_init(struct bpf_link *link, +void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog); int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer); int bpf_link_settle(struct bpf_link_primer *primer); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index ba0c2d56f8a3..8345cdf553b8 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -118,3 +118,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops) #if defined(CONFIG_BPF_JIT) BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) #endif + +BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) +BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) +#ifdef CONFIG_CGROUP_BPF +BPF_LINK_TYPE(BPF_LINK_TYPE_CGROUP, cgroup) +#endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7e6541fceade..0eccafae55bb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -222,6 +222,15 @@ enum bpf_attach_type { #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE +enum bpf_link_type { + BPF_LINK_TYPE_UNSPEC = 0, + BPF_LINK_TYPE_RAW_TRACEPOINT = 1, + BPF_LINK_TYPE_TRACING = 2, + BPF_LINK_TYPE_CGROUP = 3, + + MAX_BPF_LINK_TYPE, +}; + /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command * * NONE(default): No further bpf programs allowed in the subtree. @@ -3612,6 +3621,25 @@ struct bpf_btf_info { __u32 id; } __attribute__((aligned(8))); +struct bpf_link_info { + __u32 type; + __u32 id; + __u32 prog_id; + union { + struct { + __aligned_u64 tp_name; /* in/out: tp_name buffer ptr */ + __u32 tp_name_len; /* in/out: tp_name buffer len */ + } raw_tracepoint; + struct { + __u32 attach_type; + } tracing; + struct { + __u64 cgroup_id; + __u32 attach_type; + } cgroup; + }; +} __attribute__((aligned(8))); + /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed * by user and intended to be used by socket (e.g. to bind to, depends on * attach attach type). diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d65c6912bdaf..a2cfba89a8e1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3482,6 +3482,7 @@ extern char __weak __stop_BTF[]; extern struct btf *btf_vmlinux; #define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) static union { struct bpf_ctx_convert { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ @@ -3508,6 +3509,7 @@ static u8 bpf_ctx_convert_map[] = { 0, /* avoid empty array */ }; #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE static const struct btf_member * btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 1bdf37fca879..5c0e964105ac 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -833,10 +833,48 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link) kfree(cg_link); } -const struct bpf_link_ops bpf_cgroup_link_lops = { +static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_cgroup_link *cg_link = + container_of(link, struct bpf_cgroup_link, link); + u64 cg_id = 0; + + mutex_lock(&cgroup_mutex); + if (cg_link->cgroup) + cg_id = cgroup_id(cg_link->cgroup); + mutex_unlock(&cgroup_mutex); + + seq_printf(seq, + "cgroup_id:\t%llu\n" + "attach_type:\t%d\n", + cg_id, + cg_link->type); +} + +static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_cgroup_link *cg_link = + container_of(link, struct bpf_cgroup_link, link); + u64 cg_id = 0; + + mutex_lock(&cgroup_mutex); + if (cg_link->cgroup) + cg_id = cgroup_id(cg_link->cgroup); + mutex_unlock(&cgroup_mutex); + + info->cgroup.cgroup_id = cg_id; + info->cgroup.attach_type = cg_link->type; + return 0; +} + +static const struct bpf_link_ops bpf_cgroup_link_lops = { .release = bpf_cgroup_link_release, .dealloc = bpf_cgroup_link_dealloc, .update_prog = cgroup_bpf_replace, + .show_fdinfo = bpf_cgroup_link_show_fdinfo, + .fill_link_info = bpf_cgroup_link_fill_link_info, }; int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) @@ -858,7 +896,8 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) err = -ENOMEM; goto out_put_cgroup; } - bpf_link_init(&link->link, &bpf_cgroup_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops, + prog); link->cgroup = cgrp; link->type = attr->link_create.attach_type; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1c213a730502..d23c04cbe14f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -51,9 +51,11 @@ static const struct bpf_map_ops * const bpf_map_types[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) #define BPF_MAP_TYPE(_id, _ops) \ [_id] = &_ops, +#define BPF_LINK_TYPE(_id, _name) #include #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE }; /* @@ -1548,9 +1550,11 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = & _name ## _prog_ops, #define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) #include #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE }; static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) @@ -2183,10 +2187,11 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } -void bpf_link_init(struct bpf_link *link, +void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog) { atomic64_set(&link->refcnt, 1); + link->type = type; link->id = 0; link->ops = ops; link->prog = prog; @@ -2266,27 +2271,23 @@ static int bpf_link_release(struct inode *inode, struct file *filp) return 0; } -#ifdef CONFIG_PROC_FS -static const struct bpf_link_ops bpf_raw_tp_lops; -static const struct bpf_link_ops bpf_tracing_link_lops; +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) +#define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) [_id] = #_name, +static const char *bpf_link_type_strs[] = { + [BPF_LINK_TYPE_UNSPEC] = "", +#include +}; +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE +#ifdef CONFIG_PROC_FS static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_link *link = filp->private_data; const struct bpf_prog *prog = link->prog; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; - const char *link_type; - - if (link->ops == &bpf_raw_tp_lops) - link_type = "raw_tracepoint"; - else if (link->ops == &bpf_tracing_link_lops) - link_type = "tracing"; -#ifdef CONFIG_CGROUP_BPF - else if (link->ops == &bpf_cgroup_link_lops) - link_type = "cgroup"; -#endif - else - link_type = "unknown"; bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, @@ -2294,10 +2295,12 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) "link_id:\t%u\n" "prog_tag:\t%s\n" "prog_id:\t%u\n", - link_type, + bpf_link_type_strs[link->type], link->id, prog_tag, prog->aux->id); + if (link->ops->show_fdinfo) + link->ops->show_fdinfo(link, m); } #endif @@ -2403,6 +2406,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) struct bpf_tracing_link { struct bpf_link link; + enum bpf_attach_type attach_type; }; static void bpf_tracing_link_release(struct bpf_link *link) @@ -2418,9 +2422,33 @@ static void bpf_tracing_link_dealloc(struct bpf_link *link) kfree(tr_link); } +static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_tracing_link *tr_link = + container_of(link, struct bpf_tracing_link, link); + + seq_printf(seq, + "attach_type:\t%d\n", + tr_link->attach_type); +} + +static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_tracing_link *tr_link = + container_of(link, struct bpf_tracing_link, link); + + info->tracing.attach_type = tr_link->attach_type; + + return 0; +} + static const struct bpf_link_ops bpf_tracing_link_lops = { .release = bpf_tracing_link_release, .dealloc = bpf_tracing_link_dealloc, + .show_fdinfo = bpf_tracing_link_show_fdinfo, + .fill_link_info = bpf_tracing_link_fill_link_info, }; static int bpf_tracing_prog_attach(struct bpf_prog *prog) @@ -2460,7 +2488,9 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) err = -ENOMEM; goto out_put_prog; } - bpf_link_init(&link->link, &bpf_tracing_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING, + &bpf_tracing_link_lops, prog); + link->attach_type = prog->expected_attach_type; err = bpf_link_prime(&link->link, &link_primer); if (err) { @@ -2502,9 +2532,56 @@ static void bpf_raw_tp_link_dealloc(struct bpf_link *link) kfree(raw_tp); } +static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_raw_tp_link *raw_tp_link = + container_of(link, struct bpf_raw_tp_link, link); + + seq_printf(seq, + "tp_name:\t%s\n", + raw_tp_link->btp->tp->name); +} + +static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_raw_tp_link *raw_tp_link = + container_of(link, struct bpf_raw_tp_link, link); + char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name); + const char *tp_name = raw_tp_link->btp->tp->name; + u32 ulen = info->raw_tracepoint.tp_name_len; + size_t tp_len = strlen(tp_name); + + if (ulen && !ubuf) + return -EINVAL; + + info->raw_tracepoint.tp_name_len = tp_len + 1; + + if (!ubuf) + return 0; + + if (ulen >= tp_len + 1) { + if (copy_to_user(ubuf, tp_name, tp_len + 1)) + return -EFAULT; + } else { + char zero = '\0'; + + if (copy_to_user(ubuf, tp_name, ulen - 1)) + return -EFAULT; + if (put_user(zero, ubuf + ulen - 1)) + return -EFAULT; + return -ENOSPC; + } + + return 0; +} + static const struct bpf_link_ops bpf_raw_tp_link_lops = { .release = bpf_raw_tp_link_release, .dealloc = bpf_raw_tp_link_dealloc, + .show_fdinfo = bpf_raw_tp_link_show_fdinfo, + .fill_link_info = bpf_raw_tp_link_fill_link_info, }; #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd @@ -2570,7 +2647,8 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) err = -ENOMEM; goto out_put_btp; } - bpf_link_init(&link->link, &bpf_raw_tp_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, + &bpf_raw_tp_link_lops, prog); link->btp = btp; err = bpf_link_prime(&link->link, &link_primer); @@ -3366,6 +3444,42 @@ static int bpf_btf_get_info_by_fd(struct btf *btf, return btf_get_info_by_fd(btf, attr, uattr); } +static int bpf_link_get_info_by_fd(struct bpf_link *link, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct bpf_link_info info; + u32 info_len = attr->info.info_len; + int err; + + err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); + if (err) + return err; + info_len = min_t(u32, sizeof(info), info_len); + + memset(&info, 0, sizeof(info)); + if (copy_from_user(&info, uinfo, info_len)) + return -EFAULT; + + info.type = link->type; + info.id = link->id; + info.prog_id = link->prog->aux->id; + + if (link->ops->fill_link_info) { + err = link->ops->fill_link_info(link, &info); + if (err) + return err; + } + + if (copy_to_user(uinfo, &info, info_len) || + put_user(info_len, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} + + #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, @@ -3390,6 +3504,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, uattr); else if (f.file->f_op == &btf_fops) err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr); + else if (f.file->f_op == &bpf_link_fops) + err = bpf_link_get_info_by_fd(f.file->private_data, + attr, uattr); else err = -EINVAL; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 91728e0f27eb..2b337e32aa94 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -28,9 +28,11 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = & _name ## _verifier_ops, #define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) #include #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE }; /* bpf_check() is a static code analyzer that walks eBPF program diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4a6c47f3febe..0eccafae55bb 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -113,6 +113,8 @@ enum bpf_cmd { BPF_MAP_DELETE_BATCH, BPF_LINK_CREATE, BPF_LINK_UPDATE, + BPF_LINK_GET_FD_BY_ID, + BPF_LINK_GET_NEXT_ID, }; enum bpf_map_type { @@ -220,6 +222,15 @@ enum bpf_attach_type { #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE +enum bpf_link_type { + BPF_LINK_TYPE_UNSPEC = 0, + BPF_LINK_TYPE_RAW_TRACEPOINT = 1, + BPF_LINK_TYPE_TRACING = 2, + BPF_LINK_TYPE_CGROUP = 3, + + MAX_BPF_LINK_TYPE, +}; + /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command * * NONE(default): No further bpf programs allowed in the subtree. @@ -523,6 +534,7 @@ union bpf_attr { __u32 prog_id; __u32 map_id; __u32 btf_id; + __u32 link_id; }; __u32 next_id; __u32 open_flags; @@ -3609,6 +3621,25 @@ struct bpf_btf_info { __u32 id; } __attribute__((aligned(8))); +struct bpf_link_info { + __u32 type; + __u32 id; + __u32 prog_id; + union { + struct { + __aligned_u64 tp_name; /* in/out: tp_name buffer ptr */ + __u32 tp_name_len; /* in/out: tp_name buffer len */ + } raw_tracepoint; + struct { + __u32 attach_type; + } tracing; + struct { + __u64 cgroup_id; + __u32 attach_type; + } cgroup; + }; +} __attribute__((aligned(8))); + /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed * by user and intended to be used by socket (e.g. to bind to, depends on * attach attach type). -- cgit v1.2.3 From 0dbc866832a0fbf9f2b98d412da44c5cfd1b7756 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:09 -0700 Subject: libbpf: Add low-level APIs for new bpf_link commands Add low-level API calls for bpf_link_get_next_id() and bpf_link_get_fd_by_id(). Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429001614.1544-6-andriin@fb.com --- tools/lib/bpf/bpf.c | 19 +++++++++++++++++-- tools/lib/bpf/bpf.h | 4 +++- tools/lib/bpf/libbpf.map | 6 ++++++ 3 files changed, 26 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 5cc1b0785d18..8f2f0958d446 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -721,6 +721,11 @@ int bpf_btf_get_next_id(__u32 start_id, __u32 *next_id) return bpf_obj_get_next_id(start_id, next_id, BPF_BTF_GET_NEXT_ID); } +int bpf_link_get_next_id(__u32 start_id, __u32 *next_id) +{ + return bpf_obj_get_next_id(start_id, next_id, BPF_LINK_GET_NEXT_ID); +} + int bpf_prog_get_fd_by_id(__u32 id) { union bpf_attr attr; @@ -751,13 +756,23 @@ int bpf_btf_get_fd_by_id(__u32 id) return sys_bpf(BPF_BTF_GET_FD_BY_ID, &attr, sizeof(attr)); } -int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len) +int bpf_link_get_fd_by_id(__u32 id) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + attr.link_id = id; + + return sys_bpf(BPF_LINK_GET_FD_BY_ID, &attr, sizeof(attr)); +} + +int bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len) { union bpf_attr attr; int err; memset(&attr, 0, sizeof(attr)); - attr.info.bpf_fd = prog_fd; + attr.info.bpf_fd = bpf_fd; attr.info.info_len = *info_len; attr.info.info = ptr_to_u64(info); diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 46d47afdd887..335b457b3a25 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -216,10 +216,12 @@ LIBBPF_API int bpf_prog_test_run(int prog_fd, int repeat, void *data, LIBBPF_API int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id); LIBBPF_API int bpf_map_get_next_id(__u32 start_id, __u32 *next_id); LIBBPF_API int bpf_btf_get_next_id(__u32 start_id, __u32 *next_id); +LIBBPF_API int bpf_link_get_next_id(__u32 start_id, __u32 *next_id); LIBBPF_API int bpf_prog_get_fd_by_id(__u32 id); LIBBPF_API int bpf_map_get_fd_by_id(__u32 id); LIBBPF_API int bpf_btf_get_fd_by_id(__u32 id); -LIBBPF_API int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len); +LIBBPF_API int bpf_link_get_fd_by_id(__u32 id); +LIBBPF_API int bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len); LIBBPF_API int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index bb8831605b25..7cd49aa38005 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -254,3 +254,9 @@ LIBBPF_0.0.8 { bpf_program__set_lsm; bpf_set_link_xdp_fd_opts; } LIBBPF_0.0.7; + +LIBBPF_0.0.9 { + global: + bpf_link_get_fd_by_id; + bpf_link_get_next_id; +} LIBBPF_0.0.8; -- cgit v1.2.3 From 2c2837b09e9ab4874353186599609fa2e1ccabce Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:10 -0700 Subject: selftests/bpf: Test bpf_link's get_next_id, get_fd_by_id, and get_obj_info Extend bpf_obj_id selftest to verify bpf_link's observability APIs. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429001614.1544-7-andriin@fb.com --- .../testing/selftests/bpf/prog_tests/bpf_obj_id.c | 110 +++++++++++++++++++-- tools/testing/selftests/bpf/progs/test_obj_id.c | 14 +-- 2 files changed, 104 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c index f10029821e16..7afa4160416f 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c @@ -1,26 +1,30 @@ // SPDX-License-Identifier: GPL-2.0 #include +#define nr_iters 2 + void test_bpf_obj_id(void) { const __u64 array_magic_value = 0xfaceb00c; const __u32 array_key = 0; - const int nr_iters = 2; const char *file = "./test_obj_id.o"; const char *expected_prog_name = "test_obj_id"; const char *expected_map_name = "test_map_id"; const __u64 nsec_per_sec = 1000000000; - struct bpf_object *objs[nr_iters]; + struct bpf_object *objs[nr_iters] = {}; + struct bpf_link *links[nr_iters] = {}; + struct bpf_program *prog; int prog_fds[nr_iters], map_fds[nr_iters]; /* +1 to test for the info_len returned by kernel */ struct bpf_prog_info prog_infos[nr_iters + 1]; struct bpf_map_info map_infos[nr_iters + 1]; + struct bpf_link_info link_infos[nr_iters + 1]; /* Each prog only uses one map. +1 to test nr_map_ids * returned by kernel. */ __u32 map_ids[nr_iters + 1]; - char jited_insns[128], xlated_insns[128], zeros[128]; + char jited_insns[128], xlated_insns[128], zeros[128], tp_name[128]; __u32 i, next_id, info_len, nr_id_found, duration = 0; struct timespec real_time_ts, boot_time_ts; int err = 0; @@ -36,14 +40,15 @@ void test_bpf_obj_id(void) CHECK(err >= 0 || errno != ENOENT, "get-fd-by-notexist-map-id", "err %d errno %d\n", err, errno); - for (i = 0; i < nr_iters; i++) - objs[i] = NULL; + err = bpf_link_get_fd_by_id(0); + CHECK(err >= 0 || errno != ENOENT, + "get-fd-by-notexist-link-id", "err %d errno %d\n", err, errno); /* Check bpf_obj_get_info_by_fd() */ bzero(zeros, sizeof(zeros)); for (i = 0; i < nr_iters; i++) { now = time(NULL); - err = bpf_prog_load(file, BPF_PROG_TYPE_SOCKET_FILTER, + err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &objs[i], &prog_fds[i]); /* test_obj_id.o is a dumb prog. It should never fail * to load. @@ -60,6 +65,17 @@ void test_bpf_obj_id(void) if (CHECK_FAIL(err)) goto done; + prog = bpf_object__find_program_by_title(objs[i], + "raw_tp/sys_enter"); + if (CHECK_FAIL(!prog)) + goto done; + links[i] = bpf_program__attach(prog); + err = libbpf_get_error(links[i]); + if (CHECK(err, "prog_attach", "prog #%d, err %d\n", i, err)) { + links[i] = NULL; + goto done; + } + /* Check getting map info */ info_len = sizeof(struct bpf_map_info) * 2; bzero(&map_infos[i], info_len); @@ -107,7 +123,7 @@ void test_bpf_obj_id(void) load_time = (real_time_ts.tv_sec - boot_time_ts.tv_sec) + (prog_infos[i].load_time / nsec_per_sec); if (CHECK(err || - prog_infos[i].type != BPF_PROG_TYPE_SOCKET_FILTER || + prog_infos[i].type != BPF_PROG_TYPE_RAW_TRACEPOINT || info_len != sizeof(struct bpf_prog_info) || (env.jit_enabled && !prog_infos[i].jited_prog_len) || (env.jit_enabled && @@ -120,7 +136,11 @@ void test_bpf_obj_id(void) *(int *)(long)prog_infos[i].map_ids != map_infos[i].id || strcmp((char *)prog_infos[i].name, expected_prog_name), "get-prog-info(fd)", - "err %d errno %d i %d type %d(%d) info_len %u(%zu) jit_enabled %d jited_prog_len %u xlated_prog_len %u jited_prog %d xlated_prog %d load_time %lu(%lu) uid %u(%u) nr_map_ids %u(%u) map_id %u(%u) name %s(%s)\n", + "err %d errno %d i %d type %d(%d) info_len %u(%zu) " + "jit_enabled %d jited_prog_len %u xlated_prog_len %u " + "jited_prog %d xlated_prog %d load_time %lu(%lu) " + "uid %u(%u) nr_map_ids %u(%u) map_id %u(%u) " + "name %s(%s)\n", err, errno, i, prog_infos[i].type, BPF_PROG_TYPE_SOCKET_FILTER, info_len, sizeof(struct bpf_prog_info), @@ -135,6 +155,33 @@ void test_bpf_obj_id(void) *(int *)(long)prog_infos[i].map_ids, map_infos[i].id, prog_infos[i].name, expected_prog_name)) goto done; + + /* Check getting link info */ + info_len = sizeof(struct bpf_link_info) * 2; + bzero(&link_infos[i], info_len); + link_infos[i].raw_tracepoint.tp_name = (__u64)&tp_name; + link_infos[i].raw_tracepoint.tp_name_len = sizeof(tp_name); + err = bpf_obj_get_info_by_fd(bpf_link__fd(links[i]), + &link_infos[i], &info_len); + if (CHECK(err || + link_infos[i].type != BPF_LINK_TYPE_RAW_TRACEPOINT || + link_infos[i].prog_id != prog_infos[i].id || + link_infos[i].raw_tracepoint.tp_name != (__u64)&tp_name || + strcmp((char *)link_infos[i].raw_tracepoint.tp_name, + "sys_enter") || + info_len != sizeof(struct bpf_link_info), + "get-link-info(fd)", + "err %d errno %d info_len %u(%zu) type %d(%d) id %d " + "prog_id %d (%d) tp_name %s(%s)\n", + err, errno, + info_len, sizeof(struct bpf_link_info), + link_infos[i].type, BPF_LINK_TYPE_RAW_TRACEPOINT, + link_infos[i].id, + link_infos[i].prog_id, prog_infos[i].id, + (char *)link_infos[i].raw_tracepoint.tp_name, + "sys_enter")) + goto done; + } /* Check bpf_prog_get_next_id() */ @@ -247,7 +294,52 @@ void test_bpf_obj_id(void) "nr_id_found %u(%u)\n", nr_id_found, nr_iters); + /* Check bpf_link_get_next_id() */ + nr_id_found = 0; + next_id = 0; + while (!bpf_link_get_next_id(next_id, &next_id)) { + struct bpf_link_info link_info; + int link_fd, cmp_res; + + info_len = sizeof(link_info); + memset(&link_info, 0, info_len); + + link_fd = bpf_link_get_fd_by_id(next_id); + if (link_fd < 0 && errno == ENOENT) + /* The bpf_link is in the dead row */ + continue; + if (CHECK(link_fd < 0, "get-link-fd(next_id)", + "link_fd %d next_id %u errno %d\n", + link_fd, next_id, errno)) + break; + + for (i = 0; i < nr_iters; i++) + if (link_infos[i].id == next_id) + break; + + if (i == nr_iters) + continue; + + nr_id_found++; + + err = bpf_obj_get_info_by_fd(link_fd, &link_info, &info_len); + cmp_res = memcmp(&link_info, &link_infos[i], + offsetof(struct bpf_link_info, raw_tracepoint)); + CHECK(err || info_len != sizeof(link_info) || cmp_res, + "check get-link-info(next_id->fd)", + "err %d errno %d info_len %u(%zu) memcmp %d\n", + err, errno, info_len, sizeof(struct bpf_link_info), + cmp_res); + + close(link_fd); + } + CHECK(nr_id_found != nr_iters, + "check total link id found by get_next_id", + "nr_id_found %u(%u)\n", nr_id_found, nr_iters); + done: - for (i = 0; i < nr_iters; i++) + for (i = 0; i < nr_iters; i++) { + bpf_link__destroy(links[i]); bpf_object__close(objs[i]); + } } diff --git a/tools/testing/selftests/bpf/progs/test_obj_id.c b/tools/testing/selftests/bpf/progs/test_obj_id.c index 98b9de2fafd0..ded71b3ff6b4 100644 --- a/tools/testing/selftests/bpf/progs/test_obj_id.c +++ b/tools/testing/selftests/bpf/progs/test_obj_id.c @@ -3,16 +3,8 @@ */ #include #include -#include #include -/* It is a dumb bpf program such that it must have no - * issue to be loaded since testing the verifier is - * not the focus here. - */ - -int _version SEC("version") = 1; - struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 1); @@ -20,13 +12,13 @@ struct { __type(value, __u64); } test_map_id SEC(".maps"); -SEC("test_obj_id_dummy") -int test_obj_id(struct __sk_buff *skb) +SEC("raw_tp/sys_enter") +int test_obj_id(void *ctx) { __u32 key = 0; __u64 *value; value = bpf_map_lookup_elem(&test_map_id, &key); - return TC_ACT_OK; + return 0; } -- cgit v1.2.3 From 50325b1761e31ad17d252e795af72a9af8c5a7d7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:11 -0700 Subject: bpftool: Expose attach_type-to-string array to non-cgroup code Move attach_type_strings into main.h for access in non-cgroup code. bpf_attach_type is used for non-cgroup attach types quite widely now. So also complete missing string translations for non-cgroup attach types. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20200429001614.1544-8-andriin@fb.com --- tools/bpf/bpftool/cgroup.c | 48 +++++++++++++++------------------------------- tools/bpf/bpftool/main.h | 32 +++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 33 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c index 62c6a1d7cd18..1693c802bb20 100644 --- a/tools/bpf/bpftool/cgroup.c +++ b/tools/bpf/bpftool/cgroup.c @@ -31,42 +31,20 @@ static unsigned int query_flags; -static const char * const attach_type_strings[] = { - [BPF_CGROUP_INET_INGRESS] = "ingress", - [BPF_CGROUP_INET_EGRESS] = "egress", - [BPF_CGROUP_INET_SOCK_CREATE] = "sock_create", - [BPF_CGROUP_SOCK_OPS] = "sock_ops", - [BPF_CGROUP_DEVICE] = "device", - [BPF_CGROUP_INET4_BIND] = "bind4", - [BPF_CGROUP_INET6_BIND] = "bind6", - [BPF_CGROUP_INET4_CONNECT] = "connect4", - [BPF_CGROUP_INET6_CONNECT] = "connect6", - [BPF_CGROUP_INET4_POST_BIND] = "post_bind4", - [BPF_CGROUP_INET6_POST_BIND] = "post_bind6", - [BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4", - [BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6", - [BPF_CGROUP_SYSCTL] = "sysctl", - [BPF_CGROUP_UDP4_RECVMSG] = "recvmsg4", - [BPF_CGROUP_UDP6_RECVMSG] = "recvmsg6", - [BPF_CGROUP_GETSOCKOPT] = "getsockopt", - [BPF_CGROUP_SETSOCKOPT] = "setsockopt", - [__MAX_BPF_ATTACH_TYPE] = NULL, -}; - static enum bpf_attach_type parse_attach_type(const char *str) { enum bpf_attach_type type; for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++) { - if (attach_type_strings[type] && - is_prefix(str, attach_type_strings[type])) + if (attach_type_name[type] && + is_prefix(str, attach_type_name[type])) return type; } return __MAX_BPF_ATTACH_TYPE; } -static int show_bpf_prog(int id, const char *attach_type_str, +static int show_bpf_prog(int id, enum bpf_attach_type attach_type, const char *attach_flags_str, int level) { @@ -86,18 +64,22 @@ static int show_bpf_prog(int id, const char *attach_type_str, if (json_output) { jsonw_start_object(json_wtr); jsonw_uint_field(json_wtr, "id", info.id); - jsonw_string_field(json_wtr, "attach_type", - attach_type_str); + if (attach_type < ARRAY_SIZE(attach_type_name)) + jsonw_string_field(json_wtr, "attach_type", + attach_type_name[attach_type]); + else + jsonw_uint_field(json_wtr, "attach_type", attach_type); jsonw_string_field(json_wtr, "attach_flags", attach_flags_str); jsonw_string_field(json_wtr, "name", info.name); jsonw_end_object(json_wtr); } else { - printf("%s%-8u %-15s %-15s %-15s\n", level ? " " : "", - info.id, - attach_type_str, - attach_flags_str, - info.name); + printf("%s%-8u ", level ? " " : "", info.id); + if (attach_type < ARRAY_SIZE(attach_type_name)) + printf("%-15s", attach_type_name[attach_type]); + else + printf("type %-10u", attach_type); + printf(" %-15s %-15s\n", attach_flags_str, info.name); } close(prog_fd); @@ -171,7 +153,7 @@ static int show_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type, } for (iter = 0; iter < prog_cnt; iter++) - show_bpf_prog(prog_ids[iter], attach_type_strings[type], + show_bpf_prog(prog_ids[iter], type, attach_flags_str, level); return 0; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 86f14ce26fd7..99d84bd1d5b2 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -83,6 +83,38 @@ static const char * const prog_type_name[] = { [BPF_PROG_TYPE_EXT] = "ext", }; +static const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = { + [BPF_CGROUP_INET_INGRESS] = "ingress", + [BPF_CGROUP_INET_EGRESS] = "egress", + [BPF_CGROUP_INET_SOCK_CREATE] = "sock_create", + [BPF_CGROUP_SOCK_OPS] = "sock_ops", + [BPF_CGROUP_DEVICE] = "device", + [BPF_CGROUP_INET4_BIND] = "bind4", + [BPF_CGROUP_INET6_BIND] = "bind6", + [BPF_CGROUP_INET4_CONNECT] = "connect4", + [BPF_CGROUP_INET6_CONNECT] = "connect6", + [BPF_CGROUP_INET4_POST_BIND] = "post_bind4", + [BPF_CGROUP_INET6_POST_BIND] = "post_bind6", + [BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4", + [BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6", + [BPF_CGROUP_SYSCTL] = "sysctl", + [BPF_CGROUP_UDP4_RECVMSG] = "recvmsg4", + [BPF_CGROUP_UDP6_RECVMSG] = "recvmsg6", + [BPF_CGROUP_GETSOCKOPT] = "getsockopt", + [BPF_CGROUP_SETSOCKOPT] = "setsockopt", + + [BPF_SK_SKB_STREAM_PARSER] = "sk_skb_stream_parser", + [BPF_SK_SKB_STREAM_VERDICT] = "sk_skb_stream_verdict", + [BPF_SK_MSG_VERDICT] = "sk_msg_verdict", + [BPF_LIRC_MODE2] = "lirc_mode2", + [BPF_FLOW_DISSECTOR] = "flow_dissector", + [BPF_TRACE_RAW_TP] = "raw_tp", + [BPF_TRACE_FENTRY] = "fentry", + [BPF_TRACE_FEXIT] = "fexit", + [BPF_MODIFY_RETURN] = "mod_ret", + [BPF_LSM_MAC] = "lsm_mac", +}; + extern const char * const map_type_name[]; extern const size_t map_type_name_size; -- cgit v1.2.3 From c5481f9a954f27b8730c1dfeebbc9b3b5b2b2481 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:12 -0700 Subject: bpftool: Add bpf_link show and pin support Add `bpftool link show` and `bpftool link pin` commands. Example plain output for `link show` (with showing pinned paths): [vmuser@archvm bpf]$ sudo ~/local/linux/tools/bpf/bpftool/bpftool -f link 1: tracing prog 12 prog_type tracing attach_type fentry pinned /sys/fs/bpf/my_test_link pinned /sys/fs/bpf/my_test_link2 2: tracing prog 13 prog_type tracing attach_type fentry 3: tracing prog 14 prog_type tracing attach_type fentry 4: tracing prog 15 prog_type tracing attach_type fentry 5: tracing prog 16 prog_type tracing attach_type fentry 6: tracing prog 17 prog_type tracing attach_type fentry 7: raw_tracepoint prog 21 tp 'sys_enter' 8: cgroup prog 25 cgroup_id 584 attach_type egress 9: cgroup prog 25 cgroup_id 599 attach_type egress 10: cgroup prog 25 cgroup_id 614 attach_type egress 11: cgroup prog 25 cgroup_id 629 attach_type egress Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20200429001614.1544-9-andriin@fb.com --- tools/bpf/bpftool/common.c | 2 + tools/bpf/bpftool/link.c | 333 +++++++++++++++++++++++++++++++++++++++++++++ tools/bpf/bpftool/main.c | 6 +- tools/bpf/bpftool/main.h | 5 + 4 files changed, 345 insertions(+), 1 deletion(-) create mode 100644 tools/bpf/bpftool/link.c (limited to 'tools') diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index f2223dbdfb0a..c47bdc65de8e 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -262,6 +262,8 @@ int get_fd_type(int fd) return BPF_OBJ_MAP; else if (strstr(buf, "bpf-prog")) return BPF_OBJ_PROG; + else if (strstr(buf, "bpf-link")) + return BPF_OBJ_LINK; return BPF_OBJ_UNKNOWN; } diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c new file mode 100644 index 000000000000..adc7dc431ed8 --- /dev/null +++ b/tools/bpf/bpftool/link.c @@ -0,0 +1,333 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2020 Facebook */ + +#include +#include +#include +#include + +#include + +#include "json_writer.h" +#include "main.h" + +static const char * const link_type_name[] = { + [BPF_LINK_TYPE_UNSPEC] = "unspec", + [BPF_LINK_TYPE_RAW_TRACEPOINT] = "raw_tracepoint", + [BPF_LINK_TYPE_TRACING] = "tracing", + [BPF_LINK_TYPE_CGROUP] = "cgroup", +}; + +static int link_parse_fd(int *argc, char ***argv) +{ + if (is_prefix(**argv, "id")) { + unsigned int id; + char *endptr; + + NEXT_ARGP(); + + id = strtoul(**argv, &endptr, 0); + if (*endptr) { + p_err("can't parse %s as ID", **argv); + return -1; + } + NEXT_ARGP(); + + return bpf_link_get_fd_by_id(id); + } else if (is_prefix(**argv, "pinned")) { + char *path; + + NEXT_ARGP(); + + path = **argv; + NEXT_ARGP(); + + return open_obj_pinned_any(path, BPF_OBJ_LINK); + } + + p_err("expected 'id' or 'pinned', got: '%s'?", **argv); + return -1; +} + +static void +show_link_header_json(struct bpf_link_info *info, json_writer_t *wtr) +{ + jsonw_uint_field(wtr, "id", info->id); + if (info->type < ARRAY_SIZE(link_type_name)) + jsonw_string_field(wtr, "type", link_type_name[info->type]); + else + jsonw_uint_field(wtr, "type", info->type); + + jsonw_uint_field(json_wtr, "prog_id", info->prog_id); +} + +static int get_prog_info(int prog_id, struct bpf_prog_info *info) +{ + __u32 len = sizeof(*info); + int err, prog_fd; + + prog_fd = bpf_prog_get_fd_by_id(prog_id); + if (prog_fd < 0) + return prog_fd; + + memset(info, 0, sizeof(*info)); + err = bpf_obj_get_info_by_fd(prog_fd, info, &len); + if (err) + p_err("can't get prog info: %s", strerror(errno)); + close(prog_fd); + return err; +} + +static int show_link_close_json(int fd, struct bpf_link_info *info) +{ + struct bpf_prog_info prog_info; + int err; + + jsonw_start_object(json_wtr); + + show_link_header_json(info, json_wtr); + + switch (info->type) { + case BPF_LINK_TYPE_RAW_TRACEPOINT: + jsonw_string_field(json_wtr, "tp_name", + (const char *)info->raw_tracepoint.tp_name); + break; + case BPF_LINK_TYPE_TRACING: + err = get_prog_info(info->prog_id, &prog_info); + if (err) + return err; + + if (prog_info.type < ARRAY_SIZE(prog_type_name)) + jsonw_string_field(json_wtr, "prog_type", + prog_type_name[prog_info.type]); + else + jsonw_uint_field(json_wtr, "prog_type", + prog_info.type); + + if (info->tracing.attach_type < ARRAY_SIZE(attach_type_name)) + jsonw_string_field(json_wtr, "attach_type", + attach_type_name[info->tracing.attach_type]); + else + jsonw_uint_field(json_wtr, "attach_type", + info->tracing.attach_type); + break; + case BPF_LINK_TYPE_CGROUP: + jsonw_lluint_field(json_wtr, "cgroup_id", + info->cgroup.cgroup_id); + if (info->cgroup.attach_type < ARRAY_SIZE(attach_type_name)) + jsonw_string_field(json_wtr, "attach_type", + attach_type_name[info->cgroup.attach_type]); + else + jsonw_uint_field(json_wtr, "attach_type", + info->cgroup.attach_type); + break; + default: + break; + } + + if (!hash_empty(link_table.table)) { + struct pinned_obj *obj; + + jsonw_name(json_wtr, "pinned"); + jsonw_start_array(json_wtr); + hash_for_each_possible(link_table.table, obj, hash, info->id) { + if (obj->id == info->id) + jsonw_string(json_wtr, obj->path); + } + jsonw_end_array(json_wtr); + } + jsonw_end_object(json_wtr); + + return 0; +} + +static void show_link_header_plain(struct bpf_link_info *info) +{ + printf("%u: ", info->id); + if (info->type < ARRAY_SIZE(link_type_name)) + printf("%s ", link_type_name[info->type]); + else + printf("type %u ", info->type); + + printf("prog %u ", info->prog_id); +} + +static int show_link_close_plain(int fd, struct bpf_link_info *info) +{ + struct bpf_prog_info prog_info; + int err; + + show_link_header_plain(info); + + switch (info->type) { + case BPF_LINK_TYPE_RAW_TRACEPOINT: + printf("\n\ttp '%s' ", + (const char *)info->raw_tracepoint.tp_name); + break; + case BPF_LINK_TYPE_TRACING: + err = get_prog_info(info->prog_id, &prog_info); + if (err) + return err; + + if (prog_info.type < ARRAY_SIZE(prog_type_name)) + printf("\n\tprog_type %s ", + prog_type_name[prog_info.type]); + else + printf("\n\tprog_type %u ", prog_info.type); + + if (info->tracing.attach_type < ARRAY_SIZE(attach_type_name)) + printf("attach_type %s ", + attach_type_name[info->tracing.attach_type]); + else + printf("attach_type %u ", info->tracing.attach_type); + break; + case BPF_LINK_TYPE_CGROUP: + printf("\n\tcgroup_id %zu ", (size_t)info->cgroup.cgroup_id); + if (info->cgroup.attach_type < ARRAY_SIZE(attach_type_name)) + printf("attach_type %s ", + attach_type_name[info->cgroup.attach_type]); + else + printf("attach_type %u ", info->cgroup.attach_type); + break; + default: + break; + } + + if (!hash_empty(link_table.table)) { + struct pinned_obj *obj; + + hash_for_each_possible(link_table.table, obj, hash, info->id) { + if (obj->id == info->id) + printf("\n\tpinned %s", obj->path); + } + } + + printf("\n"); + + return 0; +} + +static int do_show_link(int fd) +{ + struct bpf_link_info info; + __u32 len = sizeof(info); + char raw_tp_name[256]; + int err; + + memset(&info, 0, sizeof(info)); +again: + err = bpf_obj_get_info_by_fd(fd, &info, &len); + if (err) { + p_err("can't get link info: %s", + strerror(errno)); + close(fd); + return err; + } + if (info.type == BPF_LINK_TYPE_RAW_TRACEPOINT && + !info.raw_tracepoint.tp_name) { + info.raw_tracepoint.tp_name = (unsigned long)&raw_tp_name; + info.raw_tracepoint.tp_name_len = sizeof(raw_tp_name); + goto again; + } + + if (json_output) + show_link_close_json(fd, &info); + else + show_link_close_plain(fd, &info); + + close(fd); + return 0; +} + +static int do_show(int argc, char **argv) +{ + __u32 id = 0; + int err, fd; + + if (show_pinned) + build_pinned_obj_table(&link_table, BPF_OBJ_LINK); + + if (argc == 2) { + fd = link_parse_fd(&argc, &argv); + if (fd < 0) + return fd; + return do_show_link(fd); + } + + if (argc) + return BAD_ARG(); + + if (json_output) + jsonw_start_array(json_wtr); + while (true) { + err = bpf_link_get_next_id(id, &id); + if (err) { + if (errno == ENOENT) + break; + p_err("can't get next link: %s%s", strerror(errno), + errno == EINVAL ? " -- kernel too old?" : ""); + break; + } + + fd = bpf_link_get_fd_by_id(id); + if (fd < 0) { + if (errno == ENOENT) + continue; + p_err("can't get link by id (%u): %s", + id, strerror(errno)); + break; + } + + err = do_show_link(fd); + if (err) + break; + } + if (json_output) + jsonw_end_array(json_wtr); + + return errno == ENOENT ? 0 : -1; +} + +static int do_pin(int argc, char **argv) +{ + int err; + + err = do_pin_any(argc, argv, link_parse_fd); + if (!err && json_output) + jsonw_null(json_wtr); + return err; +} + +static int do_help(int argc, char **argv) +{ + if (json_output) { + jsonw_null(json_wtr); + return 0; + } + + fprintf(stderr, + "Usage: %1$s %2$s { show | list } [LINK]\n" + " %1$s %2$s pin LINK FILE\n" + " %1$s %2$s help\n" + "\n" + " " HELP_SPEC_LINK "\n" + " " HELP_SPEC_PROGRAM "\n" + " " HELP_SPEC_OPTIONS "\n" + "", + bin_name, argv[-2]); + + return 0; +} + +static const struct cmd cmds[] = { + { "show", do_show }, + { "list", do_show }, + { "help", do_help }, + { "pin", do_pin }, + { 0 } +}; + +int do_link(int argc, char **argv) +{ + return cmd_select(cmds, argc, argv, do_help); +} diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index 466c269eabdd..1413a154806e 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -30,6 +30,7 @@ bool verifier_logs; bool relaxed_maps; struct pinned_obj_table prog_table; struct pinned_obj_table map_table; +struct pinned_obj_table link_table; static void __noreturn clean_and_exit(int i) { @@ -58,7 +59,7 @@ static int do_help(int argc, char **argv) " %s batch file FILE\n" " %s version\n" "\n" - " OBJECT := { prog | map | cgroup | perf | net | feature | btf | gen | struct_ops }\n" + " OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, bin_name, bin_name); @@ -215,6 +216,7 @@ static const struct cmd cmds[] = { { "batch", do_batch }, { "prog", do_prog }, { "map", do_map }, + { "link", do_link }, { "cgroup", do_cgroup }, { "perf", do_perf }, { "net", do_net }, @@ -364,6 +366,7 @@ int main(int argc, char **argv) hash_init(prog_table.table); hash_init(map_table.table); + hash_init(link_table.table); opterr = 0; while ((opt = getopt_long(argc, argv, "Vhpjfmnd", @@ -422,6 +425,7 @@ int main(int argc, char **argv) if (show_pinned) { delete_pinned_obj_table(&prog_table); delete_pinned_obj_table(&map_table); + delete_pinned_obj_table(&link_table); } return ret; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 99d84bd1d5b2..9b1fb81a8331 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -50,6 +50,8 @@ "\t {-m|--mapcompat} | {-n|--nomount} }" #define HELP_SPEC_MAP \ "MAP := { id MAP_ID | pinned FILE | name MAP_NAME }" +#define HELP_SPEC_LINK \ + "LINK := { id LINK_ID | pinned FILE }" static const char * const prog_type_name[] = { [BPF_PROG_TYPE_UNSPEC] = "unspec", @@ -122,6 +124,7 @@ enum bpf_obj_type { BPF_OBJ_UNKNOWN, BPF_OBJ_PROG, BPF_OBJ_MAP, + BPF_OBJ_LINK, }; extern const char *bin_name; @@ -134,6 +137,7 @@ extern bool verifier_logs; extern bool relaxed_maps; extern struct pinned_obj_table prog_table; extern struct pinned_obj_table map_table; +extern struct pinned_obj_table link_table; void __printf(1, 2) p_err(const char *fmt, ...); void __printf(1, 2) p_info(const char *fmt, ...); @@ -185,6 +189,7 @@ int do_pin_fd(int fd, const char *name); int do_prog(int argc, char **arg); int do_map(int argc, char **arg); +int do_link(int argc, char **arg); int do_event_pipe(int argc, char **argv); int do_cgroup(int argc, char **arg); int do_perf(int argc, char **arg); -- cgit v1.2.3 From 7464d013ccd4db8544df5eddb05ddd509b9c46e5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:13 -0700 Subject: bpftool: Add bpftool-link manpage Add bpftool-link manpage with information and examples of link-related commands. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20200429001614.1544-10-andriin@fb.com --- tools/bpf/bpftool/Documentation/bpftool-link.rst | 118 +++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 tools/bpf/bpftool/Documentation/bpftool-link.rst (limited to 'tools') diff --git a/tools/bpf/bpftool/Documentation/bpftool-link.rst b/tools/bpf/bpftool/Documentation/bpftool-link.rst new file mode 100644 index 000000000000..ee6500d6e6e4 --- /dev/null +++ b/tools/bpf/bpftool/Documentation/bpftool-link.rst @@ -0,0 +1,118 @@ +================ +bpftool-link +================ +------------------------------------------------------------------------------- +tool for inspection and simple manipulation of eBPF links +------------------------------------------------------------------------------- + +:Manual section: 8 + +SYNOPSIS +======== + + **bpftool** [*OPTIONS*] **link** *COMMAND* + + *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-f** | **--bpffs** } } + + *COMMANDS* := { **show** | **list** | **pin** | **help** } + +LINK COMMANDS +============= + +| **bpftool** **link { show | list }** [*LINK*] +| **bpftool** **link pin** *LINK* *FILE* +| **bpftool** **link help** +| +| *LINK* := { **id** *LINK_ID* | **pinned** *FILE* } + + +DESCRIPTION +=========== + **bpftool link { show | list }** [*LINK*] + Show information about active links. If *LINK* is + specified show information only about given link, + otherwise list all links currently active on the system. + + Output will start with link ID followed by link type and + zero or more named attributes, some of which depend on type + of link. + + **bpftool link pin** *LINK* *FILE* + Pin link *LINK* as *FILE*. + + Note: *FILE* must be located in *bpffs* mount. It must not + contain a dot character ('.'), which is reserved for future + extensions of *bpffs*. + + **bpftool link help** + Print short help message. + +OPTIONS +======= + -h, --help + Print short generic help message (similar to **bpftool help**). + + -V, --version + Print version number (similar to **bpftool version**). + + -j, --json + Generate JSON output. For commands that cannot produce JSON, this + option has no effect. + + -p, --pretty + Generate human-readable JSON output. Implies **-j**. + + -f, --bpffs + When showing BPF links, show file names of pinned + links. + + -n, --nomount + Do not automatically attempt to mount any virtual file system + (such as tracefs or BPF virtual file system) when necessary. + + -d, --debug + Print all logs available, even debug-level information. This + includes logs from libbpf. + +EXAMPLES +======== +**# bpftool link show** + +:: + + 10: cgroup prog 25 + cgroup_id 614 attach_type egress + +**# bpftool --json --pretty link show** + +:: + + [{ + "type": "cgroup", + "prog_id": 25, + "cgroup_id": 614, + "attach_type": "egress" + } + ] + +| +| **# bpftool link pin id 10 /sys/fs/bpf/link** +| **# ls -l /sys/fs/bpf/** + +:: + + -rw------- 1 root root 0 Apr 23 21:39 link + + +SEE ALSO +======== + **bpf**\ (2), + **bpf-helpers**\ (7), + **bpftool**\ (8), + **bpftool-prog\ (8), + **bpftool-map**\ (8), + **bpftool-cgroup**\ (8), + **bpftool-feature**\ (8), + **bpftool-net**\ (8), + **bpftool-perf**\ (8), + **bpftool-btf**\ (8) -- cgit v1.2.3 From 5d085ad2e68cceec8332b23ea8f630a28b506366 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:14 -0700 Subject: bpftool: Add link bash completions Extend bpftool's bash-completion script to handle new link command and its sub-commands. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20200429001614.1544-11-andriin@fb.com --- tools/bpf/bpftool/bash-completion/bpftool | 39 +++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'tools') diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 45ee99b159e2..c033c3329f73 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -98,6 +98,12 @@ _bpftool_get_btf_ids() command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) ) } +_bpftool_get_link_ids() +{ + COMPREPLY+=( $( compgen -W "$( bpftool -jp link 2>&1 | \ + command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) ) +} + _bpftool_get_obj_map_names() { local obj @@ -1082,6 +1088,39 @@ _bpftool() ;; esac ;; + link) + case $command in + show|list|pin) + case $prev in + id) + _bpftool_get_link_ids + return 0 + ;; + esac + ;; + esac + + local LINK_TYPE='id pinned' + case $command in + show|list) + [[ $prev != "$command" ]] && return 0 + COMPREPLY=( $( compgen -W "$LINK_TYPE" -- "$cur" ) ) + return 0 + ;; + pin) + if [[ $prev == "$command" ]]; then + COMPREPLY=( $( compgen -W "$LINK_TYPE" -- "$cur" ) ) + else + _filedir + fi + return 0 + ;; + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'help pin show list' -- "$cur" ) ) + ;; + esac + ;; esac } && complete -F _bpftool bpftool -- cgit v1.2.3 From 41017e56af6cf99122c86655f60fe4e1b75ecf48 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:27:37 -0700 Subject: libbpf: Refactor BTF-defined map definition parsing logic Factor out BTF map definition logic into stand-alone routine for easier reuse for map-in-map case. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429002739.48006-2-andriin@fb.com --- tools/lib/bpf/libbpf.c | 195 ++++++++++++++++++++++++++----------------------- 1 file changed, 103 insertions(+), 92 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 8e1dc6980fac..7d10436d7b58 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1914,109 +1914,54 @@ static int build_map_pin_path(struct bpf_map *map, const char *path) return 0; } -static int bpf_object__init_user_btf_map(struct bpf_object *obj, - const struct btf_type *sec, - int var_idx, int sec_idx, - const Elf_Data *data, bool strict, - const char *pin_root_path) + +static int parse_btf_map_def(struct bpf_object *obj, + struct bpf_map *map, + const struct btf_type *def, + bool strict, + const char *pin_root_path) { - const struct btf_type *var, *def, *t; - const struct btf_var_secinfo *vi; - const struct btf_var *var_extra; + const struct btf_type *t; const struct btf_member *m; - const char *map_name; - struct bpf_map *map; int vlen, i; - vi = btf_var_secinfos(sec) + var_idx; - var = btf__type_by_id(obj->btf, vi->type); - var_extra = btf_var(var); - map_name = btf__name_by_offset(obj->btf, var->name_off); - vlen = btf_vlen(var); - - if (map_name == NULL || map_name[0] == '\0') { - pr_warn("map #%d: empty name.\n", var_idx); - return -EINVAL; - } - if ((__u64)vi->offset + vi->size > data->d_size) { - pr_warn("map '%s' BTF data is corrupted.\n", map_name); - return -EINVAL; - } - if (!btf_is_var(var)) { - pr_warn("map '%s': unexpected var kind %u.\n", - map_name, btf_kind(var)); - return -EINVAL; - } - if (var_extra->linkage != BTF_VAR_GLOBAL_ALLOCATED && - var_extra->linkage != BTF_VAR_STATIC) { - pr_warn("map '%s': unsupported var linkage %u.\n", - map_name, var_extra->linkage); - return -EOPNOTSUPP; - } - - def = skip_mods_and_typedefs(obj->btf, var->type, NULL); - if (!btf_is_struct(def)) { - pr_warn("map '%s': unexpected def kind %u.\n", - map_name, btf_kind(var)); - return -EINVAL; - } - if (def->size > vi->size) { - pr_warn("map '%s': invalid def size.\n", map_name); - return -EINVAL; - } - - map = bpf_object__add_map(obj); - if (IS_ERR(map)) - return PTR_ERR(map); - map->name = strdup(map_name); - if (!map->name) { - pr_warn("map '%s': failed to alloc map name.\n", map_name); - return -ENOMEM; - } - map->libbpf_type = LIBBPF_MAP_UNSPEC; - map->def.type = BPF_MAP_TYPE_UNSPEC; - map->sec_idx = sec_idx; - map->sec_offset = vi->offset; - pr_debug("map '%s': at sec_idx %d, offset %zu.\n", - map_name, map->sec_idx, map->sec_offset); - vlen = btf_vlen(def); m = btf_members(def); for (i = 0; i < vlen; i++, m++) { const char *name = btf__name_by_offset(obj->btf, m->name_off); if (!name) { - pr_warn("map '%s': invalid field #%d.\n", map_name, i); + pr_warn("map '%s': invalid field #%d.\n", map->name, i); return -EINVAL; } if (strcmp(name, "type") == 0) { - if (!get_map_field_int(map_name, obj->btf, m, + if (!get_map_field_int(map->name, obj->btf, m, &map->def.type)) return -EINVAL; pr_debug("map '%s': found type = %u.\n", - map_name, map->def.type); + map->name, map->def.type); } else if (strcmp(name, "max_entries") == 0) { - if (!get_map_field_int(map_name, obj->btf, m, + if (!get_map_field_int(map->name, obj->btf, m, &map->def.max_entries)) return -EINVAL; pr_debug("map '%s': found max_entries = %u.\n", - map_name, map->def.max_entries); + map->name, map->def.max_entries); } else if (strcmp(name, "map_flags") == 0) { - if (!get_map_field_int(map_name, obj->btf, m, + if (!get_map_field_int(map->name, obj->btf, m, &map->def.map_flags)) return -EINVAL; pr_debug("map '%s': found map_flags = %u.\n", - map_name, map->def.map_flags); + map->name, map->def.map_flags); } else if (strcmp(name, "key_size") == 0) { __u32 sz; - if (!get_map_field_int(map_name, obj->btf, m, &sz)) + if (!get_map_field_int(map->name, obj->btf, m, &sz)) return -EINVAL; pr_debug("map '%s': found key_size = %u.\n", - map_name, sz); + map->name, sz); if (map->def.key_size && map->def.key_size != sz) { pr_warn("map '%s': conflicting key size %u != %u.\n", - map_name, map->def.key_size, sz); + map->name, map->def.key_size, sz); return -EINVAL; } map->def.key_size = sz; @@ -2026,25 +1971,25 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, t = btf__type_by_id(obj->btf, m->type); if (!t) { pr_warn("map '%s': key type [%d] not found.\n", - map_name, m->type); + map->name, m->type); return -EINVAL; } if (!btf_is_ptr(t)) { pr_warn("map '%s': key spec is not PTR: %u.\n", - map_name, btf_kind(t)); + map->name, btf_kind(t)); return -EINVAL; } sz = btf__resolve_size(obj->btf, t->type); if (sz < 0) { pr_warn("map '%s': can't determine key size for type [%u]: %zd.\n", - map_name, t->type, (ssize_t)sz); + map->name, t->type, (ssize_t)sz); return sz; } pr_debug("map '%s': found key [%u], sz = %zd.\n", - map_name, t->type, (ssize_t)sz); + map->name, t->type, (ssize_t)sz); if (map->def.key_size && map->def.key_size != sz) { pr_warn("map '%s': conflicting key size %u != %zd.\n", - map_name, map->def.key_size, (ssize_t)sz); + map->name, map->def.key_size, (ssize_t)sz); return -EINVAL; } map->def.key_size = sz; @@ -2052,13 +1997,13 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, } else if (strcmp(name, "value_size") == 0) { __u32 sz; - if (!get_map_field_int(map_name, obj->btf, m, &sz)) + if (!get_map_field_int(map->name, obj->btf, m, &sz)) return -EINVAL; pr_debug("map '%s': found value_size = %u.\n", - map_name, sz); + map->name, sz); if (map->def.value_size && map->def.value_size != sz) { pr_warn("map '%s': conflicting value size %u != %u.\n", - map_name, map->def.value_size, sz); + map->name, map->def.value_size, sz); return -EINVAL; } map->def.value_size = sz; @@ -2068,25 +2013,25 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, t = btf__type_by_id(obj->btf, m->type); if (!t) { pr_warn("map '%s': value type [%d] not found.\n", - map_name, m->type); + map->name, m->type); return -EINVAL; } if (!btf_is_ptr(t)) { pr_warn("map '%s': value spec is not PTR: %u.\n", - map_name, btf_kind(t)); + map->name, btf_kind(t)); return -EINVAL; } sz = btf__resolve_size(obj->btf, t->type); if (sz < 0) { pr_warn("map '%s': can't determine value size for type [%u]: %zd.\n", - map_name, t->type, (ssize_t)sz); + map->name, t->type, (ssize_t)sz); return sz; } pr_debug("map '%s': found value [%u], sz = %zd.\n", - map_name, t->type, (ssize_t)sz); + map->name, t->type, (ssize_t)sz); if (map->def.value_size && map->def.value_size != sz) { pr_warn("map '%s': conflicting value size %u != %zd.\n", - map_name, map->def.value_size, (ssize_t)sz); + map->name, map->def.value_size, (ssize_t)sz); return -EINVAL; } map->def.value_size = sz; @@ -2095,44 +2040,110 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, __u32 val; int err; - if (!get_map_field_int(map_name, obj->btf, m, &val)) + if (!get_map_field_int(map->name, obj->btf, m, &val)) return -EINVAL; pr_debug("map '%s': found pinning = %u.\n", - map_name, val); + map->name, val); if (val != LIBBPF_PIN_NONE && val != LIBBPF_PIN_BY_NAME) { pr_warn("map '%s': invalid pinning value %u.\n", - map_name, val); + map->name, val); return -EINVAL; } if (val == LIBBPF_PIN_BY_NAME) { err = build_map_pin_path(map, pin_root_path); if (err) { pr_warn("map '%s': couldn't build pin path.\n", - map_name); + map->name); return err; } } } else { if (strict) { pr_warn("map '%s': unknown field '%s'.\n", - map_name, name); + map->name, name); return -ENOTSUP; } pr_debug("map '%s': ignoring unknown field '%s'.\n", - map_name, name); + map->name, name); } } if (map->def.type == BPF_MAP_TYPE_UNSPEC) { - pr_warn("map '%s': map type isn't specified.\n", map_name); + pr_warn("map '%s': map type isn't specified.\n", map->name); return -EINVAL; } return 0; } +static int bpf_object__init_user_btf_map(struct bpf_object *obj, + const struct btf_type *sec, + int var_idx, int sec_idx, + const Elf_Data *data, bool strict, + const char *pin_root_path) +{ + const struct btf_type *var, *def; + const struct btf_var_secinfo *vi; + const struct btf_var *var_extra; + const char *map_name; + struct bpf_map *map; + + vi = btf_var_secinfos(sec) + var_idx; + var = btf__type_by_id(obj->btf, vi->type); + var_extra = btf_var(var); + map_name = btf__name_by_offset(obj->btf, var->name_off); + + if (map_name == NULL || map_name[0] == '\0') { + pr_warn("map #%d: empty name.\n", var_idx); + return -EINVAL; + } + if ((__u64)vi->offset + vi->size > data->d_size) { + pr_warn("map '%s' BTF data is corrupted.\n", map_name); + return -EINVAL; + } + if (!btf_is_var(var)) { + pr_warn("map '%s': unexpected var kind %u.\n", + map_name, btf_kind(var)); + return -EINVAL; + } + if (var_extra->linkage != BTF_VAR_GLOBAL_ALLOCATED && + var_extra->linkage != BTF_VAR_STATIC) { + pr_warn("map '%s': unsupported var linkage %u.\n", + map_name, var_extra->linkage); + return -EOPNOTSUPP; + } + + def = skip_mods_and_typedefs(obj->btf, var->type, NULL); + if (!btf_is_struct(def)) { + pr_warn("map '%s': unexpected def kind %u.\n", + map_name, btf_kind(var)); + return -EINVAL; + } + if (def->size > vi->size) { + pr_warn("map '%s': invalid def size.\n", map_name); + return -EINVAL; + } + + map = bpf_object__add_map(obj); + if (IS_ERR(map)) + return PTR_ERR(map); + map->name = strdup(map_name); + if (!map->name) { + pr_warn("map '%s': failed to alloc map name.\n", map_name); + return -ENOMEM; + } + map->libbpf_type = LIBBPF_MAP_UNSPEC; + map->def.type = BPF_MAP_TYPE_UNSPEC; + map->sec_idx = sec_idx; + map->sec_offset = vi->offset; + pr_debug("map '%s': at sec_idx %d, offset %zu.\n", + map_name, map->sec_idx, map->sec_offset); + + return parse_btf_map_def(obj, map, def, strict, pin_root_path); +} + static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, const char *pin_root_path) { -- cgit v1.2.3 From 2d39d7c56f115148b05d1d8c6b8698a5730c8b53 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:27:38 -0700 Subject: libbpf: Refactor map creation logic and fix cleanup leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Factor out map creation and destruction logic to simplify code and especially error handling. Also fix map FD leak in case of partially successful map creation during bpf_object load operation. Fixes: 57a00f41644f ("libbpf: Add auto-pinning of maps when loading BPF objects") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200429002739.48006-3-andriin@fb.com --- tools/lib/bpf/libbpf.c | 226 ++++++++++++++++++++++++++----------------------- 1 file changed, 121 insertions(+), 105 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 7d10436d7b58..9c845cf4cfcf 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -3493,107 +3493,111 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map) return 0; } +static void bpf_map__destroy(struct bpf_map *map); + +static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map) +{ + struct bpf_create_map_attr create_attr; + struct bpf_map_def *def = &map->def; + + memset(&create_attr, 0, sizeof(create_attr)); + + if (obj->caps.name) + create_attr.name = map->name; + create_attr.map_ifindex = map->map_ifindex; + create_attr.map_type = def->type; + create_attr.map_flags = def->map_flags; + create_attr.key_size = def->key_size; + create_attr.value_size = def->value_size; + + if (def->type == BPF_MAP_TYPE_PERF_EVENT_ARRAY && !def->max_entries) { + int nr_cpus; + + nr_cpus = libbpf_num_possible_cpus(); + if (nr_cpus < 0) { + pr_warn("map '%s': failed to determine number of system CPUs: %d\n", + map->name, nr_cpus); + return nr_cpus; + } + pr_debug("map '%s': setting size to %d\n", map->name, nr_cpus); + create_attr.max_entries = nr_cpus; + } else { + create_attr.max_entries = def->max_entries; + } + + if (bpf_map__is_struct_ops(map)) + create_attr.btf_vmlinux_value_type_id = + map->btf_vmlinux_value_type_id; + + create_attr.btf_fd = 0; + create_attr.btf_key_type_id = 0; + create_attr.btf_value_type_id = 0; + if (obj->btf && !bpf_map_find_btf_info(obj, map)) { + create_attr.btf_fd = btf__fd(obj->btf); + create_attr.btf_key_type_id = map->btf_key_type_id; + create_attr.btf_value_type_id = map->btf_value_type_id; + } + + map->fd = bpf_create_map_xattr(&create_attr); + if (map->fd < 0 && (create_attr.btf_key_type_id || + create_attr.btf_value_type_id)) { + char *cp, errmsg[STRERR_BUFSIZE]; + int err = -errno; + + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); + pr_warn("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n", + map->name, cp, err); + create_attr.btf_fd = 0; + create_attr.btf_key_type_id = 0; + create_attr.btf_value_type_id = 0; + map->btf_key_type_id = 0; + map->btf_value_type_id = 0; + map->fd = bpf_create_map_xattr(&create_attr); + } + + if (map->fd < 0) + return -errno; + + return 0; +} + static int bpf_object__create_maps(struct bpf_object *obj) { - struct bpf_create_map_attr create_attr = {}; - int nr_cpus = 0; - unsigned int i; + struct bpf_map *map; + char *cp, errmsg[STRERR_BUFSIZE]; + unsigned int i, j; int err; for (i = 0; i < obj->nr_maps; i++) { - struct bpf_map *map = &obj->maps[i]; - struct bpf_map_def *def = &map->def; - char *cp, errmsg[STRERR_BUFSIZE]; - int *pfd = &map->fd; + map = &obj->maps[i]; if (map->pin_path) { err = bpf_object__reuse_map(map); if (err) { - pr_warn("error reusing pinned map %s\n", + pr_warn("map '%s': error reusing pinned map\n", map->name); - return err; + goto err_out; } } if (map->fd >= 0) { - pr_debug("skip map create (preset) %s: fd=%d\n", + pr_debug("map '%s': skipping creation (preset fd=%d)\n", map->name, map->fd); continue; } - if (obj->caps.name) - create_attr.name = map->name; - create_attr.map_ifindex = map->map_ifindex; - create_attr.map_type = def->type; - create_attr.map_flags = def->map_flags; - create_attr.key_size = def->key_size; - create_attr.value_size = def->value_size; - if (def->type == BPF_MAP_TYPE_PERF_EVENT_ARRAY && - !def->max_entries) { - if (!nr_cpus) - nr_cpus = libbpf_num_possible_cpus(); - if (nr_cpus < 0) { - pr_warn("failed to determine number of system CPUs: %d\n", - nr_cpus); - err = nr_cpus; - goto err_out; - } - pr_debug("map '%s': setting size to %d\n", - map->name, nr_cpus); - create_attr.max_entries = nr_cpus; - } else { - create_attr.max_entries = def->max_entries; - } - create_attr.btf_fd = 0; - create_attr.btf_key_type_id = 0; - create_attr.btf_value_type_id = 0; - if (bpf_map_type__is_map_in_map(def->type) && - map->inner_map_fd >= 0) - create_attr.inner_map_fd = map->inner_map_fd; - if (bpf_map__is_struct_ops(map)) - create_attr.btf_vmlinux_value_type_id = - map->btf_vmlinux_value_type_id; - - if (obj->btf && !bpf_map_find_btf_info(obj, map)) { - create_attr.btf_fd = btf__fd(obj->btf); - create_attr.btf_key_type_id = map->btf_key_type_id; - create_attr.btf_value_type_id = map->btf_value_type_id; - } - - *pfd = bpf_create_map_xattr(&create_attr); - if (*pfd < 0 && (create_attr.btf_key_type_id || - create_attr.btf_value_type_id)) { - err = -errno; - cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); - pr_warn("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n", - map->name, cp, err); - create_attr.btf_fd = 0; - create_attr.btf_key_type_id = 0; - create_attr.btf_value_type_id = 0; - map->btf_key_type_id = 0; - map->btf_value_type_id = 0; - *pfd = bpf_create_map_xattr(&create_attr); - } - - if (*pfd < 0) { - size_t j; + err = bpf_object__create_map(obj, map); + if (err) + goto err_out; - err = -errno; -err_out: - cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); - pr_warn("failed to create map (name: '%s'): %s(%d)\n", - map->name, cp, err); - pr_perm_msg(err); - for (j = 0; j < i; j++) - zclose(obj->maps[j].fd); - return err; - } + pr_debug("map '%s': created successfully, fd=%d\n", map->name, + map->fd); if (bpf_map__is_internal(map)) { err = bpf_object__populate_internal_map(obj, map); if (err < 0) { - zclose(*pfd); + zclose(map->fd); goto err_out; } } @@ -3601,16 +3605,23 @@ err_out: if (map->pin_path && !map->pinned) { err = bpf_map__pin(map, NULL); if (err) { - pr_warn("failed to auto-pin map name '%s' at '%s'\n", - map->name, map->pin_path); - return err; + pr_warn("map '%s': failed to auto-pin at '%s': %d\n", + map->name, map->pin_path, err); + zclose(map->fd); + goto err_out; } } - - pr_debug("created map %s: fd=%d\n", map->name, *pfd); } return 0; + +err_out: + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); + pr_warn("map '%s': failed to create: %s(%d)\n", map->name, cp, err); + pr_perm_msg(err); + for (j = 0; j < i; j++) + zclose(obj->maps[j].fd); + return err; } static int @@ -5966,6 +5977,32 @@ int bpf_object__pin(struct bpf_object *obj, const char *path) return 0; } +static void bpf_map__destroy(struct bpf_map *map) +{ + if (map->clear_priv) + map->clear_priv(map, map->priv); + map->priv = NULL; + map->clear_priv = NULL; + + if (map->mmaped) { + munmap(map->mmaped, bpf_map_mmap_sz(map)); + map->mmaped = NULL; + } + + if (map->st_ops) { + zfree(&map->st_ops->data); + zfree(&map->st_ops->progs); + zfree(&map->st_ops->kern_func_off); + zfree(&map->st_ops); + } + + zfree(&map->name); + zfree(&map->pin_path); + + if (map->fd >= 0) + zclose(map->fd); +} + void bpf_object__close(struct bpf_object *obj) { size_t i; @@ -5981,29 +6018,8 @@ void bpf_object__close(struct bpf_object *obj) btf__free(obj->btf); btf_ext__free(obj->btf_ext); - for (i = 0; i < obj->nr_maps; i++) { - struct bpf_map *map = &obj->maps[i]; - - if (map->clear_priv) - map->clear_priv(map, map->priv); - map->priv = NULL; - map->clear_priv = NULL; - - if (map->mmaped) { - munmap(map->mmaped, bpf_map_mmap_sz(map)); - map->mmaped = NULL; - } - - if (map->st_ops) { - zfree(&map->st_ops->data); - zfree(&map->st_ops->progs); - zfree(&map->st_ops->kern_func_off); - zfree(&map->st_ops); - } - - zfree(&map->name); - zfree(&map->pin_path); - } + for (i = 0; i < obj->nr_maps; i++) + bpf_map__destroy(&obj->maps[i]); zfree(&obj->kconfig); zfree(&obj->externs); -- cgit v1.2.3 From 646f02ffdd49c466cb81642c2b013beb80092d01 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:27:39 -0700 Subject: libbpf: Add BTF-defined map-in-map support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As discussed at LPC 2019 ([0]), this patch brings (a quite belated) support for declarative BTF-defined map-in-map support in libbpf. It allows to define ARRAY_OF_MAPS and HASH_OF_MAPS BPF maps without any user-space initialization code involved. Additionally, it allows to initialize outer map's slots with references to respective inner maps at load time, also completely declaratively. Despite a weak type system of C, the way BTF-defined map-in-map definition works, it's actually quite hard to accidentally initialize outer map with incompatible inner maps. This being C, of course, it's still possible, but even that would be caught at load time and error returned with helpful debug log pointing exactly to the slot that failed to be initialized. As an example, here's a rather advanced HASH_OF_MAPS declaration and initialization example, filling slots #0 and #4 with two inner maps: #include struct inner_map { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 1); __type(key, int); __type(value, int); } inner_map1 SEC(".maps"), inner_map2 SEC(".maps"); struct outer_hash { __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); __uint(max_entries, 5); __uint(key_size, sizeof(int)); __array(values, struct inner_map); } outer_hash SEC(".maps") = { .values = { [0] = &inner_map2, [4] = &inner_map1, }, }; Here's the relevant part of libbpf debug log showing pretty clearly of what's going on with map-in-map initialization: libbpf: .maps relo #0: for 6 value 0 rel.r_offset 96 name 260 ('inner_map1') libbpf: .maps relo #0: map 'outer_arr' slot [0] points to map 'inner_map1' libbpf: .maps relo #1: for 7 value 32 rel.r_offset 112 name 249 ('inner_map2') libbpf: .maps relo #1: map 'outer_arr' slot [2] points to map 'inner_map2' libbpf: .maps relo #2: for 7 value 32 rel.r_offset 144 name 249 ('inner_map2') libbpf: .maps relo #2: map 'outer_hash' slot [0] points to map 'inner_map2' libbpf: .maps relo #3: for 6 value 0 rel.r_offset 176 name 260 ('inner_map1') libbpf: .maps relo #3: map 'outer_hash' slot [4] points to map 'inner_map1' libbpf: map 'inner_map1': created successfully, fd=4 libbpf: map 'inner_map2': created successfully, fd=5 libbpf: map 'outer_hash': created successfully, fd=7 libbpf: map 'outer_hash': slot [0] set to map 'inner_map2' fd=5 libbpf: map 'outer_hash': slot [4] set to map 'inner_map1' fd=4 Notice from the log above that fd=6 (not logged explicitly) is used for inner "prototype" map, necessary for creation of outer map. It is destroyed immediately after outer map is created. See also included selftest with some extra comments explaining extra details of usage. Additionally, similar initialization syntax and libbpf functionality can be used to do initialization of BPF_PROG_ARRAY with references to BPF sub-programs. This can be done in follow up patches, if there will be a demand for this. [0] https://linuxplumbersconf.org/event/4/contributions/448/ Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200429002739.48006-4-andriin@fb.com --- tools/lib/bpf/bpf_helpers.h | 1 + tools/lib/bpf/libbpf.c | 281 +++++++++++++++++++-- .../selftests/bpf/prog_tests/btf_map_in_map.c | 49 ++++ .../selftests/bpf/progs/test_btf_map_in_map.c | 76 ++++++ 4 files changed, 384 insertions(+), 23 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c create mode 100644 tools/testing/selftests/bpf/progs/test_btf_map_in_map.c (limited to 'tools') diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index 60aad054eea1..da00b87aa199 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -12,6 +12,7 @@ #define __uint(name, val) int (*name)[val] #define __type(name, val) typeof(val) *name +#define __array(name, val) typeof(val) *name[] /* Helper macro to print out debug messages */ #define bpf_printk(fmt, ...) \ diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 9c845cf4cfcf..445ee903f9cd 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -310,6 +310,7 @@ struct bpf_map { int map_ifindex; int inner_map_fd; struct bpf_map_def def; + __u32 btf_var_idx; __u32 btf_key_type_id; __u32 btf_value_type_id; __u32 btf_vmlinux_value_type_id; @@ -318,6 +319,9 @@ struct bpf_map { enum libbpf_map_type libbpf_type; void *mmaped; struct bpf_struct_ops *st_ops; + struct bpf_map *inner_map; + void **init_slots; + int init_slots_sz; char *pin_path; bool pinned; bool reused; @@ -389,6 +393,7 @@ struct bpf_object { int nr_reloc_sects; int maps_shndx; int btf_maps_shndx; + __u32 btf_maps_sec_btf_id; int text_shndx; int symbols_shndx; int data_shndx; @@ -1918,7 +1923,7 @@ static int build_map_pin_path(struct bpf_map *map, const char *path) static int parse_btf_map_def(struct bpf_object *obj, struct bpf_map *map, const struct btf_type *def, - bool strict, + bool strict, bool is_inner, const char *pin_root_path) { const struct btf_type *t; @@ -2036,10 +2041,79 @@ static int parse_btf_map_def(struct bpf_object *obj, } map->def.value_size = sz; map->btf_value_type_id = t->type; + } + else if (strcmp(name, "values") == 0) { + int err; + + if (is_inner) { + pr_warn("map '%s': multi-level inner maps not supported.\n", + map->name); + return -ENOTSUP; + } + if (i != vlen - 1) { + pr_warn("map '%s': '%s' member should be last.\n", + map->name, name); + return -EINVAL; + } + if (!bpf_map_type__is_map_in_map(map->def.type)) { + pr_warn("map '%s': should be map-in-map.\n", + map->name); + return -ENOTSUP; + } + if (map->def.value_size && map->def.value_size != 4) { + pr_warn("map '%s': conflicting value size %u != 4.\n", + map->name, map->def.value_size); + return -EINVAL; + } + map->def.value_size = 4; + t = btf__type_by_id(obj->btf, m->type); + if (!t) { + pr_warn("map '%s': map-in-map inner type [%d] not found.\n", + map->name, m->type); + return -EINVAL; + } + if (!btf_is_array(t) || btf_array(t)->nelems) { + pr_warn("map '%s': map-in-map inner spec is not a zero-sized array.\n", + map->name); + return -EINVAL; + } + t = skip_mods_and_typedefs(obj->btf, btf_array(t)->type, + NULL); + if (!btf_is_ptr(t)) { + pr_warn("map '%s': map-in-map inner def is of unexpected kind %u.\n", + map->name, btf_kind(t)); + return -EINVAL; + } + t = skip_mods_and_typedefs(obj->btf, t->type, NULL); + if (!btf_is_struct(t)) { + pr_warn("map '%s': map-in-map inner def is of unexpected kind %u.\n", + map->name, btf_kind(t)); + return -EINVAL; + } + + map->inner_map = calloc(1, sizeof(*map->inner_map)); + if (!map->inner_map) + return -ENOMEM; + map->inner_map->sec_idx = obj->efile.btf_maps_shndx; + map->inner_map->name = malloc(strlen(map->name) + + sizeof(".inner") + 1); + if (!map->inner_map->name) + return -ENOMEM; + sprintf(map->inner_map->name, "%s.inner", map->name); + + err = parse_btf_map_def(obj, map->inner_map, t, strict, + true /* is_inner */, NULL); + if (err) + return err; } else if (strcmp(name, "pinning") == 0) { __u32 val; int err; + if (is_inner) { + pr_debug("map '%s': inner def can't be pinned.\n", + map->name); + return -EINVAL; + } if (!get_map_field_int(map->name, obj->btf, m, &val)) return -EINVAL; pr_debug("map '%s': found pinning = %u.\n", @@ -2138,10 +2212,11 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, map->def.type = BPF_MAP_TYPE_UNSPEC; map->sec_idx = sec_idx; map->sec_offset = vi->offset; + map->btf_var_idx = var_idx; pr_debug("map '%s': at sec_idx %d, offset %zu.\n", map_name, map->sec_idx, map->sec_offset); - return parse_btf_map_def(obj, map, def, strict, pin_root_path); + return parse_btf_map_def(obj, map, def, strict, false, pin_root_path); } static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, @@ -2174,6 +2249,7 @@ static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, name = btf__name_by_offset(obj->btf, t->name_off); if (strcmp(name, MAPS_ELF_SEC) == 0) { sec = t; + obj->efile.btf_maps_sec_btf_id = i; break; } } @@ -2560,7 +2636,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj) /* Only do relo for section with exec instructions */ if (!section_have_execinstr(obj, sec) && - strcmp(name, ".rel" STRUCT_OPS_SEC)) { + strcmp(name, ".rel" STRUCT_OPS_SEC) && + strcmp(name, ".rel" MAPS_ELF_SEC)) { pr_debug("skip relo %s(%d) for section(%d)\n", name, idx, sec); continue; @@ -3538,6 +3615,22 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map) create_attr.btf_value_type_id = map->btf_value_type_id; } + if (bpf_map_type__is_map_in_map(def->type)) { + if (map->inner_map) { + int err; + + err = bpf_object__create_map(obj, map->inner_map); + if (err) { + pr_warn("map '%s': failed to create inner map: %d\n", + map->name, err); + return err; + } + map->inner_map_fd = bpf_map__fd(map->inner_map); + } + if (map->inner_map_fd >= 0) + create_attr.inner_map_fd = map->inner_map_fd; + } + map->fd = bpf_create_map_xattr(&create_attr); if (map->fd < 0 && (create_attr.btf_key_type_id || create_attr.btf_value_type_id)) { @@ -3558,6 +3651,11 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map) if (map->fd < 0) return -errno; + if (bpf_map_type__is_map_in_map(def->type) && map->inner_map) { + bpf_map__destroy(map->inner_map); + zfree(&map->inner_map); + } + return 0; } @@ -3602,6 +3700,31 @@ bpf_object__create_maps(struct bpf_object *obj) } } + if (map->init_slots_sz) { + for (j = 0; j < map->init_slots_sz; j++) { + const struct bpf_map *targ_map; + int fd; + + if (!map->init_slots[j]) + continue; + + targ_map = map->init_slots[j]; + fd = bpf_map__fd(targ_map); + err = bpf_map_update_elem(map->fd, &j, &fd, 0); + if (err) { + err = -errno; + pr_warn("map '%s': failed to initialize slot [%d] to map '%s' fd=%d: %d\n", + map->name, j, targ_map->name, + fd, err); + goto err_out; + } + pr_debug("map '%s': slot [%d] set to map '%s' fd=%d\n", + map->name, j, targ_map->name, fd); + } + zfree(&map->init_slots); + map->init_slots_sz = 0; + } + if (map->pin_path && !map->pinned) { err = bpf_map__pin(map, NULL); if (err) { @@ -4873,9 +4996,118 @@ bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path) return 0; } -static int bpf_object__collect_struct_ops_map_reloc(struct bpf_object *obj, - GElf_Shdr *shdr, - Elf_Data *data); +static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, + GElf_Shdr *shdr, Elf_Data *data); + +static int bpf_object__collect_map_relos(struct bpf_object *obj, + GElf_Shdr *shdr, Elf_Data *data) +{ + int i, j, nrels, new_sz, ptr_sz = sizeof(void *); + const struct btf_type *sec, *var, *def; + const struct btf_var_secinfo *vi; + const struct btf_member *member; + struct bpf_map *map, *targ_map; + const char *name, *mname; + Elf_Data *symbols; + unsigned int moff; + GElf_Sym sym; + GElf_Rel rel; + void *tmp; + + if (!obj->efile.btf_maps_sec_btf_id || !obj->btf) + return -EINVAL; + sec = btf__type_by_id(obj->btf, obj->efile.btf_maps_sec_btf_id); + if (!sec) + return -EINVAL; + + symbols = obj->efile.symbols; + nrels = shdr->sh_size / shdr->sh_entsize; + for (i = 0; i < nrels; i++) { + if (!gelf_getrel(data, i, &rel)) { + pr_warn(".maps relo #%d: failed to get ELF relo\n", i); + return -LIBBPF_ERRNO__FORMAT; + } + if (!gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym)) { + pr_warn(".maps relo #%d: symbol %zx not found\n", + i, (size_t)GELF_R_SYM(rel.r_info)); + return -LIBBPF_ERRNO__FORMAT; + } + name = elf_strptr(obj->efile.elf, obj->efile.strtabidx, + sym.st_name) ? : ""; + if (sym.st_shndx != obj->efile.btf_maps_shndx) { + pr_warn(".maps relo #%d: '%s' isn't a BTF-defined map\n", + i, name); + return -LIBBPF_ERRNO__RELOC; + } + + pr_debug(".maps relo #%d: for %zd value %zd rel.r_offset %zu name %d ('%s')\n", + i, (ssize_t)(rel.r_info >> 32), (size_t)sym.st_value, + (size_t)rel.r_offset, sym.st_name, name); + + for (j = 0; j < obj->nr_maps; j++) { + map = &obj->maps[j]; + if (map->sec_idx != obj->efile.btf_maps_shndx) + continue; + + vi = btf_var_secinfos(sec) + map->btf_var_idx; + if (vi->offset <= rel.r_offset && + rel.r_offset + sizeof(void *) <= vi->offset + vi->size) + break; + } + if (j == obj->nr_maps) { + pr_warn(".maps relo #%d: cannot find map '%s' at rel.r_offset %zu\n", + i, name, (size_t)rel.r_offset); + return -EINVAL; + } + + if (!bpf_map_type__is_map_in_map(map->def.type)) + return -EINVAL; + if (map->def.type == BPF_MAP_TYPE_HASH_OF_MAPS && + map->def.key_size != sizeof(int)) { + pr_warn(".maps relo #%d: hash-of-maps '%s' should have key size %zu.\n", + i, map->name, sizeof(int)); + return -EINVAL; + } + + targ_map = bpf_object__find_map_by_name(obj, name); + if (!targ_map) + return -ESRCH; + + var = btf__type_by_id(obj->btf, vi->type); + def = skip_mods_and_typedefs(obj->btf, var->type, NULL); + if (btf_vlen(def) == 0) + return -EINVAL; + member = btf_members(def) + btf_vlen(def) - 1; + mname = btf__name_by_offset(obj->btf, member->name_off); + if (strcmp(mname, "values")) + return -EINVAL; + + moff = btf_member_bit_offset(def, btf_vlen(def) - 1) / 8; + if (rel.r_offset - vi->offset < moff) + return -EINVAL; + + moff = rel.r_offset - vi->offset - moff; + if (moff % ptr_sz) + return -EINVAL; + moff /= ptr_sz; + if (moff >= map->init_slots_sz) { + new_sz = moff + 1; + tmp = realloc(map->init_slots, new_sz * ptr_sz); + if (!tmp) + return -ENOMEM; + map->init_slots = tmp; + memset(map->init_slots + map->init_slots_sz, 0, + (new_sz - map->init_slots_sz) * ptr_sz); + map->init_slots_sz = new_sz; + } + map->init_slots[moff] = targ_map; + + pr_debug(".maps relo #%d: map '%s' slot [%d] points to map '%s'\n", + i, map->name, moff, name); + } + + return 0; +} static int bpf_object__collect_reloc(struct bpf_object *obj) { @@ -4898,21 +5130,17 @@ static int bpf_object__collect_reloc(struct bpf_object *obj) } if (idx == obj->efile.st_ops_shndx) { - err = bpf_object__collect_struct_ops_map_reloc(obj, - shdr, - data); - if (err) - return err; - continue; - } - - prog = bpf_object__find_prog_by_idx(obj, idx); - if (!prog) { - pr_warn("relocation failed: no section(%d)\n", idx); - return -LIBBPF_ERRNO__RELOC; + err = bpf_object__collect_st_ops_relos(obj, shdr, data); + } else if (idx == obj->efile.btf_maps_shndx) { + err = bpf_object__collect_map_relos(obj, shdr, data); + } else { + prog = bpf_object__find_prog_by_idx(obj, idx); + if (!prog) { + pr_warn("relocation failed: no prog in section(%d)\n", idx); + return -LIBBPF_ERRNO__RELOC; + } + err = bpf_program__collect_reloc(prog, shdr, data, obj); } - - err = bpf_program__collect_reloc(prog, shdr, data, obj); if (err) return err; } @@ -5984,6 +6212,14 @@ static void bpf_map__destroy(struct bpf_map *map) map->priv = NULL; map->clear_priv = NULL; + if (map->inner_map) { + bpf_map__destroy(map->inner_map); + zfree(&map->inner_map); + } + + zfree(&map->init_slots); + map->init_slots_sz = 0; + if (map->mmaped) { munmap(map->mmaped, bpf_map_mmap_sz(map)); map->mmaped = NULL; @@ -6543,9 +6779,8 @@ static struct bpf_map *find_struct_ops_map_by_offset(struct bpf_object *obj, } /* Collect the reloc from ELF and populate the st_ops->progs[] */ -static int bpf_object__collect_struct_ops_map_reloc(struct bpf_object *obj, - GElf_Shdr *shdr, - Elf_Data *data) +static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, + GElf_Shdr *shdr, Elf_Data *data) { const struct btf_member *member; struct bpf_struct_ops *st_ops; diff --git a/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c new file mode 100644 index 000000000000..f7ee8fa377ad --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include + +#include "test_btf_map_in_map.skel.h" + +void test_btf_map_in_map(void) +{ + int duration = 0, err, key = 0, val; + struct test_btf_map_in_map* skel; + + skel = test_btf_map_in_map__open_and_load(); + if (CHECK(!skel, "skel_open", "failed to open&load skeleton\n")) + return; + + err = test_btf_map_in_map__attach(skel); + if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) + goto cleanup; + + /* inner1 = input, inner2 = input + 1 */ + val = bpf_map__fd(skel->maps.inner_map1); + bpf_map_update_elem(bpf_map__fd(skel->maps.outer_arr), &key, &val, 0); + val = bpf_map__fd(skel->maps.inner_map2); + bpf_map_update_elem(bpf_map__fd(skel->maps.outer_hash), &key, &val, 0); + skel->bss->input = 1; + usleep(1); + + bpf_map_lookup_elem(bpf_map__fd(skel->maps.inner_map1), &key, &val); + CHECK(val != 1, "inner1", "got %d != exp %d\n", val, 1); + bpf_map_lookup_elem(bpf_map__fd(skel->maps.inner_map2), &key, &val); + CHECK(val != 2, "inner2", "got %d != exp %d\n", val, 2); + + /* inner1 = input + 1, inner2 = input */ + val = bpf_map__fd(skel->maps.inner_map2); + bpf_map_update_elem(bpf_map__fd(skel->maps.outer_arr), &key, &val, 0); + val = bpf_map__fd(skel->maps.inner_map1); + bpf_map_update_elem(bpf_map__fd(skel->maps.outer_hash), &key, &val, 0); + skel->bss->input = 3; + usleep(1); + + bpf_map_lookup_elem(bpf_map__fd(skel->maps.inner_map1), &key, &val); + CHECK(val != 4, "inner1", "got %d != exp %d\n", val, 4); + bpf_map_lookup_elem(bpf_map__fd(skel->maps.inner_map2), &key, &val); + CHECK(val != 3, "inner2", "got %d != exp %d\n", val, 3); + +cleanup: + test_btf_map_in_map__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c b/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c new file mode 100644 index 000000000000..e5093796be97 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2020 Facebook */ +#include +#include + +struct inner_map { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, int); +} inner_map1 SEC(".maps"), + inner_map2 SEC(".maps"); + +struct outer_arr { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 3); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + /* it's possible to use anonymous struct as inner map definition here */ + __array(values, struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + /* changing max_entries to 2 will fail during load + * due to incompatibility with inner_map definition */ + __uint(max_entries, 1); + __type(key, int); + __type(value, int); + }); +} outer_arr SEC(".maps") = { + /* (void *) cast is necessary because we didn't use `struct inner_map` + * in __inner(values, ...) + * Actually, a conscious effort is required to screw up initialization + * of inner map slots, which is a great thing! + */ + .values = { (void *)&inner_map1, 0, (void *)&inner_map2 }, +}; + +struct outer_hash { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __uint(max_entries, 5); + __uint(key_size, sizeof(int)); + /* Here everything works flawlessly due to reuse of struct inner_map + * and compiler will complain at the attempt to use non-inner_map + * references below. This is great experience. + */ + __array(values, struct inner_map); +} outer_hash SEC(".maps") = { + .values = { + [0] = &inner_map2, + [4] = &inner_map1, + }, +}; + +int input = 0; + +SEC("raw_tp/sys_enter") +int handle__sys_enter(void *ctx) +{ + struct inner_map *inner_map; + int key = 0, val; + + inner_map = bpf_map_lookup_elem(&outer_arr, &key); + if (!inner_map) + return 1; + val = input; + bpf_map_update_elem(inner_map, &key, &val, 0); + + inner_map = bpf_map_lookup_elem(&outer_hash, &key); + if (!inner_map) + return 1; + val = input + 1; + bpf_map_update_elem(inner_map, &key, &val, 0); + + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 76148faa161e7cfb2d7719f35b37d7db4f3f8596 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 18:21:01 -0700 Subject: selftests/bpf: Ensure test flavors use correct skeletons Ensure that test runner flavors include their own skeletons from / directory. Previously, skeletons generated for no-flavor test_progs were used. Apart from fixing correctness, this also makes it possible to compile only flavors individually: $ make clean && make test_progs-no_alu32 ... now succeeds ... Fixes: 74b5a5968fe8 ("selftests/bpf: Replace test_progs and test_maps w/ general rule") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429012111.277390-2-andriin@fb.com --- tools/testing/selftests/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 4e654d41c7af..01c95f8278c7 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -324,7 +324,7 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ $(TRUNNER_BPF_SKELS) \ $$(BPFOBJ) | $(TRUNNER_OUTPUT) $$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@) - cd $$(@D) && $$(CC) $$(CFLAGS) -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) + cd $$(@D) && $$(CC) -I. $$(CFLAGS) -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \ %.c \ -- cgit v1.2.3 From 02995dd4bb02a5359a08e44abb3c18c2f456bd19 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 18:21:02 -0700 Subject: selftests/bpf: Add SAN_CFLAGS param to selftests build to allow sanitizers Add ability to specify extra compiler flags with SAN_CFLAGS for compilation of all user-space C files. This allows to build all of selftest programs with, e.g., custom sanitizer flags, without requiring support for such sanitizers from anyone compiling selftest/bpf. As an example, to compile everything with AddressSanitizer, one would do: $ make clean && make SAN_CFLAGS="-fsanitize=address" For AddressSanitizer to work, one needs appropriate libasan shared library installed in the system, with version of libasan matching what GCC links against. E.g., GCC8 needs libasan5, while GCC7 uses libasan4. For CentOS 7, to build everything successfully one would need to: $ sudo yum install devtoolset-8-gcc devtoolset-libasan-devel $ scl enable devtoolset-8 bash # set up environment For Arch Linux to run selftests, one would need to install gcc-libs package to get libasan.so.5: $ sudo pacman -S gcc-libs N.B. EXTRA_CFLAGS name wasn't used, because it's also used by libbpf's Makefile and this causes few issues: 1. default "-g -Wall" flags are overriden; 2. compiling shared library with AddressSanitizer generates a bunch of symbols like: "_GLOBAL__sub_D_00099_0_btf_dump.c", "_GLOBAL__sub_D_00099_0_bpf.c", etc, which screws up versioned symbols check. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Cc: Julia Kartseva Link: https://lore.kernel.org/bpf/20200429012111.277390-3-andriin@fb.com --- tools/testing/selftests/bpf/Makefile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 01c95f8278c7..887f06a514ee 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -20,9 +20,10 @@ CLANG ?= clang LLC ?= llc LLVM_OBJCOPY ?= llvm-objcopy BPF_GCC ?= $(shell command -v bpf-gcc;) -CFLAGS += -g -rdynamic -Wall -O2 $(GENFLAGS) -I$(CURDIR) \ - -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) -I$(TOOLSINCDIR) \ - -I$(APIDIR) \ +SAN_CFLAGS ?= +CFLAGS += -g -rdynamic -Wall -O2 $(GENFLAGS) $(SAN_CFLAGS) \ + -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ + -I$(TOOLSINCDIR) -I$(APIDIR) \ -Dbpf_prog_load=bpf_prog_test_load \ -Dbpf_load_program=bpf_test_load_program LDLIBS += -lcap -lelf -lz -lrt -lpthread -- cgit v1.2.3 From 42fce2cfb405e613f0355c4f92429d651bf0a5b3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 18:21:03 -0700 Subject: selftests/bpf: Convert test_hashmap into test_progs test Fold stand-alone test_hashmap test into test_progs. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429012111.277390-4-andriin@fb.com --- tools/testing/selftests/bpf/.gitignore | 2 - tools/testing/selftests/bpf/Makefile | 2 +- tools/testing/selftests/bpf/prog_tests/hashmap.c | 380 ++++++++++++++++++++++ tools/testing/selftests/bpf/test_hashmap.c | 382 ----------------------- 4 files changed, 381 insertions(+), 385 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/hashmap.c delete mode 100644 tools/testing/selftests/bpf/test_hashmap.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index c30079c86998..16b9774d8b68 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -30,8 +30,6 @@ test_tcpnotify_user test_libbpf test_tcp_check_syncookie_user test_sysctl -test_hashmap -test_btf_dump test_current_pid_tgid_new_ns xdping test_cpp diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 887f06a514ee..10f12a5aac20 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -33,7 +33,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \ test_sock test_btf test_sockmap get_cgroup_id_user test_socket_cookie \ test_cgroup_storage \ - test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_hashmap \ + test_netcnt test_tcpnotify_user test_sock_fields test_sysctl \ test_progs-no_alu32 \ test_current_pid_tgid_new_ns diff --git a/tools/testing/selftests/bpf/prog_tests/hashmap.c b/tools/testing/selftests/bpf/prog_tests/hashmap.c new file mode 100644 index 000000000000..428d488830c6 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/hashmap.c @@ -0,0 +1,380 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * Tests for libbpf's hashmap. + * + * Copyright (c) 2019 Facebook + */ +#include "test_progs.h" +#include "bpf/hashmap.h" + +static int duration = 0; + +static size_t hash_fn(const void *k, void *ctx) +{ + return (long)k; +} + +static bool equal_fn(const void *a, const void *b, void *ctx) +{ + return (long)a == (long)b; +} + +static inline size_t next_pow_2(size_t n) +{ + size_t r = 1; + + while (r < n) + r <<= 1; + return r; +} + +static inline size_t exp_cap(size_t sz) +{ + size_t r = next_pow_2(sz); + + if (sz * 4 / 3 > r) + r <<= 1; + return r; +} + +#define ELEM_CNT 62 + +static void test_hashmap_generic(void) +{ + struct hashmap_entry *entry, *tmp; + int err, bkt, found_cnt, i; + long long found_msk; + struct hashmap *map; + + map = hashmap__new(hash_fn, equal_fn, NULL); + if (CHECK(IS_ERR(map), "hashmap__new", + "failed to create map: %ld\n", PTR_ERR(map))) + return; + + for (i = 0; i < ELEM_CNT; i++) { + const void *oldk, *k = (const void *)(long)i; + void *oldv, *v = (void *)(long)(1024 + i); + + err = hashmap__update(map, k, v, &oldk, &oldv); + if (CHECK(err != -ENOENT, "hashmap__update", + "unexpected result: %d\n", err)) + goto cleanup; + + if (i % 2) { + err = hashmap__add(map, k, v); + } else { + err = hashmap__set(map, k, v, &oldk, &oldv); + if (CHECK(oldk != NULL || oldv != NULL, "check_kv", + "unexpected k/v: %p=%p\n", oldk, oldv)) + goto cleanup; + } + + if (CHECK(err, "elem_add", "failed to add k/v %ld = %ld: %d\n", + (long)k, (long)v, err)) + goto cleanup; + + if (CHECK(!hashmap__find(map, k, &oldv), "elem_find", + "failed to find key %ld\n", (long)k)) + goto cleanup; + if (CHECK(oldv != v, "elem_val", + "found value is wrong: %ld\n", (long)oldv)) + goto cleanup; + } + + if (CHECK(hashmap__size(map) != ELEM_CNT, "hashmap__size", + "invalid map size: %zu\n", hashmap__size(map))) + goto cleanup; + if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)), + "hashmap_cap", + "unexpected map capacity: %zu\n", hashmap__capacity(map))) + goto cleanup; + + found_msk = 0; + hashmap__for_each_entry(map, entry, bkt) { + long k = (long)entry->key; + long v = (long)entry->value; + + found_msk |= 1ULL << k; + if (CHECK(v - k != 1024, "check_kv", + "invalid k/v pair: %ld = %ld\n", k, v)) + goto cleanup; + } + if (CHECK(found_msk != (1ULL << ELEM_CNT) - 1, "elem_cnt", + "not all keys iterated: %llx\n", found_msk)) + goto cleanup; + + for (i = 0; i < ELEM_CNT; i++) { + const void *oldk, *k = (const void *)(long)i; + void *oldv, *v = (void *)(long)(256 + i); + + err = hashmap__add(map, k, v); + if (CHECK(err != -EEXIST, "hashmap__add", + "unexpected add result: %d\n", err)) + goto cleanup; + + if (i % 2) + err = hashmap__update(map, k, v, &oldk, &oldv); + else + err = hashmap__set(map, k, v, &oldk, &oldv); + + if (CHECK(err, "elem_upd", + "failed to update k/v %ld = %ld: %d\n", + (long)k, (long)v, err)) + goto cleanup; + if (CHECK(!hashmap__find(map, k, &oldv), "elem_find", + "failed to find key %ld\n", (long)k)) + goto cleanup; + if (CHECK(oldv != v, "elem_val", + "found value is wrong: %ld\n", (long)oldv)) + goto cleanup; + } + + if (CHECK(hashmap__size(map) != ELEM_CNT, "hashmap__size", + "invalid updated map size: %zu\n", hashmap__size(map))) + goto cleanup; + if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)), + "hashmap__capacity", + "unexpected map capacity: %zu\n", hashmap__capacity(map))) + goto cleanup; + + found_msk = 0; + hashmap__for_each_entry_safe(map, entry, tmp, bkt) { + long k = (long)entry->key; + long v = (long)entry->value; + + found_msk |= 1ULL << k; + if (CHECK(v - k != 256, "elem_check", + "invalid updated k/v pair: %ld = %ld\n", k, v)) + goto cleanup; + } + if (CHECK(found_msk != (1ULL << ELEM_CNT) - 1, "elem_cnt", + "not all keys iterated after update: %llx\n", found_msk)) + goto cleanup; + + found_cnt = 0; + hashmap__for_each_key_entry(map, entry, (void *)0) { + found_cnt++; + } + if (CHECK(!found_cnt, "found_cnt", + "didn't find any entries for key 0\n")) + goto cleanup; + + found_msk = 0; + found_cnt = 0; + hashmap__for_each_key_entry_safe(map, entry, tmp, (void *)0) { + const void *oldk, *k; + void *oldv, *v; + + k = entry->key; + v = entry->value; + + found_cnt++; + found_msk |= 1ULL << (long)k; + + if (CHECK(!hashmap__delete(map, k, &oldk, &oldv), "elem_del", + "failed to delete k/v %ld = %ld\n", + (long)k, (long)v)) + goto cleanup; + if (CHECK(oldk != k || oldv != v, "check_old", + "invalid deleted k/v: expected %ld = %ld, got %ld = %ld\n", + (long)k, (long)v, (long)oldk, (long)oldv)) + goto cleanup; + if (CHECK(hashmap__delete(map, k, &oldk, &oldv), "elem_del", + "unexpectedly deleted k/v %ld = %ld\n", + (long)oldk, (long)oldv)) + goto cleanup; + } + + if (CHECK(!found_cnt || !found_msk, "found_entries", + "didn't delete any key entries\n")) + goto cleanup; + if (CHECK(hashmap__size(map) != ELEM_CNT - found_cnt, "elem_cnt", + "invalid updated map size (already deleted: %d): %zu\n", + found_cnt, hashmap__size(map))) + goto cleanup; + if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)), + "hashmap__capacity", + "unexpected map capacity: %zu\n", hashmap__capacity(map))) + goto cleanup; + + hashmap__for_each_entry_safe(map, entry, tmp, bkt) { + const void *oldk, *k; + void *oldv, *v; + + k = entry->key; + v = entry->value; + + found_cnt++; + found_msk |= 1ULL << (long)k; + + if (CHECK(!hashmap__delete(map, k, &oldk, &oldv), "elem_del", + "failed to delete k/v %ld = %ld\n", + (long)k, (long)v)) + goto cleanup; + if (CHECK(oldk != k || oldv != v, "elem_check", + "invalid old k/v: expect %ld = %ld, got %ld = %ld\n", + (long)k, (long)v, (long)oldk, (long)oldv)) + goto cleanup; + if (CHECK(hashmap__delete(map, k, &oldk, &oldv), "elem_del", + "unexpectedly deleted k/v %ld = %ld\n", + (long)k, (long)v)) + goto cleanup; + } + + if (CHECK(found_cnt != ELEM_CNT || found_msk != (1ULL << ELEM_CNT) - 1, + "found_cnt", + "not all keys were deleted: found_cnt:%d, found_msk:%llx\n", + found_cnt, found_msk)) + goto cleanup; + if (CHECK(hashmap__size(map) != 0, "hashmap__size", + "invalid updated map size (already deleted: %d): %zu\n", + found_cnt, hashmap__size(map))) + goto cleanup; + + found_cnt = 0; + hashmap__for_each_entry(map, entry, bkt) { + CHECK(false, "elem_exists", + "unexpected map entries left: %ld = %ld\n", + (long)entry->key, (long)entry->value); + goto cleanup; + } + + hashmap__clear(map); + hashmap__for_each_entry(map, entry, bkt) { + CHECK(false, "elem_exists", + "unexpected map entries left: %ld = %ld\n", + (long)entry->key, (long)entry->value); + goto cleanup; + } + +cleanup: + hashmap__free(map); +} + +static size_t collision_hash_fn(const void *k, void *ctx) +{ + return 0; +} + +static void test_hashmap_multimap(void) +{ + void *k1 = (void *)0, *k2 = (void *)1; + struct hashmap_entry *entry; + struct hashmap *map; + long found_msk; + int err, bkt; + + /* force collisions */ + map = hashmap__new(collision_hash_fn, equal_fn, NULL); + if (CHECK(IS_ERR(map), "hashmap__new", + "failed to create map: %ld\n", PTR_ERR(map))) + return; + + /* set up multimap: + * [0] -> 1, 2, 4; + * [1] -> 8, 16, 32; + */ + err = hashmap__append(map, k1, (void *)1); + if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err)) + goto cleanup; + err = hashmap__append(map, k1, (void *)2); + if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err)) + goto cleanup; + err = hashmap__append(map, k1, (void *)4); + if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err)) + goto cleanup; + + err = hashmap__append(map, k2, (void *)8); + if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err)) + goto cleanup; + err = hashmap__append(map, k2, (void *)16); + if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err)) + goto cleanup; + err = hashmap__append(map, k2, (void *)32); + if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err)) + goto cleanup; + + if (CHECK(hashmap__size(map) != 6, "hashmap_size", + "invalid map size: %zu\n", hashmap__size(map))) + goto cleanup; + + /* verify global iteration still works and sees all values */ + found_msk = 0; + hashmap__for_each_entry(map, entry, bkt) { + found_msk |= (long)entry->value; + } + if (CHECK(found_msk != (1 << 6) - 1, "found_msk", + "not all keys iterated: %lx\n", found_msk)) + goto cleanup; + + /* iterate values for key 1 */ + found_msk = 0; + hashmap__for_each_key_entry(map, entry, k1) { + found_msk |= (long)entry->value; + } + if (CHECK(found_msk != (1 | 2 | 4), "found_msk", + "invalid k1 values: %lx\n", found_msk)) + goto cleanup; + + /* iterate values for key 2 */ + found_msk = 0; + hashmap__for_each_key_entry(map, entry, k2) { + found_msk |= (long)entry->value; + } + if (CHECK(found_msk != (8 | 16 | 32), "found_msk", + "invalid k2 values: %lx\n", found_msk)) + goto cleanup; + +cleanup: + hashmap__free(map); +} + +static void test_hashmap_empty() +{ + struct hashmap_entry *entry; + int bkt; + struct hashmap *map; + void *k = (void *)0; + + /* force collisions */ + map = hashmap__new(hash_fn, equal_fn, NULL); + if (CHECK(IS_ERR(map), "hashmap__new", + "failed to create map: %ld\n", PTR_ERR(map))) + goto cleanup; + + if (CHECK(hashmap__size(map) != 0, "hashmap__size", + "invalid map size: %zu\n", hashmap__size(map))) + goto cleanup; + if (CHECK(hashmap__capacity(map) != 0, "hashmap__capacity", + "invalid map capacity: %zu\n", hashmap__capacity(map))) + goto cleanup; + if (CHECK(hashmap__find(map, k, NULL), "elem_find", + "unexpected find\n")) + goto cleanup; + if (CHECK(hashmap__delete(map, k, NULL, NULL), "elem_del", + "unexpected delete\n")) + goto cleanup; + + hashmap__for_each_entry(map, entry, bkt) { + CHECK(false, "elem_found", "unexpected iterated entry\n"); + goto cleanup; + } + hashmap__for_each_key_entry(map, entry, k) { + CHECK(false, "key_found", "unexpected key entry\n"); + goto cleanup; + } + +cleanup: + hashmap__free(map); +} + +void test_hashmap() +{ + if (test__start_subtest("generic")) + test_hashmap_generic(); + if (test__start_subtest("multimap")) + test_hashmap_multimap(); + if (test__start_subtest("empty")) + test_hashmap_empty(); +} diff --git a/tools/testing/selftests/bpf/test_hashmap.c b/tools/testing/selftests/bpf/test_hashmap.c deleted file mode 100644 index c490e012c23f..000000000000 --- a/tools/testing/selftests/bpf/test_hashmap.c +++ /dev/null @@ -1,382 +0,0 @@ -// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) - -/* - * Tests for libbpf's hashmap. - * - * Copyright (c) 2019 Facebook - */ -#include -#include -#include -#include "bpf/hashmap.h" - -#define CHECK(condition, format...) ({ \ - int __ret = !!(condition); \ - if (__ret) { \ - fprintf(stderr, "%s:%d:FAIL ", __func__, __LINE__); \ - fprintf(stderr, format); \ - } \ - __ret; \ -}) - -size_t hash_fn(const void *k, void *ctx) -{ - return (long)k; -} - -bool equal_fn(const void *a, const void *b, void *ctx) -{ - return (long)a == (long)b; -} - -static inline size_t next_pow_2(size_t n) -{ - size_t r = 1; - - while (r < n) - r <<= 1; - return r; -} - -static inline size_t exp_cap(size_t sz) -{ - size_t r = next_pow_2(sz); - - if (sz * 4 / 3 > r) - r <<= 1; - return r; -} - -#define ELEM_CNT 62 - -int test_hashmap_generic(void) -{ - struct hashmap_entry *entry, *tmp; - int err, bkt, found_cnt, i; - long long found_msk; - struct hashmap *map; - - fprintf(stderr, "%s: ", __func__); - - map = hashmap__new(hash_fn, equal_fn, NULL); - if (CHECK(IS_ERR(map), "failed to create map: %ld\n", PTR_ERR(map))) - return 1; - - for (i = 0; i < ELEM_CNT; i++) { - const void *oldk, *k = (const void *)(long)i; - void *oldv, *v = (void *)(long)(1024 + i); - - err = hashmap__update(map, k, v, &oldk, &oldv); - if (CHECK(err != -ENOENT, "unexpected result: %d\n", err)) - return 1; - - if (i % 2) { - err = hashmap__add(map, k, v); - } else { - err = hashmap__set(map, k, v, &oldk, &oldv); - if (CHECK(oldk != NULL || oldv != NULL, - "unexpected k/v: %p=%p\n", oldk, oldv)) - return 1; - } - - if (CHECK(err, "failed to add k/v %ld = %ld: %d\n", - (long)k, (long)v, err)) - return 1; - - if (CHECK(!hashmap__find(map, k, &oldv), - "failed to find key %ld\n", (long)k)) - return 1; - if (CHECK(oldv != v, "found value is wrong: %ld\n", (long)oldv)) - return 1; - } - - if (CHECK(hashmap__size(map) != ELEM_CNT, - "invalid map size: %zu\n", hashmap__size(map))) - return 1; - if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)), - "unexpected map capacity: %zu\n", hashmap__capacity(map))) - return 1; - - found_msk = 0; - hashmap__for_each_entry(map, entry, bkt) { - long k = (long)entry->key; - long v = (long)entry->value; - - found_msk |= 1ULL << k; - if (CHECK(v - k != 1024, "invalid k/v pair: %ld = %ld\n", k, v)) - return 1; - } - if (CHECK(found_msk != (1ULL << ELEM_CNT) - 1, - "not all keys iterated: %llx\n", found_msk)) - return 1; - - for (i = 0; i < ELEM_CNT; i++) { - const void *oldk, *k = (const void *)(long)i; - void *oldv, *v = (void *)(long)(256 + i); - - err = hashmap__add(map, k, v); - if (CHECK(err != -EEXIST, "unexpected add result: %d\n", err)) - return 1; - - if (i % 2) - err = hashmap__update(map, k, v, &oldk, &oldv); - else - err = hashmap__set(map, k, v, &oldk, &oldv); - - if (CHECK(err, "failed to update k/v %ld = %ld: %d\n", - (long)k, (long)v, err)) - return 1; - if (CHECK(!hashmap__find(map, k, &oldv), - "failed to find key %ld\n", (long)k)) - return 1; - if (CHECK(oldv != v, "found value is wrong: %ld\n", (long)oldv)) - return 1; - } - - if (CHECK(hashmap__size(map) != ELEM_CNT, - "invalid updated map size: %zu\n", hashmap__size(map))) - return 1; - if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)), - "unexpected map capacity: %zu\n", hashmap__capacity(map))) - return 1; - - found_msk = 0; - hashmap__for_each_entry_safe(map, entry, tmp, bkt) { - long k = (long)entry->key; - long v = (long)entry->value; - - found_msk |= 1ULL << k; - if (CHECK(v - k != 256, - "invalid updated k/v pair: %ld = %ld\n", k, v)) - return 1; - } - if (CHECK(found_msk != (1ULL << ELEM_CNT) - 1, - "not all keys iterated after update: %llx\n", found_msk)) - return 1; - - found_cnt = 0; - hashmap__for_each_key_entry(map, entry, (void *)0) { - found_cnt++; - } - if (CHECK(!found_cnt, "didn't find any entries for key 0\n")) - return 1; - - found_msk = 0; - found_cnt = 0; - hashmap__for_each_key_entry_safe(map, entry, tmp, (void *)0) { - const void *oldk, *k; - void *oldv, *v; - - k = entry->key; - v = entry->value; - - found_cnt++; - found_msk |= 1ULL << (long)k; - - if (CHECK(!hashmap__delete(map, k, &oldk, &oldv), - "failed to delete k/v %ld = %ld\n", - (long)k, (long)v)) - return 1; - if (CHECK(oldk != k || oldv != v, - "invalid deleted k/v: expected %ld = %ld, got %ld = %ld\n", - (long)k, (long)v, (long)oldk, (long)oldv)) - return 1; - if (CHECK(hashmap__delete(map, k, &oldk, &oldv), - "unexpectedly deleted k/v %ld = %ld\n", - (long)oldk, (long)oldv)) - return 1; - } - - if (CHECK(!found_cnt || !found_msk, - "didn't delete any key entries\n")) - return 1; - if (CHECK(hashmap__size(map) != ELEM_CNT - found_cnt, - "invalid updated map size (already deleted: %d): %zu\n", - found_cnt, hashmap__size(map))) - return 1; - if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)), - "unexpected map capacity: %zu\n", hashmap__capacity(map))) - return 1; - - hashmap__for_each_entry_safe(map, entry, tmp, bkt) { - const void *oldk, *k; - void *oldv, *v; - - k = entry->key; - v = entry->value; - - found_cnt++; - found_msk |= 1ULL << (long)k; - - if (CHECK(!hashmap__delete(map, k, &oldk, &oldv), - "failed to delete k/v %ld = %ld\n", - (long)k, (long)v)) - return 1; - if (CHECK(oldk != k || oldv != v, - "invalid old k/v: expect %ld = %ld, got %ld = %ld\n", - (long)k, (long)v, (long)oldk, (long)oldv)) - return 1; - if (CHECK(hashmap__delete(map, k, &oldk, &oldv), - "unexpectedly deleted k/v %ld = %ld\n", - (long)k, (long)v)) - return 1; - } - - if (CHECK(found_cnt != ELEM_CNT || found_msk != (1ULL << ELEM_CNT) - 1, - "not all keys were deleted: found_cnt:%d, found_msk:%llx\n", - found_cnt, found_msk)) - return 1; - if (CHECK(hashmap__size(map) != 0, - "invalid updated map size (already deleted: %d): %zu\n", - found_cnt, hashmap__size(map))) - return 1; - - found_cnt = 0; - hashmap__for_each_entry(map, entry, bkt) { - CHECK(false, "unexpected map entries left: %ld = %ld\n", - (long)entry->key, (long)entry->value); - return 1; - } - - hashmap__free(map); - hashmap__for_each_entry(map, entry, bkt) { - CHECK(false, "unexpected map entries left: %ld = %ld\n", - (long)entry->key, (long)entry->value); - return 1; - } - - fprintf(stderr, "OK\n"); - return 0; -} - -size_t collision_hash_fn(const void *k, void *ctx) -{ - return 0; -} - -int test_hashmap_multimap(void) -{ - void *k1 = (void *)0, *k2 = (void *)1; - struct hashmap_entry *entry; - struct hashmap *map; - long found_msk; - int err, bkt; - - fprintf(stderr, "%s: ", __func__); - - /* force collisions */ - map = hashmap__new(collision_hash_fn, equal_fn, NULL); - if (CHECK(IS_ERR(map), "failed to create map: %ld\n", PTR_ERR(map))) - return 1; - - - /* set up multimap: - * [0] -> 1, 2, 4; - * [1] -> 8, 16, 32; - */ - err = hashmap__append(map, k1, (void *)1); - if (CHECK(err, "failed to add k/v: %d\n", err)) - return 1; - err = hashmap__append(map, k1, (void *)2); - if (CHECK(err, "failed to add k/v: %d\n", err)) - return 1; - err = hashmap__append(map, k1, (void *)4); - if (CHECK(err, "failed to add k/v: %d\n", err)) - return 1; - - err = hashmap__append(map, k2, (void *)8); - if (CHECK(err, "failed to add k/v: %d\n", err)) - return 1; - err = hashmap__append(map, k2, (void *)16); - if (CHECK(err, "failed to add k/v: %d\n", err)) - return 1; - err = hashmap__append(map, k2, (void *)32); - if (CHECK(err, "failed to add k/v: %d\n", err)) - return 1; - - if (CHECK(hashmap__size(map) != 6, - "invalid map size: %zu\n", hashmap__size(map))) - return 1; - - /* verify global iteration still works and sees all values */ - found_msk = 0; - hashmap__for_each_entry(map, entry, bkt) { - found_msk |= (long)entry->value; - } - if (CHECK(found_msk != (1 << 6) - 1, - "not all keys iterated: %lx\n", found_msk)) - return 1; - - /* iterate values for key 1 */ - found_msk = 0; - hashmap__for_each_key_entry(map, entry, k1) { - found_msk |= (long)entry->value; - } - if (CHECK(found_msk != (1 | 2 | 4), - "invalid k1 values: %lx\n", found_msk)) - return 1; - - /* iterate values for key 2 */ - found_msk = 0; - hashmap__for_each_key_entry(map, entry, k2) { - found_msk |= (long)entry->value; - } - if (CHECK(found_msk != (8 | 16 | 32), - "invalid k2 values: %lx\n", found_msk)) - return 1; - - fprintf(stderr, "OK\n"); - return 0; -} - -int test_hashmap_empty() -{ - struct hashmap_entry *entry; - int bkt; - struct hashmap *map; - void *k = (void *)0; - - fprintf(stderr, "%s: ", __func__); - - /* force collisions */ - map = hashmap__new(hash_fn, equal_fn, NULL); - if (CHECK(IS_ERR(map), "failed to create map: %ld\n", PTR_ERR(map))) - return 1; - - if (CHECK(hashmap__size(map) != 0, - "invalid map size: %zu\n", hashmap__size(map))) - return 1; - if (CHECK(hashmap__capacity(map) != 0, - "invalid map capacity: %zu\n", hashmap__capacity(map))) - return 1; - if (CHECK(hashmap__find(map, k, NULL), "unexpected find\n")) - return 1; - if (CHECK(hashmap__delete(map, k, NULL, NULL), "unexpected delete\n")) - return 1; - - hashmap__for_each_entry(map, entry, bkt) { - CHECK(false, "unexpected iterated entry\n"); - return 1; - } - hashmap__for_each_key_entry(map, entry, k) { - CHECK(false, "unexpected key entry\n"); - return 1; - } - - fprintf(stderr, "OK\n"); - return 0; -} - -int main(int argc, char **argv) -{ - bool failed = false; - - if (test_hashmap_generic()) - failed = true; - if (test_hashmap_multimap()) - failed = true; - if (test_hashmap_empty()) - failed = true; - - return failed; -} -- cgit v1.2.3 From 229bf8bf4d910510bc1a2fd0b89bd467cd71050d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 18:21:04 -0700 Subject: libbpf: Fix memory leak and possible double-free in hashmap__clear Fix memory leak in hashmap_clear() not freeing hashmap_entry structs for each of the remaining entries. Also NULL-out bucket list to prevent possible double-free between hashmap__clear() and hashmap__free(). Running test_progs-asan flavor clearly showed this problem. Reported-by: Alston Tang Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429012111.277390-5-andriin@fb.com --- tools/lib/bpf/hashmap.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/hashmap.c b/tools/lib/bpf/hashmap.c index 54c30c802070..cffb96202e0d 100644 --- a/tools/lib/bpf/hashmap.c +++ b/tools/lib/bpf/hashmap.c @@ -59,7 +59,14 @@ struct hashmap *hashmap__new(hashmap_hash_fn hash_fn, void hashmap__clear(struct hashmap *map) { + struct hashmap_entry *cur, *tmp; + int bkt; + + hashmap__for_each_entry_safe(map, cur, tmp, bkt) { + free(cur); + } free(map->buckets); + map->buckets = NULL; map->cap = map->cap_bits = map->sz = 0; } -- cgit v1.2.3 From f25d5416d64c796aa639136eb0b076c8bd579b54 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 18:21:05 -0700 Subject: selftests/bpf: Fix memory leak in test selector Free test selector substrings, which were strdup()'ed. Fixes: b65053cd94f4 ("selftests/bpf: Add whitelist/blacklist of test names to test_progs") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429012111.277390-6-andriin@fb.com --- tools/testing/selftests/bpf/test_progs.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index b521e0a512b6..86d0020c9eec 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -420,6 +420,18 @@ static int libbpf_print_fn(enum libbpf_print_level level, return 0; } +static void free_str_set(const struct str_set *set) +{ + int i; + + if (!set) + return; + + for (i = 0; i < set->cnt; i++) + free((void *)set->strs[i]); + free(set->strs); +} + static int parse_str_list(const char *s, struct str_set *set) { char *input, *state = NULL, *next, **tmp, **strs = NULL; @@ -756,11 +768,11 @@ int main(int argc, char **argv) fprintf(stdout, "Summary: %d/%d PASSED, %d SKIPPED, %d FAILED\n", env.succ_cnt, env.sub_succ_cnt, env.skip_cnt, env.fail_cnt); - free(env.test_selector.blacklist.strs); - free(env.test_selector.whitelist.strs); + free_str_set(&env.test_selector.blacklist); + free_str_set(&env.test_selector.whitelist); free(env.test_selector.num_set); - free(env.subtest_selector.blacklist.strs); - free(env.subtest_selector.whitelist.strs); + free_str_set(&env.subtest_selector.blacklist); + free_str_set(&env.subtest_selector.whitelist); free(env.subtest_selector.num_set); return env.fail_cnt ? EXIT_FAILURE : EXIT_SUCCESS; -- cgit v1.2.3 From 9f56bb531a809ecaa7f0ddca61d2cf3adc1cb81a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 18:21:06 -0700 Subject: selftests/bpf: Fix memory leak in extract_build_id() getline() allocates string, which has to be freed. Fixes: 81f77fd0deeb ("bpf: add selftest for stackmap with BPF_F_STACK_BUILD_ID") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Cc: Song Liu Link: https://lore.kernel.org/bpf/20200429012111.277390-7-andriin@fb.com --- tools/testing/selftests/bpf/test_progs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 86d0020c9eec..93970ec1c9e9 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -351,6 +351,7 @@ int extract_build_id(char *build_id, size_t size) len = size; memcpy(build_id, line, len); build_id[len] = '\0'; + free(line); return 0; err: fclose(fp); -- cgit v1.2.3 From 13c908495e5d51718a6da84ae925fa2aac056380 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 18:21:07 -0700 Subject: selftests/bpf: Fix invalid memory reads in core_relo selftest Another one found by AddressSanitizer. input_len is bigger than actually initialized data size. Fixes: c7566a69695c ("selftests/bpf: Add field existence CO-RE relocs tests") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429012111.277390-8-andriin@fb.com --- tools/testing/selftests/bpf/prog_tests/core_reloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index 31e177adbdf1..084ed26a7d78 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -392,7 +392,7 @@ static struct core_reloc_test_case test_cases[] = { .input = STRUCT_TO_CHAR_PTR(core_reloc_existence___minimal) { .a = 42, }, - .input_len = sizeof(struct core_reloc_existence), + .input_len = sizeof(struct core_reloc_existence___minimal), .output = STRUCT_TO_CHAR_PTR(core_reloc_existence_output) { .a_exists = 1, .b_exists = 0, -- cgit v1.2.3 From 3521ffa2ee9a48c3236c93f54ae11c074490ebce Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 18:21:08 -0700 Subject: libbpf: Fix huge memory leak in libbpf_find_vmlinux_btf_id() BTF object wasn't freed. Fixes: a6ed02cac690 ("libbpf: Load btf_vmlinux only once per object.") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Cc: KP Singh Link: https://lore.kernel.org/bpf/20200429012111.277390-9-andriin@fb.com --- tools/lib/bpf/libbpf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 445ee903f9cd..d86ff8214b96 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6934,6 +6934,7 @@ int libbpf_find_vmlinux_btf_id(const char *name, enum bpf_attach_type attach_type) { struct btf *btf; + int err; btf = libbpf_find_kernel_btf(); if (IS_ERR(btf)) { @@ -6941,7 +6942,9 @@ int libbpf_find_vmlinux_btf_id(const char *name, return -EINVAL; } - return __find_vmlinux_btf_id(btf, name, attach_type); + err = __find_vmlinux_btf_id(btf, name, attach_type); + btf__free(btf); + return err; } static int libbpf_find_prog_btf_id(const char *name, __u32 attach_prog_fd) -- cgit v1.2.3 From 36d0b6159f6a6f51f600bf1777702f7036fb9839 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 18:21:09 -0700 Subject: selftests/bpf: Disable ASAN instrumentation for mmap()'ed memory read AddressSanitizer assumes that all memory dereferences are done against memory allocated by sanitizer's malloc()/free() code and not touched by anyone else. Seems like this doesn't hold for perf buffer memory. Disable instrumentation on perf buffer callback function. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429012111.277390-10-andriin@fb.com --- tools/testing/selftests/bpf/prog_tests/perf_buffer.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c index 1450ea2dd4cc..a122ce3b360e 100644 --- a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c +++ b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c @@ -6,6 +6,11 @@ #include #include "bpf/libbpf_internal.h" +/* AddressSanitizer sometimes crashes due to data dereference below, due to + * this being mmap()'ed memory. Disable instrumentation with + * no_sanitize_address attribute + */ +__attribute__((no_sanitize_address)) static void on_sample(void *ctx, int cpu, void *data, __u32 size) { int cpu_data = *(int *)data, duration = 0; -- cgit v1.2.3 From 8d30e80a049ad699264e4a12911e349f93c7279a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 18:21:10 -0700 Subject: selftests/bpf: Fix bpf_link leak in ns_current_pid_tgid selftest If condition is inverted, but it's also just not necessary. Fixes: 1c1052e0140a ("tools/testing/selftests/bpf: Add self-tests for new helper bpf_get_ns_current_pid_tgid.") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Cc: Carlos Neira Link: https://lore.kernel.org/bpf/20200429012111.277390-11-andriin@fb.com --- tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c index 542240e16564..e74dc501b27f 100644 --- a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c +++ b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c @@ -80,9 +80,6 @@ void test_ns_current_pid_tgid(void) "User pid/tgid %llu BPF pid/tgid %llu\n", id, bss.pid_tgid)) goto cleanup; cleanup: - if (!link) { - bpf_link__destroy(link); - link = NULL; - } + bpf_link__destroy(link); bpf_object__close(obj); } -- cgit v1.2.3 From e4e8f4d047fdcf7ac7d944e266e85d8041f16cd6 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 18:21:11 -0700 Subject: selftests/bpf: Add runqslower binary to .gitignore With recent changes, runqslower is being copied into selftests/bpf root directory. So add it into .gitignore. Fixes: b26d1e2b6028 ("selftests/bpf: Copy runqslower to OUTPUT directory") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Cc: Veronika Kabatova Link: https://lore.kernel.org/bpf/20200429012111.277390-12-andriin@fb.com --- tools/testing/selftests/bpf/.gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 16b9774d8b68..3ff031972975 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -37,4 +37,4 @@ test_cpp /no_alu32 /bpf_gcc /tools - +/runqslower -- cgit v1.2.3 From e3450b79dfe47632ffa65042c6d5a6b48263da4e Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 29 Apr 2020 15:45:04 +0100 Subject: tools: bpftool: For "feature probe" define "full_mode" bool as global The "full_mode" variable used for switching between full or partial feature probing (i.e. with or without probing helpers that will log warnings in kernel logs) was piped from the main do_probe() function down to probe_helpers_for_progtype(), where it is needed. Define it as a global variable: the calls will be more readable, and if other similar flags were to be used in the future, we could use global variables as well instead of extending again the list of arguments with new flags. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200429144506.8999-2-quentin@isovalent.com --- tools/bpf/bpftool/feature.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 88718ee6a438..59e4cb44efbc 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -35,6 +35,8 @@ static const char * const helper_name[] = { #undef BPF_HELPER_MAKE_ENTRY +static bool full_mode; + /* Miscellaneous utility functions */ static bool check_procfs(void) @@ -540,8 +542,7 @@ probe_helper_for_progtype(enum bpf_prog_type prog_type, bool supported_type, static void probe_helpers_for_progtype(enum bpf_prog_type prog_type, bool supported_type, - const char *define_prefix, bool full_mode, - __u32 ifindex) + const char *define_prefix, __u32 ifindex) { const char *ptype_name = prog_type_name[prog_type]; char feat_name[128]; @@ -678,8 +679,7 @@ static void section_map_types(const char *define_prefix, __u32 ifindex) } static void -section_helpers(bool *supported_types, const char *define_prefix, - bool full_mode, __u32 ifindex) +section_helpers(bool *supported_types, const char *define_prefix, __u32 ifindex) { unsigned int i; @@ -704,8 +704,8 @@ section_helpers(bool *supported_types, const char *define_prefix, define_prefix, define_prefix, define_prefix, define_prefix); for (i = BPF_PROG_TYPE_UNSPEC + 1; i < ARRAY_SIZE(prog_type_name); i++) - probe_helpers_for_progtype(i, supported_types[i], - define_prefix, full_mode, ifindex); + probe_helpers_for_progtype(i, supported_types[i], define_prefix, + ifindex); print_end_section(); } @@ -725,7 +725,6 @@ static int do_probe(int argc, char **argv) enum probe_component target = COMPONENT_UNSPEC; const char *define_prefix = NULL; bool supported_types[128] = {}; - bool full_mode = false; __u32 ifindex = 0; char *ifname; @@ -803,7 +802,7 @@ static int do_probe(int argc, char **argv) goto exit_close_json; section_program_types(supported_types, define_prefix, ifindex); section_map_types(define_prefix, ifindex); - section_helpers(supported_types, define_prefix, full_mode, ifindex); + section_helpers(supported_types, define_prefix, ifindex); section_misc(define_prefix, ifindex); exit_close_json: -- cgit v1.2.3 From cf9bf714523dbbc97953be6de6ca14d57d4f8a21 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 29 Apr 2020 15:45:05 +0100 Subject: tools: bpftool: Allow unprivileged users to probe features There is demand for a way to identify what BPF helper functions are available to unprivileged users. To do so, allow unprivileged users to run "bpftool feature probe" to list BPF-related features. This will only show features accessible to those users, and may not reflect the full list of features available (to administrators) on the system. To avoid the case where bpftool is inadvertently run as non-root and would list only a subset of the features supported by the system when it would be expected to list all of them, running as unprivileged is gated behind the "unprivileged" keyword passed to the command line. When used by a privileged user, this keyword allows to drop the CAP_SYS_ADMIN and to list the features available to unprivileged users. Note that this addsd a dependency on libpcap for compiling bpftool. Note that there is no particular reason why the probes were restricted to root, other than the fact I did not need them for unprivileged and did not bother with the additional checks at the time probes were added. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200429144506.8999-3-quentin@isovalent.com --- .../bpf/bpftool/Documentation/bpftool-feature.rst | 10 +- tools/bpf/bpftool/Makefile | 2 +- tools/bpf/bpftool/bash-completion/bpftool | 2 +- tools/bpf/bpftool/feature.c | 102 ++++++++++++++++++--- 4 files changed, 100 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index b04156cfd7a3..ca085944e4cf 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -19,7 +19,7 @@ SYNOPSIS FEATURE COMMANDS ================ -| **bpftool** **feature probe** [*COMPONENT*] [**full**] [**macros** [**prefix** *PREFIX*]] +| **bpftool** **feature probe** [*COMPONENT*] [**full**] [**unprivileged**] [**macros** [**prefix** *PREFIX*]] | **bpftool** **feature help** | | *COMPONENT* := { **kernel** | **dev** *NAME* } @@ -49,6 +49,14 @@ DESCRIPTION Keyword **kernel** can be omitted. If no probe target is specified, probing the kernel is the default behaviour. + When the **unprivileged** keyword is used, bpftool will dump + only the features available to a user who does not have the + **CAP_SYS_ADMIN** capability set. The features available in + that case usually represent a small subset of the parameters + supported by the system. Unprivileged users MUST use the + **unprivileged** keyword: This is to avoid misdetection if + bpftool is inadvertently run as non-root, for example. + **bpftool feature probe dev** *NAME* [**full**] [**macros** [**prefix** *PREFIX*]] Probe network device for supported eBPF features and dump results to the console. diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index f584d1fdfc64..89d7962a4a44 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -55,7 +55,7 @@ ifneq ($(EXTRA_LDFLAGS),) LDFLAGS += $(EXTRA_LDFLAGS) endif -LIBS = $(LIBBPF) -lelf -lz +LIBS = $(LIBBPF) -lelf -lz -lcap INSTALL ?= install RM ?= rm -f diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index c033c3329f73..fc989ead7313 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -1079,7 +1079,7 @@ _bpftool() COMPREPLY+=( $( compgen -W 'macros' -- "$cur" ) ) fi _bpftool_one_of_list 'kernel dev' - _bpftool_once_attr 'full' + _bpftool_once_attr 'full unprivileged' return 0 ;; *) diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 59e4cb44efbc..952f4b1987c0 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -36,6 +37,7 @@ static const char * const helper_name[] = { #undef BPF_HELPER_MAKE_ENTRY static bool full_mode; +static bool run_as_unprivileged; /* Miscellaneous utility functions */ @@ -473,6 +475,11 @@ probe_prog_type(enum bpf_prog_type prog_type, bool *supported_types, } res = bpf_probe_prog_type(prog_type, ifindex); + /* Probe may succeed even if program load fails, for unprivileged users + * check that we did not fail because of insufficient permissions + */ + if (run_as_unprivileged && errno == EPERM) + res = false; supported_types[prog_type] |= res; @@ -501,6 +508,10 @@ probe_map_type(enum bpf_map_type map_type, const char *define_prefix, res = bpf_probe_map_type(map_type, ifindex); + /* Probe result depends on the success of map creation, no additional + * check required for unprivileged users + */ + maxlen = sizeof(plain_desc) - strlen(plain_comment) - 1; if (strlen(map_type_name[map_type]) > maxlen) { p_info("map type name too long"); @@ -520,12 +531,17 @@ probe_helper_for_progtype(enum bpf_prog_type prog_type, bool supported_type, const char *define_prefix, unsigned int id, const char *ptype_name, __u32 ifindex) { - bool res; + bool res = false; - if (!supported_type) - res = false; - else + if (supported_type) { res = bpf_probe_helper(id, prog_type, ifindex); + /* Probe may succeed even if program load fails, for + * unprivileged users check that we did not fail because of + * insufficient permissions + */ + if (run_as_unprivileged && errno == EPERM) + res = false; + } if (json_output) { if (res) @@ -720,6 +736,65 @@ static void section_misc(const char *define_prefix, __u32 ifindex) print_end_section(); } +static int handle_perms(void) +{ + cap_value_t cap_list[1] = { CAP_SYS_ADMIN }; + bool has_sys_admin_cap = false; + cap_flag_value_t val; + int res = -1; + cap_t caps; + + caps = cap_get_proc(); + if (!caps) { + p_err("failed to get capabilities for process: %s", + strerror(errno)); + return -1; + } + + if (cap_get_flag(caps, CAP_SYS_ADMIN, CAP_EFFECTIVE, &val)) { + p_err("bug: failed to retrieve CAP_SYS_ADMIN status"); + goto exit_free; + } + if (val == CAP_SET) + has_sys_admin_cap = true; + + if (!run_as_unprivileged && !has_sys_admin_cap) { + p_err("full feature probing requires CAP_SYS_ADMIN, run as root or use 'unprivileged'"); + goto exit_free; + } + + if ((run_as_unprivileged && !has_sys_admin_cap) || + (!run_as_unprivileged && has_sys_admin_cap)) { + /* We are all good, exit now */ + res = 0; + goto exit_free; + } + + /* if (run_as_unprivileged && has_sys_admin_cap), drop CAP_SYS_ADMIN */ + + if (cap_set_flag(caps, CAP_EFFECTIVE, ARRAY_SIZE(cap_list), cap_list, + CAP_CLEAR)) { + p_err("bug: failed to clear CAP_SYS_ADMIN from capabilities"); + goto exit_free; + } + + if (cap_set_proc(caps)) { + p_err("failed to drop CAP_SYS_ADMIN: %s", strerror(errno)); + goto exit_free; + } + + res = 0; + +exit_free: + if (cap_free(caps) && !res) { + p_err("failed to clear storage object for capabilities: %s", + strerror(errno)); + res = -1; + } + + return res; +} + static int do_probe(int argc, char **argv) { enum probe_component target = COMPONENT_UNSPEC; @@ -728,14 +803,6 @@ static int do_probe(int argc, char **argv) __u32 ifindex = 0; char *ifname; - /* Detection assumes user has sufficient privileges (CAP_SYS_ADMIN). - * Let's approximate, and restrict usage to root user only. - */ - if (geteuid()) { - p_err("please run this command as root user"); - return -1; - } - set_max_rlimit(); while (argc) { @@ -784,6 +851,9 @@ static int do_probe(int argc, char **argv) if (!REQ_ARGS(1)) return -1; define_prefix = GET_ARG(); + } else if (is_prefix(*argv, "unprivileged")) { + run_as_unprivileged = true; + NEXT_ARG(); } else { p_err("expected no more arguments, 'kernel', 'dev', 'macros' or 'prefix', got: '%s'?", *argv); @@ -791,6 +861,12 @@ static int do_probe(int argc, char **argv) } } + /* Full feature detection requires CAP_SYS_ADMIN privilege. + * Let's approximate, and warn if user is not root. + */ + if (handle_perms()) + return -1; + if (json_output) { define_prefix = NULL; jsonw_start_object(json_wtr); @@ -821,7 +897,7 @@ static int do_help(int argc, char **argv) } fprintf(stderr, - "Usage: %s %s probe [COMPONENT] [full] [macros [prefix PREFIX]]\n" + "Usage: %s %s probe [COMPONENT] [full] [unprivileged] [macros [prefix PREFIX]]\n" " %s %s help\n" "\n" " COMPONENT := { kernel | dev NAME }\n" -- cgit v1.2.3 From 0b3b9ca3d154486baa08a41cbc62fde67ba8c6c3 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 29 Apr 2020 15:45:06 +0100 Subject: tools: bpftool: Make libcap dependency optional The new libcap dependency is not used for an essential feature of bpftool, and we could imagine building the tool without checks on CAP_SYS_ADMIN by disabling probing features as an unprivileged users. Make it so, in order to avoid a hard dependency on libcap, and to ease packaging/embedding of bpftool. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200429144506.8999-4-quentin@isovalent.com --- .../bpf/bpftool/Documentation/bpftool-feature.rst | 4 +++- tools/bpf/bpftool/Makefile | 13 +++++++---- tools/bpf/bpftool/feature.c | 26 ++++++++++++++++++++++ 3 files changed, 38 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index ca085944e4cf..1fa755f55e0c 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -55,7 +55,9 @@ DESCRIPTION that case usually represent a small subset of the parameters supported by the system. Unprivileged users MUST use the **unprivileged** keyword: This is to avoid misdetection if - bpftool is inadvertently run as non-root, for example. + bpftool is inadvertently run as non-root, for example. This + keyword is unavailable if bpftool was compiled without + libcap. **bpftool feature probe dev** *NAME* [**full**] [**macros** [**prefix** *PREFIX*]] Probe network device for supported eBPF features and dump diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 89d7962a4a44..2759f9cc3289 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -55,16 +55,15 @@ ifneq ($(EXTRA_LDFLAGS),) LDFLAGS += $(EXTRA_LDFLAGS) endif -LIBS = $(LIBBPF) -lelf -lz -lcap - INSTALL ?= install RM ?= rm -f CLANG ?= clang FEATURE_USER = .bpftool -FEATURE_TESTS = libbfd disassembler-four-args reallocarray zlib \ +FEATURE_TESTS = libbfd disassembler-four-args reallocarray zlib libcap \ + clang-bpf-global-var +FEATURE_DISPLAY = libbfd disassembler-four-args zlib libcap \ clang-bpf-global-var -FEATURE_DISPLAY = libbfd disassembler-four-args zlib clang-bpf-global-var check_feat := 1 NON_CHECK_FEAT_TARGETS := clean uninstall doc doc-clean doc-install doc-uninstall @@ -90,6 +89,12 @@ ifeq ($(feature-reallocarray), 0) CFLAGS += -DCOMPAT_NEED_REALLOCARRAY endif +LIBS = $(LIBBPF) -lelf -lz +ifeq ($(feature-libcap), 1) +CFLAGS += -DUSE_LIBCAP +LIBS += -lcap +endif + include $(wildcard $(OUTPUT)*.d) all: $(OUTPUT)bpftool diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 952f4b1987c0..f54347f55ee0 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -6,7 +6,9 @@ #include #include #include +#ifdef USE_LIBCAP #include +#endif #include #include @@ -37,7 +39,9 @@ static const char * const helper_name[] = { #undef BPF_HELPER_MAKE_ENTRY static bool full_mode; +#ifdef USE_LIBCAP static bool run_as_unprivileged; +#endif /* Miscellaneous utility functions */ @@ -475,11 +479,13 @@ probe_prog_type(enum bpf_prog_type prog_type, bool *supported_types, } res = bpf_probe_prog_type(prog_type, ifindex); +#ifdef USE_LIBCAP /* Probe may succeed even if program load fails, for unprivileged users * check that we did not fail because of insufficient permissions */ if (run_as_unprivileged && errno == EPERM) res = false; +#endif supported_types[prog_type] |= res; @@ -535,12 +541,14 @@ probe_helper_for_progtype(enum bpf_prog_type prog_type, bool supported_type, if (supported_type) { res = bpf_probe_helper(id, prog_type, ifindex); +#ifdef USE_LIBCAP /* Probe may succeed even if program load fails, for * unprivileged users check that we did not fail because of * insufficient permissions */ if (run_as_unprivileged && errno == EPERM) res = false; +#endif } if (json_output) { @@ -738,6 +746,7 @@ static void section_misc(const char *define_prefix, __u32 ifindex) static int handle_perms(void) { +#ifdef USE_LIBCAP cap_value_t cap_list[1] = { CAP_SYS_ADMIN }; bool has_sys_admin_cap = false; cap_flag_value_t val; @@ -793,6 +802,18 @@ exit_free: } return res; +#else + /* Detection assumes user has sufficient privileges (CAP_SYS_ADMIN). + * We do not use libpcap so let's approximate, and restrict usage to + * root user only. + */ + if (geteuid()) { + p_err("full feature probing requires root privileges"); + return -1; + } + + return 0; +#endif /* USE_LIBCAP */ } static int do_probe(int argc, char **argv) @@ -852,8 +873,13 @@ static int do_probe(int argc, char **argv) return -1; define_prefix = GET_ARG(); } else if (is_prefix(*argv, "unprivileged")) { +#ifdef USE_LIBCAP run_as_unprivileged = true; NEXT_ARG(); +#else + p_err("unprivileged run not supported, recompile bpftool with libcap"); + return -1; +#endif } else { p_err("expected no more arguments, 'kernel', 'dev', 'macros' or 'prefix', got: '%s'?", *argv); -- cgit v1.2.3 From 34a2cc6eee809f974111979f4c2b3c62aaaad457 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 29 Apr 2020 20:11:53 +0200 Subject: selftests/bpf: Test that lookup on SOCKMAP/SOCKHASH is allowed Now that bpf_map_lookup_elem() is white-listed for SOCKMAP/SOCKHASH, replace the tests which check that verifier prevents lookup on these map types with ones that ensure that lookup operation is permitted, but only with a release of acquired socket reference. Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200429181154.479310-3-jakub@cloudflare.com --- .../selftests/bpf/verifier/prevent_map_lookup.c | 30 ---------- tools/testing/selftests/bpf/verifier/sock.c | 70 ++++++++++++++++++++++ 2 files changed, 70 insertions(+), 30 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c b/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c index da7a4b37cb98..fc4e301260f6 100644 --- a/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c +++ b/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c @@ -1,33 +1,3 @@ -{ - "prevent map lookup in sockmap", - .insns = { - BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_EXIT_INSN(), - }, - .fixup_map_sockmap = { 3 }, - .result = REJECT, - .errstr = "cannot pass map_type 15 into func bpf_map_lookup_elem", - .prog_type = BPF_PROG_TYPE_SOCK_OPS, -}, -{ - "prevent map lookup in sockhash", - .insns = { - BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_EXIT_INSN(), - }, - .fixup_map_sockhash = { 3 }, - .result = REJECT, - .errstr = "cannot pass map_type 18 into func bpf_map_lookup_elem", - .prog_type = BPF_PROG_TYPE_SOCK_OPS, -}, { "prevent map lookup in stack trace", .insns = { diff --git a/tools/testing/selftests/bpf/verifier/sock.c b/tools/testing/selftests/bpf/verifier/sock.c index 9ed192e14f5f..f87ad69dbc62 100644 --- a/tools/testing/selftests/bpf/verifier/sock.c +++ b/tools/testing/selftests/bpf/verifier/sock.c @@ -516,3 +516,73 @@ .prog_type = BPF_PROG_TYPE_XDP, .result = ACCEPT, }, +{ + "bpf_map_lookup_elem(sockmap, &key)", + .insns = { + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_sockmap = { 3 }, + .prog_type = BPF_PROG_TYPE_SK_SKB, + .result = REJECT, + .errstr = "Unreleased reference id=2 alloc_insn=5", +}, +{ + "bpf_map_lookup_elem(sockhash, &key)", + .insns = { + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_sockhash = { 3 }, + .prog_type = BPF_PROG_TYPE_SK_SKB, + .result = REJECT, + .errstr = "Unreleased reference id=2 alloc_insn=5", +}, +{ + "bpf_map_lookup_elem(sockmap, &key); sk->type [fullsock field]; bpf_sk_release(sk)", + .insns = { + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, type)), + BPF_EMIT_CALL(BPF_FUNC_sk_release), + BPF_EXIT_INSN(), + }, + .fixup_map_sockmap = { 3 }, + .prog_type = BPF_PROG_TYPE_SK_SKB, + .result = ACCEPT, +}, +{ + "bpf_map_lookup_elem(sockhash, &key); sk->type [fullsock field]; bpf_sk_release(sk)", + .insns = { + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, type)), + BPF_EMIT_CALL(BPF_FUNC_sk_release), + BPF_EXIT_INSN(), + }, + .fixup_map_sockhash = { 3 }, + .prog_type = BPF_PROG_TYPE_SK_SKB, + .result = ACCEPT, +}, -- cgit v1.2.3 From 0b9ad56b1ea66382a3dcc8e3e7c54967bf8c6d94 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 29 Apr 2020 20:11:54 +0200 Subject: selftests/bpf: Use SOCKMAP for server sockets in bpf_sk_assign test Update bpf_sk_assign test to fetch the server socket from SOCKMAP, now that map lookup from BPF in SOCKMAP is enabled. This way the test TC BPF program doesn't need to know what address server socket is bound to. Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200429181154.479310-4-jakub@cloudflare.com --- tools/testing/selftests/bpf/Makefile | 2 +- tools/testing/selftests/bpf/prog_tests/sk_assign.c | 21 +++++- tools/testing/selftests/bpf/progs/test_sk_assign.c | 82 +++++++++------------- 3 files changed, 53 insertions(+), 52 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 10f12a5aac20..3d942be23d09 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -243,7 +243,7 @@ define GCC_BPF_BUILD_RULE $(BPF_GCC) $3 $4 -O2 -c $1 -o $2 endef -SKEL_BLACKLIST := btf__% test_pinning_invalid.c +SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c # Set up extra TRUNNER_XXX "temporary" variables in the environment (relies on # $eval()) and pass control to DEFINE_TEST_RUNNER_RULES. diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c index d572e1a2c297..47fa04adc147 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c @@ -20,6 +20,7 @@ #define CONNECT_PORT 4321 #define TEST_DADDR (0xC0A80203) #define NS_SELF "/proc/self/ns/net" +#define SERVER_MAP_PATH "/sys/fs/bpf/tc/globals/server_map" static const struct timeval timeo_sec = { .tv_sec = 3 }; static const size_t timeo_optlen = sizeof(timeo_sec); @@ -265,6 +266,7 @@ void test_sk_assign(void) TEST("ipv6 udp addr redir", AF_INET6, SOCK_DGRAM, true), }; int server = -1; + int server_map; int self_net; self_net = open(NS_SELF, O_RDONLY); @@ -278,9 +280,17 @@ void test_sk_assign(void) goto cleanup; } + server_map = bpf_obj_get(SERVER_MAP_PATH); + if (CHECK_FAIL(server_map < 0)) { + perror("Unable to open " SERVER_MAP_PATH); + goto cleanup; + } + for (int i = 0; i < ARRAY_SIZE(tests) && !READ_ONCE(stop); i++) { struct test_sk_cfg *test = &tests[i]; const struct sockaddr *addr; + const int zero = 0; + int err; if (!test__start_subtest(test->name)) continue; @@ -288,7 +298,13 @@ void test_sk_assign(void) addr = (const struct sockaddr *)test->addr; server = start_server(addr, test->len, test->type); if (server == -1) - goto cleanup; + goto close; + + err = bpf_map_update_elem(server_map, &zero, &server, BPF_ANY); + if (CHECK_FAIL(err)) { + perror("Unable to update server_map"); + goto close; + } /* connect to unbound ports */ prepare_addr(test->addr, test->family, CONNECT_PORT, @@ -302,7 +318,10 @@ void test_sk_assign(void) close: close(server); + close(server_map); cleanup: + if (CHECK_FAIL(unlink(SERVER_MAP_PATH))) + perror("Unable to unlink " SERVER_MAP_PATH); if (CHECK_FAIL(setns(self_net, CLONE_NEWNET))) perror("Failed to setns("NS_SELF")"); close(self_net); diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign.c b/tools/testing/selftests/bpf/progs/test_sk_assign.c index 8f530843b4da..1ecd987005d2 100644 --- a/tools/testing/selftests/bpf/progs/test_sk_assign.c +++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c @@ -16,6 +16,26 @@ #include #include +/* Pin map under /sys/fs/bpf/tc/globals/ */ +#define PIN_GLOBAL_NS 2 + +/* Must match struct bpf_elf_map layout from iproute2 */ +struct { + __u32 type; + __u32 size_key; + __u32 size_value; + __u32 max_elem; + __u32 flags; + __u32 id; + __u32 pinning; +} server_map SEC("maps") = { + .type = BPF_MAP_TYPE_SOCKMAP, + .size_key = sizeof(int), + .size_value = sizeof(__u64), + .max_elem = 1, + .pinning = PIN_GLOBAL_NS, +}; + int _version SEC("version") = 1; char _license[] SEC("license") = "GPL"; @@ -72,7 +92,9 @@ handle_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4) { struct bpf_sock_tuple ln = {0}; struct bpf_sock *sk; + const int zero = 0; size_t tuple_len; + __be16 dport; int ret; tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6); @@ -83,32 +105,11 @@ handle_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4) if (sk) goto assign; - if (ipv4) { - if (tuple->ipv4.dport != bpf_htons(4321)) - return TC_ACT_OK; - - ln.ipv4.daddr = bpf_htonl(0x7f000001); - ln.ipv4.dport = bpf_htons(1234); - - sk = bpf_sk_lookup_udp(skb, &ln, sizeof(ln.ipv4), - BPF_F_CURRENT_NETNS, 0); - } else { - if (tuple->ipv6.dport != bpf_htons(4321)) - return TC_ACT_OK; - - /* Upper parts of daddr are already zero. */ - ln.ipv6.daddr[3] = bpf_htonl(0x1); - ln.ipv6.dport = bpf_htons(1234); - - sk = bpf_sk_lookup_udp(skb, &ln, sizeof(ln.ipv6), - BPF_F_CURRENT_NETNS, 0); - } + dport = ipv4 ? tuple->ipv4.dport : tuple->ipv6.dport; + if (dport != bpf_htons(4321)) + return TC_ACT_OK; - /* workaround: We can't do a single socket lookup here, because then - * the compiler will likely spill tuple_len to the stack. This makes it - * lose all bounds information in the verifier, which then rejects the - * call as unsafe. - */ + sk = bpf_map_lookup_elem(&server_map, &zero); if (!sk) return TC_ACT_SHOT; @@ -123,7 +124,9 @@ handle_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4) { struct bpf_sock_tuple ln = {0}; struct bpf_sock *sk; + const int zero = 0; size_t tuple_len; + __be16 dport; int ret; tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6); @@ -137,32 +140,11 @@ handle_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4) bpf_sk_release(sk); } - if (ipv4) { - if (tuple->ipv4.dport != bpf_htons(4321)) - return TC_ACT_OK; + dport = ipv4 ? tuple->ipv4.dport : tuple->ipv6.dport; + if (dport != bpf_htons(4321)) + return TC_ACT_OK; - ln.ipv4.daddr = bpf_htonl(0x7f000001); - ln.ipv4.dport = bpf_htons(1234); - - sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv4), - BPF_F_CURRENT_NETNS, 0); - } else { - if (tuple->ipv6.dport != bpf_htons(4321)) - return TC_ACT_OK; - - /* Upper parts of daddr are already zero. */ - ln.ipv6.daddr[3] = bpf_htonl(0x1); - ln.ipv6.dport = bpf_htons(1234); - - sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv6), - BPF_F_CURRENT_NETNS, 0); - } - - /* workaround: We can't do a single socket lookup here, because then - * the compiler will likely spill tuple_len to the stack. This makes it - * lose all bounds information in the verifier, which then rejects the - * call as unsafe. - */ + sk = bpf_map_lookup_elem(&server_map, &zero); if (!sk) return TC_ACT_SHOT; -- cgit v1.2.3 From 063e688133914505ddb396cc33231f22f12e0685 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 29 Apr 2020 19:14:36 -0700 Subject: libbpf: Fix false uninitialized variable warning Some versions of GCC falsely detect that vi might not be initialized. That's not true, but let's silence it with NULL initialization. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200430021436.1522502-1-andriin@fb.com --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index d86ff8214b96..977add1b73e2 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5003,8 +5003,8 @@ static int bpf_object__collect_map_relos(struct bpf_object *obj, GElf_Shdr *shdr, Elf_Data *data) { int i, j, nrels, new_sz, ptr_sz = sizeof(void *); + const struct btf_var_secinfo *vi = NULL; const struct btf_type *sec, *var, *def; - const struct btf_var_secinfo *vi; const struct btf_member *member; struct bpf_map *map, *targ_map; const char *name, *mname; -- cgit v1.2.3 From c321022244708aec4675de4f032ef1ba9ff0c640 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 30 Apr 2020 12:47:38 +0200 Subject: selftests/bpf: Test allowed maps for bpf_sk_select_reuseport Check that verifier allows passing a map of type: BPF_MAP_TYPE_REUSEPORT_SOCKARRARY, or BPF_MAP_TYPE_SOCKMAP, or BPF_MAP_TYPE_SOCKHASH ... to bpf_sk_select_reuseport helper. Suggested-by: John Fastabend Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200430104738.494180-1-jakub@cloudflare.com --- tools/testing/selftests/bpf/test_verifier.c | 12 +++++++- tools/testing/selftests/bpf/verifier/sock.c | 45 +++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index ad6939c67c5e..21a1ce219c1c 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -50,7 +50,7 @@ #define MAX_INSNS BPF_MAXINSNS #define MAX_TEST_INSNS 1000000 #define MAX_FIXUPS 8 -#define MAX_NR_MAPS 19 +#define MAX_NR_MAPS 20 #define MAX_TEST_RUNS 8 #define POINTER_VALUE 0xcafe4all #define TEST_DATA_LEN 64 @@ -86,6 +86,7 @@ struct bpf_test { int fixup_map_array_small[MAX_FIXUPS]; int fixup_sk_storage_map[MAX_FIXUPS]; int fixup_map_event_output[MAX_FIXUPS]; + int fixup_map_reuseport_array[MAX_FIXUPS]; const char *errstr; const char *errstr_unpriv; uint32_t insn_processed; @@ -637,6 +638,7 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, int *fixup_map_array_small = test->fixup_map_array_small; int *fixup_sk_storage_map = test->fixup_sk_storage_map; int *fixup_map_event_output = test->fixup_map_event_output; + int *fixup_map_reuseport_array = test->fixup_map_reuseport_array; if (test->fill_helper) { test->fill_insns = calloc(MAX_TEST_INSNS, sizeof(struct bpf_insn)); @@ -806,6 +808,14 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, fixup_map_event_output++; } while (*fixup_map_event_output); } + if (*fixup_map_reuseport_array) { + map_fds[19] = __create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, + sizeof(u32), sizeof(u64), 1, 0); + do { + prog[*fixup_map_reuseport_array].imm = map_fds[19]; + fixup_map_reuseport_array++; + } while (*fixup_map_reuseport_array); + } } static int set_admin(bool admin) diff --git a/tools/testing/selftests/bpf/verifier/sock.c b/tools/testing/selftests/bpf/verifier/sock.c index f87ad69dbc62..0bc51ad9e0fb 100644 --- a/tools/testing/selftests/bpf/verifier/sock.c +++ b/tools/testing/selftests/bpf/verifier/sock.c @@ -586,3 +586,48 @@ .prog_type = BPF_PROG_TYPE_SK_SKB, .result = ACCEPT, }, +{ + "bpf_sk_select_reuseport(ctx, reuseport_array, &key, flags)", + .insns = { + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -4), + BPF_LD_MAP_FD(BPF_REG_2, 0), + BPF_EMIT_CALL(BPF_FUNC_sk_select_reuseport), + BPF_EXIT_INSN(), + }, + .fixup_map_reuseport_array = { 4 }, + .prog_type = BPF_PROG_TYPE_SK_REUSEPORT, + .result = ACCEPT, +}, +{ + "bpf_sk_select_reuseport(ctx, sockmap, &key, flags)", + .insns = { + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -4), + BPF_LD_MAP_FD(BPF_REG_2, 0), + BPF_EMIT_CALL(BPF_FUNC_sk_select_reuseport), + BPF_EXIT_INSN(), + }, + .fixup_map_sockmap = { 4 }, + .prog_type = BPF_PROG_TYPE_SK_REUSEPORT, + .result = ACCEPT, +}, +{ + "bpf_sk_select_reuseport(ctx, sockhash, &key, flags)", + .insns = { + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -4), + BPF_LD_MAP_FD(BPF_REG_2, 0), + BPF_EMIT_CALL(BPF_FUNC_sk_select_reuseport), + BPF_EXIT_INSN(), + }, + .fixup_map_sockmap = { 4 }, + .prog_type = BPF_PROG_TYPE_SK_REUSEPORT, + .result = ACCEPT, +}, -- cgit v1.2.3 From d46edd671a147032e22cfeb271a5734703093649 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 30 Apr 2020 00:15:04 -0700 Subject: bpf: Sharing bpf runtime stats with BPF_ENABLE_STATS Currently, sysctl kernel.bpf_stats_enabled controls BPF runtime stats. Typical userspace tools use kernel.bpf_stats_enabled as follows: 1. Enable kernel.bpf_stats_enabled; 2. Check program run_time_ns; 3. Sleep for the monitoring period; 4. Check program run_time_ns again, calculate the difference; 5. Disable kernel.bpf_stats_enabled. The problem with this approach is that only one userspace tool can toggle this sysctl. If multiple tools toggle the sysctl at the same time, the measurement may be inaccurate. To fix this problem while keep backward compatibility, introduce a new bpf command BPF_ENABLE_STATS. On success, this command enables stats and returns a valid fd. BPF_ENABLE_STATS takes argument "type". Currently, only one type, BPF_STATS_RUN_TIME, is supported. We can extend the command to support other types of stats in the future. With BPF_ENABLE_STATS, user space tool would have the following flow: 1. Get a fd with BPF_ENABLE_STATS, and make sure it is valid; 2. Check program run_time_ns; 3. Sleep for the monitoring period; 4. Check program run_time_ns again, calculate the difference; 5. Close the fd. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200430071506.1408910-2-songliubraving@fb.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 11 ++++++++ kernel/bpf/syscall.c | 57 ++++++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 36 +++++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 11 ++++++++ 5 files changed, 115 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c07b1d2f3824..1262ec460ab3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -987,6 +987,7 @@ _out: \ #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); +extern struct mutex bpf_stats_enabled_mutex; /* * Block execution of BPF programs attached to instrumentation (perf, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0eccafae55bb..705e4822f997 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -115,6 +115,7 @@ enum bpf_cmd { BPF_LINK_UPDATE, BPF_LINK_GET_FD_BY_ID, BPF_LINK_GET_NEXT_ID, + BPF_ENABLE_STATS, }; enum bpf_map_type { @@ -390,6 +391,12 @@ enum { */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) +/* type for BPF_ENABLE_STATS */ +enum bpf_stats_type { + /* enabled run_time_ns and run_cnt */ + BPF_STATS_RUN_TIME = 0, +}; + enum bpf_stack_build_id_status { /* user space need an empty entry to identify end of a trace */ BPF_STACK_BUILD_ID_EMPTY = 0, @@ -601,6 +608,10 @@ union bpf_attr { __u32 old_prog_fd; } link_update; + struct { /* struct used by BPF_ENABLE_STATS command */ + __u32 type; + } enable_stats; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c75b2dd2459c..4f34eecec9ce 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3872,6 +3872,60 @@ static int bpf_link_get_fd_by_id(const union bpf_attr *attr) return fd; } +DEFINE_MUTEX(bpf_stats_enabled_mutex); + +static int bpf_stats_release(struct inode *inode, struct file *file) +{ + mutex_lock(&bpf_stats_enabled_mutex); + static_key_slow_dec(&bpf_stats_enabled_key.key); + mutex_unlock(&bpf_stats_enabled_mutex); + return 0; +} + +static const struct file_operations bpf_stats_fops = { + .release = bpf_stats_release, +}; + +static int bpf_enable_runtime_stats(void) +{ + int fd; + + mutex_lock(&bpf_stats_enabled_mutex); + + /* Set a very high limit to avoid overflow */ + if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) { + mutex_unlock(&bpf_stats_enabled_mutex); + return -EBUSY; + } + + fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC); + if (fd >= 0) + static_key_slow_inc(&bpf_stats_enabled_key.key); + + mutex_unlock(&bpf_stats_enabled_mutex); + return fd; +} + +#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type + +static int bpf_enable_stats(union bpf_attr *attr) +{ + + if (CHECK_ATTR(BPF_ENABLE_STATS)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (attr->enable_stats.type) { + case BPF_STATS_RUN_TIME: + return bpf_enable_runtime_stats(); + default: + break; + } + return -EINVAL; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -3996,6 +4050,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = bpf_obj_get_next_id(&attr, uattr, &link_idr, &link_idr_lock); break; + case BPF_ENABLE_STATS: + err = bpf_enable_stats(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e961286d0e14..7adfe5dbce9d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -201,6 +201,40 @@ static int max_extfrag_threshold = 1000; #endif /* CONFIG_SYSCTL */ +#ifdef CONFIG_BPF_SYSCALL +static int bpf_stats_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct static_key *key = (struct static_key *)table->data; + static int saved_val; + int val, ret; + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&bpf_stats_enabled_mutex); + val = saved_val; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret && val != saved_val) { + if (val) + static_key_slow_inc(key); + else + static_key_slow_dec(key); + saved_val = val; + } + mutex_unlock(&bpf_stats_enabled_mutex); + return ret; +} +#endif + /* * /proc/sys support */ @@ -2549,7 +2583,7 @@ static struct ctl_table kern_table[] = { .data = &bpf_stats_enabled_key.key, .maxlen = sizeof(bpf_stats_enabled_key), .mode = 0644, - .proc_handler = proc_do_static_key, + .proc_handler = bpf_stats_handler, }, #endif #if defined(CONFIG_TREE_RCU) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0eccafae55bb..705e4822f997 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -115,6 +115,7 @@ enum bpf_cmd { BPF_LINK_UPDATE, BPF_LINK_GET_FD_BY_ID, BPF_LINK_GET_NEXT_ID, + BPF_ENABLE_STATS, }; enum bpf_map_type { @@ -390,6 +391,12 @@ enum { */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) +/* type for BPF_ENABLE_STATS */ +enum bpf_stats_type { + /* enabled run_time_ns and run_cnt */ + BPF_STATS_RUN_TIME = 0, +}; + enum bpf_stack_build_id_status { /* user space need an empty entry to identify end of a trace */ BPF_STACK_BUILD_ID_EMPTY = 0, @@ -601,6 +608,10 @@ union bpf_attr { __u32 old_prog_fd; } link_update; + struct { /* struct used by BPF_ENABLE_STATS command */ + __u32 type; + } enable_stats; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF -- cgit v1.2.3 From 0bee106716cfb2c6da81916b968395db22bd7755 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 30 Apr 2020 00:15:05 -0700 Subject: libbpf: Add support for command BPF_ENABLE_STATS bpf_enable_stats() is added to enable given stats. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200430071506.1408910-3-songliubraving@fb.com --- tools/lib/bpf/bpf.c | 10 ++++++++++ tools/lib/bpf/bpf.h | 1 + tools/lib/bpf/libbpf.map | 1 + 3 files changed, 12 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 8f2f0958d446..43322f0d6c7f 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -841,3 +841,13 @@ int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, return err; } + +int bpf_enable_stats(enum bpf_stats_type type) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + attr.enable_stats.type = type; + + return sys_bpf(BPF_ENABLE_STATS, &attr, sizeof(attr)); +} diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 335b457b3a25..1901b2777854 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -231,6 +231,7 @@ LIBBPF_API int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset, __u64 *probe_addr); +LIBBPF_API int bpf_enable_stats(enum bpf_stats_type type); #ifdef __cplusplus } /* extern "C" */ diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 7cd49aa38005..e03bd4db827e 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -257,6 +257,7 @@ LIBBPF_0.0.8 { LIBBPF_0.0.9 { global: + bpf_enable_stats; bpf_link_get_fd_by_id; bpf_link_get_next_id; } LIBBPF_0.0.8; -- cgit v1.2.3 From 31a9f7fe93378ab587d758d5b2e96a237caa7b8c Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 30 Apr 2020 00:15:06 -0700 Subject: bpf: Add selftest for BPF_ENABLE_STATS Add test for BPF_ENABLE_STATS, which should enable run_time_ns stats. ~/selftests/bpf# ./test_progs -t enable_stats -v test_enable_stats:PASS:skel_open_and_load 0 nsec test_enable_stats:PASS:get_stats_fd 0 nsec test_enable_stats:PASS:attach_raw_tp 0 nsec test_enable_stats:PASS:get_prog_info 0 nsec test_enable_stats:PASS:check_stats_enabled 0 nsec test_enable_stats:PASS:check_run_cnt_valid 0 nsec Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200430071506.1408910-4-songliubraving@fb.com --- .../selftests/bpf/prog_tests/enable_stats.c | 45 ++++++++++++++++++++++ .../selftests/bpf/progs/test_enable_stats.c | 18 +++++++++ 2 files changed, 63 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/enable_stats.c create mode 100644 tools/testing/selftests/bpf/progs/test_enable_stats.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/enable_stats.c b/tools/testing/selftests/bpf/prog_tests/enable_stats.c new file mode 100644 index 000000000000..2cb2085917e7 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/enable_stats.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include "test_enable_stats.skel.h" + +void test_enable_stats(void) +{ + struct test_enable_stats *skel; + int stats_fd, err, prog_fd; + struct bpf_prog_info info; + __u32 info_len = sizeof(info); + int duration = 0; + + skel = test_enable_stats__open_and_load(); + if (CHECK(!skel, "skel_open_and_load", "skeleton open/load failed\n")) + return; + + stats_fd = bpf_enable_stats(BPF_STATS_RUN_TIME); + if (CHECK(stats_fd < 0, "get_stats_fd", "failed %d\n", errno)) { + test_enable_stats__destroy(skel); + return; + } + + err = test_enable_stats__attach(skel); + if (CHECK(err, "attach_raw_tp", "err %d\n", err)) + goto cleanup; + + test_enable_stats__detach(skel); + + prog_fd = bpf_program__fd(skel->progs.test_enable_stats); + memset(&info, 0, info_len); + err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); + if (CHECK(err, "get_prog_info", + "failed to get bpf_prog_info for fd %d\n", prog_fd)) + goto cleanup; + if (CHECK(info.run_time_ns == 0, "check_stats_enabled", + "failed to enable run_time_ns stats\n")) + goto cleanup; + + CHECK(info.run_cnt != skel->bss->count, "check_run_cnt_valid", + "invalid run_cnt stats\n"); + +cleanup: + test_enable_stats__destroy(skel); + close(stats_fd); +} diff --git a/tools/testing/selftests/bpf/progs/test_enable_stats.c b/tools/testing/selftests/bpf/progs/test_enable_stats.c new file mode 100644 index 000000000000..01a002ade529 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_enable_stats.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +__u64 count = 0; + +SEC("raw_tracepoint/sys_enter") +int test_enable_stats(void *ctx) +{ + count += 1; + return 0; +} -- cgit v1.2.3 From beecf11bc2188067824591612151c4dc6ec383c7 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 30 Apr 2020 16:31:52 -0700 Subject: bpf: Bpf_{g,s}etsockopt for struct bpf_sock_addr Currently, bpf_getsockopt and bpf_setsockopt helpers operate on the 'struct bpf_sock_ops' context in BPF_PROG_TYPE_SOCK_OPS program. Let's generalize them and make them available for 'struct bpf_sock_addr'. That way, in the future, we can allow those helpers in more places. As an example, let's expose those 'struct bpf_sock_addr' based helpers to BPF_CGROUP_INET{4,6}_CONNECT hooks. That way we can override CC before the connection is made. v3: * Expose custom helpers for bpf_sock_addr context instead of doing generic bpf_sock argument (as suggested by Daniel). Even with try_socket_lock that doesn't sleep we have a problem where context sk is already locked and socket lock is non-nestable. v2: * s/BPF_PROG_TYPE_CGROUP_SOCKOPT/BPF_PROG_TYPE_SOCK_OPS/ Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200430233152.199403-1-sdf@google.com --- include/uapi/linux/bpf.h | 14 ++- net/core/filter.c | 118 +++++++++++++++++----- tools/include/uapi/linux/bpf.h | 14 ++- tools/testing/selftests/bpf/config | 1 + tools/testing/selftests/bpf/progs/connect4_prog.c | 46 +++++++++ 5 files changed, 166 insertions(+), 27 deletions(-) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 705e4822f997..b3643e27e264 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1587,7 +1587,7 @@ union bpf_attr { * Return * 0 * - * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) + * int bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **setsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1595,6 +1595,11 @@ union bpf_attr { * must be specified, see **setsockopt(2)** for more information. * The option value of length *optlen* is pointed by *optval*. * + * *bpf_socket* should be one of the following: + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * * This helper actually implements a subset of **setsockopt()**. * It supports the following *level*\ s: * @@ -1789,7 +1794,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) + * int bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **getsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1798,6 +1803,11 @@ union bpf_attr { * The retrieved value is stored in the structure pointed by * *opval* and of length *optlen*. * + * *bpf_socket* should be one of the following: + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * * This helper actually implements a subset of **getsockopt()**. * It supports the following *level*\ s: * diff --git a/net/core/filter.c b/net/core/filter.c index 70b32723e6be..dfaf5df13722 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4194,16 +4194,19 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, - int, level, int, optname, char *, optval, int, optlen) +#define SOCKOPT_CC_REINIT (1 << 0) + +static int _bpf_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen, u32 flags) { - struct sock *sk = bpf_sock->sk; int ret = 0; int val; if (!sk_fullsock(sk)) return -EINVAL; + sock_owned_by_me(sk); + if (level == SOL_SOCKET) { if (optlen != sizeof(int)) return -EINVAL; @@ -4298,7 +4301,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { char name[TCP_CA_NAME_MAX]; - bool reinit = bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN; + bool reinit = flags & SOCKOPT_CC_REINIT; strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); @@ -4345,24 +4348,14 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, return ret; } -static const struct bpf_func_proto bpf_setsockopt_proto = { - .func = bpf_setsockopt, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE, -}; - -BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, - int, level, int, optname, char *, optval, int, optlen) +static int _bpf_getsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) { - struct sock *sk = bpf_sock->sk; - if (!sk_fullsock(sk)) goto err_clear; + + sock_owned_by_me(sk); + #ifdef CONFIG_INET if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { struct inet_connection_sock *icsk; @@ -4428,8 +4421,71 @@ err_clear: return -EINVAL; } -static const struct bpf_func_proto bpf_getsockopt_proto = { - .func = bpf_getsockopt, +BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, + int, level, int, optname, char *, optval, int, optlen) +{ + u32 flags = 0; + return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen, + flags); +} + +static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { + .func = bpf_sock_addr_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx, + int, level, int, optname, char *, optval, int, optlen) +{ + return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { + .func = bpf_sock_addr_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, + int, level, int, optname, char *, optval, int, optlen) +{ + u32 flags = 0; + if (bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN) + flags |= SOCKOPT_CC_REINIT; + return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen, + flags); +} + +static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { + .func = bpf_sock_ops_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, + int, level, int, optname, char *, optval, int, optlen) +{ + return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = { + .func = bpf_sock_ops_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, @@ -6043,6 +6099,22 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_setsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + return &bpf_sock_addr_setsockopt_proto; + default: + return NULL; + } + case BPF_FUNC_getsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + return &bpf_sock_addr_getsockopt_proto; + default: + return NULL; + } default: return bpf_base_func_proto(func_id); } @@ -6261,9 +6333,9 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_setsockopt: - return &bpf_setsockopt_proto; + return &bpf_sock_ops_setsockopt_proto; case BPF_FUNC_getsockopt: - return &bpf_getsockopt_proto; + return &bpf_sock_ops_getsockopt_proto; case BPF_FUNC_sock_ops_cb_flags_set: return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 705e4822f997..b3643e27e264 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1587,7 +1587,7 @@ union bpf_attr { * Return * 0 * - * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) + * int bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **setsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1595,6 +1595,11 @@ union bpf_attr { * must be specified, see **setsockopt(2)** for more information. * The option value of length *optlen* is pointed by *optval*. * + * *bpf_socket* should be one of the following: + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * * This helper actually implements a subset of **setsockopt()**. * It supports the following *level*\ s: * @@ -1789,7 +1794,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) + * int bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **getsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1798,6 +1803,11 @@ union bpf_attr { * The retrieved value is stored in the structure pointed by * *opval* and of length *optlen*. * + * *bpf_socket* should be one of the following: + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * * This helper actually implements a subset of **getsockopt()**. * It supports the following *level*\ s: * diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 60e3ae5d4e48..6e5b94c036ca 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -37,3 +37,4 @@ CONFIG_IPV6_SIT=m CONFIG_BPF_JIT=y CONFIG_BPF_LSM=y CONFIG_SECURITY=y +CONFIG_TCP_CONG_DCTCP=y diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index ad3c498a8150..972918cd2d7f 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -16,6 +17,10 @@ #define DST_REWRITE_IP4 0x7f000001U #define DST_REWRITE_PORT4 4444 +#ifndef TCP_CA_NAME_MAX +#define TCP_CA_NAME_MAX 16 +#endif + int _version SEC("version") = 1; __attribute__ ((noinline)) @@ -33,6 +38,43 @@ int do_bind(struct bpf_sock_addr *ctx) return 1; } +static __inline int verify_cc(struct bpf_sock_addr *ctx, + char expected[TCP_CA_NAME_MAX]) +{ + char buf[TCP_CA_NAME_MAX]; + int i; + + if (bpf_getsockopt(ctx, SOL_TCP, TCP_CONGESTION, &buf, sizeof(buf))) + return 1; + + for (i = 0; i < TCP_CA_NAME_MAX; i++) { + if (buf[i] != expected[i]) + return 1; + if (buf[i] == 0) + break; + } + + return 0; +} + +static __inline int set_cc(struct bpf_sock_addr *ctx) +{ + char dctcp[TCP_CA_NAME_MAX] = "dctcp"; + char cubic[TCP_CA_NAME_MAX] = "cubic"; + + if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &dctcp, sizeof(dctcp))) + return 1; + if (verify_cc(ctx, dctcp)) + return 1; + + if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &cubic, sizeof(cubic))) + return 1; + if (verify_cc(ctx, cubic)) + return 1; + + return 0; +} + SEC("cgroup/connect4") int connect_v4_prog(struct bpf_sock_addr *ctx) { @@ -66,6 +108,10 @@ int connect_v4_prog(struct bpf_sock_addr *ctx) bpf_sk_release(sk); + /* Rewrite congestion control. */ + if (ctx->type == SOCK_STREAM && set_cc(ctx)) + return 0; + /* Rewrite destination. */ ctx->user_ip4 = bpf_htonl(DST_REWRITE_IP4); ctx->user_port = bpf_htons(DST_REWRITE_PORT4); -- cgit v1.2.3 From 57dc6f3b4133f45e73d87895180ca1f3eaf01722 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 1 May 2020 15:43:20 -0700 Subject: selftests/bpf: Use reno instead of dctcp Andrey pointed out that we can use reno instead of dctcp for CC tests and drop CONFIG_TCP_CONG_DCTCP=y requirement. Fixes: beecf11bc218 ("bpf: Bpf_{g,s}etsockopt for struct bpf_sock_addr") Suggested-by: Andrey Ignatov Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200501224320.28441-1-sdf@google.com --- tools/testing/selftests/bpf/config | 1 - tools/testing/selftests/bpf/progs/connect4_prog.c | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 6e5b94c036ca..60e3ae5d4e48 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -37,4 +37,3 @@ CONFIG_IPV6_SIT=m CONFIG_BPF_JIT=y CONFIG_BPF_LSM=y CONFIG_SECURITY=y -CONFIG_TCP_CONG_DCTCP=y diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index 972918cd2d7f..c2c85c31cffd 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -59,12 +59,12 @@ static __inline int verify_cc(struct bpf_sock_addr *ctx, static __inline int set_cc(struct bpf_sock_addr *ctx) { - char dctcp[TCP_CA_NAME_MAX] = "dctcp"; + char reno[TCP_CA_NAME_MAX] = "reno"; char cubic[TCP_CA_NAME_MAX] = "cubic"; - if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &dctcp, sizeof(dctcp))) + if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &reno, sizeof(reno))) return 1; - if (verify_cc(ctx, dctcp)) + if (verify_cc(ctx, reno)) return 1; if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &cubic, sizeof(cubic))) -- cgit v1.2.3 From d3f1cbd29fa63f1bb608603a6cd54ca7af56a68b Mon Sep 17 00:00:00 2001 From: Vincent Cheng Date: Fri, 1 May 2020 23:35:37 -0400 Subject: ptp: Add adjust_phase to ptp_clock_caps capability. Add adjust_phase to ptp_clock_caps capability to allow user to query if a PHC driver supports adjust phase with ioctl PTP_CLOCK_GETCAPS command. Signed-off-by: Vincent Cheng Reviewed-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/ptp/ptp_chardev.c | 1 + include/uapi/linux/ptp_clock.h | 4 +++- tools/testing/selftests/ptp/testptp.c | 6 ++++-- 3 files changed, 8 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index 93d574faf1fe..375cd6e4aade 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -136,6 +136,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg) caps.pps = ptp->info->pps; caps.n_pins = ptp->info->n_pins; caps.cross_timestamping = ptp->info->getcrosststamp != NULL; + caps.adjust_phase = ptp->info->adjphase != NULL; if (copy_to_user((void __user *)arg, &caps, sizeof(caps))) err = -EFAULT; break; diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h index 9dc9d0079e98..ff070aa64278 100644 --- a/include/uapi/linux/ptp_clock.h +++ b/include/uapi/linux/ptp_clock.h @@ -89,7 +89,9 @@ struct ptp_clock_caps { int n_pins; /* Number of input/output pins. */ /* Whether the clock supports precise system-device cross timestamps */ int cross_timestamping; - int rsv[13]; /* Reserved for future use. */ + /* Whether the clock supports adjust phase */ + int adjust_phase; + int rsv[12]; /* Reserved for future use. */ }; struct ptp_extts_request { diff --git a/tools/testing/selftests/ptp/testptp.c b/tools/testing/selftests/ptp/testptp.c index c0dd10257df5..da7a9dda9490 100644 --- a/tools/testing/selftests/ptp/testptp.c +++ b/tools/testing/selftests/ptp/testptp.c @@ -269,14 +269,16 @@ int main(int argc, char *argv[]) " %d programmable periodic signals\n" " %d pulse per second\n" " %d programmable pins\n" - " %d cross timestamping\n", + " %d cross timestamping\n" + " %d adjust_phase\n", caps.max_adj, caps.n_alarm, caps.n_ext_ts, caps.n_per_out, caps.pps, caps.n_pins, - caps.cross_timestamping); + caps.cross_timestamping, + caps.adjust_phase); } } -- cgit v1.2.3 From 043b3e22768d5d909cb1474fc21ae2fbaf026c0c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 1 May 2020 09:40:41 -0700 Subject: devlink: let kernel allocate region snapshot id Currently users have to choose a free snapshot id before calling DEVLINK_CMD_REGION_NEW. This is potentially racy and inconvenient. Make the DEVLINK_ATTR_REGION_SNAPSHOT_ID optional and try to allocate id automatically. Send a message back to the caller with the snapshot info. Example use: $ devlink region new netdevsim/netdevsim1/dummy netdevsim/netdevsim1/dummy: snapshot 1 $ id=$(devlink -j region new netdevsim/netdevsim1/dummy | \ jq '.[][][][]') $ devlink region dump netdevsim/netdevsim1/dummy snapshot $id [...] $ devlink region del netdevsim/netdevsim1/dummy snapshot $id v4: - inline the notification code v3: - send the notification only once snapshot creation completed. v2: - don't wrap the line containing extack; - add a few sentences to the docs. Signed-off-by: Jakub Kicinski Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- .../networking/devlink/devlink-region.rst | 7 ++- net/core/devlink.c | 57 +++++++++++++++++----- .../selftests/drivers/net/netdevsim/devlink.sh | 13 +++++ 3 files changed, 62 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/Documentation/networking/devlink/devlink-region.rst b/Documentation/networking/devlink/devlink-region.rst index 04e04d1ff627..daf35427fce1 100644 --- a/Documentation/networking/devlink/devlink-region.rst +++ b/Documentation/networking/devlink/devlink-region.rst @@ -23,7 +23,9 @@ states, but see also :doc:`devlink-health` Regions may optionally support capturing a snapshot on demand via the ``DEVLINK_CMD_REGION_NEW`` netlink message. A driver wishing to allow requested snapshots must implement the ``.snapshot`` callback for the region -in its ``devlink_region_ops`` structure. +in its ``devlink_region_ops`` structure. If snapshot id is not set in +the ``DEVLINK_CMD_REGION_NEW`` request kernel will allocate one and send +the snapshot information to user space. example usage ------------- @@ -45,7 +47,8 @@ example usage $ devlink region del pci/0000:00:05.0/cr-space snapshot 1 # Request an immediate snapshot, if supported by the region - $ devlink region new pci/0000:00:05.0/cr-space snapshot 5 + $ devlink region new pci/0000:00:05.0/cr-space + pci/0000:00:05.0/cr-space: snapshot 5 # Dump a snapshot: $ devlink region dump pci/0000:00:05.0/fw-health snapshot 1 diff --git a/net/core/devlink.c b/net/core/devlink.c index 2b7c60c18b99..43a9d5be73ca 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4086,6 +4086,8 @@ static int devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; + struct devlink_snapshot *snapshot; + struct nlattr *snapshot_id_attr; struct devlink_region *region; const char *region_name; u32 snapshot_id; @@ -4097,11 +4099,6 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } - if (!info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) { - NL_SET_ERR_MSG_MOD(info->extack, "No snapshot id provided"); - return -EINVAL; - } - region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); region = devlink_region_get_by_name(devlink, region_name); if (!region) { @@ -4119,16 +4116,25 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) return -ENOSPC; } - snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]); + snapshot_id_attr = info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]; + if (snapshot_id_attr) { + snapshot_id = nla_get_u32(snapshot_id_attr); - if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { - NL_SET_ERR_MSG_MOD(info->extack, "The requested snapshot id is already in use"); - return -EEXIST; - } + if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { + NL_SET_ERR_MSG_MOD(info->extack, "The requested snapshot id is already in use"); + return -EEXIST; + } - err = __devlink_snapshot_id_insert(devlink, snapshot_id); - if (err) - return err; + err = __devlink_snapshot_id_insert(devlink, snapshot_id); + if (err) + return err; + } else { + err = __devlink_region_snapshot_id_get(devlink, &snapshot_id); + if (err) { + NL_SET_ERR_MSG_MOD(info->extack, "Failed to allocate a new snapshot id"); + return err; + } + } err = region->ops->snapshot(devlink, info->extack, &data); if (err) @@ -4138,6 +4144,27 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) if (err) goto err_snapshot_create; + if (!snapshot_id_attr) { + struct sk_buff *msg; + + snapshot = devlink_region_snapshot_get_by_id(region, + snapshot_id); + if (WARN_ON(!snapshot)) + return -EINVAL; + + msg = devlink_nl_region_notify_build(region, snapshot, + DEVLINK_CMD_REGION_NEW, + info->snd_portid, + info->snd_seq); + err = PTR_ERR_OR_ZERO(msg); + if (err) + goto err_notify; + + err = genlmsg_reply(msg, info); + if (err) + goto err_notify; + } + return 0; err_snapshot_create: @@ -4145,6 +4172,10 @@ err_snapshot_create: err_snapshot_capture: __devlink_snapshot_id_decrement(devlink, snapshot_id); return err; + +err_notify: + devlink_region_snapshot_del(region, snapshot); + return err; } static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg, diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh index 9f9741444549..ad539eccddcb 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh @@ -151,6 +151,19 @@ regions_test() check_region_snapshot_count dummy post-second-delete 2 + sid=$(devlink -j region new $DL_HANDLE/dummy | jq '.[][][][]') + check_err $? "Failed to create a new snapshot with id allocated by the kernel" + + check_region_snapshot_count dummy post-first-request 3 + + devlink region dump $DL_HANDLE/dummy snapshot $sid >> /dev/null + check_err $? "Failed to dump a snapshot with id allocated by the kernel" + + devlink region del $DL_HANDLE/dummy snapshot $sid + check_err $? "Failed to delete snapshot with id allocated by the kernel" + + check_region_snapshot_count dummy post-first-request 2 + log_test "regions test" } -- cgit v1.2.3 From 33181bb8e8fe947e1f8020a4b103601a4cac94d9 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 8 May 2020 10:46:08 -0700 Subject: selftests/bpf: Generalize helpers to control background listener Move the following routines that let us start a background listener thread and connect to a server by fd to the test_prog: * start_server - socket+bind+listen * connect_to_fd - connect to the server identified by fd These will be used in the next commit. Also, extend these helpers to support AF_INET6 and accept the family as an argument. v5: * drop pthread.h (Martin KaFai Lau) * add SO_SNDTIMEO (Martin KaFai Lau) v4: * export extra helper to start server without a thread (Martin KaFai Lau) * tcp_rtt is no longer starting background thread (Martin KaFai Lau) v2: * put helpers into network_helpers.c (Andrii Nakryiko) Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Andrey Ignatov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200508174611.228805-2-sdf@google.com --- tools/testing/selftests/bpf/Makefile | 2 +- tools/testing/selftests/bpf/network_helpers.c | 93 ++++++++++++++++++ tools/testing/selftests/bpf/network_helpers.h | 10 ++ tools/testing/selftests/bpf/prog_tests/tcp_rtt.c | 116 +---------------------- 4 files changed, 108 insertions(+), 113 deletions(-) create mode 100644 tools/testing/selftests/bpf/network_helpers.c create mode 100644 tools/testing/selftests/bpf/network_helpers.h (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 3d942be23d09..8f25966b500b 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -354,7 +354,7 @@ endef TRUNNER_TESTS_DIR := prog_tests TRUNNER_BPF_PROGS_DIR := progs TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \ - flow_dissector_load.h + network_helpers.c flow_dissector_load.h TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ $(wildcard progs/btf_dump_test_case_*.c) TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c new file mode 100644 index 000000000000..0073dddb72fd --- /dev/null +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include + +#include "network_helpers.h" + +#define clean_errno() (errno == 0 ? "None" : strerror(errno)) +#define log_err(MSG, ...) fprintf(stderr, "(%s:%d: errno: %s) " MSG "\n", \ + __FILE__, __LINE__, clean_errno(), ##__VA_ARGS__) + +int start_server(int family, int type) +{ + struct sockaddr_storage addr = {}; + socklen_t len; + int fd; + + if (family == AF_INET) { + struct sockaddr_in *sin = (void *)&addr; + + sin->sin_family = AF_INET; + len = sizeof(*sin); + } else { + struct sockaddr_in6 *sin6 = (void *)&addr; + + sin6->sin6_family = AF_INET6; + len = sizeof(*sin6); + } + + fd = socket(family, type | SOCK_NONBLOCK, 0); + if (fd < 0) { + log_err("Failed to create server socket"); + return -1; + } + + if (bind(fd, (const struct sockaddr *)&addr, len) < 0) { + log_err("Failed to bind socket"); + close(fd); + return -1; + } + + if (type == SOCK_STREAM) { + if (listen(fd, 1) < 0) { + log_err("Failed to listed on socket"); + close(fd); + return -1; + } + } + + return fd; +} + +static const struct timeval timeo_sec = { .tv_sec = 3 }; +static const size_t timeo_optlen = sizeof(timeo_sec); + +int connect_to_fd(int family, int type, int server_fd) +{ + struct sockaddr_storage addr; + socklen_t len = sizeof(addr); + int fd; + + fd = socket(family, type, 0); + if (fd < 0) { + log_err("Failed to create client socket"); + return -1; + } + + if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec, timeo_optlen)) { + log_err("Failed to set SO_RCVTIMEO"); + goto out; + } + + if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) { + log_err("Failed to get server addr"); + goto out; + } + + if (connect(fd, (const struct sockaddr *)&addr, len) < 0) { + log_err("Fail to connect to server with family %d", family); + goto out; + } + + return fd; + +out: + close(fd); + return -1; +} diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h new file mode 100644 index 000000000000..30068eacc1a2 --- /dev/null +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NETWORK_HELPERS_H +#define __NETWORK_HELPERS_H +#include +#include + +int start_server(int family, int type); +int connect_to_fd(int family, int type, int server_fd); + +#endif diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c index e56b52ab41da..9013a0c01eed 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include "cgroup_helpers.h" +#include "network_helpers.h" struct tcp_rtt_storage { __u32 invoked; @@ -87,34 +88,6 @@ static int verify_sk(int map_fd, int client_fd, const char *msg, __u32 invoked, return err; } -static int connect_to_server(int server_fd) -{ - struct sockaddr_storage addr; - socklen_t len = sizeof(addr); - int fd; - - fd = socket(AF_INET, SOCK_STREAM, 0); - if (fd < 0) { - log_err("Failed to create client socket"); - return -1; - } - - if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) { - log_err("Failed to get server addr"); - goto out; - } - - if (connect(fd, (const struct sockaddr *)&addr, len) < 0) { - log_err("Fail to connect to server"); - goto out; - } - - return fd; - -out: - close(fd); - return -1; -} static int run_test(int cgroup_fd, int server_fd) { @@ -145,7 +118,7 @@ static int run_test(int cgroup_fd, int server_fd) goto close_bpf_object; } - client_fd = connect_to_server(server_fd); + client_fd = connect_to_fd(AF_INET, SOCK_STREAM, server_fd); if (client_fd < 0) { err = -1; goto close_bpf_object; @@ -180,103 +153,22 @@ close_bpf_object: return err; } -static int start_server(void) -{ - struct sockaddr_in addr = { - .sin_family = AF_INET, - .sin_addr.s_addr = htonl(INADDR_LOOPBACK), - }; - int fd; - - fd = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, 0); - if (fd < 0) { - log_err("Failed to create server socket"); - return -1; - } - - if (bind(fd, (const struct sockaddr *)&addr, sizeof(addr)) < 0) { - log_err("Failed to bind socket"); - close(fd); - return -1; - } - - return fd; -} - -static pthread_mutex_t server_started_mtx = PTHREAD_MUTEX_INITIALIZER; -static pthread_cond_t server_started = PTHREAD_COND_INITIALIZER; -static volatile bool server_done = false; - -static void *server_thread(void *arg) -{ - struct sockaddr_storage addr; - socklen_t len = sizeof(addr); - int fd = *(int *)arg; - int client_fd; - int err; - - err = listen(fd, 1); - - pthread_mutex_lock(&server_started_mtx); - pthread_cond_signal(&server_started); - pthread_mutex_unlock(&server_started_mtx); - - if (CHECK_FAIL(err < 0)) { - perror("Failed to listed on socket"); - return ERR_PTR(err); - } - - while (true) { - client_fd = accept(fd, (struct sockaddr *)&addr, &len); - if (client_fd == -1 && errno == EAGAIN) { - usleep(50); - continue; - } - break; - } - if (CHECK_FAIL(client_fd < 0)) { - perror("Failed to accept client"); - return ERR_PTR(err); - } - - while (!server_done) - usleep(50); - - close(client_fd); - - return NULL; -} - void test_tcp_rtt(void) { int server_fd, cgroup_fd; - pthread_t tid; - void *server_res; cgroup_fd = test__join_cgroup("/tcp_rtt"); if (CHECK_FAIL(cgroup_fd < 0)) return; - server_fd = start_server(); + server_fd = start_server(AF_INET, SOCK_STREAM); if (CHECK_FAIL(server_fd < 0)) goto close_cgroup_fd; - if (CHECK_FAIL(pthread_create(&tid, NULL, server_thread, - (void *)&server_fd))) - goto close_server_fd; - - pthread_mutex_lock(&server_started_mtx); - pthread_cond_wait(&server_started, &server_started_mtx); - pthread_mutex_unlock(&server_started_mtx); - CHECK_FAIL(run_test(cgroup_fd, server_fd)); - server_done = true; - CHECK_FAIL(pthread_join(tid, &server_res)); - CHECK_FAIL(IS_ERR(server_res)); - -close_server_fd: close(server_fd); + close_cgroup_fd: close(cgroup_fd); } -- cgit v1.2.3 From 488a23b89d175cc78f352417114f4f5a10470722 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 8 May 2020 10:46:09 -0700 Subject: selftests/bpf: Move existing common networking parts into network_helpers 1. Move pkt_v4 and pkt_v6 into network_helpers and adjust the users. 2. Copy-paste spin_lock_thread into two tests that use it. Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Acked-by: Andrey Ignatov Link: https://lore.kernel.org/bpf/20200508174611.228805-3-sdf@google.com --- tools/testing/selftests/bpf/network_helpers.c | 17 ++++++++++++ tools/testing/selftests/bpf/network_helpers.h | 29 +++++++++++++++++++++ .../selftests/bpf/prog_tests/fexit_bpf2bpf.c | 1 + .../selftests/bpf/prog_tests/flow_dissector.c | 1 + .../bpf/prog_tests/flow_dissector_load_bytes.c | 1 + .../testing/selftests/bpf/prog_tests/global_data.c | 1 + tools/testing/selftests/bpf/prog_tests/kfree_skb.c | 1 + tools/testing/selftests/bpf/prog_tests/l4lb_all.c | 1 + tools/testing/selftests/bpf/prog_tests/map_lock.c | 14 ++++++++++ .../testing/selftests/bpf/prog_tests/pkt_access.c | 1 + .../selftests/bpf/prog_tests/pkt_md_access.c | 1 + .../selftests/bpf/prog_tests/prog_run_xattr.c | 1 + .../selftests/bpf/prog_tests/queue_stack_map.c | 1 + .../selftests/bpf/prog_tests/signal_pending.c | 1 + tools/testing/selftests/bpf/prog_tests/skb_ctx.c | 1 + tools/testing/selftests/bpf/prog_tests/spinlock.c | 14 ++++++++++ tools/testing/selftests/bpf/prog_tests/xdp.c | 1 + .../selftests/bpf/prog_tests/xdp_adjust_tail.c | 1 + .../testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c | 1 + .../selftests/bpf/prog_tests/xdp_noinline.c | 1 + tools/testing/selftests/bpf/test_progs.c | 30 ---------------------- tools/testing/selftests/bpf/test_progs.h | 23 ----------------- 22 files changed, 90 insertions(+), 53 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 0073dddb72fd..0ff64b70b746 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -14,6 +14,23 @@ #define log_err(MSG, ...) fprintf(stderr, "(%s:%d: errno: %s) " MSG "\n", \ __FILE__, __LINE__, clean_errno(), ##__VA_ARGS__) +struct ipv4_packet pkt_v4 = { + .eth.h_proto = __bpf_constant_htons(ETH_P_IP), + .iph.ihl = 5, + .iph.protocol = IPPROTO_TCP, + .iph.tot_len = __bpf_constant_htons(MAGIC_BYTES), + .tcp.urg_ptr = 123, + .tcp.doff = 5, +}; + +struct ipv6_packet pkt_v6 = { + .eth.h_proto = __bpf_constant_htons(ETH_P_IPV6), + .iph.nexthdr = IPPROTO_TCP, + .iph.payload_len = __bpf_constant_htons(MAGIC_BYTES), + .tcp.urg_ptr = 123, + .tcp.doff = 5, +}; + int start_server(int family, int type) { struct sockaddr_storage addr = {}; diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 30068eacc1a2..a0be7db4f67d 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -3,6 +3,35 @@ #define __NETWORK_HELPERS_H #include #include +#include +typedef __u16 __sum16; +#include +#include +#include +#include +#include +#include + +#define MAGIC_VAL 0x1234 +#define NUM_ITER 100000 +#define VIP_NUM 5 +#define MAGIC_BYTES 123 + +/* ipv4 test vector */ +struct ipv4_packet { + struct ethhdr eth; + struct iphdr iph; + struct tcphdr tcp; +} __packed; +extern struct ipv4_packet pkt_v4; + +/* ipv6 test vector */ +struct ipv6_packet { + struct ethhdr eth; + struct ipv6hdr iph; + struct tcphdr tcp; +} __packed; +extern struct ipv6_packet pkt_v6; int start_server(int family, int type); int connect_to_fd(int family, int type, int server_fd); diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c index c2642517e1d8..a895bfed55db 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include +#include static void test_fexit_bpf2bpf_common(const char *obj_file, const char *target_obj_file, diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index 92563898867c..2301c4d3ecec 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include #include diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c index dc5ef155ec28..0e8a4d2f023d 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include void test_flow_dissector_load_bytes(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/global_data.c b/tools/testing/selftests/bpf/prog_tests/global_data.c index c680926fce73..e3cb62b0a110 100644 --- a/tools/testing/selftests/bpf/prog_tests/global_data.c +++ b/tools/testing/selftests/bpf/prog_tests/global_data.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include static void test_global_data_number(struct bpf_object *obj, __u32 duration) { diff --git a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c index 7507c8f689bc..42c3a3103c26 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c +++ b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include struct meta { int ifindex; diff --git a/tools/testing/selftests/bpf/prog_tests/l4lb_all.c b/tools/testing/selftests/bpf/prog_tests/l4lb_all.c index eaf64595be88..c2d373e294bb 100644 --- a/tools/testing/selftests/bpf/prog_tests/l4lb_all.c +++ b/tools/testing/selftests/bpf/prog_tests/l4lb_all.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include static void test_l4lb(const char *file) { diff --git a/tools/testing/selftests/bpf/prog_tests/map_lock.c b/tools/testing/selftests/bpf/prog_tests/map_lock.c index 8f91f1881d11..ce17b1ed8709 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/map_lock.c @@ -1,5 +1,19 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include + +static void *spin_lock_thread(void *arg) +{ + __u32 duration, retval; + int err, prog_fd = *(u32 *) arg; + + err = bpf_prog_test_run(prog_fd, 10000, &pkt_v4, sizeof(pkt_v4), + NULL, NULL, &retval, &duration); + CHECK(err || retval, "", + "err %d errno %d retval %d duration %d\n", + err, errno, retval, duration); + pthread_exit(arg); +} static void *parallel_map_access(void *arg) { diff --git a/tools/testing/selftests/bpf/prog_tests/pkt_access.c b/tools/testing/selftests/bpf/prog_tests/pkt_access.c index a2537dfa899c..44b514fabccd 100644 --- a/tools/testing/selftests/bpf/prog_tests/pkt_access.c +++ b/tools/testing/selftests/bpf/prog_tests/pkt_access.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include void test_pkt_access(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c b/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c index 5f7aea605019..939015cd6dba 100644 --- a/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c +++ b/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include void test_pkt_md_access(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c b/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c index 5dd89b941f53..dde2b7ae7bc9 100644 --- a/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c +++ b/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include void test_prog_run_xattr(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c index faccc66f4e39..f47e7b1cb32c 100644 --- a/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c +++ b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include enum { QUEUE, diff --git a/tools/testing/selftests/bpf/prog_tests/signal_pending.c b/tools/testing/selftests/bpf/prog_tests/signal_pending.c index 996e808f43a2..dfcbddcbe4d3 100644 --- a/tools/testing/selftests/bpf/prog_tests/signal_pending.c +++ b/tools/testing/selftests/bpf/prog_tests/signal_pending.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include static void sigalrm_handler(int s) {} static struct sigaction sigalrm_action = { diff --git a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c index 4538bd08203f..7021b92af313 100644 --- a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c +++ b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include void test_skb_ctx(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/spinlock.c b/tools/testing/selftests/bpf/prog_tests/spinlock.c index 1ae00cd3174e..7577a77a4c4c 100644 --- a/tools/testing/selftests/bpf/prog_tests/spinlock.c +++ b/tools/testing/selftests/bpf/prog_tests/spinlock.c @@ -1,5 +1,19 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include + +static void *spin_lock_thread(void *arg) +{ + __u32 duration, retval; + int err, prog_fd = *(u32 *) arg; + + err = bpf_prog_test_run(prog_fd, 10000, &pkt_v4, sizeof(pkt_v4), + NULL, NULL, &retval, &duration); + CHECK(err || retval, "", + "err %d errno %d retval %d duration %d\n", + err, errno, retval, duration); + pthread_exit(arg); +} void test_spinlock(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/xdp.c b/tools/testing/selftests/bpf/prog_tests/xdp.c index dcb5ecac778e..48921ff74850 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include void test_xdp(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c index 3744196d7cba..6c8ca1c93f9b 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include void test_xdp_adjust_tail(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c index a0f688c37023..2c6c570b21f8 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include "test_xdp.skel.h" #include "test_xdp_bpf2bpf.skel.h" diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c b/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c index c9404e6b226e..f284f72158ef 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include void test_xdp_noinline(void) { diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 93970ec1c9e9..0f411fdc4f6d 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -222,23 +222,6 @@ int test__join_cgroup(const char *path) return fd; } -struct ipv4_packet pkt_v4 = { - .eth.h_proto = __bpf_constant_htons(ETH_P_IP), - .iph.ihl = 5, - .iph.protocol = IPPROTO_TCP, - .iph.tot_len = __bpf_constant_htons(MAGIC_BYTES), - .tcp.urg_ptr = 123, - .tcp.doff = 5, -}; - -struct ipv6_packet pkt_v6 = { - .eth.h_proto = __bpf_constant_htons(ETH_P_IPV6), - .iph.nexthdr = IPPROTO_TCP, - .iph.payload_len = __bpf_constant_htons(MAGIC_BYTES), - .tcp.urg_ptr = 123, - .tcp.doff = 5, -}; - int bpf_find_map(const char *test, struct bpf_object *obj, const char *name) { struct bpf_map *map; @@ -358,19 +341,6 @@ err: return -1; } -void *spin_lock_thread(void *arg) -{ - __u32 duration, retval; - int err, prog_fd = *(u32 *) arg; - - err = bpf_prog_test_run(prog_fd, 10000, &pkt_v4, sizeof(pkt_v4), - NULL, NULL, &retval, &duration); - CHECK(err || retval, "", - "err %d errno %d retval %d duration %d\n", - err, errno, retval, duration); - pthread_exit(arg); -} - /* extern declarations for test funcs */ #define DEFINE_TEST(name) extern void test_##name(void); #include diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index 10188cc8e9e0..83287c76332b 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -87,24 +87,6 @@ extern void test__skip(void); extern void test__fail(void); extern int test__join_cgroup(const char *path); -#define MAGIC_BYTES 123 - -/* ipv4 test vector */ -struct ipv4_packet { - struct ethhdr eth; - struct iphdr iph; - struct tcphdr tcp; -} __packed; -extern struct ipv4_packet pkt_v4; - -/* ipv6 test vector */ -struct ipv6_packet { - struct ethhdr eth; - struct ipv6hdr iph; - struct tcphdr tcp; -} __packed; -extern struct ipv6_packet pkt_v6; - #define PRINT_FAIL(format...) \ ({ \ test__fail(); \ @@ -143,10 +125,6 @@ extern struct ipv6_packet pkt_v6; #define CHECK_ATTR(condition, tag, format...) \ _CHECK(condition, tag, tattr.duration, format) -#define MAGIC_VAL 0x1234 -#define NUM_ITER 100000 -#define VIP_NUM 5 - static inline __u64 ptr_to_u64(const void *ptr) { return (__u64) (unsigned long) ptr; @@ -156,7 +134,6 @@ int bpf_find_map(const char *test, struct bpf_object *obj, const char *name); int compare_map_keys(int map1_fd, int map2_fd); int compare_stack_ips(int smap_fd, int amap_fd, int stack_trace_len); int extract_build_id(char *build_id, size_t size); -void *spin_lock_thread(void *arg); #ifdef __x86_64__ #define SYS_NANOSLEEP_KPROBE_NAME "__x64_sys_nanosleep" -- cgit v1.2.3 From 8086fbaf49345f988deec539ec8e182b02914401 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 8 May 2020 10:46:11 -0700 Subject: bpf: Allow any port in bpf_bind helper We want to have a tighter control on what ports we bind to in the BPF_CGROUP_INET{4,6}_CONNECT hooks even if it means connect() becomes slightly more expensive. The expensive part comes from the fact that we now need to call inet_csk_get_port() that verifies that the port is not used and allocates an entry in the hash table for it. Since we can't rely on "snum || !bind_address_no_port" to prevent us from calling POST_BIND hook anymore, let's add another bind flag to indicate that the call site is BPF program. v5: * fix wrong AF_INET (should be AF_INET6) in the bpf program for v6 v3: * More bpf_bind documentation refinements (Martin KaFai Lau) * Add UDP tests as well (Martin KaFai Lau) * Don't start the thread, just do socket+bind+listen (Martin KaFai Lau) v2: * Update documentation (Andrey Ignatov) * Pass BIND_FORCE_ADDRESS_NO_PORT conditionally (Andrey Ignatov) Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Andrey Ignatov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200508174611.228805-5-sdf@google.com --- include/net/inet_common.h | 2 + include/uapi/linux/bpf.h | 9 +- net/core/filter.c | 18 ++-- net/ipv4/af_inet.c | 10 +- net/ipv6/af_inet6.c | 12 ++- tools/include/uapi/linux/bpf.h | 9 +- .../selftests/bpf/prog_tests/connect_force_port.c | 115 +++++++++++++++++++++ .../selftests/bpf/progs/connect_force_port4.c | 28 +++++ .../selftests/bpf/progs/connect_force_port6.c | 28 +++++ 9 files changed, 203 insertions(+), 28 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/connect_force_port.c create mode 100644 tools/testing/selftests/bpf/progs/connect_force_port4.c create mode 100644 tools/testing/selftests/bpf/progs/connect_force_port6.c (limited to 'tools') diff --git a/include/net/inet_common.h b/include/net/inet_common.h index c38f4f7d660a..cb2818862919 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -39,6 +39,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); #define BIND_FORCE_ADDRESS_NO_PORT (1 << 0) /* Grab and release socket lock. */ #define BIND_WITH_LOCK (1 << 1) +/* Called from BPF program. */ +#define BIND_FROM_BPF (1 << 2) int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, u32 flags); int inet_getname(struct socket *sock, struct sockaddr *uaddr, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b3643e27e264..6e5e7caa3739 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1994,10 +1994,11 @@ union bpf_attr { * * This helper works for IPv4 and IPv6, TCP and UDP sockets. The * domain (*addr*\ **->sa_family**) must be **AF_INET** (or - * **AF_INET6**). Looking for a free port to bind to can be - * expensive, therefore binding to port is not permitted by the - * helper: *addr*\ **->sin_port** (or **sin6_port**, respectively) - * must be set to zero. + * **AF_INET6**). It's advised to pass zero port (**sin_port** + * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like + * behavior and lets the kernel efficiently pick up an unused + * port as long as 4-tuple is unique. Passing non-zero port might + * lead to degraded performance. * Return * 0 on success, or a negative error in case of failure. * diff --git a/net/core/filter.c b/net/core/filter.c index fa9ddab5dd1f..da0634979f53 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4525,32 +4525,28 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, { #ifdef CONFIG_INET struct sock *sk = ctx->sk; + u32 flags = BIND_FROM_BPF; int err; - /* Binding to port can be expensive so it's prohibited in the helper. - * Only binding to IP is supported. - */ err = -EINVAL; if (addr_len < offsetofend(struct sockaddr, sa_family)) return err; if (addr->sa_family == AF_INET) { if (addr_len < sizeof(struct sockaddr_in)) return err; - if (((struct sockaddr_in *)addr)->sin_port != htons(0)) - return err; - return __inet_bind(sk, addr, addr_len, - BIND_FORCE_ADDRESS_NO_PORT); + if (((struct sockaddr_in *)addr)->sin_port == htons(0)) + flags |= BIND_FORCE_ADDRESS_NO_PORT; + return __inet_bind(sk, addr, addr_len, flags); #if IS_ENABLED(CONFIG_IPV6) } else if (addr->sa_family == AF_INET6) { if (addr_len < SIN6_LEN_RFC2133) return err; - if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) - return err; + if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0)) + flags |= BIND_FORCE_ADDRESS_NO_PORT; /* ipv6_bpf_stub cannot be NULL, since it's called from * bpf_cgroup_inet6_connect hook and ipv6 is already loaded */ - return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, - BIND_FORCE_ADDRESS_NO_PORT); + return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags); #endif /* CONFIG_IPV6 */ } #endif /* CONFIG_INET */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 68e74b1b0f26..fcf0d12a407a 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -526,10 +526,12 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, err = -EADDRINUSE; goto out_release_sock; } - err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); - if (err) { - inet->inet_saddr = inet->inet_rcv_saddr = 0; - goto out_release_sock; + if (!(flags & BIND_FROM_BPF)) { + err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); + if (err) { + inet->inet_saddr = inet->inet_rcv_saddr = 0; + goto out_release_sock; + } } } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 552c2592b81c..771a462a8322 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -407,11 +407,13 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, err = -EADDRINUSE; goto out; } - err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk); - if (err) { - sk->sk_ipv6only = saved_ipv6only; - inet_reset_saddr(sk); - goto out; + if (!(flags & BIND_FROM_BPF)) { + err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk); + if (err) { + sk->sk_ipv6only = saved_ipv6only; + inet_reset_saddr(sk); + goto out; + } } } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b3643e27e264..6e5e7caa3739 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1994,10 +1994,11 @@ union bpf_attr { * * This helper works for IPv4 and IPv6, TCP and UDP sockets. The * domain (*addr*\ **->sa_family**) must be **AF_INET** (or - * **AF_INET6**). Looking for a free port to bind to can be - * expensive, therefore binding to port is not permitted by the - * helper: *addr*\ **->sin_port** (or **sin6_port**, respectively) - * must be set to zero. + * **AF_INET6**). It's advised to pass zero port (**sin_port** + * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like + * behavior and lets the kernel efficiently pick up an unused + * port as long as 4-tuple is unique. Passing non-zero port might + * lead to degraded performance. * Return * 0 on success, or a negative error in case of failure. * diff --git a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c new file mode 100644 index 000000000000..47fbb20cb6a6 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "cgroup_helpers.h" +#include "network_helpers.h" + +static int verify_port(int family, int fd, int expected) +{ + struct sockaddr_storage addr; + socklen_t len = sizeof(addr); + __u16 port; + + if (getsockname(fd, (struct sockaddr *)&addr, &len)) { + log_err("Failed to get server addr"); + return -1; + } + + if (family == AF_INET) + port = ((struct sockaddr_in *)&addr)->sin_port; + else + port = ((struct sockaddr_in6 *)&addr)->sin6_port; + + if (ntohs(port) != expected) { + log_err("Unexpected port %d, expected %d", ntohs(port), + expected); + return -1; + } + + return 0; +} + +static int run_test(int cgroup_fd, int server_fd, int family, int type) +{ + struct bpf_prog_load_attr attr = { + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + }; + struct bpf_object *obj; + int expected_port; + int prog_fd; + int err; + int fd; + + if (family == AF_INET) { + attr.file = "./connect_force_port4.o"; + attr.expected_attach_type = BPF_CGROUP_INET4_CONNECT; + expected_port = 22222; + } else { + attr.file = "./connect_force_port6.o"; + attr.expected_attach_type = BPF_CGROUP_INET6_CONNECT; + expected_port = 22223; + } + + err = bpf_prog_load_xattr(&attr, &obj, &prog_fd); + if (err) { + log_err("Failed to load BPF object"); + return -1; + } + + err = bpf_prog_attach(prog_fd, cgroup_fd, attr.expected_attach_type, + 0); + if (err) { + log_err("Failed to attach BPF program"); + goto close_bpf_object; + } + + fd = connect_to_fd(family, type, server_fd); + if (fd < 0) { + err = -1; + goto close_bpf_object; + } + + err = verify_port(family, fd, expected_port); + + close(fd); + +close_bpf_object: + bpf_object__close(obj); + return err; +} + +void test_connect_force_port(void) +{ + int server_fd, cgroup_fd; + + cgroup_fd = test__join_cgroup("/connect_force_port"); + if (CHECK_FAIL(cgroup_fd < 0)) + return; + + server_fd = start_server(AF_INET, SOCK_STREAM); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_STREAM)); + close(server_fd); + + server_fd = start_server(AF_INET6, SOCK_STREAM); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_STREAM)); + close(server_fd); + + server_fd = start_server(AF_INET, SOCK_DGRAM); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_DGRAM)); + close(server_fd); + + server_fd = start_server(AF_INET6, SOCK_DGRAM); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_DGRAM)); + close(server_fd); + +close_cgroup_fd: + close(cgroup_fd); +} diff --git a/tools/testing/selftests/bpf/progs/connect_force_port4.c b/tools/testing/selftests/bpf/progs/connect_force_port4.c new file mode 100644 index 000000000000..1b8eb34b2db0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/connect_force_port4.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include +#include +#include +#include + +#include +#include + +char _license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; + +SEC("cgroup/connect4") +int _connect4(struct bpf_sock_addr *ctx) +{ + struct sockaddr_in sa = {}; + + sa.sin_family = AF_INET; + sa.sin_port = bpf_htons(22222); + sa.sin_addr.s_addr = bpf_htonl(0x7f000001); /* 127.0.0.1 */ + + if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) + return 0; + + return 1; +} diff --git a/tools/testing/selftests/bpf/progs/connect_force_port6.c b/tools/testing/selftests/bpf/progs/connect_force_port6.c new file mode 100644 index 000000000000..ae6f7d750b4c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/connect_force_port6.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include +#include +#include +#include + +#include +#include + +char _license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; + +SEC("cgroup/connect6") +int _connect6(struct bpf_sock_addr *ctx) +{ + struct sockaddr_in6 sa = {}; + + sa.sin6_family = AF_INET6; + sa.sin6_port = bpf_htons(22223); + sa.sin6_addr.s6_addr32[3] = bpf_htonl(1); /* ::1 */ + + if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) + return 0; + + return 1; +} -- cgit v1.2.3 From b886dea37b78debeea7019c649c05c7e2ba027fc Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 9 May 2020 23:06:08 +0300 Subject: selftests: mlxsw: rename tc_flower_restrictions.sh to tc_restrictions.sh The file is about to contain matchall restrictions too, so change the name to make it more generic. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: Jakub Kicinski --- .../drivers/net/mlxsw/tc_flower_restrictions.sh | 186 --------------------- .../selftests/drivers/net/mlxsw/tc_restrictions.sh | 186 +++++++++++++++++++++ 2 files changed, 186 insertions(+), 186 deletions(-) delete mode 100755 tools/testing/selftests/drivers/net/mlxsw/tc_flower_restrictions.sh create mode 100755 tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_flower_restrictions.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_flower_restrictions.sh deleted file mode 100755 index 68c80d0ec1ec..000000000000 --- a/tools/testing/selftests/drivers/net/mlxsw/tc_flower_restrictions.sh +++ /dev/null @@ -1,186 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -lib_dir=$(dirname $0)/../../../net/forwarding - -ALL_TESTS=" - shared_block_drop_test - egress_redirect_test - multi_mirror_test -" -NUM_NETIFS=2 - -source $lib_dir/tc_common.sh -source $lib_dir/lib.sh - -switch_create() -{ - simple_if_init $swp1 192.0.2.1/24 - simple_if_init $swp2 192.0.2.2/24 -} - -switch_destroy() -{ - simple_if_fini $swp2 192.0.2.2/24 - simple_if_fini $swp1 192.0.2.1/24 -} - -shared_block_drop_test() -{ - RET=0 - - # It is forbidden in mlxsw driver to have mixed-bound - # shared block with a drop rule. - - tc qdisc add dev $swp1 ingress_block 22 clsact - check_err $? "Failed to create clsact with ingress block" - - tc filter add block 22 protocol ip pref 1 handle 101 flower \ - skip_sw dst_ip 192.0.2.2 action drop - check_err $? "Failed to add drop rule to ingress bound block" - - tc qdisc add dev $swp2 ingress_block 22 clsact - check_err $? "Failed to create another clsact with ingress shared block" - - tc qdisc del dev $swp2 clsact - - tc qdisc add dev $swp2 egress_block 22 clsact - check_fail $? "Incorrect success to create another clsact with egress shared block" - - tc filter del block 22 protocol ip pref 1 handle 101 flower - - tc qdisc add dev $swp2 egress_block 22 clsact - check_err $? "Failed to create another clsact with egress shared block after blocker drop rule removed" - - tc filter add block 22 protocol ip pref 1 handle 101 flower \ - skip_sw dst_ip 192.0.2.2 action drop - check_fail $? "Incorrect success to add drop rule to mixed bound block" - - tc qdisc del dev $swp1 clsact - - tc qdisc add dev $swp1 egress_block 22 clsact - check_err $? "Failed to create another clsact with egress shared block" - - tc filter add block 22 protocol ip pref 1 handle 101 flower \ - skip_sw dst_ip 192.0.2.2 action drop - check_err $? "Failed to add drop rule to egress bound shared block" - - tc filter del block 22 protocol ip pref 1 handle 101 flower - - tc qdisc del dev $swp2 clsact - tc qdisc del dev $swp1 clsact - - log_test "shared block drop" -} - -egress_redirect_test() -{ - RET=0 - - # It is forbidden in mlxsw driver to have mirred redirect on - # egress-bound block. - - tc qdisc add dev $swp1 ingress_block 22 clsact - check_err $? "Failed to create clsact with ingress block" - - tc filter add block 22 protocol ip pref 1 handle 101 flower \ - skip_sw dst_ip 192.0.2.2 \ - action mirred egress redirect dev $swp2 - check_err $? "Failed to add redirect rule to ingress bound block" - - tc qdisc add dev $swp2 ingress_block 22 clsact - check_err $? "Failed to create another clsact with ingress shared block" - - tc qdisc del dev $swp2 clsact - - tc qdisc add dev $swp2 egress_block 22 clsact - check_fail $? "Incorrect success to create another clsact with egress shared block" - - tc filter del block 22 protocol ip pref 1 handle 101 flower - - tc qdisc add dev $swp2 egress_block 22 clsact - check_err $? "Failed to create another clsact with egress shared block after blocker redirect rule removed" - - tc filter add block 22 protocol ip pref 1 handle 101 flower \ - skip_sw dst_ip 192.0.2.2 \ - action mirred egress redirect dev $swp2 - check_fail $? "Incorrect success to add redirect rule to mixed bound block" - - tc qdisc del dev $swp1 clsact - - tc qdisc add dev $swp1 egress_block 22 clsact - check_err $? "Failed to create another clsact with egress shared block" - - tc filter add block 22 protocol ip pref 1 handle 101 flower \ - skip_sw dst_ip 192.0.2.2 \ - action mirred egress redirect dev $swp2 - check_fail $? "Incorrect success to add redirect rule to egress bound shared block" - - tc qdisc del dev $swp2 clsact - - tc filter add block 22 protocol ip pref 1 handle 101 flower \ - skip_sw dst_ip 192.0.2.2 \ - action mirred egress redirect dev $swp2 - check_fail $? "Incorrect success to add redirect rule to egress bound block" - - tc qdisc del dev $swp1 clsact - - log_test "shared block drop" -} - -multi_mirror_test() -{ - RET=0 - - # It is forbidden in mlxsw driver to have multiple mirror - # actions in a single rule. - - tc qdisc add dev $swp1 clsact - - tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \ - skip_sw dst_ip 192.0.2.2 \ - action mirred egress mirror dev $swp2 - check_err $? "Failed to add rule with single mirror action" - - tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower - - tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \ - skip_sw dst_ip 192.0.2.2 \ - action mirred egress mirror dev $swp2 \ - action mirred egress mirror dev $swp1 - check_fail $? "Incorrect success to add rule with two mirror actions" - - tc qdisc del dev $swp1 clsact - - log_test "multi mirror" -} - -setup_prepare() -{ - swp1=${NETIFS[p1]} - swp2=${NETIFS[p2]} - - vrf_prepare - - switch_create -} - -cleanup() -{ - pre_cleanup - - switch_destroy - - vrf_cleanup -} - -check_tc_shblock_support - -trap cleanup EXIT - -setup_prepare -setup_wait - -tests_run - -exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh new file mode 100755 index 000000000000..68c80d0ec1ec --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh @@ -0,0 +1,186 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + shared_block_drop_test + egress_redirect_test + multi_mirror_test +" +NUM_NETIFS=2 + +source $lib_dir/tc_common.sh +source $lib_dir/lib.sh + +switch_create() +{ + simple_if_init $swp1 192.0.2.1/24 + simple_if_init $swp2 192.0.2.2/24 +} + +switch_destroy() +{ + simple_if_fini $swp2 192.0.2.2/24 + simple_if_fini $swp1 192.0.2.1/24 +} + +shared_block_drop_test() +{ + RET=0 + + # It is forbidden in mlxsw driver to have mixed-bound + # shared block with a drop rule. + + tc qdisc add dev $swp1 ingress_block 22 clsact + check_err $? "Failed to create clsact with ingress block" + + tc filter add block 22 protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + check_err $? "Failed to add drop rule to ingress bound block" + + tc qdisc add dev $swp2 ingress_block 22 clsact + check_err $? "Failed to create another clsact with ingress shared block" + + tc qdisc del dev $swp2 clsact + + tc qdisc add dev $swp2 egress_block 22 clsact + check_fail $? "Incorrect success to create another clsact with egress shared block" + + tc filter del block 22 protocol ip pref 1 handle 101 flower + + tc qdisc add dev $swp2 egress_block 22 clsact + check_err $? "Failed to create another clsact with egress shared block after blocker drop rule removed" + + tc filter add block 22 protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + check_fail $? "Incorrect success to add drop rule to mixed bound block" + + tc qdisc del dev $swp1 clsact + + tc qdisc add dev $swp1 egress_block 22 clsact + check_err $? "Failed to create another clsact with egress shared block" + + tc filter add block 22 protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + check_err $? "Failed to add drop rule to egress bound shared block" + + tc filter del block 22 protocol ip pref 1 handle 101 flower + + tc qdisc del dev $swp2 clsact + tc qdisc del dev $swp1 clsact + + log_test "shared block drop" +} + +egress_redirect_test() +{ + RET=0 + + # It is forbidden in mlxsw driver to have mirred redirect on + # egress-bound block. + + tc qdisc add dev $swp1 ingress_block 22 clsact + check_err $? "Failed to create clsact with ingress block" + + tc filter add block 22 protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 \ + action mirred egress redirect dev $swp2 + check_err $? "Failed to add redirect rule to ingress bound block" + + tc qdisc add dev $swp2 ingress_block 22 clsact + check_err $? "Failed to create another clsact with ingress shared block" + + tc qdisc del dev $swp2 clsact + + tc qdisc add dev $swp2 egress_block 22 clsact + check_fail $? "Incorrect success to create another clsact with egress shared block" + + tc filter del block 22 protocol ip pref 1 handle 101 flower + + tc qdisc add dev $swp2 egress_block 22 clsact + check_err $? "Failed to create another clsact with egress shared block after blocker redirect rule removed" + + tc filter add block 22 protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 \ + action mirred egress redirect dev $swp2 + check_fail $? "Incorrect success to add redirect rule to mixed bound block" + + tc qdisc del dev $swp1 clsact + + tc qdisc add dev $swp1 egress_block 22 clsact + check_err $? "Failed to create another clsact with egress shared block" + + tc filter add block 22 protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 \ + action mirred egress redirect dev $swp2 + check_fail $? "Incorrect success to add redirect rule to egress bound shared block" + + tc qdisc del dev $swp2 clsact + + tc filter add block 22 protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 \ + action mirred egress redirect dev $swp2 + check_fail $? "Incorrect success to add redirect rule to egress bound block" + + tc qdisc del dev $swp1 clsact + + log_test "shared block drop" +} + +multi_mirror_test() +{ + RET=0 + + # It is forbidden in mlxsw driver to have multiple mirror + # actions in a single rule. + + tc qdisc add dev $swp1 clsact + + tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 \ + action mirred egress mirror dev $swp2 + check_err $? "Failed to add rule with single mirror action" + + tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower + + tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 \ + action mirred egress mirror dev $swp2 \ + action mirred egress mirror dev $swp1 + check_fail $? "Incorrect success to add rule with two mirror actions" + + tc qdisc del dev $swp1 clsact + + log_test "multi mirror" +} + +setup_prepare() +{ + swp1=${NETIFS[p1]} + swp2=${NETIFS[p2]} + + vrf_prepare + + switch_create +} + +cleanup() +{ + pre_cleanup + + switch_destroy + + vrf_cleanup +} + +check_tc_shblock_support + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS -- cgit v1.2.3 From 240fe73457fbfc13cb30d1d16064f19590ff10f6 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 9 May 2020 23:06:09 +0300 Subject: selftests: mlxsw: tc_restrictions: add test to check sample action restrictions Check that matchall rules with sample actions are not possible to be inserted to egress. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: Jakub Kicinski --- .../selftests/drivers/net/mlxsw/tc_restrictions.sh | 25 ++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh index 68c80d0ec1ec..a67e80315e47 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh @@ -7,6 +7,7 @@ ALL_TESTS=" shared_block_drop_test egress_redirect_test multi_mirror_test + matchall_sample_egress_test " NUM_NETIFS=2 @@ -155,6 +156,30 @@ multi_mirror_test() log_test "multi mirror" } +matchall_sample_egress_test() +{ + RET=0 + + # It is forbidden in mlxsw driver to have matchall with sample action + # bound on egress + + tc qdisc add dev $swp1 clsact + + tc filter add dev $swp1 ingress protocol all pref 1 handle 101 \ + matchall skip_sw action sample rate 100 group 1 + check_err $? "Failed to add rule with sample action on ingress" + + tc filter del dev $swp1 ingress protocol all pref 1 handle 101 matchall + + tc filter add dev $swp1 egress protocol all pref 1 handle 101 \ + matchall skip_sw action sample rate 100 group 1 + check_fail $? "Incorrect success to add rule with sample action on egress" + + tc qdisc del dev $swp1 clsact + + log_test "matchall sample egress" +} + setup_prepare() { swp1=${NETIFS[p1]} -- cgit v1.2.3 From aa7431123fc6f36574d9cc23be24dc802bb4cfa5 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 9 May 2020 23:06:10 +0300 Subject: selftests: mlxsw: tc_restrictions: add couple of test for the correct matchall-flower ordering Make sure that the drive restricts incorrect order of inserted matchall vs. flower rules. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: Jakub Kicinski --- .../selftests/drivers/net/mlxsw/tc_restrictions.sh | 107 +++++++++++++++++++++ 1 file changed, 107 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh index a67e80315e47..9241250c5921 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh @@ -8,6 +8,9 @@ ALL_TESTS=" egress_redirect_test multi_mirror_test matchall_sample_egress_test + matchall_mirror_behind_flower_ingress_test + matchall_sample_behind_flower_ingress_test + matchall_mirror_behind_flower_egress_test " NUM_NETIFS=2 @@ -180,6 +183,110 @@ matchall_sample_egress_test() log_test "matchall sample egress" } +matchall_behind_flower_ingress_test() +{ + local action=$1 + local action_args=$2 + + RET=0 + + # On ingress, all matchall-mirror and matchall-sample + # rules have to be in front of the flower rules + + tc qdisc add dev $swp1 clsact + + tc filter add dev $swp1 ingress protocol ip pref 10 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + + tc filter add dev $swp1 ingress protocol all pref 9 handle 102 \ + matchall skip_sw action $action_args + check_err $? "Failed to add matchall rule in front of a flower rule" + + tc filter del dev $swp1 ingress protocol all pref 9 handle 102 matchall + + tc filter add dev $swp1 ingress protocol all pref 11 handle 102 \ + matchall skip_sw action $action_args + check_fail $? "Incorrect success to add matchall rule behind a flower rule" + + tc filter del dev $swp1 ingress protocol ip pref 10 handle 101 flower + + tc filter add dev $swp1 ingress protocol all pref 9 handle 102 \ + matchall skip_sw action $action_args + + tc filter add dev $swp1 ingress protocol ip pref 10 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + check_err $? "Failed to add flower rule behind a matchall rule" + + tc filter del dev $swp1 ingress protocol ip pref 10 handle 101 flower + + tc filter add dev $swp1 ingress protocol ip pref 8 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + check_fail $? "Incorrect success to add flower rule in front of a matchall rule" + + tc qdisc del dev $swp1 clsact + + log_test "matchall $action flower ingress" +} + +matchall_mirror_behind_flower_ingress_test() +{ + matchall_behind_flower_ingress_test "mirror" "mirred egress mirror dev $swp2" +} + +matchall_sample_behind_flower_ingress_test() +{ + matchall_behind_flower_ingress_test "sample" "sample rate 100 group 1" +} + +matchall_behind_flower_egress_test() +{ + local action=$1 + local action_args=$2 + + RET=0 + + # On egress, all matchall-mirror rules have to be behind the flower rules + + tc qdisc add dev $swp1 clsact + + tc filter add dev $swp1 egress protocol ip pref 10 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + + tc filter add dev $swp1 egress protocol all pref 11 handle 102 \ + matchall skip_sw action $action_args + check_err $? "Failed to add matchall rule in front of a flower rule" + + tc filter del dev $swp1 egress protocol all pref 11 handle 102 matchall + + tc filter add dev $swp1 egress protocol all pref 9 handle 102 \ + matchall skip_sw action $action_args + check_fail $? "Incorrect success to add matchall rule behind a flower rule" + + tc filter del dev $swp1 egress protocol ip pref 10 handle 101 flower + + tc filter add dev $swp1 egress protocol all pref 11 handle 102 \ + matchall skip_sw action $action_args + + tc filter add dev $swp1 egress protocol ip pref 10 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + check_err $? "Failed to add flower rule behind a matchall rule" + + tc filter del dev $swp1 egress protocol ip pref 10 handle 101 flower + + tc filter add dev $swp1 egress protocol ip pref 12 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + check_fail $? "Incorrect success to add flower rule in front of a matchall rule" + + tc qdisc del dev $swp1 clsact + + log_test "matchall $action flower egress" +} + +matchall_mirror_behind_flower_egress_test() +{ + matchall_behind_flower_egress_test "mirror" "mirred egress mirror dev $swp2" +} + setup_prepare() { swp1=${NETIFS[p1]} -- cgit v1.2.3 From 15d83c4d7cef5c067a8b075ce59e97df4f60706e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:00 -0700 Subject: bpf: Allow loading of a bpf_iter program A bpf_iter program is a tracing program with attach type BPF_TRACE_ITER. The load attribute attach_btf_id is used by the verifier against a particular kernel function, which represents a target, e.g., __bpf_iter__bpf_map for target bpf_map which is implemented later. The program return value must be 0 or 1 for now. 0 : successful, except potential seq_file buffer overflow which is handled by seq_file reader. 1 : request to restart the same object In the future, other return values may be used for filtering or teminating the iterator. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175900.2474947-1-yhs@fb.com --- include/linux/bpf.h | 3 +++ include/uapi/linux/bpf.h | 1 + kernel/bpf/bpf_iter.c | 36 ++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 21 +++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 1 + 5 files changed, 62 insertions(+) (limited to 'tools') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 40c78b86fe38..f28bdd714754 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1127,6 +1127,8 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd); int bpf_obj_pin_user(u32 ufd, const char __user *pathname); int bpf_obj_get_user(const char __user *pathname, int flags); +#define BPF_ITER_FUNC_PREFIX "__bpf_iter__" + typedef int (*bpf_iter_init_seq_priv_t)(void *private_data); typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); @@ -1140,6 +1142,7 @@ struct bpf_iter_reg { int bpf_iter_reg_target(struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const char *target); +bool bpf_iter_prog_supported(struct bpf_prog *prog); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6e5e7caa3739..c8a5325cc8d0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -218,6 +218,7 @@ enum bpf_attach_type { BPF_TRACE_FEXIT, BPF_MODIFY_RETURN, BPF_LSM_MAC, + BPF_TRACE_ITER, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 5a8119d17d14..dec182d8395a 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -12,6 +12,7 @@ struct bpf_iter_target_info { bpf_iter_init_seq_priv_t init_seq_private; bpf_iter_fini_seq_priv_t fini_seq_private; u32 seq_priv_size; + u32 btf_id; /* cached value */ }; static struct list_head targets = LIST_HEAD_INIT(targets); @@ -57,3 +58,38 @@ void bpf_iter_unreg_target(const char *target) WARN_ON(found == false); } + +static void cache_btf_id(struct bpf_iter_target_info *tinfo, + struct bpf_prog *prog) +{ + tinfo->btf_id = prog->aux->attach_btf_id; +} + +bool bpf_iter_prog_supported(struct bpf_prog *prog) +{ + const char *attach_fname = prog->aux->attach_func_name; + u32 prog_btf_id = prog->aux->attach_btf_id; + const char *prefix = BPF_ITER_FUNC_PREFIX; + struct bpf_iter_target_info *tinfo; + int prefix_len = strlen(prefix); + bool supported = false; + + if (strncmp(attach_fname, prefix, prefix_len)) + return false; + + mutex_lock(&targets_mutex); + list_for_each_entry(tinfo, &targets, list) { + if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) { + supported = true; + break; + } + if (!strcmp(attach_fname + prefix_len, tinfo->target)) { + cache_btf_id(tinfo, prog); + supported = true; + break; + } + } + mutex_unlock(&targets_mutex); + + return supported; +} diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 70ad009577f8..d725ff7d11db 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7101,6 +7101,10 @@ static int check_return_code(struct bpf_verifier_env *env) return 0; range = tnum_const(0); break; + case BPF_PROG_TYPE_TRACING: + if (env->prog->expected_attach_type != BPF_TRACE_ITER) + return 0; + break; default: return 0; } @@ -10481,6 +10485,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) struct bpf_prog *tgt_prog = prog->aux->linked_prog; u32 btf_id = prog->aux->attach_btf_id; const char prefix[] = "btf_trace_"; + struct btf_func_model fmodel; int ret = 0, subprog = -1, i; struct bpf_trampoline *tr; const struct btf_type *t; @@ -10622,6 +10627,22 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; prog->aux->attach_btf_trace = true; return 0; + case BPF_TRACE_ITER: + if (!btf_type_is_func(t)) { + verbose(env, "attach_btf_id %u is not a function\n", + btf_id); + return -EINVAL; + } + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_func_proto(t)) + return -EINVAL; + prog->aux->attach_func_name = tname; + prog->aux->attach_func_proto = t; + if (!bpf_iter_prog_supported(prog)) + return -EINVAL; + ret = btf_distill_func_proto(&env->log, btf, t, + tname, &fmodel); + return ret; default: if (!prog_extension) return -EINVAL; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6e5e7caa3739..c8a5325cc8d0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -218,6 +218,7 @@ enum bpf_attach_type { BPF_TRACE_FEXIT, BPF_MODIFY_RETURN, BPF_LSM_MAC, + BPF_TRACE_ITER, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From de4e05cac46d206f9090051ef09930514bff73e4 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:01 -0700 Subject: bpf: Support bpf tracing/iter programs for BPF_LINK_CREATE Given a bpf program, the step to create an anonymous bpf iterator is: - create a bpf_iter_link, which combines bpf program and the target. In the future, there could be more information recorded in the link. A link_fd will be returned to the user space. - create an anonymous bpf iterator with the given link_fd. The bpf_iter_link can be pinned to bpffs mount file system to create a file based bpf iterator as well. The benefit to use of bpf_iter_link: - using bpf link simplifies design and implementation as bpf link is used for other tracing bpf programs. - for file based bpf iterator, bpf_iter_link provides a standard way to replace underlying bpf programs. - for both anonymous and free based iterators, bpf link query capability can be leveraged. The patch added support of tracing/iter programs for BPF_LINK_CREATE. A new link type BPF_LINK_TYPE_ITER is added to facilitate link querying. Currently, only prog_id is needed, so there is no additional in-kernel show_fdinfo() and fill_link_info() hook is needed for BPF_LINK_TYPE_ITER link. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175901.2475084-1-yhs@fb.com --- include/linux/bpf.h | 1 + include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/bpf_iter.c | 62 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 14 ++++++++++ tools/include/uapi/linux/bpf.h | 1 + 6 files changed, 80 insertions(+) (limited to 'tools') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f28bdd714754..e93d2d33c82c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1143,6 +1143,7 @@ struct bpf_iter_reg { int bpf_iter_reg_target(struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const char *target); bool bpf_iter_prog_supported(struct bpf_prog *prog); +int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 8345cdf553b8..29d22752fc87 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -124,3 +124,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) #ifdef CONFIG_CGROUP_BPF BPF_LINK_TYPE(BPF_LINK_TYPE_CGROUP, cgroup) #endif +BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c8a5325cc8d0..1e8dfff5d5d4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -229,6 +229,7 @@ enum bpf_link_type { BPF_LINK_TYPE_RAW_TRACEPOINT = 1, BPF_LINK_TYPE_TRACING = 2, BPF_LINK_TYPE_CGROUP = 3, + BPF_LINK_TYPE_ITER = 4, MAX_BPF_LINK_TYPE, }; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index dec182d8395a..03f5832909db 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -15,6 +15,11 @@ struct bpf_iter_target_info { u32 btf_id; /* cached value */ }; +struct bpf_iter_link { + struct bpf_link link; + struct bpf_iter_target_info *tinfo; +}; + static struct list_head targets = LIST_HEAD_INIT(targets); static DEFINE_MUTEX(targets_mutex); @@ -93,3 +98,60 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) return supported; } + +static void bpf_iter_link_release(struct bpf_link *link) +{ +} + +static void bpf_iter_link_dealloc(struct bpf_link *link) +{ + struct bpf_iter_link *iter_link = + container_of(link, struct bpf_iter_link, link); + + kfree(iter_link); +} + +static const struct bpf_link_ops bpf_iter_link_lops = { + .release = bpf_iter_link_release, + .dealloc = bpf_iter_link_dealloc, +}; + +int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_link_primer link_primer; + struct bpf_iter_target_info *tinfo; + struct bpf_iter_link *link; + bool existed = false; + u32 prog_btf_id; + int err; + + if (attr->link_create.target_fd || attr->link_create.flags) + return -EINVAL; + + prog_btf_id = prog->aux->attach_btf_id; + mutex_lock(&targets_mutex); + list_for_each_entry(tinfo, &targets, list) { + if (tinfo->btf_id == prog_btf_id) { + existed = true; + break; + } + } + mutex_unlock(&targets_mutex); + if (!existed) + return -ENOENT; + + link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); + if (!link) + return -ENOMEM; + + bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog); + link->tinfo = tinfo; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + return err; + } + + return bpf_link_settle(&link_primer); +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bb1ab7da6103..6ffe2d8fb6c7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2729,6 +2729,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: return BPF_PROG_TYPE_CGROUP_SOCKOPT; + case BPF_TRACE_ITER: + return BPF_PROG_TYPE_TRACING; default: return BPF_PROG_TYPE_UNSPEC; } @@ -3729,6 +3731,15 @@ err_put: return err; } +static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + if (attr->link_create.attach_type == BPF_TRACE_ITER && + prog->expected_attach_type == BPF_TRACE_ITER) + return bpf_iter_link_attach(attr, prog); + + return -EINVAL; +} + #define BPF_LINK_CREATE_LAST_FIELD link_create.flags static int link_create(union bpf_attr *attr) { @@ -3765,6 +3776,9 @@ static int link_create(union bpf_attr *attr) case BPF_PROG_TYPE_CGROUP_SOCKOPT: ret = cgroup_bpf_link_attach(attr, prog); break; + case BPF_PROG_TYPE_TRACING: + ret = tracing_bpf_link_attach(attr, prog); + break; default: ret = -EINVAL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c8a5325cc8d0..1e8dfff5d5d4 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -229,6 +229,7 @@ enum bpf_link_type { BPF_LINK_TYPE_RAW_TRACEPOINT = 1, BPF_LINK_TYPE_TRACING = 2, BPF_LINK_TYPE_CGROUP = 3, + BPF_LINK_TYPE_ITER = 4, MAX_BPF_LINK_TYPE, }; -- cgit v1.2.3 From ac51d99bf81caac8d8881fe52098948110d0de68 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:05 -0700 Subject: bpf: Create anonymous bpf iterator A new bpf command BPF_ITER_CREATE is added. The anonymous bpf iterator is seq_file based. The seq_file private data are referenced by targets. The bpf_iter infrastructure allocated additional space at seq_file->private before the space used by targets to store some meta data, e.g., prog: prog to run session_id: an unique id for each opened seq_file seq_num: how many times bpf programs are queried in this session done_stop: an internal state to decide whether bpf program should be called in seq_ops->stop() or not The seq_num will start from 0 for valid objects. The bpf program may see the same seq_num more than once if - seq_file buffer overflow happens and the same object is retried by bpf_seq_read(), or - the bpf program explicitly requests a retry of the same object Since module is not supported for bpf_iter, all target registeration happens at __init time, so there is no need to change bpf_iter_unreg_target() as it is used mostly in error path of the init function at which time no bpf iterators have been created yet. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175905.2475770-1-yhs@fb.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 6 ++ kernel/bpf/bpf_iter.c | 129 +++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 26 +++++++++ tools/include/uapi/linux/bpf.h | 6 ++ 5 files changed, 168 insertions(+) (limited to 'tools') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e93d2d33c82c..80b1b9d8a638 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1144,6 +1144,7 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const char *target); bool bpf_iter_prog_supported(struct bpf_prog *prog); int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int bpf_iter_new_fd(struct bpf_link *link); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1e8dfff5d5d4..708763f702e1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -116,6 +116,7 @@ enum bpf_cmd { BPF_LINK_GET_FD_BY_ID, BPF_LINK_GET_NEXT_ID, BPF_ENABLE_STATS, + BPF_ITER_CREATE, }; enum bpf_map_type { @@ -614,6 +615,11 @@ union bpf_attr { __u32 type; } enable_stats; + struct { /* struct used by BPF_ITER_CREATE command */ + __u32 link_fd; + __u32 flags; + } iter_create; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 832973ee80fa..e7129b57865f 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -2,6 +2,7 @@ /* Copyright (c) 2020 Facebook */ #include +#include #include #include @@ -20,12 +21,24 @@ struct bpf_iter_link { struct bpf_iter_target_info *tinfo; }; +struct bpf_iter_priv_data { + struct bpf_iter_target_info *tinfo; + struct bpf_prog *prog; + u64 session_id; + u64 seq_num; + bool done_stop; + u8 target_private[] __aligned(8); +}; + static struct list_head targets = LIST_HEAD_INIT(targets); static DEFINE_MUTEX(targets_mutex); /* protect bpf_iter_link changes */ static DEFINE_MUTEX(link_mutex); +/* incremented on every opened seq_file */ +static atomic64_t session_id; + /* bpf_seq_read, a customized and simpler version for bpf iterator. * no_llseek is assumed for this file. * The following are differences from seq_read(): @@ -149,6 +162,33 @@ done: return copied; } +static int iter_release(struct inode *inode, struct file *file) +{ + struct bpf_iter_priv_data *iter_priv; + struct seq_file *seq; + + seq = file->private_data; + if (!seq) + return 0; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + + if (iter_priv->tinfo->fini_seq_private) + iter_priv->tinfo->fini_seq_private(seq->private); + + bpf_prog_put(iter_priv->prog); + seq->private = iter_priv; + + return seq_release_private(inode, file); +} + +static const struct file_operations bpf_iter_fops = { + .llseek = no_llseek, + .read = bpf_seq_read, + .release = iter_release, +}; + int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; @@ -309,3 +349,92 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) return bpf_link_settle(&link_primer); } + +static void init_seq_meta(struct bpf_iter_priv_data *priv_data, + struct bpf_iter_target_info *tinfo, + struct bpf_prog *prog) +{ + priv_data->tinfo = tinfo; + priv_data->prog = prog; + priv_data->session_id = atomic64_inc_return(&session_id); + priv_data->seq_num = 0; + priv_data->done_stop = false; +} + +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) +{ + struct bpf_iter_priv_data *priv_data; + struct bpf_iter_target_info *tinfo; + struct bpf_prog *prog; + u32 total_priv_dsize; + struct seq_file *seq; + int err = 0; + + mutex_lock(&link_mutex); + prog = link->link.prog; + bpf_prog_inc(prog); + mutex_unlock(&link_mutex); + + tinfo = link->tinfo; + total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + + tinfo->seq_priv_size; + priv_data = __seq_open_private(file, tinfo->seq_ops, total_priv_dsize); + if (!priv_data) { + err = -ENOMEM; + goto release_prog; + } + + if (tinfo->init_seq_private) { + err = tinfo->init_seq_private(priv_data->target_private); + if (err) + goto release_seq_file; + } + + init_seq_meta(priv_data, tinfo, prog); + seq = file->private_data; + seq->private = priv_data->target_private; + + return 0; + +release_seq_file: + seq_release_private(file->f_inode, file); + file->private_data = NULL; +release_prog: + bpf_prog_put(prog); + return err; +} + +int bpf_iter_new_fd(struct bpf_link *link) +{ + struct file *file; + unsigned int flags; + int err, fd; + + if (link->ops != &bpf_iter_link_lops) + return -EINVAL; + + flags = O_RDONLY | O_CLOEXEC; + fd = get_unused_fd_flags(flags); + if (fd < 0) + return fd; + + file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto free_fd; + } + + err = prepare_seq_file(file, + container_of(link, struct bpf_iter_link, link)); + if (err) + goto free_file; + + fd_install(fd, file); + return fd; + +free_file: + fput(file); +free_fd: + put_unused_fd(fd); + return err; +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6ffe2d8fb6c7..a293e88ee01a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3941,6 +3941,29 @@ static int bpf_enable_stats(union bpf_attr *attr) return -EINVAL; } +#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags + +static int bpf_iter_create(union bpf_attr *attr) +{ + struct bpf_link *link; + int err; + + if (CHECK_ATTR(BPF_ITER_CREATE)) + return -EINVAL; + + if (attr->iter_create.flags) + return -EINVAL; + + link = bpf_link_get_from_fd(attr->iter_create.link_fd); + if (IS_ERR(link)) + return PTR_ERR(link); + + err = bpf_iter_new_fd(link); + bpf_link_put(link); + + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -4068,6 +4091,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_ENABLE_STATS: err = bpf_enable_stats(&attr); break; + case BPF_ITER_CREATE: + err = bpf_iter_create(&attr); + break; default: err = -EINVAL; break; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1e8dfff5d5d4..708763f702e1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -116,6 +116,7 @@ enum bpf_cmd { BPF_LINK_GET_FD_BY_ID, BPF_LINK_GET_NEXT_ID, BPF_ENABLE_STATS, + BPF_ITER_CREATE, }; enum bpf_map_type { @@ -614,6 +615,11 @@ union bpf_attr { __u32 type; } enable_stats; + struct { /* struct used by BPF_ITER_CREATE command */ + __u32 link_fd; + __u32 flags; + } iter_create; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF -- cgit v1.2.3 From 492e639f0c222784e2e0f121966375f641c61b15 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:14 -0700 Subject: bpf: Add bpf_seq_printf and bpf_seq_write helpers Two helpers bpf_seq_printf and bpf_seq_write, are added for writing data to the seq_file buffer. bpf_seq_printf supports common format string flag/width/type fields so at least I can get identical results for netlink and ipv6_route targets. For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW specifically indicates a write failure due to overflow, which means the object will be repeated in the next bpf invocation if object collection stays the same. Note that if the object collection is changed, depending how collection traversal is done, even if the object still in the collection, it may not be visited. For bpf_seq_printf, format %s, %p{i,I}{4,6} needs to read kernel memory. Reading kernel memory may fail in the following two cases: - invalid kernel address, or - valid kernel address but requiring a major fault If reading kernel memory failed, the %s string will be an empty string and %p{i,I}{4,6} will be all 0. Not returning error to bpf program is consistent with what bpf_trace_printk() does for now. bpf_seq_printf may return -EBUSY meaning that internal percpu buffer for memory copy of strings or other pointees is not available. Bpf program can return 1 to indicate it wants the same object to be repeated. Right now, this should not happen on no-RT kernels since migrate_disable(), which guards bpf prog call, calls preempt_disable(). Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175914.2476661-1-yhs@fb.com --- include/uapi/linux/bpf.h | 39 +++++++- kernel/trace/bpf_trace.c | 214 +++++++++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 + tools/include/uapi/linux/bpf.h | 39 +++++++- 4 files changed, 292 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 708763f702e1..9d1932e23cec 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3077,6 +3077,41 @@ union bpf_attr { * See: clock_gettime(CLOCK_BOOTTIME) * Return * Current *ktime*. + * + * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * Description + * seq_printf uses seq_file seq_printf() to print out the format string. + * The *m* represents the seq_file. The *fmt* and *fmt_size* are for + * the format string itself. The *data* and *data_len* are format string + * arguments. The *data* are a u64 array and corresponding format string + * values are stored in the array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* array. + * The *data_len* is the *data* size in term of bytes. + * + * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. + * Reading kernel memory may fail due to either invalid address or + * valid address but requiring a major memory fault. If reading kernel memory + * fails, the string for **%s** will be an empty string, and the ip + * address for **%p{i,I}{4,6}** will be 0. Not returning error to + * bpf program is consistent with what bpf_trace_printk() does for now. + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EBUSY** Percpu memory copy buffer is busy, can try again + * by returning 1 from bpf program. + * * **-EINVAL** Invalid arguments, or invalid/unsupported formats. + * * **-E2BIG** Too many format specifiers. + * * **-EOVERFLOW** Overflow happens, the same object will be tried again. + * + * int bpf_seq_write(struct seq_file *m, const void *data, u32 len) + * Description + * seq_write uses seq_file seq_write() to write the data. + * The *m* represents the seq_file. The *data* and *len* represent the + * data to write in bytes. + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EOVERFLOW** Overflow happens, the same object will be tried again. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3204,7 +3239,9 @@ union bpf_attr { FN(get_netns_cookie), \ FN(get_current_ancestor_cgroup_id), \ FN(sk_assign), \ - FN(ktime_get_boot_ns), + FN(ktime_get_boot_ns), \ + FN(seq_printf), \ + FN(seq_write), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e875c95d3ced..d961428fb5b6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -457,6 +457,212 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } +#define MAX_SEQ_PRINTF_VARARGS 12 +#define MAX_SEQ_PRINTF_MAX_MEMCPY 6 +#define MAX_SEQ_PRINTF_STR_LEN 128 + +struct bpf_seq_printf_buf { + char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN]; +}; +static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf); +static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used); + +BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, + const void *, data, u32, data_len) +{ + int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0; + int i, buf_used, copy_size, num_args; + u64 params[MAX_SEQ_PRINTF_VARARGS]; + struct bpf_seq_printf_buf *bufs; + const u64 *args = data; + + buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used); + if (WARN_ON_ONCE(buf_used > 1)) { + err = -EBUSY; + goto out; + } + + bufs = this_cpu_ptr(&bpf_seq_printf_buf); + + /* + * bpf_check()->check_func_arg()->check_stack_boundary() + * guarantees that fmt points to bpf program stack, + * fmt_size bytes of it were initialized and fmt_size > 0 + */ + if (fmt[--fmt_size] != 0) + goto out; + + if (data_len & 7) + goto out; + + for (i = 0; i < fmt_size; i++) { + if (fmt[i] == '%') { + if (fmt[i + 1] == '%') + i++; + else if (!data || !data_len) + goto out; + } + } + + num_args = data_len / 8; + + /* check format string for allowed specifiers */ + for (i = 0; i < fmt_size; i++) { + /* only printable ascii for now. */ + if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { + err = -EINVAL; + goto out; + } + + if (fmt[i] != '%') + continue; + + if (fmt[i + 1] == '%') { + i++; + continue; + } + + if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) { + err = -E2BIG; + goto out; + } + + if (fmt_cnt >= num_args) { + err = -EINVAL; + goto out; + } + + /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ + i++; + + /* skip optional "[0 +-][num]" width formating field */ + while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || + fmt[i] == ' ') + i++; + if (fmt[i] >= '1' && fmt[i] <= '9') { + i++; + while (fmt[i] >= '0' && fmt[i] <= '9') + i++; + } + + if (fmt[i] == 's') { + /* try our best to copy */ + if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { + err = -E2BIG; + goto out; + } + + err = strncpy_from_unsafe(bufs->buf[memcpy_cnt], + (void *) (long) args[fmt_cnt], + MAX_SEQ_PRINTF_STR_LEN); + if (err < 0) + bufs->buf[memcpy_cnt][0] = '\0'; + params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; + + fmt_cnt++; + memcpy_cnt++; + continue; + } + + if (fmt[i] == 'p') { + if (fmt[i + 1] == 0 || + fmt[i + 1] == 'K' || + fmt[i + 1] == 'x') { + /* just kernel pointers */ + params[fmt_cnt] = args[fmt_cnt]; + fmt_cnt++; + continue; + } + + /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ + if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I') { + err = -EINVAL; + goto out; + } + if (fmt[i + 2] != '4' && fmt[i + 2] != '6') { + err = -EINVAL; + goto out; + } + + if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { + err = -E2BIG; + goto out; + } + + + copy_size = (fmt[i + 2] == '4') ? 4 : 16; + + err = probe_kernel_read(bufs->buf[memcpy_cnt], + (void *) (long) args[fmt_cnt], + copy_size); + if (err < 0) + memset(bufs->buf[memcpy_cnt], 0, copy_size); + params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; + + i += 2; + fmt_cnt++; + memcpy_cnt++; + continue; + } + + if (fmt[i] == 'l') { + i++; + if (fmt[i] == 'l') + i++; + } + + if (fmt[i] != 'i' && fmt[i] != 'd' && + fmt[i] != 'u' && fmt[i] != 'x') { + err = -EINVAL; + goto out; + } + + params[fmt_cnt] = args[fmt_cnt]; + fmt_cnt++; + } + + /* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give + * all of them to seq_printf(). + */ + seq_printf(m, fmt, params[0], params[1], params[2], params[3], + params[4], params[5], params[6], params[7], params[8], + params[9], params[10], params[11]); + + err = seq_has_overflowed(m) ? -EOVERFLOW : 0; +out: + this_cpu_dec(bpf_seq_printf_buf_used); + return err; +} + +static int bpf_seq_printf_btf_ids[5]; +static const struct bpf_func_proto bpf_seq_printf_proto = { + .func = bpf_seq_printf, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_PTR_TO_MEM_OR_NULL, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_seq_printf_btf_ids, +}; + +BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) +{ + return seq_write(m, data, len) ? -EOVERFLOW : 0; +} + +static int bpf_seq_write_btf_ids[5]; +static const struct bpf_func_proto bpf_seq_write_proto = { + .func = bpf_seq_write, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_seq_write_btf_ids, +}; + static __always_inline int get_map_perf_counter(struct bpf_map *map, u64 flags, u64 *value, u64 *enabled, u64 *running) @@ -1226,6 +1432,14 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_xdp_output: return &bpf_xdp_output_proto; #endif + case BPF_FUNC_seq_printf: + return prog->expected_attach_type == BPF_TRACE_ITER ? + &bpf_seq_printf_proto : + NULL; + case BPF_FUNC_seq_write: + return prog->expected_attach_type == BPF_TRACE_ITER ? + &bpf_seq_write_proto : + NULL; default: return raw_tp_prog_func_proto(func_id, prog); } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index f43d193aff3a..ded304c96a05 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -414,6 +414,7 @@ class PrinterHelpers(Printer): 'struct sk_reuseport_md', 'struct sockaddr', 'struct tcphdr', + 'struct seq_file', 'struct __sk_buff', 'struct sk_msg_md', @@ -450,6 +451,7 @@ class PrinterHelpers(Printer): 'struct sk_reuseport_md', 'struct sockaddr', 'struct tcphdr', + 'struct seq_file', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 708763f702e1..9d1932e23cec 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3077,6 +3077,41 @@ union bpf_attr { * See: clock_gettime(CLOCK_BOOTTIME) * Return * Current *ktime*. + * + * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * Description + * seq_printf uses seq_file seq_printf() to print out the format string. + * The *m* represents the seq_file. The *fmt* and *fmt_size* are for + * the format string itself. The *data* and *data_len* are format string + * arguments. The *data* are a u64 array and corresponding format string + * values are stored in the array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* array. + * The *data_len* is the *data* size in term of bytes. + * + * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. + * Reading kernel memory may fail due to either invalid address or + * valid address but requiring a major memory fault. If reading kernel memory + * fails, the string for **%s** will be an empty string, and the ip + * address for **%p{i,I}{4,6}** will be 0. Not returning error to + * bpf program is consistent with what bpf_trace_printk() does for now. + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EBUSY** Percpu memory copy buffer is busy, can try again + * by returning 1 from bpf program. + * * **-EINVAL** Invalid arguments, or invalid/unsupported formats. + * * **-E2BIG** Too many format specifiers. + * * **-EOVERFLOW** Overflow happens, the same object will be tried again. + * + * int bpf_seq_write(struct seq_file *m, const void *data, u32 len) + * Description + * seq_write uses seq_file seq_write() to write the data. + * The *m* represents the seq_file. The *data* and *len* represent the + * data to write in bytes. + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EOVERFLOW** Overflow happens, the same object will be tried again. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3204,7 +3239,9 @@ union bpf_attr { FN(get_netns_cookie), \ FN(get_current_ancestor_cgroup_id), \ FN(sk_assign), \ - FN(ktime_get_boot_ns), + FN(ktime_get_boot_ns), \ + FN(seq_printf), \ + FN(seq_write), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From c09add2fbc5aece00a5b54a48ce39fd4e3284d87 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:17 -0700 Subject: tools/libbpf: Add bpf_iter support Two new libbpf APIs are added to support bpf_iter: - bpf_program__attach_iter Given a bpf program and additional parameters, which is none now, returns a bpf_link. - bpf_iter_create syscall level API to create a bpf iterator. The macro BPF_SEQ_PRINTF are also introduced. The format looks like: BPF_SEQ_PRINTF(seq, "task id %d\n", pid); This macro can help bpf program writers with nicer bpf_seq_printf syntax similar to the kernel one. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175917.2476936-1-yhs@fb.com --- tools/lib/bpf/bpf.c | 10 +++++++++ tools/lib/bpf/bpf.h | 2 ++ tools/lib/bpf/bpf_tracing.h | 16 ++++++++++++++ tools/lib/bpf/libbpf.c | 52 +++++++++++++++++++++++++++++++++++++++++++++ tools/lib/bpf/libbpf.h | 9 ++++++++ tools/lib/bpf/libbpf.map | 2 ++ 6 files changed, 91 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 43322f0d6c7f..a7329b671c41 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -619,6 +619,16 @@ int bpf_link_update(int link_fd, int new_prog_fd, return sys_bpf(BPF_LINK_UPDATE, &attr, sizeof(attr)); } +int bpf_iter_create(int link_fd) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + attr.iter_create.link_fd = link_fd; + + return sys_bpf(BPF_ITER_CREATE, &attr, sizeof(attr)); +} + int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt) { diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 1901b2777854..1b6015b21ba8 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -187,6 +187,8 @@ struct bpf_link_update_opts { LIBBPF_API int bpf_link_update(int link_fd, int new_prog_fd, const struct bpf_link_update_opts *opts); +LIBBPF_API int bpf_iter_create(int link_fd); + struct bpf_prog_test_run_attr { int prog_fd; int repeat; diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index f3f3c3fb98cb..cf97d07692b4 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -413,4 +413,20 @@ typeof(name(0)) name(struct pt_regs *ctx) \ } \ static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) +/* + * BPF_SEQ_PRINTF to wrap bpf_seq_printf to-be-printed values + * in a structure. + */ +#define BPF_SEQ_PRINTF(seq, fmt, args...) \ + ({ \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + static const char ___fmt[] = fmt; \ + unsigned long long ___param[] = { args }; \ + _Pragma("GCC diagnostic pop") \ + int ___ret = bpf_seq_printf(seq, ___fmt, sizeof(___fmt), \ + ___param, sizeof(___param)); \ + ___ret; \ + }) + #endif diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 977add1b73e2..6c2f46908f4d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6586,6 +6586,8 @@ static struct bpf_link *attach_trace(const struct bpf_sec_def *sec, struct bpf_program *prog); static struct bpf_link *attach_lsm(const struct bpf_sec_def *sec, struct bpf_program *prog); +static struct bpf_link *attach_iter(const struct bpf_sec_def *sec, + struct bpf_program *prog); static const struct bpf_sec_def section_defs[] = { BPF_PROG_SEC("socket", BPF_PROG_TYPE_SOCKET_FILTER), @@ -6629,6 +6631,10 @@ static const struct bpf_sec_def section_defs[] = { .is_attach_btf = true, .expected_attach_type = BPF_LSM_MAC, .attach_fn = attach_lsm), + SEC_DEF("iter/", TRACING, + .expected_attach_type = BPF_TRACE_ITER, + .is_attach_btf = true, + .attach_fn = attach_iter), BPF_PROG_SEC("xdp", BPF_PROG_TYPE_XDP), BPF_PROG_SEC("perf_event", BPF_PROG_TYPE_PERF_EVENT), BPF_PROG_SEC("lwt_in", BPF_PROG_TYPE_LWT_IN), @@ -6891,6 +6897,7 @@ invalid_prog: #define BTF_TRACE_PREFIX "btf_trace_" #define BTF_LSM_PREFIX "bpf_lsm_" +#define BTF_ITER_PREFIX "__bpf_iter__" #define BTF_MAX_NAME_SIZE 128 static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix, @@ -6921,6 +6928,9 @@ static inline int __find_vmlinux_btf_id(struct btf *btf, const char *name, else if (attach_type == BPF_LSM_MAC) err = find_btf_by_prefix_kind(btf, BTF_LSM_PREFIX, name, BTF_KIND_FUNC); + else if (attach_type == BPF_TRACE_ITER) + err = find_btf_by_prefix_kind(btf, BTF_ITER_PREFIX, name, + BTF_KIND_FUNC); else err = btf__find_by_name_kind(btf, name, BTF_KIND_FUNC); @@ -7848,6 +7858,12 @@ static struct bpf_link *attach_lsm(const struct bpf_sec_def *sec, return bpf_program__attach_lsm(prog); } +static struct bpf_link *attach_iter(const struct bpf_sec_def *sec, + struct bpf_program *prog) +{ + return bpf_program__attach_iter(prog, NULL); +} + struct bpf_link * bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd) { @@ -7882,6 +7898,42 @@ bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd) return link; } +struct bpf_link * +bpf_program__attach_iter(struct bpf_program *prog, + const struct bpf_iter_attach_opts *opts) +{ + char errmsg[STRERR_BUFSIZE]; + struct bpf_link *link; + int prog_fd, link_fd; + + if (!OPTS_VALID(opts, bpf_iter_attach_opts)) + return ERR_PTR(-EINVAL); + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("program '%s': can't attach before loaded\n", + bpf_program__title(prog, false)); + return ERR_PTR(-EINVAL); + } + + link = calloc(1, sizeof(*link)); + if (!link) + return ERR_PTR(-ENOMEM); + link->detach = &bpf_link__detach_fd; + + link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_ITER, NULL); + if (link_fd < 0) { + link_fd = -errno; + free(link); + pr_warn("program '%s': failed to attach to iterator: %s\n", + bpf_program__title(prog, false), + libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg))); + return ERR_PTR(link_fd); + } + link->fd = link_fd; + return link; +} + struct bpf_link *bpf_program__attach(struct bpf_program *prog) { const struct bpf_sec_def *sec_def; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index f1dacecb1619..8ea69558f0a8 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -258,6 +258,15 @@ struct bpf_map; LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(struct bpf_map *map); +struct bpf_iter_attach_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ +}; +#define bpf_iter_attach_opts__last_field sz + +LIBBPF_API struct bpf_link * +bpf_program__attach_iter(struct bpf_program *prog, + const struct bpf_iter_attach_opts *opts); + struct bpf_insn; /* diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index e03bd4db827e..0133d469d30b 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -258,6 +258,8 @@ LIBBPF_0.0.8 { LIBBPF_0.0.9 { global: bpf_enable_stats; + bpf_iter_create; bpf_link_get_fd_by_id; bpf_link_get_next_id; + bpf_program__attach_iter; } LIBBPF_0.0.8; -- cgit v1.2.3 From 5fbc220862fc7a53a0455ccd2d96c82141e222d4 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:19 -0700 Subject: tools/libpf: Add offsetof/container_of macro in bpf_helpers.h These two helpers will be used later in bpf_iter bpf program bpf_iter_netlink.c. Put them in bpf_helpers.h since they could be useful in other cases. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175919.2477104-1-yhs@fb.com --- tools/lib/bpf/bpf_helpers.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index da00b87aa199..f67dce2af802 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -36,6 +36,20 @@ #define __weak __attribute__((weak)) #endif +/* + * Helper macro to manipulate data structures + */ +#ifndef offsetof +#define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER) +#endif +#ifndef container_of +#define container_of(ptr, type, member) \ + ({ \ + void *__mptr = (void *)(ptr); \ + ((type *)(__mptr - offsetof(type, member))); \ + }) +#endif + /* * Helper structure used by eBPF C program * to describe BPF map attributes to libbpf loader -- cgit v1.2.3 From 9406b485dea5e25bed7c81cd822747d494cc8bde Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:20 -0700 Subject: tools/bpftool: Add bpf_iter support for bptool Currently, only one command is supported bpftool iter pin It will pin the trace/iter bpf program in the object file to the where should be on a bpffs mount. For example, $ bpftool iter pin ./bpf_iter_ipv6_route.o \ /sys/fs/bpf/my_route User can then do a `cat` to print out the results: $ cat /sys/fs/bpf/my_route fe800000000000000000000000000000 40 00000000000000000000000000000000 ... 00000000000000000000000000000000 00 00000000000000000000000000000000 ... 00000000000000000000000000000001 80 00000000000000000000000000000000 ... fe800000000000008c0162fffebdfd57 80 00000000000000000000000000000000 ... ff000000000000000000000000000000 08 00000000000000000000000000000000 ... 00000000000000000000000000000000 00 00000000000000000000000000000000 ... The implementation for ipv6_route iterator is in one of subsequent patches. This patch also added BPF_LINK_TYPE_ITER to link query. In the future, we may add additional parameters to pin command by parameterizing the bpf iterator. For example, a map_id or pid may be added to let bpf program only traverses a single map or task, similar to kernel seq_file single_open(). We may also add introspection command for targets/iterators by leveraging the bpf_iter itself. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200509175920.2477247-1-yhs@fb.com --- tools/bpf/bpftool/Documentation/bpftool-iter.rst | 83 ++++++++++++++++++++++ tools/bpf/bpftool/bash-completion/bpftool | 13 ++++ tools/bpf/bpftool/iter.c | 88 ++++++++++++++++++++++++ tools/bpf/bpftool/link.c | 1 + tools/bpf/bpftool/main.c | 3 +- tools/bpf/bpftool/main.h | 1 + 6 files changed, 188 insertions(+), 1 deletion(-) create mode 100644 tools/bpf/bpftool/Documentation/bpftool-iter.rst create mode 100644 tools/bpf/bpftool/iter.c (limited to 'tools') diff --git a/tools/bpf/bpftool/Documentation/bpftool-iter.rst b/tools/bpf/bpftool/Documentation/bpftool-iter.rst new file mode 100644 index 000000000000..13b173d93890 --- /dev/null +++ b/tools/bpf/bpftool/Documentation/bpftool-iter.rst @@ -0,0 +1,83 @@ +============ +bpftool-iter +============ +------------------------------------------------------------------------------- +tool to create BPF iterators +------------------------------------------------------------------------------- + +:Manual section: 8 + +SYNOPSIS +======== + + **bpftool** [*OPTIONS*] **iter** *COMMAND* + + *COMMANDS* := { **pin** | **help** } + +ITER COMMANDS +=================== + +| **bpftool** **iter pin** *OBJ* *PATH* +| **bpftool** **iter help** +| +| *OBJ* := /a/file/of/bpf_iter_target.o + + +DESCRIPTION +=========== + **bpftool iter pin** *OBJ* *PATH* + A bpf iterator combines a kernel iterating of + particular kernel data (e.g., tasks, bpf_maps, etc.) + and a bpf program called for each kernel data object + (e.g., one task, one bpf_map, etc.). User space can + *read* kernel iterator output through *read()* syscall. + + The *pin* command creates a bpf iterator from *OBJ*, + and pin it to *PATH*. The *PATH* should be located + in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions + of *bpffs*. + + User can then *cat PATH* to see the bpf iterator output. + + **bpftool iter help** + Print short help message. + +OPTIONS +======= + -h, --help + Print short generic help message (similar to **bpftool help**). + + -V, --version + Print version number (similar to **bpftool version**). + + -d, --debug + Print all logs available, even debug-level information. This + includes logs from libbpf as well as from the verifier, when + attempting to load programs. + +EXAMPLES +======== +**# bpftool iter pin bpf_iter_netlink.o /sys/fs/bpf/my_netlink** + +:: + + Create a file-based bpf iterator from bpf_iter_netlink.o and pin it + to /sys/fs/bpf/my_netlink + + +SEE ALSO +======== + **bpf**\ (2), + **bpf-helpers**\ (7), + **bpftool**\ (8), + **bpftool-prog**\ (8), + **bpftool-map**\ (8), + **bpftool-link**\ (8), + **bpftool-cgroup**\ (8), + **bpftool-feature**\ (8), + **bpftool-net**\ (8), + **bpftool-perf**\ (8), + **bpftool-btf**\ (8) + **bpftool-gen**\ (8) + **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index fc989ead7313..9f0f20e73b87 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -610,6 +610,19 @@ _bpftool() ;; esac ;; + iter) + case $command in + pin) + _filedir + return 0 + ;; + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'pin help' \ + -- "$cur" ) ) + ;; + esac + ;; map) local MAP_TYPE='id pinned name' case $command in diff --git a/tools/bpf/bpftool/iter.c b/tools/bpf/bpftool/iter.c new file mode 100644 index 000000000000..eb5987a0c3b6 --- /dev/null +++ b/tools/bpf/bpftool/iter.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (C) 2020 Facebook + +#define _GNU_SOURCE +#include +#include + +#include "main.h" + +static int do_pin(int argc, char **argv) +{ + const char *objfile, *path; + struct bpf_program *prog; + struct bpf_object *obj; + struct bpf_link *link; + int err; + + if (!REQ_ARGS(2)) + usage(); + + objfile = GET_ARG(); + path = GET_ARG(); + + obj = bpf_object__open(objfile); + if (IS_ERR(obj)) { + p_err("can't open objfile %s", objfile); + return -1; + } + + err = bpf_object__load(obj); + if (err) { + p_err("can't load objfile %s", objfile); + goto close_obj; + } + + prog = bpf_program__next(NULL, obj); + if (!prog) { + p_err("can't find bpf program in objfile %s", objfile); + goto close_obj; + } + + link = bpf_program__attach_iter(prog, NULL); + if (IS_ERR(link)) { + err = PTR_ERR(link); + p_err("attach_iter failed for program %s", + bpf_program__name(prog)); + goto close_obj; + } + + err = mount_bpffs_for_pin(path); + if (err) + goto close_link; + + err = bpf_link__pin(link, path); + if (err) { + p_err("pin_iter failed for program %s to path %s", + bpf_program__name(prog), path); + goto close_link; + } + +close_link: + bpf_link__destroy(link); +close_obj: + bpf_object__close(obj); + return err; +} + +static int do_help(int argc, char **argv) +{ + fprintf(stderr, + "Usage: %s %s pin OBJ PATH\n" + " %s %s help\n" + "\n", + bin_name, argv[-2], bin_name, argv[-2]); + + return 0; +} + +static const struct cmd cmds[] = { + { "help", do_help }, + { "pin", do_pin }, + { 0 } +}; + +int do_iter(int argc, char **argv) +{ + return cmd_select(cmds, argc, argv, do_help); +} diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index adc7dc431ed8..b6a0b35c78ae 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -16,6 +16,7 @@ static const char * const link_type_name[] = { [BPF_LINK_TYPE_RAW_TRACEPOINT] = "raw_tracepoint", [BPF_LINK_TYPE_TRACING] = "tracing", [BPF_LINK_TYPE_CGROUP] = "cgroup", + [BPF_LINK_TYPE_ITER] = "iter", }; static int link_parse_fd(int *argc, char ***argv) diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index 1413a154806e..46bd716a9d86 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -59,7 +59,7 @@ static int do_help(int argc, char **argv) " %s batch file FILE\n" " %s version\n" "\n" - " OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops }\n" + " OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops | iter }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, bin_name, bin_name); @@ -224,6 +224,7 @@ static const struct cmd cmds[] = { { "btf", do_btf }, { "gen", do_gen }, { "struct_ops", do_struct_ops }, + { "iter", do_iter }, { "version", do_version }, { 0 } }; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 9b1fb81a8331..a41cefabccaf 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -199,6 +199,7 @@ int do_feature(int argc, char **argv); int do_btf(int argc, char **argv); int do_gen(int argc, char **argv); int do_struct_ops(int argc, char **argv); +int do_iter(int argc, char **argv); int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what); int prog_parse_fd(int *argc, char ***argv); -- cgit v1.2.3 From 7c128a6bbd4f5b6780a90f3ce9aff192b7dd9d6a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:21 -0700 Subject: tools/bpf: selftests: Add iterator programs for ipv6_route and netlink Two bpf programs are added in this patch for netlink and ipv6_route target. On my VM, I am able to achieve identical results compared to /proc/net/netlink and /proc/net/ipv6_route. $ cat /proc/net/netlink sk Eth Pid Groups Rmem Wmem Dump Locks Drops Inode 000000002c42d58b 0 0 00000000 0 0 0 2 0 7 00000000a4e8b5e1 0 1 00000551 0 0 0 2 0 18719 00000000e1b1c195 4 0 00000000 0 0 0 2 0 16422 000000007e6b29f9 6 0 00000000 0 0 0 2 0 16424 .... 00000000159a170d 15 1862 00000002 0 0 0 2 0 1886 000000009aca4bc9 15 3918224839 00000002 0 0 0 2 0 19076 00000000d0ab31d2 15 1 00000002 0 0 0 2 0 18683 000000008398fb08 16 0 00000000 0 0 0 2 0 27 $ cat /sys/fs/bpf/my_netlink sk Eth Pid Groups Rmem Wmem Dump Locks Drops Inode 000000002c42d58b 0 0 00000000 0 0 0 2 0 7 00000000a4e8b5e1 0 1 00000551 0 0 0 2 0 18719 00000000e1b1c195 4 0 00000000 0 0 0 2 0 16422 000000007e6b29f9 6 0 00000000 0 0 0 2 0 16424 .... 00000000159a170d 15 1862 00000002 0 0 0 2 0 1886 000000009aca4bc9 15 3918224839 00000002 0 0 0 2 0 19076 00000000d0ab31d2 15 1 00000002 0 0 0 2 0 18683 000000008398fb08 16 0 00000000 0 0 0 2 0 27 $ cat /proc/net/ipv6_route fe800000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000001 00000000 00000001 eth0 00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200 lo 00000000000000000000000000000001 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000003 00000000 80200001 lo fe80000000000000c04b03fffe7827ce 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000002 00000000 80200001 eth0 ff000000000000000000000000000000 08 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000003 00000000 00000001 eth0 00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200 lo $ cat /sys/fs/bpf/my_ipv6_route fe800000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000001 00000000 00000001 eth0 00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200 lo 00000000000000000000000000000001 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000003 00000000 80200001 lo fe80000000000000c04b03fffe7827ce 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000002 00000000 80200001 eth0 ff000000000000000000000000000000 08 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000003 00000000 00000001 eth0 00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200 lo Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175921.2477493-1-yhs@fb.com --- .../selftests/bpf/progs/bpf_iter_ipv6_route.c | 62 ++++++++++++++++++++ .../testing/selftests/bpf/progs/bpf_iter_netlink.c | 66 ++++++++++++++++++++++ 2 files changed, 128 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_netlink.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c new file mode 100644 index 000000000000..ab9e2650e021 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +extern bool CONFIG_IPV6_SUBTREES __kconfig __weak; + +#define RTF_GATEWAY 0x0002 +#define IFNAMSIZ 16 +#define fib_nh_gw_family nh_common.nhc_gw_family +#define fib_nh_gw6 nh_common.nhc_gw.ipv6 +#define fib_nh_dev nh_common.nhc_dev + +SEC("iter/ipv6_route") +int dump_ipv6_route(struct bpf_iter__ipv6_route *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct fib6_info *rt = ctx->rt; + const struct net_device *dev; + struct fib6_nh *fib6_nh; + unsigned int flags; + struct nexthop *nh; + + if (rt == (void *)0) + return 0; + + fib6_nh = &rt->fib6_nh[0]; + flags = rt->fib6_flags; + + /* FIXME: nexthop_is_multipath is not handled here. */ + nh = rt->nh; + if (rt->nh) + fib6_nh = &nh->nh_info->fib6_nh; + + BPF_SEQ_PRINTF(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); + + if (CONFIG_IPV6_SUBTREES) + BPF_SEQ_PRINTF(seq, "%pi6 %02x ", &rt->fib6_src.addr, + rt->fib6_src.plen); + else + BPF_SEQ_PRINTF(seq, "00000000000000000000000000000000 00 "); + + if (fib6_nh->fib_nh_gw_family) { + flags |= RTF_GATEWAY; + BPF_SEQ_PRINTF(seq, "%pi6 ", &fib6_nh->fib_nh_gw6); + } else { + BPF_SEQ_PRINTF(seq, "00000000000000000000000000000000 "); + } + + dev = fib6_nh->fib_nh_dev; + if (dev) + BPF_SEQ_PRINTF(seq, "%08x %08x %08x %08x %8s\n", rt->fib6_metric, + rt->fib6_ref.refs.counter, 0, flags, dev->name); + else + BPF_SEQ_PRINTF(seq, "%08x %08x %08x %08x\n", rt->fib6_metric, + rt->fib6_ref.refs.counter, 0, flags); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c new file mode 100644 index 000000000000..6b40a233d4e0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +#define sk_rmem_alloc sk_backlog.rmem_alloc +#define sk_refcnt __sk_common.skc_refcnt + +static inline struct inode *SOCK_INODE(struct socket *socket) +{ + return &container_of(socket, struct socket_alloc, socket)->vfs_inode; +} + +SEC("iter/netlink") +int dump_netlink(struct bpf_iter__netlink *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct netlink_sock *nlk = ctx->sk; + unsigned long group, ino; + struct inode *inode; + struct socket *sk; + struct sock *s; + + if (nlk == (void *)0) + return 0; + + if (ctx->meta->seq_num == 0) + BPF_SEQ_PRINTF(seq, "sk Eth Pid Groups " + "Rmem Wmem Dump Locks Drops " + "Inode\n"); + + s = &nlk->sk; + BPF_SEQ_PRINTF(seq, "%pK %-3d ", s, s->sk_protocol); + + if (!nlk->groups) { + group = 0; + } else { + /* FIXME: temporary use bpf_probe_read here, needs + * verifier support to do direct access. + */ + bpf_probe_read(&group, sizeof(group), &nlk->groups[0]); + } + BPF_SEQ_PRINTF(seq, "%-10u %08x %-8d %-8d %-5d %-8d ", + nlk->portid, (u32)group, + s->sk_rmem_alloc.counter, + s->sk_wmem_alloc.refs.counter - 1, + nlk->cb_running, s->sk_refcnt.refs.counter); + + sk = s->sk_socket; + if (!sk) { + ino = 0; + } else { + /* FIXME: container_of inside SOCK_INODE has a forced + * type conversion, and direct access cannot be used + * with current verifier. + */ + inode = SOCK_INODE(sk); + bpf_probe_read(&ino, sizeof(ino), &inode->i_ino); + } + BPF_SEQ_PRINTF(seq, "%-8u %-8lu\n", s->sk_drops.counter, ino); + + return 0; +} -- cgit v1.2.3 From acf61631746c01850a9df0cd5617c5c29214776c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:22 -0700 Subject: tools/bpf: selftests: Add iter progs for bpf_map/task/task_file The implementation is arbitrary, just to show how the bpf programs can be written for bpf_map/task/task_file. They can be costomized for specific needs. For example, for bpf_map, the iterator prints out: $ cat /sys/fs/bpf/my_bpf_map id refcnt usercnt locked_vm 3 2 0 20 6 2 0 20 9 2 0 20 12 2 0 20 13 2 0 20 16 2 0 20 19 2 0 20 %%% END %%% For task, the iterator prints out: $ cat /sys/fs/bpf/my_task tgid gid 1 1 2 2 .... 1944 1944 1948 1948 1949 1949 1953 1953 === END === For task/file, the iterator prints out: $ cat /sys/fs/bpf/my_task_file tgid gid fd file 1 1 0 ffffffff95c97600 1 1 1 ffffffff95c97600 1 1 2 ffffffff95c97600 .... 1895 1895 255 ffffffff95c8fe00 1932 1932 0 ffffffff95c8fe00 1932 1932 1 ffffffff95c8fe00 1932 1932 2 ffffffff95c8fe00 1932 1932 3 ffffffff95c185c0 This is able to print out all open files (fd and file->f_op), so user can compare f_op against a particular kernel file operations to find what it is. For example, from /proc/kallsyms, we can find ffffffff95c185c0 r eventfd_fops so we will know tgid 1932 fd 3 is an eventfd file descriptor. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175922.2477576-1-yhs@fb.com --- .../testing/selftests/bpf/progs/bpf_iter_bpf_map.c | 28 ++++++++++++++++++++++ tools/testing/selftests/bpf/progs/bpf_iter_task.c | 25 +++++++++++++++++++ .../selftests/bpf/progs/bpf_iter_task_file.c | 26 ++++++++++++++++++++ 3 files changed, 79 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_task.c create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_task_file.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c new file mode 100644 index 000000000000..4867cd3445c8 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +SEC("iter/bpf_map") +int dump_bpf_map(struct bpf_iter__bpf_map *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + __u64 seq_num = ctx->meta->seq_num; + struct bpf_map *map = ctx->map; + + if (map == (void *)0) { + BPF_SEQ_PRINTF(seq, " %%%%%% END %%%%%%\n"); + return 0; + } + + if (seq_num == 0) + BPF_SEQ_PRINTF(seq, " id refcnt usercnt locked_vm\n"); + + BPF_SEQ_PRINTF(seq, "%8u %8ld %8ld %10lu\n", map->id, map->refcnt.counter, + map->usercnt.counter, + map->memory.user->locked_vm.counter); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task.c b/tools/testing/selftests/bpf/progs/bpf_iter_task.c new file mode 100644 index 000000000000..90f9011c57ca --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +SEC("iter/task") +int dump_task(struct bpf_iter__task *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct task_struct *task = ctx->task; + + if (task == (void *)0) { + BPF_SEQ_PRINTF(seq, " === END ===\n"); + return 0; + } + + if (ctx->meta->seq_num == 0) + BPF_SEQ_PRINTF(seq, " tgid gid\n"); + + BPF_SEQ_PRINTF(seq, "%8d %8d\n", task->tgid, task->pid); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c new file mode 100644 index 000000000000..c6ced38f0880 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +SEC("iter/task_file") +int dump_task_file(struct bpf_iter__task_file *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct task_struct *task = ctx->task; + __u32 fd = ctx->fd; + struct file *file = ctx->file; + + if (task == (void *)0 || file == (void *)0) + return 0; + + if (ctx->meta->seq_num == 0) + BPF_SEQ_PRINTF(seq, " tgid gid fd file\n"); + + BPF_SEQ_PRINTF(seq, "%8d %8d %8d %lx\n", task->tgid, task->pid, fd, + (long)file->f_op); + return 0; +} -- cgit v1.2.3 From 6879c042e10584ea9d5e2204939cafadcd500465 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:23 -0700 Subject: tools/bpf: selftests: Add bpf_iter selftests The added test includes the following subtests: - test verifier change for btf_id_or_null - test load/create_iter/read for ipv6_route/netlink/bpf_map/task/task_file - test anon bpf iterator - test anon bpf iterator reading one char at a time - test file bpf iterator - test overflow (single bpf program output not overflow) - test overflow (single bpf program output overflows) - test bpf prog returning 1 The ipv6_route tests the following verifier change - access fields in the variable length array of the structure. The netlink load tests the following verifier change - put a btf_id ptr value in a stack and accessible to tracing/iter programs. The anon bpf iterator also tests link auto attach through skeleton. $ test_progs -n 2 #2/1 btf_id_or_null:OK #2/2 ipv6_route:OK #2/3 netlink:OK #2/4 bpf_map:OK #2/5 task:OK #2/6 task_file:OK #2/7 anon:OK #2/8 anon-read-one-char:OK #2/9 file:OK #2/10 overflow:OK #2/11 overflow-e2big:OK #2/12 prog-ret-1:OK #2 bpf_iter:OK Summary: 1/12 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175923.2477637-1-yhs@fb.com --- tools/testing/selftests/bpf/prog_tests/bpf_iter.c | 409 +++++++++++++++++++++ .../selftests/bpf/progs/bpf_iter_test_kern1.c | 4 + .../selftests/bpf/progs/bpf_iter_test_kern2.c | 4 + .../selftests/bpf/progs/bpf_iter_test_kern3.c | 18 + .../selftests/bpf/progs/bpf_iter_test_kern4.c | 52 +++ .../bpf/progs/bpf_iter_test_kern_common.h | 22 ++ 6 files changed, 509 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_iter.c create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c new file mode 100644 index 000000000000..87c29dde1cf9 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c @@ -0,0 +1,409 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include +#include "bpf_iter_ipv6_route.skel.h" +#include "bpf_iter_netlink.skel.h" +#include "bpf_iter_bpf_map.skel.h" +#include "bpf_iter_task.skel.h" +#include "bpf_iter_task_file.skel.h" +#include "bpf_iter_test_kern1.skel.h" +#include "bpf_iter_test_kern2.skel.h" +#include "bpf_iter_test_kern3.skel.h" +#include "bpf_iter_test_kern4.skel.h" + +static int duration; + +static void test_btf_id_or_null(void) +{ + struct bpf_iter_test_kern3 *skel; + + skel = bpf_iter_test_kern3__open_and_load(); + if (CHECK(skel, "bpf_iter_test_kern3__open_and_load", + "skeleton open_and_load unexpectedly succeeded\n")) { + bpf_iter_test_kern3__destroy(skel); + return; + } +} + +static void do_dummy_read(struct bpf_program *prog) +{ + struct bpf_link *link; + char buf[16] = {}; + int iter_fd, len; + + link = bpf_program__attach_iter(prog, NULL); + if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + return; + + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n")) + goto free_link; + + /* not check contents, but ensure read() ends without error */ + while ((len = read(iter_fd, buf, sizeof(buf))) > 0) + ; + CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)); + + close(iter_fd); + +free_link: + bpf_link__destroy(link); +} + +static void test_ipv6_route(void) +{ + struct bpf_iter_ipv6_route *skel; + + skel = bpf_iter_ipv6_route__open_and_load(); + if (CHECK(!skel, "bpf_iter_ipv6_route__open_and_load", + "skeleton open_and_load failed\n")) + return; + + do_dummy_read(skel->progs.dump_ipv6_route); + + bpf_iter_ipv6_route__destroy(skel); +} + +static void test_netlink(void) +{ + struct bpf_iter_netlink *skel; + + skel = bpf_iter_netlink__open_and_load(); + if (CHECK(!skel, "bpf_iter_netlink__open_and_load", + "skeleton open_and_load failed\n")) + return; + + do_dummy_read(skel->progs.dump_netlink); + + bpf_iter_netlink__destroy(skel); +} + +static void test_bpf_map(void) +{ + struct bpf_iter_bpf_map *skel; + + skel = bpf_iter_bpf_map__open_and_load(); + if (CHECK(!skel, "bpf_iter_bpf_map__open_and_load", + "skeleton open_and_load failed\n")) + return; + + do_dummy_read(skel->progs.dump_bpf_map); + + bpf_iter_bpf_map__destroy(skel); +} + +static void test_task(void) +{ + struct bpf_iter_task *skel; + + skel = bpf_iter_task__open_and_load(); + if (CHECK(!skel, "bpf_iter_task__open_and_load", + "skeleton open_and_load failed\n")) + return; + + do_dummy_read(skel->progs.dump_task); + + bpf_iter_task__destroy(skel); +} + +static void test_task_file(void) +{ + struct bpf_iter_task_file *skel; + + skel = bpf_iter_task_file__open_and_load(); + if (CHECK(!skel, "bpf_iter_task_file__open_and_load", + "skeleton open_and_load failed\n")) + return; + + do_dummy_read(skel->progs.dump_task_file); + + bpf_iter_task_file__destroy(skel); +} + +/* The expected string is less than 16 bytes */ +static int do_read_with_fd(int iter_fd, const char *expected, + bool read_one_char) +{ + int err = -1, len, read_buf_len, start; + char buf[16] = {}; + + read_buf_len = read_one_char ? 1 : 16; + start = 0; + while ((len = read(iter_fd, buf + start, read_buf_len)) > 0) { + start += len; + if (CHECK(start >= 16, "read", "read len %d\n", len)) + return -1; + read_buf_len = read_one_char ? 1 : 16 - start; + } + if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno))) + return -1; + + err = strcmp(buf, expected); + if (CHECK(err, "read", "incorrect read result: buf %s, expected %s\n", + buf, expected)) + return -1; + + return 0; +} + +static void test_anon_iter(bool read_one_char) +{ + struct bpf_iter_test_kern1 *skel; + struct bpf_link *link; + int iter_fd, err; + + skel = bpf_iter_test_kern1__open_and_load(); + if (CHECK(!skel, "bpf_iter_test_kern1__open_and_load", + "skeleton open_and_load failed\n")) + return; + + err = bpf_iter_test_kern1__attach(skel); + if (CHECK(err, "bpf_iter_test_kern1__attach", + "skeleton attach failed\n")) { + goto out; + } + + link = skel->links.dump_task; + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n")) + goto out; + + do_read_with_fd(iter_fd, "abcd", read_one_char); + close(iter_fd); + +out: + bpf_iter_test_kern1__destroy(skel); +} + +static int do_read(const char *path, const char *expected) +{ + int err, iter_fd; + + iter_fd = open(path, O_RDONLY); + if (CHECK(iter_fd < 0, "open", "open %s failed: %s\n", + path, strerror(errno))) + return -1; + + err = do_read_with_fd(iter_fd, expected, false); + close(iter_fd); + return err; +} + +static void test_file_iter(void) +{ + const char *path = "/sys/fs/bpf/bpf_iter_test1"; + struct bpf_iter_test_kern1 *skel1; + struct bpf_iter_test_kern2 *skel2; + struct bpf_link *link; + int err; + + skel1 = bpf_iter_test_kern1__open_and_load(); + if (CHECK(!skel1, "bpf_iter_test_kern1__open_and_load", + "skeleton open_and_load failed\n")) + return; + + link = bpf_program__attach_iter(skel1->progs.dump_task, NULL); + if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + goto out; + + /* unlink this path if it exists. */ + unlink(path); + + err = bpf_link__pin(link, path); + if (CHECK(err, "pin_iter", "pin_iter to %s failed: %d\n", path, err)) + goto free_link; + + err = do_read(path, "abcd"); + if (err) + goto unlink_path; + + /* file based iterator seems working fine. Let us a link update + * of the underlying link and `cat` the iterator again, its content + * should change. + */ + skel2 = bpf_iter_test_kern2__open_and_load(); + if (CHECK(!skel2, "bpf_iter_test_kern2__open_and_load", + "skeleton open_and_load failed\n")) + goto unlink_path; + + err = bpf_link__update_program(link, skel2->progs.dump_task); + if (CHECK(err, "update_prog", "update_prog failed\n")) + goto destroy_skel2; + + do_read(path, "ABCD"); + +destroy_skel2: + bpf_iter_test_kern2__destroy(skel2); +unlink_path: + unlink(path); +free_link: + bpf_link__destroy(link); +out: + bpf_iter_test_kern1__destroy(skel1); +} + +static void test_overflow(bool test_e2big_overflow, bool ret1) +{ + __u32 map_info_len, total_read_len, expected_read_len; + int err, iter_fd, map1_fd, map2_fd, len; + struct bpf_map_info map_info = {}; + struct bpf_iter_test_kern4 *skel; + struct bpf_link *link; + __u32 page_size; + char *buf; + + skel = bpf_iter_test_kern4__open(); + if (CHECK(!skel, "bpf_iter_test_kern4__open", + "skeleton open failed\n")) + return; + + /* create two maps: bpf program will only do bpf_seq_write + * for these two maps. The goal is one map output almost + * fills seq_file buffer and then the other will trigger + * overflow and needs restart. + */ + map1_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0); + if (CHECK(map1_fd < 0, "bpf_create_map", + "map_creation failed: %s\n", strerror(errno))) + goto out; + map2_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0); + if (CHECK(map2_fd < 0, "bpf_create_map", + "map_creation failed: %s\n", strerror(errno))) + goto free_map1; + + /* bpf_seq_printf kernel buffer is one page, so one map + * bpf_seq_write will mostly fill it, and the other map + * will partially fill and then trigger overflow and need + * bpf_seq_read restart. + */ + page_size = sysconf(_SC_PAGE_SIZE); + + if (test_e2big_overflow) { + skel->rodata->print_len = (page_size + 8) / 8; + expected_read_len = 2 * (page_size + 8); + } else if (!ret1) { + skel->rodata->print_len = (page_size - 8) / 8; + expected_read_len = 2 * (page_size - 8); + } else { + skel->rodata->print_len = 1; + expected_read_len = 2 * 8; + } + skel->rodata->ret1 = ret1; + + if (CHECK(bpf_iter_test_kern4__load(skel), + "bpf_iter_test_kern4__load", "skeleton load failed\n")) + goto free_map2; + + /* setup filtering map_id in bpf program */ + map_info_len = sizeof(map_info); + err = bpf_obj_get_info_by_fd(map1_fd, &map_info, &map_info_len); + if (CHECK(err, "get_map_info", "get map info failed: %s\n", + strerror(errno))) + goto free_map2; + skel->bss->map1_id = map_info.id; + + err = bpf_obj_get_info_by_fd(map2_fd, &map_info, &map_info_len); + if (CHECK(err, "get_map_info", "get map info failed: %s\n", + strerror(errno))) + goto free_map2; + skel->bss->map2_id = map_info.id; + + link = bpf_program__attach_iter(skel->progs.dump_bpf_map, NULL); + if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + goto free_map2; + + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n")) + goto free_link; + + buf = malloc(expected_read_len); + if (!buf) + goto close_iter; + + /* do read */ + total_read_len = 0; + if (test_e2big_overflow) { + while ((len = read(iter_fd, buf, expected_read_len)) > 0) + total_read_len += len; + + CHECK(len != -1 || errno != E2BIG, "read", + "expected ret -1, errno E2BIG, but get ret %d, error %s\n", + len, strerror(errno)); + goto free_buf; + } else if (!ret1) { + while ((len = read(iter_fd, buf, expected_read_len)) > 0) + total_read_len += len; + + if (CHECK(len < 0, "read", "read failed: %s\n", + strerror(errno))) + goto free_buf; + } else { + do { + len = read(iter_fd, buf, expected_read_len); + if (len > 0) + total_read_len += len; + } while (len > 0 || len == -EAGAIN); + + if (CHECK(len < 0, "read", "read failed: %s\n", + strerror(errno))) + goto free_buf; + } + + if (CHECK(total_read_len != expected_read_len, "read", + "total len %u, expected len %u\n", total_read_len, + expected_read_len)) + goto free_buf; + + if (CHECK(skel->bss->map1_accessed != 1, "map1_accessed", + "expected 1 actual %d\n", skel->bss->map1_accessed)) + goto free_buf; + + if (CHECK(skel->bss->map2_accessed != 2, "map2_accessed", + "expected 2 actual %d\n", skel->bss->map2_accessed)) + goto free_buf; + + CHECK(skel->bss->map2_seqnum1 != skel->bss->map2_seqnum2, + "map2_seqnum", "two different seqnum %lld %lld\n", + skel->bss->map2_seqnum1, skel->bss->map2_seqnum2); + +free_buf: + free(buf); +close_iter: + close(iter_fd); +free_link: + bpf_link__destroy(link); +free_map2: + close(map2_fd); +free_map1: + close(map1_fd); +out: + bpf_iter_test_kern4__destroy(skel); +} + +void test_bpf_iter(void) +{ + if (test__start_subtest("btf_id_or_null")) + test_btf_id_or_null(); + if (test__start_subtest("ipv6_route")) + test_ipv6_route(); + if (test__start_subtest("netlink")) + test_netlink(); + if (test__start_subtest("bpf_map")) + test_bpf_map(); + if (test__start_subtest("task")) + test_task(); + if (test__start_subtest("task_file")) + test_task_file(); + if (test__start_subtest("anon")) + test_anon_iter(false); + if (test__start_subtest("anon-read-one-char")) + test_anon_iter(true); + if (test__start_subtest("file")) + test_file_iter(); + if (test__start_subtest("overflow")) + test_overflow(false, false); + if (test__start_subtest("overflow-e2big")) + test_overflow(true, false); + if (test__start_subtest("prog-ret-1")) + test_overflow(false, true); +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c new file mode 100644 index 000000000000..c71a7c283108 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c @@ -0,0 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#define START_CHAR 'a' +#include "bpf_iter_test_kern_common.h" diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c new file mode 100644 index 000000000000..8bdc8dc07444 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c @@ -0,0 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#define START_CHAR 'A' +#include "bpf_iter_test_kern_common.h" diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c new file mode 100644 index 000000000000..636a00fa074d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include "vmlinux.h" +#include + +char _license[] SEC("license") = "GPL"; + +SEC("iter/task") +int dump_task(struct bpf_iter__task *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct task_struct *task = ctx->task; + int tgid; + + tgid = task->tgid; + bpf_seq_write(seq, &tgid, sizeof(tgid)); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c new file mode 100644 index 000000000000..b18dc0471d07 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include "vmlinux.h" +#include + +char _license[] SEC("license") = "GPL"; + +__u32 map1_id = 0, map2_id = 0; +__u32 map1_accessed = 0, map2_accessed = 0; +__u64 map1_seqnum = 0, map2_seqnum1 = 0, map2_seqnum2 = 0; + +static volatile const __u32 print_len; +static volatile const __u32 ret1; + +SEC("iter/bpf_map") +int dump_bpf_map(struct bpf_iter__bpf_map *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct bpf_map *map = ctx->map; + __u64 seq_num; + int i, ret = 0; + + if (map == (void *)0) + return 0; + + /* only dump map1_id and map2_id */ + if (map->id != map1_id && map->id != map2_id) + return 0; + + seq_num = ctx->meta->seq_num; + if (map->id == map1_id) { + map1_seqnum = seq_num; + map1_accessed++; + } + + if (map->id == map2_id) { + if (map2_accessed == 0) { + map2_seqnum1 = seq_num; + if (ret1) + ret = 1; + } else { + map2_seqnum2 = seq_num; + } + map2_accessed++; + } + + /* fill seq_file buffer */ + for (i = 0; i < print_len; i++) + bpf_seq_write(seq, &seq_num, sizeof(seq_num)); + + return ret; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h new file mode 100644 index 000000000000..bdd51cf14b54 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2020 Facebook */ +#include "vmlinux.h" +#include + +char _license[] SEC("license") = "GPL"; +int count = 0; + +SEC("iter/task") +int dump_task(struct bpf_iter__task *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + char c; + + if (count < 4) { + c = START_CHAR + count; + bpf_seq_write(seq, &c, sizeof(c)); + count++; + } + + return 0; +} -- cgit v1.2.3 From b4563facdcae55c83039d5efcc3b45a63da14d2f Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 4 May 2020 10:36:26 -0700 Subject: bpf, runqslower: include proper uapi/bpf.h runqslower doesn't specify include path for uapi/bpf.h. This causes the following warning: In file included from runqslower.c:10: .../tools/testing/selftests/bpf/tools/include/bpf/bpf.h:234:38: warning: 'enum bpf_stats_type' declared inside parameter list will not be visible outside of this definition or declaration 234 | LIBBPF_API int bpf_enable_stats(enum bpf_stats_type type); Fix this by adding -I tools/includ/uapi to the Makefile. Reported-by: Alexei Starovoitov Signed-off-by: Song Liu Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- tools/bpf/runqslower/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile index 8a6f82e56a24..fb1337d69868 100644 --- a/tools/bpf/runqslower/Makefile +++ b/tools/bpf/runqslower/Makefile @@ -8,7 +8,8 @@ BPFTOOL ?= $(DEFAULT_BPFTOOL) LIBBPF_SRC := $(abspath ../../lib/bpf) BPFOBJ := $(OUTPUT)/libbpf.a BPF_INCLUDE := $(OUTPUT) -INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../lib) +INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../lib) \ + -I$(abspath ../../include/uapi) CFLAGS := -g -Wall # Try to detect best kernel BTF source -- cgit v1.2.3 From 385bbf7b119a4feb6d6bcf3586f1bb1dd9c5b0a0 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:50:57 -0500 Subject: bpf, libbpf: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200507185057.GA13981@embeddedor --- kernel/bpf/queue_stack_maps.c | 2 +- tools/lib/bpf/libbpf.c | 2 +- tools/lib/bpf/libbpf_internal.h | 2 +- tools/testing/selftests/bpf/progs/core_reloc_types.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index f697647ceb54..30e1373fd437 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -19,7 +19,7 @@ struct bpf_queue_stack { u32 head, tail; u32 size; /* max_entries + 1 */ - char elements[0] __aligned(8); + char elements[] __aligned(8); }; static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 6c2f46908f4d..3da66540b54b 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -8352,7 +8352,7 @@ error: struct perf_sample_raw { struct perf_event_header header; uint32_t size; - char data[0]; + char data[]; }; struct perf_sample_lost { diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 8c3afbd97747..50d70e90d5f1 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -153,7 +153,7 @@ struct btf_ext_info_sec { __u32 sec_name_off; __u32 num_info; /* Followed by num_info * record_size number of bytes */ - __u8 data[0]; + __u8 data[]; }; /* The minimum bpf_func_info checked by the loader */ diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h index 6d598cfbdb3e..34d84717c946 100644 --- a/tools/testing/selftests/bpf/progs/core_reloc_types.h +++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h @@ -379,7 +379,7 @@ struct core_reloc_arrays___equiv_zero_sz_arr { struct core_reloc_arrays_substruct c[3]; struct core_reloc_arrays_substruct d[1][2]; /* equivalent to flexible array */ - struct core_reloc_arrays_substruct f[0][2]; + struct core_reloc_arrays_substruct f[][2]; }; struct core_reloc_arrays___fixed_arr { -- cgit v1.2.3 From 6e7e034e88e8e22cb14765c86da92416017e45b8 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Mon, 11 May 2020 17:15:33 +0100 Subject: tools, bpftool: Poison and replace kernel integer typedefs Replace the use of kernel-only integer typedefs (u8, u32, etc.) by their user space counterpart (__u8, __u32, etc.). Similarly to what libbpf does, poison the typedefs to avoid introducing them again in the future. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200511161536.29853-2-quentin@isovalent.com --- tools/bpf/bpftool/btf_dumper.c | 4 ++-- tools/bpf/bpftool/cfg.c | 4 ++-- tools/bpf/bpftool/main.h | 3 +++ tools/bpf/bpftool/map_perf_ring.c | 2 +- tools/bpf/bpftool/prog.c | 2 +- 5 files changed, 9 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c index 497807bec675..ede162f83eea 100644 --- a/tools/bpf/bpftool/btf_dumper.c +++ b/tools/bpf/bpftool/btf_dumper.c @@ -271,8 +271,8 @@ static void btf_int128_print(json_writer_t *jw, const void *data, } } -static void btf_int128_shift(__u64 *print_num, u16 left_shift_bits, - u16 right_shift_bits) +static void btf_int128_shift(__u64 *print_num, __u16 left_shift_bits, + __u16 right_shift_bits) { __u64 upper_num, lower_num; diff --git a/tools/bpf/bpftool/cfg.c b/tools/bpf/bpftool/cfg.c index 3e21f994f262..1951219a9af7 100644 --- a/tools/bpf/bpftool/cfg.c +++ b/tools/bpf/bpftool/cfg.c @@ -157,7 +157,7 @@ static bool cfg_partition_funcs(struct cfg *cfg, struct bpf_insn *cur, return false; } -static bool is_jmp_insn(u8 code) +static bool is_jmp_insn(__u8 code) { return BPF_CLASS(code) == BPF_JMP || BPF_CLASS(code) == BPF_JMP32; } @@ -176,7 +176,7 @@ static bool func_partition_bb_head(struct func_node *func) for (; cur <= end; cur++) { if (is_jmp_insn(cur->code)) { - u8 opcode = BPF_OP(cur->code); + __u8 opcode = BPF_OP(cur->code); if (opcode == BPF_EXIT || opcode == BPF_CALL) continue; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index a41cefabccaf..f89ac70ef973 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -18,6 +18,9 @@ #include "json_writer.h" +/* Make sure we do not use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + #define ptr_to_u64(ptr) ((__u64)(unsigned long)(ptr)) #define NEXT_ARG() ({ argc--; argv++; if (argc < 0) usage(); }) diff --git a/tools/bpf/bpftool/map_perf_ring.c b/tools/bpf/bpftool/map_perf_ring.c index d9b29c17fbb8..825f29f93a57 100644 --- a/tools/bpf/bpftool/map_perf_ring.c +++ b/tools/bpf/bpftool/map_perf_ring.c @@ -39,7 +39,7 @@ struct event_ring_info { struct perf_event_sample { struct perf_event_header header; - u64 time; + __u64 time; __u32 size; unsigned char data[]; }; diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index f6a5974a7b0a..b6e5ba568f98 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -238,7 +238,7 @@ exit_free: return fd; } -static void show_prog_maps(int fd, u32 num_maps) +static void show_prog_maps(int fd, __u32 num_maps) { struct bpf_prog_info info = {}; __u32 len = sizeof(info); -- cgit v1.2.3 From c8caa0bb4b383a86a77f4c8727a4f7c7f9825260 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Mon, 11 May 2020 17:15:34 +0100 Subject: tools, bpftool: Minor fixes for documentation Bring minor improvements to bpftool documentation. Fix or harmonise formatting, update map types (including in interactive help), improve description for "map create", fix a build warning due to a missing line after the double-colon for the "bpftool prog profile" example, complete/harmonise/sort the list of related bpftool man pages in footers. v2: - Remove (instead of changing) mark-up on "value" in bpftool-map.rst, when it does not refer to something passed on the command line. - Fix an additional typo ("hexadeximal") in the same file. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200511161536.29853-3-quentin@isovalent.com --- tools/bpf/bpftool/Documentation/bpftool-btf.rst | 11 +++++-- tools/bpf/bpftool/Documentation/bpftool-cgroup.rst | 12 ++++--- .../bpf/bpftool/Documentation/bpftool-feature.rst | 12 ++++--- tools/bpf/bpftool/Documentation/bpftool-gen.rst | 21 ++++++------ tools/bpf/bpftool/Documentation/bpftool-iter.rst | 12 +++---- tools/bpf/bpftool/Documentation/bpftool-link.rst | 9 ++++-- tools/bpf/bpftool/Documentation/bpftool-map.rst | 37 ++++++++++++++-------- tools/bpf/bpftool/Documentation/bpftool-net.rst | 12 ++++--- tools/bpf/bpftool/Documentation/bpftool-perf.rst | 12 ++++--- tools/bpf/bpftool/Documentation/bpftool-prog.rst | 23 ++++++++------ .../bpftool/Documentation/bpftool-struct_ops.rst | 11 ++++--- tools/bpf/bpftool/Documentation/bpftool.rst | 11 ++++--- tools/bpf/bpftool/map.c | 3 +- 13 files changed, 116 insertions(+), 70 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst index 39615f8e145b..ce3a724f50c1 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst @@ -230,9 +230,14 @@ SEE ALSO **bpf**\ (2), **bpf-helpers**\ (7), **bpftool**\ (8), - **bpftool-map**\ (8), - **bpftool-prog**\ (8), + **bpftool-btf**\ (8), **bpftool-cgroup**\ (8), **bpftool-feature**\ (8), + **bpftool-gen**\ (8), + **bpftool-iter**\ (8), + **bpftool-link**\ (8), + **bpftool-map**\ (8), **bpftool-net**\ (8), - **bpftool-perf**\ (8) + **bpftool-perf**\ (8), + **bpftool-prog**\ (8), + **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index 06a28b07787d..e4d9da654e84 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -20,7 +20,7 @@ SYNOPSIS CGROUP COMMANDS =============== -| **bpftool** **cgroup { show | list }** *CGROUP* [**effective**] +| **bpftool** **cgroup** { **show** | **list** } *CGROUP* [**effective**] | **bpftool** **cgroup tree** [*CGROUP_ROOT*] [**effective**] | **bpftool** **cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] | **bpftool** **cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* @@ -160,9 +160,13 @@ SEE ALSO **bpf**\ (2), **bpf-helpers**\ (7), **bpftool**\ (8), - **bpftool-prog**\ (8), - **bpftool-map**\ (8), + **bpftool-btf**\ (8), **bpftool-feature**\ (8), + **bpftool-gen**\ (8), + **bpftool-iter**\ (8), + **bpftool-link**\ (8), + **bpftool-map**\ (8), **bpftool-net**\ (8), **bpftool-perf**\ (8), - **bpftool-btf**\ (8) + **bpftool-prog**\ (8), + **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index 1fa755f55e0c..8609f06e71de 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -28,7 +28,7 @@ DESCRIPTION =========== **bpftool feature probe** [**kernel**] [**full**] [**macros** [**prefix** *PREFIX*]] Probe the running kernel and dump a number of eBPF-related - parameters, such as availability of the **bpf()** system call, + parameters, such as availability of the **bpf**\ () system call, JIT status, eBPF program types availability, eBPF helper functions availability, and more. @@ -93,9 +93,13 @@ SEE ALSO **bpf**\ (2), **bpf-helpers**\ (7), **bpftool**\ (8), - **bpftool-prog**\ (8), - **bpftool-map**\ (8), + **bpftool-btf**\ (8), **bpftool-cgroup**\ (8), + **bpftool-gen**\ (8), + **bpftool-iter**\ (8), + **bpftool-link**\ (8), + **bpftool-map**\ (8), **bpftool-net**\ (8), **bpftool-perf**\ (8), - **bpftool-btf**\ (8) + **bpftool-prog**\ (8), + **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index 94d91322895a..df85dbd962c0 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -14,7 +14,7 @@ SYNOPSIS *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] } - *COMMAND* := { **skeleton | **help** } + *COMMAND* := { **skeleton** | **help** } GEN COMMANDS ============= @@ -36,12 +36,12 @@ DESCRIPTION etc. Skeleton eliminates the need to lookup mentioned components by name. Instead, if skeleton instantiation succeeds, they are populated in skeleton structure as valid - libbpf types (e.g., struct bpf_map pointer) and can be + libbpf types (e.g., **struct bpf_map** pointer) and can be passed to existing generic libbpf APIs. In addition to simple and reliable access to maps and - programs, skeleton provides a storage for BPF links (struct - bpf_link) for each BPF program within BPF object. When + programs, skeleton provides a storage for BPF links (**struct + bpf_link**) for each BPF program within BPF object. When requested, supported BPF programs will be automatically attached and resulting BPF links stored for further use by user in pre-allocated fields in skeleton struct. For BPF @@ -82,14 +82,14 @@ DESCRIPTION - **example__open** and **example__open_opts**. These functions are used to instantiate skeleton. It - corresponds to libbpf's **bpf_object__open()** API. + corresponds to libbpf's **bpf_object__open**\ () API. **_opts** variants accepts extra **bpf_object_open_opts** options. - **example__load**. This function creates maps, loads and verifies BPF programs, initializes global data maps. It corresponds to - libppf's **bpf_object__load** API. + libppf's **bpf_object__load**\ () API. - **example__open_and_load** combines **example__open** and **example__load** invocations in one commonly used @@ -296,10 +296,13 @@ SEE ALSO **bpf**\ (2), **bpf-helpers**\ (7), **bpftool**\ (8), - **bpftool-map**\ (8), - **bpftool-prog**\ (8), + **bpftool-btf**\ (8), **bpftool-cgroup**\ (8), **bpftool-feature**\ (8), + **bpftool-iter**\ (8), + **bpftool-link**\ (8), + **bpftool-map**\ (8), **bpftool-net**\ (8), **bpftool-perf**\ (8), - **bpftool-btf**\ (8) + **bpftool-prog**\ (8), + **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-iter.rst b/tools/bpf/bpftool/Documentation/bpftool-iter.rst index 13b173d93890..8dce698eab79 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-iter.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-iter.rst @@ -22,7 +22,6 @@ ITER COMMANDS | | *OBJ* := /a/file/of/bpf_iter_target.o - DESCRIPTION =========== **bpftool iter pin** *OBJ* *PATH* @@ -65,19 +64,18 @@ EXAMPLES Create a file-based bpf iterator from bpf_iter_netlink.o and pin it to /sys/fs/bpf/my_netlink - SEE ALSO ======== **bpf**\ (2), **bpf-helpers**\ (7), **bpftool**\ (8), - **bpftool-prog**\ (8), - **bpftool-map**\ (8), - **bpftool-link**\ (8), + **bpftool-btf**\ (8), **bpftool-cgroup**\ (8), **bpftool-feature**\ (8), + **bpftool-gen**\ (8), + **bpftool-link**\ (8), + **bpftool-map**\ (8), **bpftool-net**\ (8), **bpftool-perf**\ (8), - **bpftool-btf**\ (8) - **bpftool-gen**\ (8) + **bpftool-prog**\ (8), **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-link.rst b/tools/bpf/bpftool/Documentation/bpftool-link.rst index ee6500d6e6e4..0e43d7b06c11 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-link.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-link.rst @@ -109,10 +109,13 @@ SEE ALSO **bpf**\ (2), **bpf-helpers**\ (7), **bpftool**\ (8), - **bpftool-prog\ (8), - **bpftool-map**\ (8), + **bpftool-btf**\ (8), **bpftool-cgroup**\ (8), **bpftool-feature**\ (8), + **bpftool-gen**\ (8), + **bpftool-iter**\ (8), + **bpftool-map**\ (8), **bpftool-net**\ (8), **bpftool-perf**\ (8), - **bpftool-btf**\ (8) + **bpftool-prog**\ (8), + **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index cdeae8ae90ba..31101643e57c 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -21,7 +21,7 @@ SYNOPSIS MAP COMMANDS ============= -| **bpftool** **map { show | list }** [*MAP*] +| **bpftool** **map** { **show** | **list** } [*MAP*] | **bpftool** **map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \ | **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**dev** *NAME*] | **bpftool** **map dump** *MAP* @@ -49,7 +49,7 @@ MAP COMMANDS | | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps** | | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** | | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** -| | **queue** | **stack** } +| | **queue** | **stack** | **sk_storage** | **struct_ops** } DESCRIPTION =========== @@ -66,6 +66,13 @@ DESCRIPTION Create a new map with given parameters and pin it to *bpffs* as *FILE*. + *FLAGS* should be an integer which is the combination of + desired flags, e.g. 1024 for **BPF_F_MMAPABLE** (see bpf.h + UAPI header for existing flags). + + Keyword **dev** expects a network interface name, and is used + to request hardware offload for the map. + **bpftool map dump** *MAP* Dump all entries in a given *MAP*. In case of **name**, *MAP* may match several maps which will all be dumped. @@ -78,7 +85,7 @@ DESCRIPTION exists; **noexist** update only if entry doesn't exist. If the **hex** keyword is provided in front of the bytes - sequence, the bytes are parsed as hexadeximal values, even if + sequence, the bytes are parsed as hexadecimal values, even if no "0x" prefix is added. If the keyword is not provided, then the bytes are parsed as decimal values, unless a "0x" prefix (for hexadecimal) or a "0" prefix (for octal) is provided. @@ -100,10 +107,10 @@ DESCRIPTION extensions of *bpffs*. **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] - Read events from a BPF_MAP_TYPE_PERF_EVENT_ARRAY map. + Read events from a **BPF_MAP_TYPE_PERF_EVENT_ARRAY** map. Install perf rings into a perf event array map and dump - output of any bpf_perf_event_output() call in the kernel. + output of any **bpf_perf_event_output**\ () call in the kernel. By default read the number of CPUs on the system and install perf ring for each CPU in the corresponding index in the array. @@ -116,24 +123,24 @@ DESCRIPTION receiving events if it installed its rings earlier. **bpftool map peek** *MAP* - Peek next **value** in the queue or stack. + Peek next value in the queue or stack. **bpftool map push** *MAP* **value** *VALUE* - Push **value** onto the stack. + Push *VALUE* onto the stack. **bpftool map pop** *MAP* - Pop and print **value** from the stack. + Pop and print value from the stack. **bpftool map enqueue** *MAP* **value** *VALUE* - Enqueue **value** into the queue. + Enqueue *VALUE* into the queue. **bpftool map dequeue** *MAP* - Dequeue and print **value** from the queue. + Dequeue and print value from the queue. **bpftool map freeze** *MAP* Freeze the map as read-only from user space. Entries from a frozen map can not longer be updated or deleted with the - **bpf\ ()** system call. This operation is not reversible, + **bpf**\ () system call. This operation is not reversible, and the map remains immutable from user space until its destruction. However, read and write permissions for BPF programs to the map remain unchanged. @@ -269,9 +276,13 @@ SEE ALSO **bpf**\ (2), **bpf-helpers**\ (7), **bpftool**\ (8), - **bpftool-prog**\ (8), + **bpftool-btf**\ (8), **bpftool-cgroup**\ (8), **bpftool-feature**\ (8), + **bpftool-gen**\ (8), + **bpftool-iter**\ (8), + **bpftool-link**\ (8), **bpftool-net**\ (8), **bpftool-perf**\ (8), - **bpftool-btf**\ (8) + **bpftool-prog**\ (8), + **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index 8651b00b81ea..aa7450736179 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -20,7 +20,7 @@ SYNOPSIS NET COMMANDS ============ -| **bpftool** **net { show | list }** [ **dev** *NAME* ] +| **bpftool** **net** { **show** | **list** } [ **dev** *NAME* ] | **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] | **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* | **bpftool** **net help** @@ -194,9 +194,13 @@ SEE ALSO **bpf**\ (2), **bpf-helpers**\ (7), **bpftool**\ (8), - **bpftool-prog**\ (8), - **bpftool-map**\ (8), + **bpftool-btf**\ (8), **bpftool-cgroup**\ (8), **bpftool-feature**\ (8), + **bpftool-gen**\ (8), + **bpftool-iter**\ (8), + **bpftool-link**\ (8), + **bpftool-map**\ (8), **bpftool-perf**\ (8), - **bpftool-btf**\ (8) + **bpftool-prog**\ (8), + **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst index e252bd0bc434..9c592b7c6775 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-perf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst @@ -20,7 +20,7 @@ SYNOPSIS PERF COMMANDS ============= -| **bpftool** **perf { show | list }** +| **bpftool** **perf** { **show** | **list** } | **bpftool** **perf help** DESCRIPTION @@ -85,9 +85,13 @@ SEE ALSO **bpf**\ (2), **bpf-helpers**\ (7), **bpftool**\ (8), - **bpftool-prog**\ (8), - **bpftool-map**\ (8), + **bpftool-btf**\ (8), **bpftool-cgroup**\ (8), **bpftool-feature**\ (8), + **bpftool-gen**\ (8), + **bpftool-iter**\ (8), + **bpftool-link**\ (8), + **bpftool-map**\ (8), **bpftool-net**\ (8), - **bpftool-btf**\ (8) + **bpftool-prog**\ (8), + **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 9f19404f470e..5948e9d89c8d 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -21,11 +21,11 @@ SYNOPSIS PROG COMMANDS ============= -| **bpftool** **prog { show | list }** [*PROG*] +| **bpftool** **prog** { **show** | **list** } [*PROG*] | **bpftool** **prog dump xlated** *PROG* [{**file** *FILE* | **opcodes** | **visual** | **linum**}] | **bpftool** **prog dump jited** *PROG* [{**file** *FILE* | **opcodes** | **linum**}] | **bpftool** **prog pin** *PROG* *FILE* -| **bpftool** **prog { load | loadall }** *OBJ* *PATH* [**type** *TYPE*] [**map** {**idx** *IDX* | **name** *NAME*} *MAP*] [**dev** *NAME*] [**pinmaps** *MAP_DIR*] +| **bpftool** **prog** { **load** | **loadall** } *OBJ* *PATH* [**type** *TYPE*] [**map** {**idx** *IDX* | **name** *NAME*} *MAP*] [**dev** *NAME*] [**pinmaps** *MAP_DIR*] | **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*] | **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*] | **bpftool** **prog tracelog** @@ -49,7 +49,7 @@ PROG COMMANDS | *ATTACH_TYPE* := { | **msg_verdict** | **stream_verdict** | **stream_parser** | **flow_dissector** | } -| *METRIC* := { +| *METRICs* := { | **cycles** | **instructions** | **l1d_loads** | **llc_misses** | } @@ -155,7 +155,7 @@ DESCRIPTION **bpftool prog tracelog** Dump the trace pipe of the system to the console (stdout). Hit to stop printing. BPF programs can write to this - trace pipe at runtime with the **bpf_trace_printk()** helper. + trace pipe at runtime with the **bpf_trace_printk**\ () helper. This should be used only for debugging purposes. For streaming data from BPF programs to user space, one can use perf events (see also **bpftool-map**\ (8)). @@ -195,9 +195,9 @@ DESCRIPTION **bpftool prog profile** *PROG* [**duration** *DURATION*] *METRICs* Profile *METRICs* for bpf program *PROG* for *DURATION* - seconds or until user hits Ctrl-C. *DURATION* is optional. + seconds or until user hits . *DURATION* is optional. If *DURATION* is not specified, the profiling will run up to - UINT_MAX seconds. + **UINT_MAX** seconds. **bpftool prog help** Print short help message. @@ -267,7 +267,7 @@ EXAMPLES | | **# bpftool prog dump xlated id 10 file /tmp/t** -| **# ls -l /tmp/t** +| **$ ls -l /tmp/t** :: @@ -325,6 +325,7 @@ EXAMPLES | **# bpftool prog profile id 337 duration 10 cycles instructions llc_misses** :: + 51397 run_cnt 40176203 cycles (83.05%) 42518139 instructions # 1.06 insns per cycle (83.39%) @@ -335,9 +336,13 @@ SEE ALSO **bpf**\ (2), **bpf-helpers**\ (7), **bpftool**\ (8), - **bpftool-map**\ (8), + **bpftool-btf**\ (8), **bpftool-cgroup**\ (8), **bpftool-feature**\ (8), + **bpftool-gen**\ (8), + **bpftool-iter**\ (8), + **bpftool-link**\ (8), + **bpftool-map**\ (8), **bpftool-net**\ (8), **bpftool-perf**\ (8), - **bpftool-btf**\ (8) + **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst index f045cc89dd6d..d93cd1cb8b0f 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst @@ -105,12 +105,13 @@ SEE ALSO **bpf**\ (2), **bpf-helpers**\ (7), **bpftool**\ (8), - **bpftool-prog**\ (8), - **bpftool-map**\ (8), + **bpftool-btf**\ (8), **bpftool-cgroup**\ (8), **bpftool-feature**\ (8), + **bpftool-gen**\ (8), + **bpftool-iter**\ (8), + **bpftool-link**\ (8), + **bpftool-map**\ (8), **bpftool-net**\ (8), **bpftool-perf**\ (8), - **bpftool-btf**\ (8) - **bpftool-gen**\ (8) - + **bpftool-prog**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index 34239fda69ed..420d4d5df8b6 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -75,11 +75,14 @@ SEE ALSO ======== **bpf**\ (2), **bpf-helpers**\ (7), - **bpftool-prog**\ (8), - **bpftool-map**\ (8), + **bpftool-btf**\ (8), **bpftool-cgroup**\ (8), **bpftool-feature**\ (8), + **bpftool-gen**\ (8), + **bpftool-iter**\ (8), + **bpftool-link**\ (8), + **bpftool-map**\ (8), **bpftool-net**\ (8), **bpftool-perf**\ (8), - **bpftool-btf**\ (8), - **bpftool-gen**\ (8), + **bpftool-prog**\ (8), + **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 693a632f6813..85cbe9a19170 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -1589,7 +1589,8 @@ static int do_help(int argc, char **argv) " percpu_array | stack_trace | cgroup_array | lru_hash |\n" " lru_percpu_hash | lpm_trie | array_of_maps | hash_of_maps |\n" " devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n" - " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage }\n" + " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n" + " queue | stack | sk_storage | struct_ops }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], -- cgit v1.2.3 From ff20460e94af5d11ebffd9d97c1eaa00e520ecbe Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Mon, 11 May 2020 17:15:36 +0100 Subject: tools, bpf: Synchronise BPF UAPI header with tools Synchronise the bpf.h header under tools, to report the fixes recently brought to the documentation for the BPF helpers. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200511161536.29853-5-quentin@isovalent.com --- tools/include/uapi/linux/bpf.h | 109 ++++++++++++++++++++++------------------- 1 file changed, 59 insertions(+), 50 deletions(-) (limited to 'tools') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 9d1932e23cec..bfb31c1be219 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -675,8 +675,8 @@ union bpf_attr { * For tracing programs, safely attempt to read *size* bytes from * kernel space address *unsafe_ptr* and store the data in *dst*. * - * Generally, use bpf_probe_read_user() or bpf_probe_read_kernel() - * instead. + * Generally, use **bpf_probe_read_user**\ () or + * **bpf_probe_read_kernel**\ () instead. * Return * 0 on success, or a negative error in case of failure. * @@ -684,7 +684,7 @@ union bpf_attr { * Description * Return the time elapsed since system boot, in nanoseconds. * Does not include time the system was suspended. - * See: clock_gettime(CLOCK_MONOTONIC) + * See: **clock_gettime**\ (**CLOCK_MONOTONIC**) * Return * Current *ktime*. * @@ -1543,11 +1543,11 @@ union bpf_attr { * int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe kernel address - * *unsafe_ptr* to *dst*. See bpf_probe_read_kernel_str() for + * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for * more details. * - * Generally, use bpf_probe_read_user_str() or bpf_probe_read_kernel_str() - * instead. + * Generally, use **bpf_probe_read_user_str**\ () or + * **bpf_probe_read_kernel_str**\ () instead. * Return * On success, the strictly positive length of the string, * including the trailing NUL character. On error, a negative @@ -1575,7 +1575,7 @@ union bpf_attr { * * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) * Description - * Equivalent to bpf_get_socket_cookie() helper that accepts + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts * *skb*, but gets socket from **struct bpf_sock_ops** context. * Return * A 8-byte long non-decreasing number. @@ -1604,6 +1604,7 @@ union bpf_attr { * The option value of length *optlen* is pointed by *optval*. * * *bpf_socket* should be one of the following: + * * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** * and **BPF_CGROUP_INET6_CONNECT**. @@ -1672,12 +1673,12 @@ union bpf_attr { * * The lower two bits of *flags* are used as the return code if * the map lookup fails. This is so that the return value can be - * one of the XDP program return codes up to XDP_TX, as chosen by - * the caller. Any higher bits in the *flags* argument must be + * one of the XDP program return codes up to **XDP_TX**, as chosen + * by the caller. Any higher bits in the *flags* argument must be * unset. * - * See also bpf_redirect(), which only supports redirecting to an - * ifindex, but doesn't require a map to do so. + * See also **bpf_redirect**\ (), which only supports redirecting + * to an ifindex, but doesn't require a map to do so. * Return * **XDP_REDIRECT** on success, or the value of the two lower bits * of the *flags* argument on error. @@ -1785,7 +1786,7 @@ union bpf_attr { * the time running for event since last normalization. The * enabled and running times are accumulated since the perf event * open. To achieve scaling factor between two invocations of an - * eBPF program, users can can use CPU id as the key (which is + * eBPF program, users can use CPU id as the key (which is * typical for perf array usage model) to remember the previous * value and do the calculation inside the eBPF program. * Return @@ -1812,6 +1813,7 @@ union bpf_attr { * *opval* and of length *optlen*. * * *bpf_socket* should be one of the following: + * * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** * and **BPF_CGROUP_INET6_CONNECT**. @@ -1833,7 +1835,7 @@ union bpf_attr { * The first argument is the context *regs* on which the kprobe * works. * - * This helper works by setting setting the PC (program counter) + * This helper works by setting the PC (program counter) * to an override function which is run in place of the original * probed function. This means the probed function is not run at * all. The replacement function just returns with the required @@ -2300,7 +2302,7 @@ union bpf_attr { * **bpf_rc_keydown**\ () again with the same values, or calling * **bpf_rc_repeat**\ (). * - * Some protocols include a toggle bit, in case the button was + * Some protocols include a toggle bit, in case the button was * released and pressed again between consecutive scancodes. * * The *ctx* should point to the lirc sample as passed into @@ -2646,7 +2648,6 @@ union bpf_attr { * * *th* points to the start of the TCP header, while *th_len* * contains **sizeof**\ (**struct tcphdr**). - * * Return * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative * error otherwise. @@ -2829,7 +2830,6 @@ union bpf_attr { * * *th* points to the start of the TCP header, while *th_len* * contains the length of the TCP header. - * * Return * On success, lower 32 bits hold the generated SYN cookie in * followed by 16 bits which hold the MSS value for that cookie, @@ -2912,7 +2912,7 @@ union bpf_attr { * // size, after checking its boundaries. * } * - * In comparison, using **bpf_probe_read_user()** helper here + * In comparison, using **bpf_probe_read_user**\ () helper here * instead to read the string would require to estimate the length * at compile time, and would often result in copying more memory * than necessary. @@ -2930,14 +2930,14 @@ union bpf_attr { * int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* - * to *dst*. Same semantics as with bpf_probe_read_user_str() apply. + * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply. * Return - * On success, the strictly positive length of the string, including + * On success, the strictly positive length of the string, including * the trailing NUL character. On error, a negative value. * * int bpf_tcp_send_ack(void *tp, u32 rcv_nxt) * Description - * Send out a tcp-ack. *tp* is the in-kernel struct tcp_sock. + * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**. * *rcv_nxt* is the ack_seq to be sent out. * Return * 0 on success, or a negative error in case of failure. @@ -2965,19 +2965,19 @@ union bpf_attr { * int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) * Description * For an eBPF program attached to a perf event, retrieve the - * branch records (struct perf_branch_entry) associated to *ctx* - * and store it in the buffer pointed by *buf* up to size + * branch records (**struct perf_branch_entry**) associated to *ctx* + * and store it in the buffer pointed by *buf* up to size * *size* bytes. * Return * On success, number of bytes written to *buf*. On error, a * negative value. * * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to - * instead return the number of bytes required to store all the + * instead return the number of bytes required to store all the * branch entries. If this flag is set, *buf* may be NULL. * * **-EINVAL** if arguments invalid or **size** not a multiple - * of sizeof(struct perf_branch_entry). + * of **sizeof**\ (**struct perf_branch_entry**\ ). * * **-ENOENT** if architecture does not support branch records. * @@ -2985,8 +2985,8 @@ union bpf_attr { * Description * Returns 0 on success, values for *pid* and *tgid* as seen from the current * *namespace* will be returned in *nsdata*. - * - * On failure, the returned value is one of the following: + * Return + * 0 on success, or one of the following in case of failure: * * **-EINVAL** if dev and inum supplied don't match dev_t and inode number * with nsfs of current task, or if dev conversion to dev_t lost high bits. @@ -3025,8 +3025,8 @@ union bpf_attr { * a global identifier that can be assumed unique. If *ctx* is * NULL, then the helper returns the cookie for the initial * network namespace. The cookie itself is very similar to that - * of bpf_get_socket_cookie() helper, but for network namespaces - * instead of sockets. + * of **bpf_get_socket_cookie**\ () helper, but for network + * namespaces instead of sockets. * Return * A 8-byte long opaque number. * @@ -3061,57 +3061,66 @@ union bpf_attr { * * The *flags* argument must be zero. * Return - * 0 on success, or a negative errno in case of failure. + * 0 on success, or a negative error in case of failure: * - * * **-EINVAL** Unsupported flags specified. - * * **-ENOENT** Socket is unavailable for assignment. - * * **-ENETUNREACH** Socket is unreachable (wrong netns). - * * **-EOPNOTSUPP** Unsupported operation, for example a - * call from outside of TC ingress. - * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). + * **-EINVAL** if specified *flags* are not supported. + * + * **-ENOENT** if the socket is unavailable for assignment. + * + * **-ENETUNREACH** if the socket is unreachable (wrong netns). + * + * **-EOPNOTSUPP** if the operation is not supported, for example + * a call from outside of TC ingress. + * + * **-ESOCKTNOSUPPORT** if the socket type is not supported + * (reuseport). * * u64 bpf_ktime_get_boot_ns(void) * Description * Return the time elapsed since system boot, in nanoseconds. * Does include the time the system was suspended. - * See: clock_gettime(CLOCK_BOOTTIME) + * See: **clock_gettime**\ (**CLOCK_BOOTTIME**) * Return * Current *ktime*. * * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) * Description - * seq_printf uses seq_file seq_printf() to print out the format string. + * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print + * out the format string. * The *m* represents the seq_file. The *fmt* and *fmt_size* are for * the format string itself. The *data* and *data_len* are format string - * arguments. The *data* are a u64 array and corresponding format string + * arguments. The *data* are a **u64** array and corresponding format string * values are stored in the array. For strings and pointers where pointees * are accessed, only the pointer values are stored in the *data* array. - * The *data_len* is the *data* size in term of bytes. + * The *data_len* is the size of *data* in bytes. * * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. * Reading kernel memory may fail due to either invalid address or * valid address but requiring a major memory fault. If reading kernel memory * fails, the string for **%s** will be an empty string, and the ip * address for **%p{i,I}{4,6}** will be 0. Not returning error to - * bpf program is consistent with what bpf_trace_printk() does for now. + * bpf program is consistent with what **bpf_trace_printk**\ () does for now. * Return - * 0 on success, or a negative errno in case of failure. + * 0 on success, or a negative error in case of failure: + * + * **-EBUSY** if per-CPU memory copy buffer is busy, can try again + * by returning 1 from bpf program. + * + * **-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported. + * + * **-E2BIG** if *fmt* contains too many format specifiers. * - * * **-EBUSY** Percpu memory copy buffer is busy, can try again - * by returning 1 from bpf program. - * * **-EINVAL** Invalid arguments, or invalid/unsupported formats. - * * **-E2BIG** Too many format specifiers. - * * **-EOVERFLOW** Overflow happens, the same object will be tried again. + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. * * int bpf_seq_write(struct seq_file *m, const void *data, u32 len) * Description - * seq_write uses seq_file seq_write() to write the data. + * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data. * The *m* represents the seq_file. The *data* and *len* represent the - * data to write in bytes. + * data to write in bytes. * Return - * 0 on success, or a negative errno in case of failure. + * 0 on success, or a negative error in case of failure: * - * * **-EOVERFLOW** Overflow happens, the same object will be tried again. + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ -- cgit v1.2.3 From 309b81f0fdc4209d998bc63f0da52c2e96340d4e Mon Sep 17 00:00:00 2001 From: Yauheni Kaliuta Date: Wed, 13 May 2020 05:17:22 +0300 Subject: selftests/bpf: Install generated test progs Before commit 74b5a5968fe8 ("selftests/bpf: Replace test_progs and test_maps w/ general rule") selftests/bpf used generic install target from selftests/lib.mk to install generated bpf test progs by mentioning them in TEST_GEN_FILES variable. Take that functionality back. Fixes: 74b5a5968fe8 ("selftests/bpf: Replace test_progs and test_maps w/ general rule") Signed-off-by: Yauheni Kaliuta Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200513021722.7787-1-yauheni.kaliuta@redhat.com --- tools/testing/selftests/bpf/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 8f25966b500b..1f878dcd2bf6 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -265,6 +265,7 @@ TRUNNER_BPF_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, $$(TRUNNER_BPF_SRCS) TRUNNER_BPF_SKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.skel.h, \ $$(filter-out $(SKEL_BLACKLIST), \ $$(TRUNNER_BPF_SRCS))) +TEST_GEN_FILES += $$(TRUNNER_BPF_OBJS) # Evaluate rules now with extra TRUNNER_XXX variables above already defined $$(eval $$(call DEFINE_TEST_RUNNER_RULES,$1,$2)) -- cgit v1.2.3 From fd9eef1a132d1974405c3ebf9d5688ec5c51da94 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 12 May 2020 11:04:40 +0200 Subject: libbpf: Fix probe code to return EPERM if encountered When the probe code was failing for any reason ENOTSUP was returned, even if this was due to not having enough lock space. This patch fixes this by returning EPERM to the user application, so it can respond and increase the RLIMIT_MEMLOCK size. Signed-off-by: Eelco Chaudron Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/158927424896.2342.10402475603585742943.stgit@ebuild --- tools/lib/bpf/libbpf.c | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 3da66540b54b..fd882616ab52 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -3237,7 +3237,7 @@ int bpf_map__resize(struct bpf_map *map, __u32 max_entries) } static int -bpf_object__probe_name(struct bpf_object *obj) +bpf_object__probe_loading(struct bpf_object *obj) { struct bpf_load_program_attr attr; char *cp, errmsg[STRERR_BUFSIZE]; @@ -3257,15 +3257,36 @@ bpf_object__probe_name(struct bpf_object *obj) ret = bpf_load_program_xattr(&attr, NULL, 0); if (ret < 0) { - cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); - pr_warn("Error in %s():%s(%d). Couldn't load basic 'r0 = 0' BPF program.\n", - __func__, cp, errno); - return -errno; + ret = errno; + cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); + pr_warn("Error in %s():%s(%d). Couldn't load trivial BPF " + "program. Make sure your kernel supports BPF " + "(CONFIG_BPF_SYSCALL=y) and/or that RLIMIT_MEMLOCK is " + "set to big enough value.\n", __func__, cp, ret); + return -ret; } close(ret); - /* now try the same program, but with the name */ + return 0; +} + +static int +bpf_object__probe_name(struct bpf_object *obj) +{ + struct bpf_load_program_attr attr; + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int ret; + + /* make sure loading with name works */ + memset(&attr, 0, sizeof(attr)); + attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + attr.insns = insns; + attr.insns_cnt = ARRAY_SIZE(insns); + attr.license = "GPL"; attr.name = "test"; ret = bpf_load_program_xattr(&attr, NULL, 0); if (ret >= 0) { @@ -5636,7 +5657,8 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr) obj->loaded = true; - err = bpf_object__probe_caps(obj); + err = bpf_object__probe_loading(obj); + err = err ? : bpf_object__probe_caps(obj); err = err ? : bpf_object__resolve_externs(obj, obj->kconfig); err = err ? : bpf_object__sanitize_and_load_btf(obj); err = err ? : bpf_object__sanitize_maps(obj); -- cgit v1.2.3 From cd49291ce18aeef3f2ec950bc99bd72d5a05fa86 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 12 May 2020 12:24:42 -0700 Subject: selftests/bpf: Extract parse_num_list into generic testing_helpers.c Add testing_helpers.c, which will contain generic helpers for test runners and tests needing some common generic functionality, like parsing a set of numbers. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200512192445.2351848-2-andriin@fb.com --- tools/testing/selftests/bpf/Makefile | 3 +- tools/testing/selftests/bpf/test_progs.c | 67 ++------------------------- tools/testing/selftests/bpf/test_progs.h | 1 + tools/testing/selftests/bpf/testing_helpers.c | 66 ++++++++++++++++++++++++++ tools/testing/selftests/bpf/testing_helpers.h | 5 ++ 5 files changed, 78 insertions(+), 64 deletions(-) create mode 100644 tools/testing/selftests/bpf/testing_helpers.c create mode 100644 tools/testing/selftests/bpf/testing_helpers.h (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 1f878dcd2bf6..975b97b85bca 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -355,7 +355,8 @@ endef TRUNNER_TESTS_DIR := prog_tests TRUNNER_BPF_PROGS_DIR := progs TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \ - network_helpers.c flow_dissector_load.h + network_helpers.c testing_helpers.c \ + flow_dissector_load.h TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ $(wildcard progs/btf_dump_test_case_*.c) TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 0f411fdc4f6d..54fa5fa688ce 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -438,67 +438,6 @@ err: return -ENOMEM; } -int parse_num_list(const char *s, struct test_selector *sel) -{ - int i, set_len = 0, new_len, num, start = 0, end = -1; - bool *set = NULL, *tmp, parsing_end = false; - char *next; - - while (s[0]) { - errno = 0; - num = strtol(s, &next, 10); - if (errno) - return -errno; - - if (parsing_end) - end = num; - else - start = num; - - if (!parsing_end && *next == '-') { - s = next + 1; - parsing_end = true; - continue; - } else if (*next == ',') { - parsing_end = false; - s = next + 1; - end = num; - } else if (*next == '\0') { - parsing_end = false; - s = next; - end = num; - } else { - return -EINVAL; - } - - if (start > end) - return -EINVAL; - - if (end + 1 > set_len) { - new_len = end + 1; - tmp = realloc(set, new_len); - if (!tmp) { - free(set); - return -ENOMEM; - } - for (i = set_len; i < start; i++) - tmp[i] = false; - set = tmp; - set_len = new_len; - } - for (i = start; i <= end; i++) - set[i] = true; - } - - if (!set) - return -EINVAL; - - sel->num_set = set; - sel->num_set_len = set_len; - - return 0; -} - extern int extra_prog_load_log_flags; static error_t parse_arg(int key, char *arg, struct argp_state *state) @@ -512,13 +451,15 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) if (subtest_str) { *subtest_str = '\0'; if (parse_num_list(subtest_str + 1, - &env->subtest_selector)) { + &env->subtest_selector.num_set, + &env->subtest_selector.num_set_len)) { fprintf(stderr, "Failed to parse subtest numbers.\n"); return -EINVAL; } } - if (parse_num_list(arg, &env->test_selector)) { + if (parse_num_list(arg, &env->test_selector.num_set, + &env->test_selector.num_set_len)) { fprintf(stderr, "Failed to parse test numbers.\n"); return -EINVAL; } diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index 83287c76332b..f4503c926aca 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -37,6 +37,7 @@ typedef __u16 __sum16; #include "bpf_util.h" #include #include "trace_helpers.h" +#include "testing_helpers.h" #include "flow_dissector_load.h" enum verbosity { diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c new file mode 100644 index 000000000000..0af6337a8962 --- /dev/null +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (C) 2020 Facebook, Inc. */ +#include +#include +#include "testing_helpers.h" + +int parse_num_list(const char *s, bool **num_set, int *num_set_len) +{ + int i, set_len = 0, new_len, num, start = 0, end = -1; + bool *set = NULL, *tmp, parsing_end = false; + char *next; + + while (s[0]) { + errno = 0; + num = strtol(s, &next, 10); + if (errno) + return -errno; + + if (parsing_end) + end = num; + else + start = num; + + if (!parsing_end && *next == '-') { + s = next + 1; + parsing_end = true; + continue; + } else if (*next == ',') { + parsing_end = false; + s = next + 1; + end = num; + } else if (*next == '\0') { + parsing_end = false; + s = next; + end = num; + } else { + return -EINVAL; + } + + if (start > end) + return -EINVAL; + + if (end + 1 > set_len) { + new_len = end + 1; + tmp = realloc(set, new_len); + if (!tmp) { + free(set); + return -ENOMEM; + } + for (i = set_len; i < start; i++) + tmp[i] = false; + set = tmp; + set_len = new_len; + } + for (i = start; i <= end; i++) + set[i] = true; + } + + if (!set) + return -EINVAL; + + *num_set = set; + *num_set_len = set_len; + + return 0; +} diff --git a/tools/testing/selftests/bpf/testing_helpers.h b/tools/testing/selftests/bpf/testing_helpers.h new file mode 100644 index 000000000000..923b51762759 --- /dev/null +++ b/tools/testing/selftests/bpf/testing_helpers.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (C) 2020 Facebook, Inc. */ +#include + +int parse_num_list(const char *s, bool **set, int *set_len); -- cgit v1.2.3 From 8e7c2a023ac04e04c72cd7b640329511dda92672 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 12 May 2020 12:24:43 -0700 Subject: selftests/bpf: Add benchmark runner infrastructure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While working on BPF ringbuf implementation, testing, and benchmarking, I've developed a pretty generic and modular benchmark runner, which seems to be generically useful, as I've already used it for one more purpose (testing fastest way to trigger BPF program, to minimize overhead of in-kernel code). This patch adds generic part of benchmark runner and sets up Makefile for extending it with more sets of benchmarks. Benchmarker itself operates by spinning up specified number of producer and consumer threads, setting up interval timer sending SIGALARM signal to application once a second. Every second, current snapshot with hits/drops counters are collected and stored in an array. Drops are useful for producer/consumer benchmarks in which producer might overwhelm consumers. Once test finishes after given amount of warm-up and testing seconds, mean and stddev are calculated (ignoring warm-up results) and is printed out to stdout. This setup seems to give consistent and accurate results. To validate behavior, I added two atomic counting tests: global and local. For global one, all the producer threads are atomically incrementing same counter as fast as possible. This, of course, leads to huge drop of performance once there is more than one producer thread due to CPUs fighting for the same memory location. Local counting, on the other hand, maintains one counter per each producer thread, incremented independently. Once per second, all counters are read and added together to form final "counting throughput" measurement. As expected, such setup demonstrates linear scalability with number of producers (as long as there are enough physical CPU cores, of course). See example output below. Also, this setup can nicely demonstrate disastrous effects of false sharing, if care is not taken to take those per-producer counters apart into independent cache lines. Demo output shows global counter first with 1 producer, then with 4. Both total and per-producer performance significantly drop. The last run is local counter with 4 producers, demonstrating near-perfect scalability. $ ./bench -a -w1 -d2 -p1 count-global Setting up benchmark 'count-global'... Benchmark 'count-global' started. Iter 0 ( 24.822us): hits 148.179M/s (148.179M/prod), drops 0.000M/s Iter 1 ( 37.939us): hits 149.308M/s (149.308M/prod), drops 0.000M/s Iter 2 (-10.774us): hits 150.717M/s (150.717M/prod), drops 0.000M/s Iter 3 ( 3.807us): hits 151.435M/s (151.435M/prod), drops 0.000M/s Summary: hits 150.488 ± 1.079M/s (150.488M/prod), drops 0.000 ± 0.000M/s $ ./bench -a -w1 -d2 -p4 count-global Setting up benchmark 'count-global'... Benchmark 'count-global' started. Iter 0 ( 60.659us): hits 53.910M/s ( 13.477M/prod), drops 0.000M/s Iter 1 (-17.658us): hits 53.722M/s ( 13.431M/prod), drops 0.000M/s Iter 2 ( 5.865us): hits 53.495M/s ( 13.374M/prod), drops 0.000M/s Iter 3 ( 0.104us): hits 53.606M/s ( 13.402M/prod), drops 0.000M/s Summary: hits 53.608 ± 0.113M/s ( 13.402M/prod), drops 0.000 ± 0.000M/s $ ./bench -a -w1 -d2 -p4 count-local Setting up benchmark 'count-local'... Benchmark 'count-local' started. Iter 0 ( 23.388us): hits 640.450M/s (160.113M/prod), drops 0.000M/s Iter 1 ( 2.291us): hits 605.661M/s (151.415M/prod), drops 0.000M/s Iter 2 ( -6.415us): hits 607.092M/s (151.773M/prod), drops 0.000M/s Iter 3 ( -1.361us): hits 601.796M/s (150.449M/prod), drops 0.000M/s Summary: hits 604.849 ± 2.739M/s (151.212M/prod), drops 0.000 ± 0.000M/s Benchmark runner supports setting thread affinity for producer and consumer threads. You can use -a flag for default CPU selection scheme, where first consumer gets CPU #0, next one gets CPU #1, and so on. Then producer threads pick up next CPU and increment one-by-one as well. But user can also specify a set of CPUs independently for producers and consumers with --prod-affinity 1,2-10,15 and --cons-affinity . The latter allows to force producers and consumers to share same set of CPUs, if necessary. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200512192445.2351848-3-andriin@fb.com --- tools/testing/selftests/bpf/.gitignore | 1 + tools/testing/selftests/bpf/Makefile | 13 +- tools/testing/selftests/bpf/bench.c | 423 +++++++++++++++++++++++ tools/testing/selftests/bpf/bench.h | 81 +++++ tools/testing/selftests/bpf/benchs/bench_count.c | 91 +++++ 5 files changed, 608 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/bench.c create mode 100644 tools/testing/selftests/bpf/bench.h create mode 100644 tools/testing/selftests/bpf/benchs/bench_count.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 3ff031972975..1bb204cee853 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -38,3 +38,4 @@ test_cpp /bpf_gcc /tools /runqslower +/bench diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 975b97b85bca..f414b2442181 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -77,7 +77,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ # Compile but not part of 'make run_tests' TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ - test_lirc_mode2_user xdping test_cpp runqslower + test_lirc_mode2_user xdping test_cpp runqslower bench TEST_CUSTOM_PROGS = urandom_read @@ -407,6 +407,17 @@ $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) $(call msg,CXX,,$@) $(CXX) $(CFLAGS) $^ $(LDLIBS) -o $@ +# Benchmark runner +$(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h + $(call msg,CC,,$@) + $(CC) $(CFLAGS) -c $(filter %.c,$^) $(LDLIBS) -o $@ +$(OUTPUT)/bench.o: bench.h testing_helpers.h +$(OUTPUT)/bench: LDLIBS += -lm +$(OUTPUT)/bench: $(OUTPUT)/bench.o $(OUTPUT)/testing_helpers.o \ + $(OUTPUT)/bench_count.o + $(call msg,BINARY,,$@) + $(CC) $(LDFLAGS) -o $@ $(filter %.a %.o,$^) $(LDLIBS) + EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ feature \ diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c new file mode 100644 index 000000000000..3972da8b19e8 --- /dev/null +++ b/tools/testing/selftests/bpf/bench.c @@ -0,0 +1,423 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bench.h" +#include "testing_helpers.h" + +struct env env = { + .warmup_sec = 1, + .duration_sec = 5, + .affinity = false, + .consumer_cnt = 1, + .producer_cnt = 1, +}; + +static int libbpf_print_fn(enum libbpf_print_level level, + const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !env.verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static int bump_memlock_rlimit(void) +{ + struct rlimit rlim_new = { + .rlim_cur = RLIM_INFINITY, + .rlim_max = RLIM_INFINITY, + }; + + return setrlimit(RLIMIT_MEMLOCK, &rlim_new); +} + +void setup_libbpf() +{ + int err; + + libbpf_set_print(libbpf_print_fn); + + err = bump_memlock_rlimit(); + if (err) + fprintf(stderr, "failed to increase RLIMIT_MEMLOCK: %d", err); +} + +void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns) +{ + double hits_per_sec, drops_per_sec; + double hits_per_prod; + + hits_per_sec = res->hits / 1000000.0 / (delta_ns / 1000000000.0); + hits_per_prod = hits_per_sec / env.producer_cnt; + drops_per_sec = res->drops / 1000000.0 / (delta_ns / 1000000000.0); + + printf("Iter %3d (%7.3lfus): ", + iter, (delta_ns - 1000000000) / 1000.0); + + printf("hits %8.3lfM/s (%7.3lfM/prod), drops %8.3lfM/s\n", + hits_per_sec, hits_per_prod, drops_per_sec); +} + +void hits_drops_report_final(struct bench_res res[], int res_cnt) +{ + int i; + double hits_mean = 0.0, drops_mean = 0.0; + double hits_stddev = 0.0, drops_stddev = 0.0; + + for (i = 0; i < res_cnt; i++) { + hits_mean += res[i].hits / 1000000.0 / (0.0 + res_cnt); + drops_mean += res[i].drops / 1000000.0 / (0.0 + res_cnt); + } + + if (res_cnt > 1) { + for (i = 0; i < res_cnt; i++) { + hits_stddev += (hits_mean - res[i].hits / 1000000.0) * + (hits_mean - res[i].hits / 1000000.0) / + (res_cnt - 1.0); + drops_stddev += (drops_mean - res[i].drops / 1000000.0) * + (drops_mean - res[i].drops / 1000000.0) / + (res_cnt - 1.0); + } + hits_stddev = sqrt(hits_stddev); + drops_stddev = sqrt(drops_stddev); + } + printf("Summary: hits %8.3lf \u00B1 %5.3lfM/s (%7.3lfM/prod), ", + hits_mean, hits_stddev, hits_mean / env.producer_cnt); + printf("drops %8.3lf \u00B1 %5.3lfM/s\n", + drops_mean, drops_stddev); +} + +const char *argp_program_version = "benchmark"; +const char *argp_program_bug_address = ""; +const char argp_program_doc[] = +"benchmark Generic benchmarking framework.\n" +"\n" +"This tool runs benchmarks.\n" +"\n" +"USAGE: benchmark \n" +"\n" +"EXAMPLES:\n" +" # run 'count-local' benchmark with 1 producer and 1 consumer\n" +" benchmark count-local\n" +" # run 'count-local' with 16 producer and 8 consumer thread, pinned to CPUs\n" +" benchmark -p16 -c8 -a count-local\n"; + +enum { + ARG_PROD_AFFINITY_SET = 1000, + ARG_CONS_AFFINITY_SET = 1001, +}; + +static const struct argp_option opts[] = { + { "list", 'l', NULL, 0, "List available benchmarks"}, + { "duration", 'd', "SEC", 0, "Duration of benchmark, seconds"}, + { "warmup", 'w', "SEC", 0, "Warm-up period, seconds"}, + { "producers", 'p', "NUM", 0, "Number of producer threads"}, + { "consumers", 'c', "NUM", 0, "Number of consumer threads"}, + { "verbose", 'v', NULL, 0, "Verbose debug output"}, + { "affinity", 'a', NULL, 0, "Set consumer/producer thread affinity"}, + { "prod-affinity", ARG_PROD_AFFINITY_SET, "CPUSET", 0, + "Set of CPUs for producer threads; implies --affinity"}, + { "cons-affinity", ARG_CONS_AFFINITY_SET, "CPUSET", 0, + "Set of CPUs for consumer threads; implies --affinity"}, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + static int pos_args; + + switch (key) { + case 'v': + env.verbose = true; + break; + case 'l': + env.list = true; + break; + case 'd': + env.duration_sec = strtol(arg, NULL, 10); + if (env.duration_sec <= 0) { + fprintf(stderr, "Invalid duration: %s\n", arg); + argp_usage(state); + } + break; + case 'w': + env.warmup_sec = strtol(arg, NULL, 10); + if (env.warmup_sec <= 0) { + fprintf(stderr, "Invalid warm-up duration: %s\n", arg); + argp_usage(state); + } + break; + case 'p': + env.producer_cnt = strtol(arg, NULL, 10); + if (env.producer_cnt <= 0) { + fprintf(stderr, "Invalid producer count: %s\n", arg); + argp_usage(state); + } + break; + case 'c': + env.consumer_cnt = strtol(arg, NULL, 10); + if (env.consumer_cnt <= 0) { + fprintf(stderr, "Invalid consumer count: %s\n", arg); + argp_usage(state); + } + break; + case 'a': + env.affinity = true; + break; + case ARG_PROD_AFFINITY_SET: + env.affinity = true; + if (parse_num_list(arg, &env.prod_cpus.cpus, + &env.prod_cpus.cpus_len)) { + fprintf(stderr, "Invalid format of CPU set for producers."); + argp_usage(state); + } + break; + case ARG_CONS_AFFINITY_SET: + env.affinity = true; + if (parse_num_list(arg, &env.cons_cpus.cpus, + &env.cons_cpus.cpus_len)) { + fprintf(stderr, "Invalid format of CPU set for consumers."); + argp_usage(state); + } + break; + case ARGP_KEY_ARG: + if (pos_args++) { + fprintf(stderr, + "Unrecognized positional argument: %s\n", arg); + argp_usage(state); + } + env.bench_name = strdup(arg); + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +static void parse_cmdline_args(int argc, char **argv) +{ + static const struct argp argp = { + .options = opts, + .parser = parse_arg, + .doc = argp_program_doc, + }; + if (argp_parse(&argp, argc, argv, 0, NULL, NULL)) + exit(1); + if (!env.list && !env.bench_name) { + argp_help(&argp, stderr, ARGP_HELP_DOC, "bench"); + exit(1); + } +} + +static void collect_measurements(long delta_ns); + +static __u64 last_time_ns; +static void sigalarm_handler(int signo) +{ + long new_time_ns = get_time_ns(); + long delta_ns = new_time_ns - last_time_ns; + + collect_measurements(delta_ns); + + last_time_ns = new_time_ns; +} + +/* set up periodic 1-second timer */ +static void setup_timer() +{ + static struct sigaction sigalarm_action = { + .sa_handler = sigalarm_handler, + }; + struct itimerval timer_settings = {}; + int err; + + last_time_ns = get_time_ns(); + err = sigaction(SIGALRM, &sigalarm_action, NULL); + if (err < 0) { + fprintf(stderr, "failed to install SIGALARM handler: %d\n", -errno); + exit(1); + } + timer_settings.it_interval.tv_sec = 1; + timer_settings.it_value.tv_sec = 1; + err = setitimer(ITIMER_REAL, &timer_settings, NULL); + if (err < 0) { + fprintf(stderr, "failed to arm interval timer: %d\n", -errno); + exit(1); + } +} + +static void set_thread_affinity(pthread_t thread, int cpu) +{ + cpu_set_t cpuset; + + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); + if (pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset)) { + fprintf(stderr, "setting affinity to CPU #%d failed: %d\n", + cpu, errno); + exit(1); + } +} + +static int next_cpu(struct cpu_set *cpu_set) +{ + if (cpu_set->cpus) { + int i; + + /* find next available CPU */ + for (i = cpu_set->next_cpu; i < cpu_set->cpus_len; i++) { + if (cpu_set->cpus[i]) { + cpu_set->next_cpu = i + 1; + return i; + } + } + fprintf(stderr, "Not enough CPUs specified, need CPU #%d or higher.\n", i); + exit(1); + } + + return cpu_set->next_cpu++; +} + +static struct bench_state { + int res_cnt; + struct bench_res *results; + pthread_t *consumers; + pthread_t *producers; +} state; + +const struct bench *bench = NULL; + +extern const struct bench bench_count_global; +extern const struct bench bench_count_local; + +static const struct bench *benchs[] = { + &bench_count_global, + &bench_count_local, +}; + +static void setup_benchmark() +{ + int i, err; + + if (!env.bench_name) { + fprintf(stderr, "benchmark name is not specified\n"); + exit(1); + } + + for (i = 0; i < ARRAY_SIZE(benchs); i++) { + if (strcmp(benchs[i]->name, env.bench_name) == 0) { + bench = benchs[i]; + break; + } + } + if (!bench) { + fprintf(stderr, "benchmark '%s' not found\n", env.bench_name); + exit(1); + } + + printf("Setting up benchmark '%s'...\n", bench->name); + + state.producers = calloc(env.producer_cnt, sizeof(*state.producers)); + state.consumers = calloc(env.consumer_cnt, sizeof(*state.consumers)); + state.results = calloc(env.duration_sec + env.warmup_sec + 2, + sizeof(*state.results)); + if (!state.producers || !state.consumers || !state.results) + exit(1); + + if (bench->validate) + bench->validate(); + if (bench->setup) + bench->setup(); + + for (i = 0; i < env.consumer_cnt; i++) { + err = pthread_create(&state.consumers[i], NULL, + bench->consumer_thread, (void *)(long)i); + if (err) { + fprintf(stderr, "failed to create consumer thread #%d: %d\n", + i, -errno); + exit(1); + } + if (env.affinity) + set_thread_affinity(state.consumers[i], + next_cpu(&env.cons_cpus)); + } + + /* unless explicit producer CPU list is specified, continue after + * last consumer CPU + */ + if (!env.prod_cpus.cpus) + env.prod_cpus.next_cpu = env.cons_cpus.next_cpu; + + for (i = 0; i < env.producer_cnt; i++) { + err = pthread_create(&state.producers[i], NULL, + bench->producer_thread, (void *)(long)i); + if (err) { + fprintf(stderr, "failed to create producer thread #%d: %d\n", + i, -errno); + exit(1); + } + if (env.affinity) + set_thread_affinity(state.producers[i], + next_cpu(&env.prod_cpus)); + } + + printf("Benchmark '%s' started.\n", bench->name); +} + +static pthread_mutex_t bench_done_mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t bench_done = PTHREAD_COND_INITIALIZER; + +static void collect_measurements(long delta_ns) { + int iter = state.res_cnt++; + struct bench_res *res = &state.results[iter]; + + bench->measure(res); + + if (bench->report_progress) + bench->report_progress(iter, res, delta_ns); + + if (iter == env.duration_sec + env.warmup_sec) { + pthread_mutex_lock(&bench_done_mtx); + pthread_cond_signal(&bench_done); + pthread_mutex_unlock(&bench_done_mtx); + } +} + +int main(int argc, char **argv) +{ + parse_cmdline_args(argc, argv); + + if (env.list) { + int i; + + printf("Available benchmarks:\n"); + for (i = 0; i < ARRAY_SIZE(benchs); i++) { + printf("- %s\n", benchs[i]->name); + } + return 0; + } + + setup_benchmark(); + + setup_timer(); + + pthread_mutex_lock(&bench_done_mtx); + pthread_cond_wait(&bench_done, &bench_done_mtx); + pthread_mutex_unlock(&bench_done_mtx); + + if (bench->report_final) + /* skip first sample */ + bench->report_final(state.results + env.warmup_sec, + state.res_cnt - env.warmup_sec); + + return 0; +} + diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h new file mode 100644 index 000000000000..c1f48a473b02 --- /dev/null +++ b/tools/testing/selftests/bpf/bench.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct cpu_set { + bool *cpus; + int cpus_len; + int next_cpu; +}; + +struct env { + char *bench_name; + int duration_sec; + int warmup_sec; + bool verbose; + bool list; + bool affinity; + int consumer_cnt; + int producer_cnt; + struct cpu_set prod_cpus; + struct cpu_set cons_cpus; +}; + +struct bench_res { + long hits; + long drops; +}; + +struct bench { + const char *name; + void (*validate)(); + void (*setup)(); + void *(*producer_thread)(void *ctx); + void *(*consumer_thread)(void *ctx); + void (*measure)(struct bench_res* res); + void (*report_progress)(int iter, struct bench_res* res, long delta_ns); + void (*report_final)(struct bench_res res[], int res_cnt); +}; + +struct counter { + long value; +} __attribute__((aligned(128))); + +extern struct env env; +extern const struct bench *bench; + +void setup_libbpf(); +void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns); +void hits_drops_report_final(struct bench_res res[], int res_cnt); + +static inline __u64 get_time_ns() { + struct timespec t; + + clock_gettime(CLOCK_MONOTONIC, &t); + + return (u64)t.tv_sec * 1000000000 + t.tv_nsec; +} + +static inline void atomic_inc(long *value) +{ + (void)__atomic_add_fetch(value, 1, __ATOMIC_RELAXED); +} + +static inline void atomic_add(long *value, long n) +{ + (void)__atomic_add_fetch(value, n, __ATOMIC_RELAXED); +} + +static inline long atomic_swap(long *value, long n) +{ + return __atomic_exchange_n(value, n, __ATOMIC_RELAXED); +} diff --git a/tools/testing/selftests/bpf/benchs/bench_count.c b/tools/testing/selftests/bpf/benchs/bench_count.c new file mode 100644 index 000000000000..befba7a82643 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_count.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include "bench.h" + +/* COUNT-GLOBAL benchmark */ + +static struct count_global_ctx { + struct counter hits; +} count_global_ctx; + +static void *count_global_producer(void *input) +{ + struct count_global_ctx *ctx = &count_global_ctx; + + while (true) { + atomic_inc(&ctx->hits.value); + } + return NULL; +} + +static void *count_global_consumer(void *input) +{ + return NULL; +} + +static void count_global_measure(struct bench_res *res) +{ + struct count_global_ctx *ctx = &count_global_ctx; + + res->hits = atomic_swap(&ctx->hits.value, 0); +} + +/* COUNT-local benchmark */ + +static struct count_local_ctx { + struct counter *hits; +} count_local_ctx; + +static void count_local_setup() +{ + struct count_local_ctx *ctx = &count_local_ctx; + + ctx->hits = calloc(env.consumer_cnt, sizeof(*ctx->hits)); + if (!ctx->hits) + exit(1); +} + +static void *count_local_producer(void *input) +{ + struct count_local_ctx *ctx = &count_local_ctx; + int idx = (long)input; + + while (true) { + atomic_inc(&ctx->hits[idx].value); + } + return NULL; +} + +static void *count_local_consumer(void *input) +{ + return NULL; +} + +static void count_local_measure(struct bench_res *res) +{ + struct count_local_ctx *ctx = &count_local_ctx; + int i; + + for (i = 0; i < env.producer_cnt; i++) { + res->hits += atomic_swap(&ctx->hits[i].value, 0); + } +} + +const struct bench bench_count_global = { + .name = "count-global", + .producer_thread = count_global_producer, + .consumer_thread = count_global_consumer, + .measure = count_global_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_count_local = { + .name = "count-local", + .setup = count_local_setup, + .producer_thread = count_local_producer, + .consumer_thread = count_local_consumer, + .measure = count_local_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; -- cgit v1.2.3 From 4eaf0b5c5e04c21a866431bd763ab4b1f24c4d16 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 12 May 2020 12:24:44 -0700 Subject: selftest/bpf: Fmod_ret prog and implement test_overhead as part of bench MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add fmod_ret BPF program to existing test_overhead selftest. Also re-implement user-space benchmarking part into benchmark runner to compare results. Results with ./bench are consistently somewhat lower than test_overhead's, but relative performance of various types of BPF programs stay consisten (e.g., kretprobe is noticeably slower). This slowdown seems to be coming from the fact that test_overhead is single-threaded, while benchmark always spins off at least one thread for producer. This has been confirmed by hacking multi-threaded test_overhead variant and also single-threaded bench variant. Resutls are below. run_bench_rename.sh script from benchs/ subdirectory was used to produce results for ./bench. Single-threaded implementations =============================== /* bench: single-threaded, atomics */ base : 4.622 ± 0.049M/s kprobe : 3.673 ± 0.052M/s kretprobe : 2.625 ± 0.052M/s rawtp : 4.369 ± 0.089M/s fentry : 4.201 ± 0.558M/s fexit : 4.309 ± 0.148M/s fmodret : 4.314 ± 0.203M/s /* selftest: single-threaded, no atomics */ task_rename base 4555K events per sec task_rename kprobe 3643K events per sec task_rename kretprobe 2506K events per sec task_rename raw_tp 4303K events per sec task_rename fentry 4307K events per sec task_rename fexit 4010K events per sec task_rename fmod_ret 3984K events per sec Multi-threaded implementations ============================== /* bench: multi-threaded w/ atomics */ base : 3.910 ± 0.023M/s kprobe : 3.048 ± 0.037M/s kretprobe : 2.300 ± 0.015M/s rawtp : 3.687 ± 0.034M/s fentry : 3.740 ± 0.087M/s fexit : 3.510 ± 0.009M/s fmodret : 3.485 ± 0.050M/s /* selftest: multi-threaded w/ atomics */ task_rename base 3872K events per sec task_rename kprobe 3068K events per sec task_rename kretprobe 2350K events per sec task_rename raw_tp 3731K events per sec task_rename fentry 3639K events per sec task_rename fexit 3558K events per sec task_rename fmod_ret 3511K events per sec /* selftest: multi-threaded, no atomics */ task_rename base 3945K events per sec task_rename kprobe 3298K events per sec task_rename kretprobe 2451K events per sec task_rename raw_tp 3718K events per sec task_rename fentry 3782K events per sec task_rename fexit 3543K events per sec task_rename fmod_ret 3526K events per sec Note that the fact that ./bench benchmark always uses atomic increments for counting, while test_overhead doesn't, doesn't influence test results all that much. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200512192445.2351848-4-andriin@fb.com --- tools/testing/selftests/bpf/Makefile | 4 +- tools/testing/selftests/bpf/bench.c | 14 ++ tools/testing/selftests/bpf/benchs/bench_rename.c | 195 +++++++++++++++++++++ .../selftests/bpf/benchs/run_bench_rename.sh | 9 + .../selftests/bpf/prog_tests/test_overhead.c | 14 +- tools/testing/selftests/bpf/progs/test_overhead.c | 6 + 6 files changed, 240 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/bpf/benchs/bench_rename.c create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_rename.sh (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index f414b2442181..1a079e91482f 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -411,10 +411,12 @@ $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h $(call msg,CC,,$@) $(CC) $(CFLAGS) -c $(filter %.c,$^) $(LDLIBS) -o $@ +$(OUTPUT)/bench_rename.o: $(OUTPUT)/test_overhead.skel.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(OUTPUT)/bench: LDLIBS += -lm $(OUTPUT)/bench: $(OUTPUT)/bench.o $(OUTPUT)/testing_helpers.o \ - $(OUTPUT)/bench_count.o + $(OUTPUT)/bench_count.o \ + $(OUTPUT)/bench_rename.o $(call msg,BINARY,,$@) $(CC) $(LDFLAGS) -o $@ $(filter %.a %.o,$^) $(LDLIBS) diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 3972da8b19e8..c9e8b7dbaf66 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -297,10 +297,24 @@ const struct bench *bench = NULL; extern const struct bench bench_count_global; extern const struct bench bench_count_local; +extern const struct bench bench_rename_base; +extern const struct bench bench_rename_kprobe; +extern const struct bench bench_rename_kretprobe; +extern const struct bench bench_rename_rawtp; +extern const struct bench bench_rename_fentry; +extern const struct bench bench_rename_fexit; +extern const struct bench bench_rename_fmodret; static const struct bench *benchs[] = { &bench_count_global, &bench_count_local, + &bench_rename_base, + &bench_rename_kprobe, + &bench_rename_kretprobe, + &bench_rename_rawtp, + &bench_rename_fentry, + &bench_rename_fexit, + &bench_rename_fmodret, }; static void setup_benchmark() diff --git a/tools/testing/selftests/bpf/benchs/bench_rename.c b/tools/testing/selftests/bpf/benchs/bench_rename.c new file mode 100644 index 000000000000..e74cff40f4fe --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_rename.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include +#include "bench.h" +#include "test_overhead.skel.h" + +/* BPF triggering benchmarks */ +static struct ctx { + struct test_overhead *skel; + struct counter hits; + int fd; +} ctx; + +static void validate() +{ + if (env.producer_cnt != 1) { + fprintf(stderr, "benchmark doesn't support multi-producer!\n"); + exit(1); + } + if (env.consumer_cnt != 1) { + fprintf(stderr, "benchmark doesn't support multi-consumer!\n"); + exit(1); + } +} + +static void *producer(void *input) +{ + char buf[] = "test_overhead"; + int err; + + while (true) { + err = write(ctx.fd, buf, sizeof(buf)); + if (err < 0) { + fprintf(stderr, "write failed\n"); + exit(1); + } + atomic_inc(&ctx.hits.value); + } +} + +static void measure(struct bench_res *res) +{ + res->hits = atomic_swap(&ctx.hits.value, 0); +} + +static void setup_ctx() +{ + setup_libbpf(); + + ctx.skel = test_overhead__open_and_load(); + if (!ctx.skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + ctx.fd = open("/proc/self/comm", O_WRONLY|O_TRUNC); + if (ctx.fd < 0) { + fprintf(stderr, "failed to open /proc/self/comm: %d\n", -errno); + exit(1); + } +} + +static void attach_bpf(struct bpf_program *prog) +{ + struct bpf_link *link; + + link = bpf_program__attach(prog); + if (IS_ERR(link)) { + fprintf(stderr, "failed to attach program!\n"); + exit(1); + } +} + +static void setup_base() +{ + setup_ctx(); +} + +static void setup_kprobe() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.prog1); +} + +static void setup_kretprobe() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.prog2); +} + +static void setup_rawtp() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.prog3); +} + +static void setup_fentry() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.prog4); +} + +static void setup_fexit() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.prog5); +} + +static void setup_fmodret() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.prog6); +} + +static void *consumer(void *input) +{ + return NULL; +} + +const struct bench bench_rename_base = { + .name = "rename-base", + .validate = validate, + .setup = setup_base, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_rename_kprobe = { + .name = "rename-kprobe", + .validate = validate, + .setup = setup_kprobe, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_rename_kretprobe = { + .name = "rename-kretprobe", + .validate = validate, + .setup = setup_kretprobe, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_rename_rawtp = { + .name = "rename-rawtp", + .validate = validate, + .setup = setup_rawtp, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_rename_fentry = { + .name = "rename-fentry", + .validate = validate, + .setup = setup_fentry, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_rename_fexit = { + .name = "rename-fexit", + .validate = validate, + .setup = setup_fexit, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_rename_fmodret = { + .name = "rename-fmodret", + .validate = validate, + .setup = setup_fmodret, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/run_bench_rename.sh b/tools/testing/selftests/bpf/benchs/run_bench_rename.sh new file mode 100755 index 000000000000..16f774b1cdbe --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/run_bench_rename.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +set -eufo pipefail + +for i in base kprobe kretprobe rawtp fentry fexit fmodret +do + summary=$(sudo ./bench -w2 -d5 -a rename-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) + printf "%-10s: %s\n" $i "$summary" +done diff --git a/tools/testing/selftests/bpf/prog_tests/test_overhead.c b/tools/testing/selftests/bpf/prog_tests/test_overhead.c index 465b371a561d..2702df2b2343 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_overhead.c +++ b/tools/testing/selftests/bpf/prog_tests/test_overhead.c @@ -61,9 +61,10 @@ void test_test_overhead(void) const char *raw_tp_name = "raw_tp/task_rename"; const char *fentry_name = "fentry/__set_task_comm"; const char *fexit_name = "fexit/__set_task_comm"; + const char *fmodret_name = "fmod_ret/__set_task_comm"; const char *kprobe_func = "__set_task_comm"; struct bpf_program *kprobe_prog, *kretprobe_prog, *raw_tp_prog; - struct bpf_program *fentry_prog, *fexit_prog; + struct bpf_program *fentry_prog, *fexit_prog, *fmodret_prog; struct bpf_object *obj; struct bpf_link *link; int err, duration = 0; @@ -96,6 +97,10 @@ void test_test_overhead(void) if (CHECK(!fexit_prog, "find_probe", "prog '%s' not found\n", fexit_name)) goto cleanup; + fmodret_prog = bpf_object__find_program_by_title(obj, fmodret_name); + if (CHECK(!fmodret_prog, "find_probe", + "prog '%s' not found\n", fmodret_name)) + goto cleanup; err = bpf_object__load(obj); if (CHECK(err, "obj_load", "err %d\n", err)) @@ -142,6 +147,13 @@ void test_test_overhead(void) goto cleanup; test_run("fexit"); bpf_link__destroy(link); + + /* attach fmod_ret */ + link = bpf_program__attach_trace(fmodret_prog); + if (CHECK(IS_ERR(link), "attach fmod_ret", "err %ld\n", PTR_ERR(link))) + goto cleanup; + test_run("fmod_ret"); + bpf_link__destroy(link); cleanup: prctl(PR_SET_NAME, comm, 0L, 0L, 0L); bpf_object__close(obj); diff --git a/tools/testing/selftests/bpf/progs/test_overhead.c b/tools/testing/selftests/bpf/progs/test_overhead.c index 56a50b25cd33..450bf819beac 100644 --- a/tools/testing/selftests/bpf/progs/test_overhead.c +++ b/tools/testing/selftests/bpf/progs/test_overhead.c @@ -39,4 +39,10 @@ int BPF_PROG(prog5, struct task_struct *tsk, const char *buf, bool exec) return !tsk; } +SEC("fmod_ret/__set_task_comm") +int BPF_PROG(prog6, struct task_struct *tsk, const char *buf, bool exec) +{ + return !tsk; +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From c5d420c32cb44fdd10d76f0f01bcd0b09383d0b5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 12 May 2020 12:24:45 -0700 Subject: selftest/bpf: Add BPF triggering benchmark MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is sometimes desirable to be able to trigger BPF program from user-space with minimal overhead. sys_enter would seem to be a good candidate, yet in a lot of cases there will be a lot of noise from syscalls triggered by other processes on the system. So while searching for low-overhead alternative, I've stumbled upon getpgid() syscall, which seems to be specific enough to not suffer from accidental syscall by other apps. This set of benchmarks compares tp, raw_tp w/ filtering by syscall ID, kprobe, fentry and fmod_ret with returning error (so that syscall would not be executed), to determine the lowest-overhead way. Here are results on my machine (using benchs/run_bench_trigger.sh script): base : 9.200 ± 0.319M/s tp : 6.690 ± 0.125M/s rawtp : 8.571 ± 0.214M/s kprobe : 6.431 ± 0.048M/s fentry : 8.955 ± 0.241M/s fmodret : 8.903 ± 0.135M/s So it seems like fmodret doesn't give much benefit for such lightweight syscall. Raw tracepoint is pretty decent despite additional filtering logic, but it will be called for any other syscall in the system, which rules it out. Fentry, though, seems to be adding the least amoung of overhead and achieves 97.3% of performance of baseline no-BPF-attached syscall. Using getpgid() seems to be preferable to set_task_comm() approach from test_overhead, as it's about 2.35x faster in a baseline performance. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200512192445.2351848-5-andriin@fb.com --- tools/testing/selftests/bpf/Makefile | 4 +- tools/testing/selftests/bpf/bench.c | 12 ++ tools/testing/selftests/bpf/benchs/bench_trigger.c | 167 +++++++++++++++++++++ .../selftests/bpf/benchs/run_bench_trigger.sh | 9 ++ tools/testing/selftests/bpf/progs/trigger_bench.c | 47 ++++++ 5 files changed, 238 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/benchs/bench_trigger.c create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_trigger.sh create mode 100644 tools/testing/selftests/bpf/progs/trigger_bench.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 1a079e91482f..e716e931d0c9 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -412,11 +412,13 @@ $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h $(call msg,CC,,$@) $(CC) $(CFLAGS) -c $(filter %.c,$^) $(LDLIBS) -o $@ $(OUTPUT)/bench_rename.o: $(OUTPUT)/test_overhead.skel.h +$(OUTPUT)/bench_trigger.o: $(OUTPUT)/trigger_bench.skel.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(OUTPUT)/bench: LDLIBS += -lm $(OUTPUT)/bench: $(OUTPUT)/bench.o $(OUTPUT)/testing_helpers.o \ $(OUTPUT)/bench_count.o \ - $(OUTPUT)/bench_rename.o + $(OUTPUT)/bench_rename.o \ + $(OUTPUT)/bench_trigger.o $(call msg,BINARY,,$@) $(CC) $(LDFLAGS) -o $@ $(filter %.a %.o,$^) $(LDLIBS) diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index c9e8b7dbaf66..8c0dfbfe6088 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -304,6 +304,12 @@ extern const struct bench bench_rename_rawtp; extern const struct bench bench_rename_fentry; extern const struct bench bench_rename_fexit; extern const struct bench bench_rename_fmodret; +extern const struct bench bench_trig_base; +extern const struct bench bench_trig_tp; +extern const struct bench bench_trig_rawtp; +extern const struct bench bench_trig_kprobe; +extern const struct bench bench_trig_fentry; +extern const struct bench bench_trig_fmodret; static const struct bench *benchs[] = { &bench_count_global, @@ -315,6 +321,12 @@ static const struct bench *benchs[] = { &bench_rename_fentry, &bench_rename_fexit, &bench_rename_fmodret, + &bench_trig_base, + &bench_trig_tp, + &bench_trig_rawtp, + &bench_trig_kprobe, + &bench_trig_fentry, + &bench_trig_fmodret, }; static void setup_benchmark() diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c new file mode 100644 index 000000000000..49c22832f216 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include "bench.h" +#include "trigger_bench.skel.h" + +/* BPF triggering benchmarks */ +static struct trigger_ctx { + struct trigger_bench *skel; +} ctx; + +static struct counter base_hits; + +static void trigger_validate() +{ + if (env.consumer_cnt != 1) { + fprintf(stderr, "benchmark doesn't support multi-consumer!\n"); + exit(1); + } +} + +static void *trigger_base_producer(void *input) +{ + while (true) { + (void)syscall(__NR_getpgid); + atomic_inc(&base_hits.value); + } + return NULL; +} + +static void trigger_base_measure(struct bench_res *res) +{ + res->hits = atomic_swap(&base_hits.value, 0); +} + +static void *trigger_producer(void *input) +{ + while (true) + (void)syscall(__NR_getpgid); + return NULL; +} + +static void trigger_measure(struct bench_res *res) +{ + res->hits = atomic_swap(&ctx.skel->bss->hits, 0); +} + +static void setup_ctx() +{ + setup_libbpf(); + + ctx.skel = trigger_bench__open_and_load(); + if (!ctx.skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } +} + +static void attach_bpf(struct bpf_program *prog) +{ + struct bpf_link *link; + + link = bpf_program__attach(prog); + if (IS_ERR(link)) { + fprintf(stderr, "failed to attach program!\n"); + exit(1); + } +} + +static void trigger_tp_setup() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_tp); +} + +static void trigger_rawtp_setup() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_raw_tp); +} + +static void trigger_kprobe_setup() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_kprobe); +} + +static void trigger_fentry_setup() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_fentry); +} + +static void trigger_fmodret_setup() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_fmodret); +} + +static void *trigger_consumer(void *input) +{ + return NULL; +} + +const struct bench bench_trig_base = { + .name = "trig-base", + .validate = trigger_validate, + .producer_thread = trigger_base_producer, + .consumer_thread = trigger_consumer, + .measure = trigger_base_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_trig_tp = { + .name = "trig-tp", + .validate = trigger_validate, + .setup = trigger_tp_setup, + .producer_thread = trigger_producer, + .consumer_thread = trigger_consumer, + .measure = trigger_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_trig_rawtp = { + .name = "trig-rawtp", + .validate = trigger_validate, + .setup = trigger_rawtp_setup, + .producer_thread = trigger_producer, + .consumer_thread = trigger_consumer, + .measure = trigger_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_trig_kprobe = { + .name = "trig-kprobe", + .validate = trigger_validate, + .setup = trigger_kprobe_setup, + .producer_thread = trigger_producer, + .consumer_thread = trigger_consumer, + .measure = trigger_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_trig_fentry = { + .name = "trig-fentry", + .validate = trigger_validate, + .setup = trigger_fentry_setup, + .producer_thread = trigger_producer, + .consumer_thread = trigger_consumer, + .measure = trigger_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_trig_fmodret = { + .name = "trig-fmodret", + .validate = trigger_validate, + .setup = trigger_fmodret_setup, + .producer_thread = trigger_producer, + .consumer_thread = trigger_consumer, + .measure = trigger_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh new file mode 100755 index 000000000000..78e83f243294 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +set -eufo pipefail + +for i in base tp rawtp kprobe fentry fmodret +do + summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) + printf "%-10s: %s\n" $i "$summary" +done diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c new file mode 100644 index 000000000000..8b36b6640e7e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +long hits = 0; + +SEC("tp/syscalls/sys_enter_getpgid") +int bench_trigger_tp(void *ctx) +{ + __sync_add_and_fetch(&hits, 1); + return 0; +} + +SEC("raw_tp/sys_enter") +int BPF_PROG(bench_trigger_raw_tp, struct pt_regs *regs, long id) +{ + if (id == __NR_getpgid) + __sync_add_and_fetch(&hits, 1); + return 0; +} + +SEC("kprobe/__x64_sys_getpgid") +int bench_trigger_kprobe(void *ctx) +{ + __sync_add_and_fetch(&hits, 1); + return 0; +} + +SEC("fentry/__x64_sys_getpgid") +int bench_trigger_fentry(void *ctx) +{ + __sync_add_and_fetch(&hits, 1); + return 0; +} + +SEC("fmod_ret/__x64_sys_getpgid") +int bench_trigger_fmodret(void *ctx) +{ + __sync_add_and_fetch(&hits, 1); + return -22; +} -- cgit v1.2.3 From 99aaf53e2f7c4a1b152b7f300c6b07ffbc2fe192 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:15 -0700 Subject: tools/bpf: selftests : Explain bpf_iter test failures with llvm 10.0.0 Commit 6879c042e105 ("tools/bpf: selftests: Add bpf_iter selftests") added self tests for bpf_iter feature. But two subtests ipv6_route and netlink needs llvm latest 10.x release branch or trunk due to a bug in llvm BPF backend. This patch added the file README.rst to document these two failures so people using llvm 10.0.0 can be aware of them. Suggested-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200513180215.2949237-1-yhs@fb.com --- tools/testing/selftests/bpf/README.rst | 43 ++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 tools/testing/selftests/bpf/README.rst (limited to 'tools') diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst new file mode 100644 index 000000000000..0f67f1b470b0 --- /dev/null +++ b/tools/testing/selftests/bpf/README.rst @@ -0,0 +1,43 @@ +================== +BPF Selftest Notes +================== + +Additional information about selftest failures are +documented here. + +bpf_iter test failures with clang/llvm 10.0.0 +============================================= + +With clang/llvm 10.0.0, the following two bpf_iter tests failed: + * ``bpf_iter/ipv6_route`` + * ``bpf_iter/netlink`` + +The symptom for ``bpf_iter/ipv6_route`` looks like + +.. code-block:: c + + 2: (79) r8 = *(u64 *)(r1 +8) + ... + 14: (bf) r2 = r8 + 15: (0f) r2 += r1 + ; BPF_SEQ_PRINTF(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); + 16: (7b) *(u64 *)(r8 +64) = r2 + only read is supported + +The symptom for ``bpf_iter/netlink`` looks like + +.. code-block:: c + + ; struct netlink_sock *nlk = ctx->sk; + 2: (79) r7 = *(u64 *)(r1 +8) + ... + 15: (bf) r2 = r7 + 16: (0f) r2 += r1 + ; BPF_SEQ_PRINTF(seq, "%pK %-3d ", s, s->sk_protocol); + 17: (7b) *(u64 *)(r7 +0) = r2 + only read is supported + +This is due to a llvm BPF backend bug. The fix + https://reviews.llvm.org/D78466 +has been pushed to llvm 10.x release branch and will be +available in 10.0.1. The fix is available in llvm 11.0.0 trunk. -- cgit v1.2.3 From 21aef70eade22a656297c28d5da93301915d2ac2 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:16 -0700 Subject: bpf: Change btf_iter func proto prefix to "bpf_iter_" This is to be consistent with tracing and lsm programs which have prefix "bpf_trace_" and "bpf_lsm_" respectively. Suggested-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200513180216.2949387-1-yhs@fb.com --- include/linux/bpf.h | 6 +++--- tools/lib/bpf/libbpf.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cf4b6e44f2bc..ab94dfd8826f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1131,10 +1131,10 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd); int bpf_obj_pin_user(u32 ufd, const char __user *pathname); int bpf_obj_get_user(const char __user *pathname, int flags); -#define BPF_ITER_FUNC_PREFIX "__bpf_iter__" +#define BPF_ITER_FUNC_PREFIX "bpf_iter_" #define DEFINE_BPF_ITER_FUNC(target, args...) \ - extern int __bpf_iter__ ## target(args); \ - int __init __bpf_iter__ ## target(args) { return 0; } + extern int bpf_iter_ ## target(args); \ + int __init bpf_iter_ ## target(args) { return 0; } typedef int (*bpf_iter_init_seq_priv_t)(void *private_data); typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index fd882616ab52..292257995487 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6919,7 +6919,7 @@ invalid_prog: #define BTF_TRACE_PREFIX "btf_trace_" #define BTF_LSM_PREFIX "bpf_lsm_" -#define BTF_ITER_PREFIX "__bpf_iter__" +#define BTF_ITER_PREFIX "bpf_iter_" #define BTF_MAX_NAME_SIZE 128 static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix, -- cgit v1.2.3 From 0531b0357ba37464e5c0033e1b7c69bbf5ecd8fb Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 14 May 2020 09:35:52 +0300 Subject: selftests: fix flower parent qdisc Flower tests used to create ingress filter with specified parent qdisc "parent ffff:" but dump them on "ingress". With recent commit that fixed tcm_parent handling in dump those are not considered same parent anymore, which causes iproute2 tc to emit additional "parent ffff:" in first line of filter dump output. The change in output causes filter match in tests to fail. Prevent parent qdisc output when dumping filters in flower tests by always correctly specifying "ingress" parent both when creating and dumping filters. Fixes: a7df4870d79b ("net_sched: fix tcm_parent in tc filter dump") Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- tools/testing/selftests/tc-testing/tc-tests/filters/tests.json | 6 +++--- tools/testing/selftests/tc-testing/tdc_batch.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json index 8877f7b2b809..12aa4bc1f6a0 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json +++ b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json @@ -32,7 +32,7 @@ "setup": [ "$TC qdisc add dev $DEV2 ingress" ], - "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip pref 1 parent ffff: handle 0xffffffff flower action ok", + "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip pref 1 ingress handle 0xffffffff flower action ok", "expExitCode": "0", "verifyCmd": "$TC filter show dev $DEV2 ingress", "matchPattern": "filter protocol ip pref 1 flower.*handle 0xffffffff", @@ -77,9 +77,9 @@ }, "setup": [ "$TC qdisc add dev $DEV2 ingress", - "$TC filter add dev $DEV2 protocol ip prio 1 parent ffff: flower dst_mac e4:11:22:11:4a:51 src_mac e4:11:22:11:4a:50 ip_proto tcp src_ip 1.1.1.1 dst_ip 2.2.2.2 action drop" + "$TC filter add dev $DEV2 protocol ip prio 1 ingress flower dst_mac e4:11:22:11:4a:51 src_mac e4:11:22:11:4a:50 ip_proto tcp src_ip 1.1.1.1 dst_ip 2.2.2.2 action drop" ], - "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip prio 1 parent ffff: flower dst_mac e4:11:22:11:4a:51 src_mac e4:11:22:11:4a:50 ip_proto tcp src_ip 1.1.1.1 dst_ip 2.2.2.2 action drop", + "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip prio 1 ingress flower dst_mac e4:11:22:11:4a:51 src_mac e4:11:22:11:4a:50 ip_proto tcp src_ip 1.1.1.1 dst_ip 2.2.2.2 action drop", "expExitCode": "2", "verifyCmd": "$TC -s filter show dev $DEV2 ingress", "matchPattern": "filter protocol ip pref 1 flower chain 0 handle", diff --git a/tools/testing/selftests/tc-testing/tdc_batch.py b/tools/testing/selftests/tc-testing/tdc_batch.py index 6a2bd2cf528e..995f66ce43eb 100755 --- a/tools/testing/selftests/tc-testing/tdc_batch.py +++ b/tools/testing/selftests/tc-testing/tdc_batch.py @@ -72,21 +72,21 @@ mac_prefix = args.mac_prefix def format_add_filter(device, prio, handle, skip, src_mac, dst_mac, share_action): - return ("filter add dev {} {} protocol ip parent ffff: handle {} " + return ("filter add dev {} {} protocol ip ingress handle {} " " flower {} src_mac {} dst_mac {} action drop {}".format( device, prio, handle, skip, src_mac, dst_mac, share_action)) def format_rep_filter(device, prio, handle, skip, src_mac, dst_mac, share_action): - return ("filter replace dev {} {} protocol ip parent ffff: handle {} " + return ("filter replace dev {} {} protocol ip ingress handle {} " " flower {} src_mac {} dst_mac {} action drop {}".format( device, prio, handle, skip, src_mac, dst_mac, share_action)) def format_del_filter(device, prio, handle, skip, src_mac, dst_mac, share_action): - return ("filter del dev {} {} protocol ip parent ffff: handle {} " + return ("filter del dev {} {} protocol ip ingress handle {} " "flower".format(device, prio, handle)) -- cgit v1.2.3 From 5a46b062e28f57bffde767437fad3ab1d0cee2c7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 13 May 2020 10:28:22 -0700 Subject: devlink: refactor end checks in devlink_nl_cmd_region_read_dumpit Clean up after recent fixes, move address calculations around and change the variable init, so that we can have just one start_offset == end_offset check. Make the check a little stricter to preserve the -EINVAL error if requested start offset is larger than the region itself. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/core/devlink.c | 41 +++++++++------------- .../selftests/drivers/net/netdevsim/devlink.sh | 15 ++++++++ 2 files changed, 31 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/net/core/devlink.c b/net/core/devlink.c index 20f935fa29f5..7b76e5fffc10 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4215,7 +4215,6 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb, struct nlattr **attrs, u64 start_offset, u64 end_offset, - bool dump, u64 *new_offset) { struct devlink_snapshot *snapshot; @@ -4230,9 +4229,6 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb, if (!snapshot) return -EINVAL; - if (end_offset > region->size || dump) - end_offset = region->size; - while (curr_offset < end_offset) { u32 data_size; u8 *data; @@ -4260,13 +4256,12 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); - u64 ret_offset, start_offset, end_offset = 0; + u64 ret_offset, start_offset, end_offset = U64_MAX; struct nlattr **attrs = info->attrs; struct devlink_region *region; struct nlattr *chunks_attr; const char *region_name; struct devlink *devlink; - bool dump = true; void *hdr; int err; @@ -4294,8 +4289,21 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto out_unlock; } + if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] && + attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) { + if (!start_offset) + start_offset = + nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); + + end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); + end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]); + } + + if (end_offset > region->size) + end_offset = region->size; + /* return 0 if there is no further data to read */ - if (start_offset >= region->size) { + if (start_offset == end_offset) { err = 0; goto out_unlock; } @@ -4322,27 +4330,10 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto nla_put_failure; } - if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] && - attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) { - if (!start_offset) - start_offset = - nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); - - end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); - end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]); - dump = false; - - if (start_offset == end_offset) { - err = 0; - goto nla_put_failure; - } - } - err = devlink_nl_region_read_snapshot_fill(skb, devlink, region, attrs, start_offset, - end_offset, dump, - &ret_offset); + end_offset, &ret_offset); if (err && err != -EMSGSIZE) goto nla_put_failure; diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh index ad539eccddcb..de4b32fc4223 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh @@ -146,6 +146,21 @@ regions_test() check_region_snapshot_count dummy post-first-request 3 + devlink region dump $DL_HANDLE/dummy snapshot 25 >> /dev/null + check_err $? "Failed to dump snapshot with id 25" + + devlink region read $DL_HANDLE/dummy snapshot 25 addr 0 len 1 >> /dev/null + check_err $? "Failed to read snapshot with id 25 (1 byte)" + + devlink region read $DL_HANDLE/dummy snapshot 25 addr 128 len 128 >> /dev/null + check_err $? "Failed to read snapshot with id 25 (128 bytes)" + + devlink region read $DL_HANDLE/dummy snapshot 25 addr 128 len $((1<<32)) >> /dev/null + check_err $? "Failed to read snapshot with id 25 (oversized)" + + devlink region read $DL_HANDLE/dummy snapshot 25 addr $((1<<32)) len 128 >> /dev/null 2>&1 + check_fail $? "Bad read of snapshot with id 25 did not fail" + devlink region del $DL_HANDLE/dummy snapshot 25 check_err $? "Failed to delete snapshot with id 25" -- cgit v1.2.3 From 7aebfa1b3885b5aa29fcb4a596d0485ac463bbe8 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 13 May 2020 18:50:27 -0700 Subject: bpf: Support narrow loads from bpf_sock_addr.user_port bpf_sock_addr.user_port supports only 4-byte load and it leads to ugly code in BPF programs, like: volatile __u32 user_port = ctx->user_port; __u16 port = bpf_ntohs(user_port); Since otherwise clang may optimize the load to be 2-byte and it's rejected by verifier. Add support for 1- and 2-byte loads same way as it's supported for other fields in bpf_sock_addr like user_ip4, msg_src_ip4, etc. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/c1e983f4c17573032601d0b2b1f9d1274f24bc16.1589420814.git.rdna@fb.com --- include/uapi/linux/bpf.h | 2 +- net/core/filter.c | 15 +++++++-------- tools/include/uapi/linux/bpf.h | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bfb31c1be219..85cfdffde182 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3728,7 +3728,7 @@ struct bpf_sock_addr { __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. * Stored in network byte order. */ - __u32 user_port; /* Allows 4-byte read and write. + __u32 user_port; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order */ __u32 family; /* Allows 4-byte read, but no write */ diff --git a/net/core/filter.c b/net/core/filter.c index da0634979f53..1fe8c0c2d408 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7029,6 +7029,7 @@ static bool sock_addr_is_valid_access(int off, int size, case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): + case bpf_ctx_range(struct bpf_sock_addr, user_port): if (type == BPF_READ) { bpf_ctx_record_field_size(info, size_default); @@ -7059,10 +7060,6 @@ static bool sock_addr_is_valid_access(int off, int size, return false; } break; - case bpf_ctx_range(struct bpf_sock_addr, user_port): - if (size != size_default) - return false; - break; case offsetof(struct bpf_sock_addr, sk): if (type != BPF_READ) return false; @@ -7958,8 +7955,8 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { + int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port); struct bpf_insn *insn = insn_buf; - int off; switch (si->off) { case offsetof(struct bpf_sock_addr, user_family): @@ -7994,9 +7991,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, offsetof(struct sockaddr_in6, sin6_port)); BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) != sizeof_field(struct sockaddr_in6, sin6_port)); - SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern, - struct sockaddr_in6, uaddr, - sin6_port, tmp_reg); + /* Account for sin6_port being smaller than user_port. */ + port_size = min(port_size, BPF_LDST_BYTES(si)); + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, + sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg); break; case offsetof(struct bpf_sock_addr, family): diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index bfb31c1be219..85cfdffde182 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3728,7 +3728,7 @@ struct bpf_sock_addr { __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. * Stored in network byte order. */ - __u32 user_port; /* Allows 4-byte read and write. + __u32 user_port; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order */ __u32 family; /* Allows 4-byte read, but no write */ -- cgit v1.2.3 From 0645f7eb6f6af78aba2bdd37ae776bd8754bc8f0 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 13 May 2020 18:50:28 -0700 Subject: selftests/bpf: Test narrow loads for bpf_sock_addr.user_port Test 1,2,4-byte loads from bpf_sock_addr.user_port in sock_addr programs. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/e5c734a58cca4041ab30cb5471e644246f8cdb5a.1589420814.git.rdna@fb.com --- tools/testing/selftests/bpf/test_sock_addr.c | 38 ++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 61fd95b89af8..0358814c67dc 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -677,7 +677,7 @@ static int bind4_prog_load(const struct sock_addr_test *test) uint8_t u4_addr8[4]; uint16_t u4_addr16[2]; uint32_t u4_addr32; - } ip4; + } ip4, port; struct sockaddr_in addr4_rw; if (inet_pton(AF_INET, SERV4_IP, (void *)&ip4) != 1) { @@ -685,6 +685,8 @@ static int bind4_prog_load(const struct sock_addr_test *test) return -1; } + port.u4_addr32 = htons(SERV4_PORT); + if (mk_sockaddr(AF_INET, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, (struct sockaddr *)&addr4_rw, sizeof(addr4_rw)) == -1) return -1; @@ -696,49 +698,65 @@ static int bind4_prog_load(const struct sock_addr_test *test) /* if (sk.family == AF_INET && */ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, family)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET, 24), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET, 32), /* (sk.type == SOCK_DGRAM || sk.type == SOCK_STREAM) && */ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, type)), BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_DGRAM, 1), BPF_JMP_A(1), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_STREAM, 20), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_STREAM, 28), /* 1st_byte_of_user_ip4 == expected && */ BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, user_ip4)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[0], 18), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[0], 26), /* 2nd_byte_of_user_ip4 == expected && */ BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, user_ip4) + 1), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[1], 16), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[1], 24), /* 3rd_byte_of_user_ip4 == expected && */ BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, user_ip4) + 2), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[2], 14), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[2], 22), /* 4th_byte_of_user_ip4 == expected && */ BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, user_ip4) + 3), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[3], 12), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[3], 20), /* 1st_half_of_user_ip4 == expected && */ BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, user_ip4)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr16[0], 10), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr16[0], 18), /* 2nd_half_of_user_ip4 == expected && */ BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, user_ip4) + 2), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr16[1], 8), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr16[1], 16), - /* whole_user_ip4 == expected) { */ + /* whole_user_ip4 == expected && */ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, user_ip4)), BPF_LD_IMM64(BPF_REG_8, ip4.u4_addr32), /* See [2]. */ + BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_8, 12), + + /* 1st_byte_of_user_port == expected && */ + BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock_addr, user_port)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, port.u4_addr8[0], 10), + + /* 1st_half_of_user_port == expected && */ + BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock_addr, user_port)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, port.u4_addr16[0], 8), + + /* user_port == expected) { */ + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock_addr, user_port)), + BPF_LD_IMM64(BPF_REG_8, port.u4_addr32), /* See [2]. */ BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_8, 4), /* user_ip4 = addr4_rw.sin_addr */ -- cgit v1.2.3 From 5b0004d92b4511c39db0df23aa84395722f1d706 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 14 May 2020 13:15:29 +0100 Subject: selftest/bpf: Fix spelling mistake "SIGALARM" -> "SIGALRM" There is a spelling mistake in an error message, fix it. Signed-off-by: Colin Ian King Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200514121529.259668-1-colin.king@canonical.com --- tools/testing/selftests/bpf/bench.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 8c0dfbfe6088..14390689ef90 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -242,7 +242,7 @@ static void setup_timer() last_time_ns = get_time_ns(); err = sigaction(SIGALRM, &sigalarm_action, NULL); if (err < 0) { - fprintf(stderr, "failed to install SIGALARM handler: %d\n", -errno); + fprintf(stderr, "failed to install SIGALRM handler: %d\n", -errno); exit(1); } timer_settings.it_interval.tv_sec = 1; -- cgit v1.2.3 From f307fa2cb4c935f7f1ff0aeb880c7b44fb9a642b Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 14 May 2020 13:03:47 -0700 Subject: bpf: Introduce bpf_sk_{, ancestor_}cgroup_id helpers With having ability to lookup sockets in cgroup skb programs it becomes useful to access cgroup id of retrieved sockets so that policies can be implemented based on origin cgroup of such socket. For example, a container running in a cgroup can have cgroup skb ingress program that can lookup peer socket that is sending packets to a process inside the container and decide whether those packets should be allowed or denied based on cgroup id of the peer. More specifically such ingress program can implement intra-host policy "allow incoming packets only from this same container and not from any other container on same host" w/o relying on source IP addresses since quite often it can be the case that containers share same IP address on the host. Introduce two new helpers for this use-case: bpf_sk_cgroup_id() and bpf_sk_ancestor_cgroup_id(). These helpers are similar to existing bpf_skb_{,ancestor_}cgroup_id helpers with the only difference that sk is used to get cgroup id instead of skb, and share code with them. See documentation in UAPI for more details. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/f5884981249ce911f63e9b57ecd5d7d19154ff39.1589486450.git.rdna@fb.com --- include/uapi/linux/bpf.h | 36 ++++++++++++++++++++++++- net/core/filter.c | 60 +++++++++++++++++++++++++++++++++++------- tools/include/uapi/linux/bpf.h | 36 ++++++++++++++++++++++++- 3 files changed, 121 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 85cfdffde182..146c742f1d49 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3121,6 +3121,38 @@ union bpf_attr { * 0 on success, or a negative error in case of failure: * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + * + * u64 bpf_sk_cgroup_id(struct bpf_sock *sk) + * Description + * Return the cgroup v2 id of the socket *sk*. + * + * *sk* must be a non-**NULL** pointer to a full socket, e.g. one + * returned from **bpf_sk_lookup_xxx**\ (), + * **bpf_sk_fullsock**\ (), etc. The format of returned id is + * same as in **bpf_skb_cgroup_id**\ (). + * + * This helper is available only if the kernel was compiled with + * the **CONFIG_SOCK_CGROUP_DATA** configuration option. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *sk* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *sk*, then return value will be same as that + * of **bpf_sk_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *sk*. + * + * The format of returned id and helper limitations are same as in + * **bpf_sk_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3250,7 +3282,9 @@ union bpf_attr { FN(sk_assign), \ FN(ktime_get_boot_ns), \ FN(seq_printf), \ - FN(seq_write), + FN(seq_write), \ + FN(sk_cgroup_id), \ + FN(sk_ancestor_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index a47dc5b9dad4..5815902bb617 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4003,16 +4003,22 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { }; #ifdef CONFIG_SOCK_CGROUP_DATA +static inline u64 __bpf_sk_cgroup_id(struct sock *sk) +{ + struct cgroup *cgrp; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + return cgroup_id(cgrp); +} + BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) { struct sock *sk = skb_to_full_sk(skb); - struct cgroup *cgrp; if (!sk || !sk_fullsock(sk)) return 0; - cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return cgroup_id(cgrp); + return __bpf_sk_cgroup_id(sk); } static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { @@ -4022,16 +4028,12 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, - ancestor_level) +static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk, + int ancestor_level) { - struct sock *sk = skb_to_full_sk(skb); struct cgroup *ancestor; struct cgroup *cgrp; - if (!sk || !sk_fullsock(sk)) - return 0; - cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); ancestor = cgroup_ancestor(cgrp, ancestor_level); if (!ancestor) @@ -4040,6 +4042,17 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, return cgroup_id(ancestor); } +BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, + ancestor_level) +{ + struct sock *sk = skb_to_full_sk(skb); + + if (!sk || !sk_fullsock(sk)) + return 0; + + return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); +} + static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { .func = bpf_skb_ancestor_cgroup_id, .gpl_only = false, @@ -4047,6 +4060,31 @@ static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; + +BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk) +{ + return __bpf_sk_cgroup_id(sk); +} + +static const struct bpf_func_proto bpf_sk_cgroup_id_proto = { + .func = bpf_sk_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCKET, +}; + +BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level) +{ + return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); +} + +static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = { + .func = bpf_sk_ancestor_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCKET, + .arg2_type = ARG_ANYTHING, +}; #endif static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, @@ -6159,6 +6197,10 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_cgroup_id_proto; case BPF_FUNC_skb_ancestor_cgroup_id: return &bpf_skb_ancestor_cgroup_id_proto; + case BPF_FUNC_sk_cgroup_id: + return &bpf_sk_cgroup_id_proto; + case BPF_FUNC_sk_ancestor_cgroup_id: + return &bpf_sk_ancestor_cgroup_id_proto; #endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 85cfdffde182..146c742f1d49 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3121,6 +3121,38 @@ union bpf_attr { * 0 on success, or a negative error in case of failure: * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + * + * u64 bpf_sk_cgroup_id(struct bpf_sock *sk) + * Description + * Return the cgroup v2 id of the socket *sk*. + * + * *sk* must be a non-**NULL** pointer to a full socket, e.g. one + * returned from **bpf_sk_lookup_xxx**\ (), + * **bpf_sk_fullsock**\ (), etc. The format of returned id is + * same as in **bpf_skb_cgroup_id**\ (). + * + * This helper is available only if the kernel was compiled with + * the **CONFIG_SOCK_CGROUP_DATA** configuration option. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *sk* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *sk*, then return value will be same as that + * of **bpf_sk_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *sk*. + * + * The format of returned id and helper limitations are same as in + * **bpf_sk_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3250,7 +3282,9 @@ union bpf_attr { FN(sk_assign), \ FN(ktime_get_boot_ns), \ FN(seq_printf), \ - FN(seq_write), + FN(seq_write), \ + FN(sk_cgroup_id), \ + FN(sk_ancestor_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 383724e17ab02d8e440def7792c4e151b13ef4d4 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 14 May 2020 13:03:48 -0700 Subject: selftests/bpf: Add connect_fd_to_fd, connect_wait net helpers Add two new network helpers. connect_fd_to_fd connects an already created client socket fd to address of server fd. Sometimes it's useful to separate client socket creation and connecting this socket to a server, e.g. if client socket has to be created in a cgroup different from that of server cgroup. Additionally connect_to_fd is now implemented using connect_fd_to_fd, both helpers don't treat EINPROGRESS as an error and let caller decide how to proceed with it. connect_wait is a helper to work with non-blocking client sockets so that if connect_to_fd or connect_fd_to_fd returned -1 with errno == EINPROGRESS, caller can wait for connect to finish or for connection timeout. The helper returns -1 on error, 0 on timeout (1sec, hard-coded), and positive number on success. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/1403fab72300f379ca97ead4820ae43eac4414ef.1589486450.git.rdna@fb.com --- tools/testing/selftests/bpf/network_helpers.c | 74 ++++++++++++++++++++++----- tools/testing/selftests/bpf/network_helpers.h | 2 + 2 files changed, 63 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 0ff64b70b746..999a775484c1 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -4,10 +4,14 @@ #include #include #include + +#include + #include #include #include +#include "bpf_util.h" #include "network_helpers.h" #define clean_errno() (errno == 0 ? "None" : strerror(errno)) @@ -77,9 +81,7 @@ static const size_t timeo_optlen = sizeof(timeo_sec); int connect_to_fd(int family, int type, int server_fd) { - struct sockaddr_storage addr; - socklen_t len = sizeof(addr); - int fd; + int fd, save_errno; fd = socket(family, type, 0); if (fd < 0) { @@ -87,24 +89,70 @@ int connect_to_fd(int family, int type, int server_fd) return -1; } - if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec, timeo_optlen)) { + if (connect_fd_to_fd(fd, server_fd) < 0 && errno != EINPROGRESS) { + save_errno = errno; + close(fd); + errno = save_errno; + return -1; + } + + return fd; +} + +int connect_fd_to_fd(int client_fd, int server_fd) +{ + struct sockaddr_storage addr; + socklen_t len = sizeof(addr); + int save_errno; + + if (setsockopt(client_fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec, + timeo_optlen)) { log_err("Failed to set SO_RCVTIMEO"); - goto out; + return -1; } if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) { log_err("Failed to get server addr"); - goto out; + return -1; } - if (connect(fd, (const struct sockaddr *)&addr, len) < 0) { - log_err("Fail to connect to server with family %d", family); - goto out; + if (connect(client_fd, (const struct sockaddr *)&addr, len) < 0) { + if (errno != EINPROGRESS) { + save_errno = errno; + log_err("Failed to connect to server"); + errno = save_errno; + } + return -1; } - return fd; + return 0; +} + +int connect_wait(int fd) +{ + struct epoll_event ev = {}, events[2]; + int timeout_ms = 1000; + int efd, nfd; + + efd = epoll_create1(EPOLL_CLOEXEC); + if (efd < 0) { + log_err("Failed to open epoll fd"); + return -1; + } + + ev.events = EPOLLRDHUP | EPOLLOUT; + ev.data.fd = fd; + + if (epoll_ctl(efd, EPOLL_CTL_ADD, fd, &ev) < 0) { + log_err("Failed to register fd=%d on epoll fd=%d", fd, efd); + close(efd); + return -1; + } + + nfd = epoll_wait(efd, events, ARRAY_SIZE(events), timeout_ms); + if (nfd < 0) + log_err("Failed to wait for I/O event on epoll fd=%d", efd); -out: - close(fd); - return -1; + close(efd); + return nfd; } diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index a0be7db4f67d..86914e6e7b53 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -35,5 +35,7 @@ extern struct ipv6_packet pkt_v6; int start_server(int family, int type); int connect_to_fd(int family, int type, int server_fd); +int connect_fd_to_fd(int client_fd, int server_fd); +int connect_wait(int client_fd); #endif -- cgit v1.2.3 From 68e916bc8d3211ffe0b4c418184ab1b57398200c Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 14 May 2020 13:03:49 -0700 Subject: selftests/bpf: Test for sk helpers in cgroup skb Test bpf_sk_lookup_tcp, bpf_sk_release, bpf_sk_cgroup_id and bpf_sk_ancestor_cgroup_id helpers from cgroup skb program. The test creates a testing cgroup, starts a TCPv6 server inside the cgroup and creates two client sockets: one inside testing cgroup and one outside. Then it attaches cgroup skb program to the cgroup that checks all TCP segments coming to the server and allows only those coming from the cgroup of the server. If a segment comes from a peer outside of the cgroup, it'll be dropped. Finally the test checks that client from inside testing cgroup can successfully connect to the server, but client outside the cgroup fails to connect by timeout. The main goal of the test is to check newly introduced bpf_sk_{,ancestor_}cgroup_id helpers. It also checks a couple of socket lookup helpers (tcp & release), but lookup helpers were introduced much earlier and covered by other tests. Here it's mostly checked that they can be called from cgroup skb. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/171f4c5d75e8ff4fe1c4e8c1c12288b5240a4549.1589486450.git.rdna@fb.com --- .../bpf/prog_tests/cgroup_skb_sk_lookup.c | 95 +++++++++++++++++++++ .../bpf/progs/cgroup_skb_sk_lookup_kern.c | 97 ++++++++++++++++++++++ 2 files changed, 192 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c create mode 100644 tools/testing/selftests/bpf/progs/cgroup_skb_sk_lookup_kern.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c new file mode 100644 index 000000000000..059047af7df3 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include + +#include "network_helpers.h" +#include "cgroup_skb_sk_lookup_kern.skel.h" + +static void run_lookup_test(__u16 *g_serv_port, int out_sk) +{ + int serv_sk = -1, in_sk = -1, serv_in_sk = -1, err; + struct sockaddr_in6 addr = {}; + socklen_t addr_len = sizeof(addr); + __u32 duration = 0; + + serv_sk = start_server(AF_INET6, SOCK_STREAM); + if (CHECK(serv_sk < 0, "start_server", "failed to start server\n")) + return; + + err = getsockname(serv_sk, (struct sockaddr *)&addr, &addr_len); + if (CHECK(err, "getsockname", "errno %d\n", errno)) + goto cleanup; + + *g_serv_port = addr.sin6_port; + + /* Client outside of test cgroup should fail to connect by timeout. */ + err = connect_fd_to_fd(out_sk, serv_sk); + if (CHECK(!err || errno != EINPROGRESS, "connect_fd_to_fd", + "unexpected result err %d errno %d\n", err, errno)) + goto cleanup; + + err = connect_wait(out_sk); + if (CHECK(err, "connect_wait", "unexpected result %d\n", err)) + goto cleanup; + + /* Client inside test cgroup should connect just fine. */ + in_sk = connect_to_fd(AF_INET6, SOCK_STREAM, serv_sk); + if (CHECK(in_sk < 0, "connect_to_fd", "errno %d\n", errno)) + goto cleanup; + + serv_in_sk = accept(serv_sk, NULL, NULL); + if (CHECK(serv_in_sk < 0, "accept", "errno %d\n", errno)) + goto cleanup; + +cleanup: + close(serv_in_sk); + close(in_sk); + close(serv_sk); +} + +static void run_cgroup_bpf_test(const char *cg_path, int out_sk) +{ + struct cgroup_skb_sk_lookup_kern *skel; + struct bpf_link *link; + __u32 duration = 0; + int cgfd = -1; + + skel = cgroup_skb_sk_lookup_kern__open_and_load(); + if (CHECK(!skel, "skel_open_load", "open_load failed\n")) + return; + + cgfd = test__join_cgroup(cg_path); + if (CHECK(cgfd < 0, "cgroup_join", "cgroup setup failed\n")) + goto cleanup; + + link = bpf_program__attach_cgroup(skel->progs.ingress_lookup, cgfd); + if (CHECK(IS_ERR(link), "cgroup_attach", "err: %ld\n", PTR_ERR(link))) + goto cleanup; + + run_lookup_test(&skel->bss->g_serv_port, out_sk); + + bpf_link__destroy(link); + +cleanup: + close(cgfd); + cgroup_skb_sk_lookup_kern__destroy(skel); +} + +void test_cgroup_skb_sk_lookup(void) +{ + const char *cg_path = "/foo"; + int out_sk; + + /* Create a socket before joining testing cgroup so that its cgroup id + * differs from that of testing cgroup. Moving selftests process to + * testing cgroup won't change cgroup id of an already created socket. + */ + out_sk = socket(AF_INET6, SOCK_STREAM | SOCK_NONBLOCK, 0); + if (CHECK_FAIL(out_sk < 0)) + return; + + run_cgroup_bpf_test(cg_path, out_sk); + + close(out_sk); +} diff --git a/tools/testing/selftests/bpf/progs/cgroup_skb_sk_lookup_kern.c b/tools/testing/selftests/bpf/progs/cgroup_skb_sk_lookup_kern.c new file mode 100644 index 000000000000..3f757e30d7a0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/cgroup_skb_sk_lookup_kern.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +int _version SEC("version") = 1; +char _license[] SEC("license") = "GPL"; + +__u16 g_serv_port = 0; + +static inline void set_ip(__u32 *dst, const struct in6_addr *src) +{ + dst[0] = src->in6_u.u6_addr32[0]; + dst[1] = src->in6_u.u6_addr32[1]; + dst[2] = src->in6_u.u6_addr32[2]; + dst[3] = src->in6_u.u6_addr32[3]; +} + +static inline void set_tuple(struct bpf_sock_tuple *tuple, + const struct ipv6hdr *ip6h, + const struct tcphdr *tcph) +{ + set_ip(tuple->ipv6.saddr, &ip6h->daddr); + set_ip(tuple->ipv6.daddr, &ip6h->saddr); + tuple->ipv6.sport = tcph->dest; + tuple->ipv6.dport = tcph->source; +} + +static inline int is_allowed_peer_cg(struct __sk_buff *skb, + const struct ipv6hdr *ip6h, + const struct tcphdr *tcph) +{ + __u64 cgid, acgid, peer_cgid, peer_acgid; + struct bpf_sock_tuple tuple; + size_t tuple_len = sizeof(tuple.ipv6); + struct bpf_sock *peer_sk; + + set_tuple(&tuple, ip6h, tcph); + + peer_sk = bpf_sk_lookup_tcp(skb, &tuple, tuple_len, + BPF_F_CURRENT_NETNS, 0); + if (!peer_sk) + return 0; + + cgid = bpf_skb_cgroup_id(skb); + peer_cgid = bpf_sk_cgroup_id(peer_sk); + + acgid = bpf_skb_ancestor_cgroup_id(skb, 2); + peer_acgid = bpf_sk_ancestor_cgroup_id(peer_sk, 2); + + bpf_sk_release(peer_sk); + + return cgid && cgid == peer_cgid && acgid && acgid == peer_acgid; +} + +SEC("cgroup_skb/ingress") +int ingress_lookup(struct __sk_buff *skb) +{ + __u32 serv_port_key = 0; + struct ipv6hdr ip6h; + struct tcphdr tcph; + + if (skb->protocol != bpf_htons(ETH_P_IPV6)) + return 1; + + /* For SYN packets coming to listening socket skb->remote_port will be + * zero, so IPv6/TCP headers are loaded to identify remote peer + * instead. + */ + if (bpf_skb_load_bytes(skb, 0, &ip6h, sizeof(ip6h))) + return 1; + + if (ip6h.nexthdr != IPPROTO_TCP) + return 1; + + if (bpf_skb_load_bytes(skb, sizeof(ip6h), &tcph, sizeof(tcph))) + return 1; + + if (!g_serv_port) + return 0; + + if (tcph.dest != g_serv_port) + return 1; + + return is_allowed_peer_cg(skb, &ip6h, &tcph); +} -- cgit v1.2.3 From 68545fb6f2ff621de26d96a3f15868abfb6897b0 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:51:40 +0200 Subject: selftests/bpf: Adjust BPF selftest for xdp_adjust_tail Current selftest for BPF-helper xdp_adjust_tail only shrink tail. Make it more clear that this is a shrink test case. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/158945350058.97035.17280775016196207372.stgit@firesoul --- .../selftests/bpf/prog_tests/xdp_adjust_tail.c | 9 +++++-- .../testing/selftests/bpf/progs/test_adjust_tail.c | 30 ---------------------- .../bpf/progs/test_xdp_adjust_tail_shrink.c | 30 ++++++++++++++++++++++ 3 files changed, 37 insertions(+), 32 deletions(-) delete mode 100644 tools/testing/selftests/bpf/progs/test_adjust_tail.c create mode 100644 tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_shrink.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c index 6c8ca1c93f9b..a76dd81dfce9 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c @@ -2,9 +2,9 @@ #include #include -void test_xdp_adjust_tail(void) +void test_xdp_adjust_tail_shrink(void) { - const char *file = "./test_adjust_tail.o"; + const char *file = "./test_xdp_adjust_tail_shrink.o"; struct bpf_object *obj; char buf[128]; __u32 duration, retval, size; @@ -28,3 +28,8 @@ void test_xdp_adjust_tail(void) err, errno, retval, size); bpf_object__close(obj); } + +void test_xdp_adjust_tail(void) +{ + test_xdp_adjust_tail_shrink(); +} diff --git a/tools/testing/selftests/bpf/progs/test_adjust_tail.c b/tools/testing/selftests/bpf/progs/test_adjust_tail.c deleted file mode 100644 index b7fc85769bdc..000000000000 --- a/tools/testing/selftests/bpf/progs/test_adjust_tail.c +++ /dev/null @@ -1,30 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * Copyright (c) 2018 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ -#include -#include -#include - -int _version SEC("version") = 1; - -SEC("xdp_adjust_tail") -int _xdp_adjust_tail(struct xdp_md *xdp) -{ - void *data_end = (void *)(long)xdp->data_end; - void *data = (void *)(long)xdp->data; - int offset = 0; - - if (data_end - data == 54) - offset = 256; - else - offset = 20; - if (bpf_xdp_adjust_tail(xdp, 0 - offset)) - return XDP_DROP; - return XDP_TX; -} - -char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_shrink.c b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_shrink.c new file mode 100644 index 000000000000..22065a9cfb25 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_shrink.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2018 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include + +int _version SEC("version") = 1; + +SEC("xdp_adjust_tail_shrink") +int _xdp_adjust_tail_shrink(struct xdp_md *xdp) +{ + void *data_end = (void *)(long)xdp->data_end; + void *data = (void *)(long)xdp->data; + int offset = 0; + + if (data_end - data == 54) /* sizeof(pkt_v4) */ + offset = 256; /* shrink too much */ + else + offset = 20; + if (bpf_xdp_adjust_tail(xdp, 0 - offset)) + return XDP_DROP; + return XDP_TX; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 7ae2e00e8fc23f10169079fadd388317d81012be Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:51:45 +0200 Subject: selftests/bpf: Xdp_adjust_tail add grow tail tests Extend BPF selftest xdp_adjust_tail with grow tail tests, which is added as subtest's. The first grow test stays in same form as original shrink test. The second grow test use the newer bpf_prog_test_run_xattr() calls, and does extra checking of data contents. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/158945350567.97035.9632611946765811876.stgit@firesoul --- .../selftests/bpf/prog_tests/xdp_adjust_tail.c | 116 ++++++++++++++++++++- .../bpf/progs/test_xdp_adjust_tail_grow.c | 33 ++++++ 2 files changed, 144 insertions(+), 5 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c index a76dd81dfce9..d5c98f2cb12f 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c @@ -5,10 +5,10 @@ void test_xdp_adjust_tail_shrink(void) { const char *file = "./test_xdp_adjust_tail_shrink.o"; + __u32 duration, retval, size, expect_sz; struct bpf_object *obj; - char buf[128]; - __u32 duration, retval, size; int err, prog_fd; + char buf[128]; err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd); if (CHECK_FAIL(err)) @@ -21,15 +21,121 @@ void test_xdp_adjust_tail_shrink(void) "ipv4", "err %d errno %d retval %d size %d\n", err, errno, retval, size); + expect_sz = sizeof(pkt_v6) - 20; /* Test shrink with 20 bytes */ err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6), buf, &size, &retval, &duration); - CHECK(err || retval != XDP_TX || size != 54, - "ipv6", "err %d errno %d retval %d size %d\n", + CHECK(err || retval != XDP_TX || size != expect_sz, + "ipv6", "err %d errno %d retval %d size %d expect-size %d\n", + err, errno, retval, size, expect_sz); + bpf_object__close(obj); +} + +void test_xdp_adjust_tail_grow(void) +{ + const char *file = "./test_xdp_adjust_tail_grow.o"; + struct bpf_object *obj; + char buf[4096]; /* avoid segfault: large buf to hold grow results */ + __u32 duration, retval, size, expect_sz; + int err, prog_fd; + + err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd); + if (CHECK_FAIL(err)) + return; + + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + buf, &size, &retval, &duration); + CHECK(err || retval != XDP_DROP, + "ipv4", "err %d errno %d retval %d size %d\n", err, errno, retval, size); + + expect_sz = sizeof(pkt_v6) + 40; /* Test grow with 40 bytes */ + err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6) /* 74 */, + buf, &size, &retval, &duration); + CHECK(err || retval != XDP_TX || size != expect_sz, + "ipv6", "err %d errno %d retval %d size %d expect-size %d\n", + err, errno, retval, size, expect_sz); + + bpf_object__close(obj); +} + +void test_xdp_adjust_tail_grow2(void) +{ + const char *file = "./test_xdp_adjust_tail_grow.o"; + char buf[4096]; /* avoid segfault: large buf to hold grow results */ + int tailroom = 320; /* SKB_DATA_ALIGN(sizeof(struct skb_shared_info))*/; + struct bpf_object *obj; + int err, cnt, i; + int max_grow; + + struct bpf_prog_test_run_attr tattr = { + .repeat = 1, + .data_in = &buf, + .data_out = &buf, + .data_size_in = 0, /* Per test */ + .data_size_out = 0, /* Per test */ + }; + + err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &tattr.prog_fd); + if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno)) + return; + + /* Test case-64 */ + memset(buf, 1, sizeof(buf)); + tattr.data_size_in = 64; /* Determine test case via pkt size */ + tattr.data_size_out = 128; /* Limit copy_size */ + /* Kernel side alloc packet memory area that is zero init */ + err = bpf_prog_test_run_xattr(&tattr); + + CHECK_ATTR(errno != ENOSPC /* Due limit copy_size in bpf_test_finish */ + || tattr.retval != XDP_TX + || tattr.data_size_out != 192, /* Expected grow size */ + "case-64", + "err %d errno %d retval %d size %d\n", + err, errno, tattr.retval, tattr.data_size_out); + + /* Extra checks for data contents */ + CHECK_ATTR(tattr.data_size_out != 192 + || buf[0] != 1 || buf[63] != 1 /* 0-63 memset to 1 */ + || buf[64] != 0 || buf[127] != 0 /* 64-127 memset to 0 */ + || buf[128] != 1 || buf[191] != 1, /*128-191 memset to 1 */ + "case-64-data", + "err %d errno %d retval %d size %d\n", + err, errno, tattr.retval, tattr.data_size_out); + + /* Test case-128 */ + memset(buf, 2, sizeof(buf)); + tattr.data_size_in = 128; /* Determine test case via pkt size */ + tattr.data_size_out = sizeof(buf); /* Copy everything */ + err = bpf_prog_test_run_xattr(&tattr); + + max_grow = 4096 - XDP_PACKET_HEADROOM - tailroom; /* 3520 */ + CHECK_ATTR(err + || tattr.retval != XDP_TX + || tattr.data_size_out != max_grow,/* Expect max grow size */ + "case-128", + "err %d errno %d retval %d size %d expect-size %d\n", + err, errno, tattr.retval, tattr.data_size_out, max_grow); + + /* Extra checks for data content: Count grow size, will contain zeros */ + for (i = 0, cnt = 0; i < sizeof(buf); i++) { + if (buf[i] == 0) + cnt++; + } + CHECK_ATTR((cnt != (max_grow - tattr.data_size_in)) /* Grow increase */ + || tattr.data_size_out != max_grow, /* Total grow size */ + "case-128-data", + "err %d errno %d retval %d size %d grow-size %d\n", + err, errno, tattr.retval, tattr.data_size_out, cnt); + bpf_object__close(obj); } void test_xdp_adjust_tail(void) { - test_xdp_adjust_tail_shrink(); + if (test__start_subtest("xdp_adjust_tail_shrink")) + test_xdp_adjust_tail_shrink(); + if (test__start_subtest("xdp_adjust_tail_grow")) + test_xdp_adjust_tail_grow(); + if (test__start_subtest("xdp_adjust_tail_grow2")) + test_xdp_adjust_tail_grow2(); } diff --git a/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c new file mode 100644 index 000000000000..3d66599eee2e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +SEC("xdp_adjust_tail_grow") +int _xdp_adjust_tail_grow(struct xdp_md *xdp) +{ + void *data_end = (void *)(long)xdp->data_end; + void *data = (void *)(long)xdp->data; + unsigned int data_len; + int offset = 0; + + /* Data length determine test case */ + data_len = data_end - data; + + if (data_len == 54) { /* sizeof(pkt_v4) */ + offset = 4096; /* test too large offset */ + } else if (data_len == 74) { /* sizeof(pkt_v6) */ + offset = 40; + } else if (data_len == 64) { + offset = 128; + } else if (data_len == 128) { + offset = 4096 - 256 - 320 - data_len; /* Max tail grow 3520 */ + } else { + return XDP_ABORTED; /* No matching test */ + } + + if (bpf_xdp_adjust_tail(xdp, offset)) + return XDP_DROP; + return XDP_TX; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 0ee52c0f6c67e187ff1906f6048af7c96df320c7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 13 May 2020 09:58:49 +0200 Subject: bpf, bpftool: Allow probing for CONFIG_HZ from kernel config In Cilium we've recently switched to make use of bpf_jiffies64() for parts of our tc and XDP datapath since bpf_ktime_get_ns() is more expensive and high-precision is not needed for our timeouts we have anyway. Our agent has a probe manager which picks up the json of bpftool's feature probe and we also use the macro output in our C programs e.g. to have workarounds when helpers are not available on older kernels. Extend the kernel config info dump to also include the kernel's CONFIG_HZ, and rework the probe_kernel_image_config() for allowing a macro dump such that CONFIG_HZ can be propagated to BPF C code as a simple define if available via config. Latter allows to have _compile- time_ resolution of jiffies <-> sec conversion in our code since all are propagated as known constants. Given we cannot generally assume availability of kconfig everywhere, we also have a kernel hz probe [0] as a fallback. Potentially, bpftool could have an integrated probe fallback as well, although to derive it, we might need to place it under 'bpftool feature probe full' or similar given it would slow down the probing process overall. Yet 'full' doesn't fit either for us since we don't want to pollute the kernel log with warning messages from bpf_probe_write_user() and bpf_trace_printk() on agent startup; I've left it out for the time being. [0] https://github.com/cilium/cilium/blob/master/bpf/cilium-probe-kernel-hz.c Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Cc: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200513075849.20868-1-daniel@iogearbox.net --- tools/bpf/bpftool/feature.c | 120 +++++++++++++++++++++++++------------------- 1 file changed, 67 insertions(+), 53 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index f54347f55ee0..1b73e63274b5 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -80,13 +80,12 @@ print_bool_feature(const char *feat_name, const char *plain_name, printf("%s is %savailable\n", plain_name, res ? "" : "NOT "); } -static void print_kernel_option(const char *name, const char *value) +static void print_kernel_option(const char *name, const char *value, + const char *define_prefix) { char *endptr; int res; - /* No support for C-style ouptut */ - if (json_output) { if (!value) { jsonw_null_field(json_wtr, name); @@ -98,6 +97,12 @@ static void print_kernel_option(const char *name, const char *value) jsonw_int_field(json_wtr, name, res); else jsonw_string_field(json_wtr, name, value); + } else if (define_prefix) { + if (value) + printf("#define %s%s %s\n", define_prefix, + name, value); + else + printf("/* %s%s is not set */\n", define_prefix, name); } else { if (value) printf("%s is set to %s\n", name, value); @@ -315,77 +320,84 @@ static bool read_next_kernel_config_option(gzFile file, char *buf, size_t n, return false; } -static void probe_kernel_image_config(void) +static void probe_kernel_image_config(const char *define_prefix) { - static const char * const options[] = { + static const struct { + const char * const name; + bool macro_dump; + } options[] = { /* Enable BPF */ - "CONFIG_BPF", + { "CONFIG_BPF", }, /* Enable bpf() syscall */ - "CONFIG_BPF_SYSCALL", + { "CONFIG_BPF_SYSCALL", }, /* Does selected architecture support eBPF JIT compiler */ - "CONFIG_HAVE_EBPF_JIT", + { "CONFIG_HAVE_EBPF_JIT", }, /* Compile eBPF JIT compiler */ - "CONFIG_BPF_JIT", + { "CONFIG_BPF_JIT", }, /* Avoid compiling eBPF interpreter (use JIT only) */ - "CONFIG_BPF_JIT_ALWAYS_ON", + { "CONFIG_BPF_JIT_ALWAYS_ON", }, /* cgroups */ - "CONFIG_CGROUPS", + { "CONFIG_CGROUPS", }, /* BPF programs attached to cgroups */ - "CONFIG_CGROUP_BPF", + { "CONFIG_CGROUP_BPF", }, /* bpf_get_cgroup_classid() helper */ - "CONFIG_CGROUP_NET_CLASSID", + { "CONFIG_CGROUP_NET_CLASSID", }, /* bpf_skb_{,ancestor_}cgroup_id() helpers */ - "CONFIG_SOCK_CGROUP_DATA", + { "CONFIG_SOCK_CGROUP_DATA", }, /* Tracing: attach BPF to kprobes, tracepoints, etc. */ - "CONFIG_BPF_EVENTS", + { "CONFIG_BPF_EVENTS", }, /* Kprobes */ - "CONFIG_KPROBE_EVENTS", + { "CONFIG_KPROBE_EVENTS", }, /* Uprobes */ - "CONFIG_UPROBE_EVENTS", + { "CONFIG_UPROBE_EVENTS", }, /* Tracepoints */ - "CONFIG_TRACING", + { "CONFIG_TRACING", }, /* Syscall tracepoints */ - "CONFIG_FTRACE_SYSCALLS", + { "CONFIG_FTRACE_SYSCALLS", }, /* bpf_override_return() helper support for selected arch */ - "CONFIG_FUNCTION_ERROR_INJECTION", + { "CONFIG_FUNCTION_ERROR_INJECTION", }, /* bpf_override_return() helper */ - "CONFIG_BPF_KPROBE_OVERRIDE", + { "CONFIG_BPF_KPROBE_OVERRIDE", }, /* Network */ - "CONFIG_NET", + { "CONFIG_NET", }, /* AF_XDP sockets */ - "CONFIG_XDP_SOCKETS", + { "CONFIG_XDP_SOCKETS", }, /* BPF_PROG_TYPE_LWT_* and related helpers */ - "CONFIG_LWTUNNEL_BPF", + { "CONFIG_LWTUNNEL_BPF", }, /* BPF_PROG_TYPE_SCHED_ACT, TC (traffic control) actions */ - "CONFIG_NET_ACT_BPF", + { "CONFIG_NET_ACT_BPF", }, /* BPF_PROG_TYPE_SCHED_CLS, TC filters */ - "CONFIG_NET_CLS_BPF", + { "CONFIG_NET_CLS_BPF", }, /* TC clsact qdisc */ - "CONFIG_NET_CLS_ACT", + { "CONFIG_NET_CLS_ACT", }, /* Ingress filtering with TC */ - "CONFIG_NET_SCH_INGRESS", + { "CONFIG_NET_SCH_INGRESS", }, /* bpf_skb_get_xfrm_state() helper */ - "CONFIG_XFRM", + { "CONFIG_XFRM", }, /* bpf_get_route_realm() helper */ - "CONFIG_IP_ROUTE_CLASSID", + { "CONFIG_IP_ROUTE_CLASSID", }, /* BPF_PROG_TYPE_LWT_SEG6_LOCAL and related helpers */ - "CONFIG_IPV6_SEG6_BPF", + { "CONFIG_IPV6_SEG6_BPF", }, /* BPF_PROG_TYPE_LIRC_MODE2 and related helpers */ - "CONFIG_BPF_LIRC_MODE2", + { "CONFIG_BPF_LIRC_MODE2", }, /* BPF stream parser and BPF socket maps */ - "CONFIG_BPF_STREAM_PARSER", + { "CONFIG_BPF_STREAM_PARSER", }, /* xt_bpf module for passing BPF programs to netfilter */ - "CONFIG_NETFILTER_XT_MATCH_BPF", + { "CONFIG_NETFILTER_XT_MATCH_BPF", }, /* bpfilter back-end for iptables */ - "CONFIG_BPFILTER", + { "CONFIG_BPFILTER", }, /* bpftilter module with "user mode helper" */ - "CONFIG_BPFILTER_UMH", + { "CONFIG_BPFILTER_UMH", }, /* test_bpf module for BPF tests */ - "CONFIG_TEST_BPF", + { "CONFIG_TEST_BPF", }, + + /* Misc configs useful in BPF C programs */ + /* jiffies <-> sec conversion for bpf_jiffies64() helper */ + { "CONFIG_HZ", true, } }; char *values[ARRAY_SIZE(options)] = { }; struct utsname utsn; @@ -427,7 +439,8 @@ static void probe_kernel_image_config(void) while (read_next_kernel_config_option(file, buf, sizeof(buf), &value)) { for (i = 0; i < ARRAY_SIZE(options); i++) { - if (values[i] || strcmp(buf, options[i])) + if ((define_prefix && !options[i].macro_dump) || + values[i] || strcmp(buf, options[i].name)) continue; values[i] = strdup(value); @@ -439,7 +452,9 @@ end_parse: gzclose(file); for (i = 0; i < ARRAY_SIZE(options); i++) { - print_kernel_option(options[i], values[i]); + if (define_prefix && !options[i].macro_dump) + continue; + print_kernel_option(options[i].name, values[i], define_prefix); free(values[i]); } } @@ -632,23 +647,22 @@ section_system_config(enum probe_component target, const char *define_prefix) switch (target) { case COMPONENT_KERNEL: case COMPONENT_UNSPEC: - if (define_prefix) - break; - print_start_section("system_config", "Scanning system configuration...", - NULL, /* define_comment never used here */ - NULL); /* define_prefix always NULL here */ - if (check_procfs()) { - probe_unprivileged_disabled(); - probe_jit_enable(); - probe_jit_harden(); - probe_jit_kallsyms(); - probe_jit_limit(); - } else { - p_info("/* procfs not mounted, skipping related probes */"); + "/*** Misc kernel config items ***/", + define_prefix); + if (!define_prefix) { + if (check_procfs()) { + probe_unprivileged_disabled(); + probe_jit_enable(); + probe_jit_harden(); + probe_jit_kallsyms(); + probe_jit_limit(); + } else { + p_info("/* procfs not mounted, skipping related probes */"); + } } - probe_kernel_image_config(); + probe_kernel_image_config(define_prefix); print_end_section(); break; default: -- cgit v1.2.3 From 81626001187609b9c49696a5b48d5abcf0e5f9be Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 13 May 2020 16:03:55 -0700 Subject: selftests/bpf: Use CAP_BPF and CAP_PERFMON in tests Make all test_verifier test exercise CAP_BPF and CAP_PERFMON Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200513230355.7858-4-alexei.starovoitov@gmail.com --- tools/testing/selftests/bpf/test_verifier.c | 44 +++++++++++++++++++----- tools/testing/selftests/bpf/verifier/calls.c | 16 ++++----- tools/testing/selftests/bpf/verifier/dead_code.c | 10 +++--- 3 files changed, 49 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 21a1ce219c1c..78a6bae56ea6 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -818,10 +818,18 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, } } +struct libcap { + struct __user_cap_header_struct hdr; + struct __user_cap_data_struct data[2]; +}; + static int set_admin(bool admin) { cap_t caps; - const cap_value_t cap_val = CAP_SYS_ADMIN; + /* need CAP_BPF, CAP_NET_ADMIN, CAP_PERFMON to load progs */ + const cap_value_t cap_net_admin = CAP_NET_ADMIN; + const cap_value_t cap_sys_admin = CAP_SYS_ADMIN; + struct libcap *cap; int ret = -1; caps = cap_get_proc(); @@ -829,11 +837,26 @@ static int set_admin(bool admin) perror("cap_get_proc"); return -1; } - if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_val, + cap = (struct libcap *)caps; + if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_sys_admin, CAP_CLEAR)) { + perror("cap_set_flag clear admin"); + goto out; + } + if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_net_admin, admin ? CAP_SET : CAP_CLEAR)) { - perror("cap_set_flag"); + perror("cap_set_flag set_or_clear net"); goto out; } + /* libcap is likely old and simply ignores CAP_BPF and CAP_PERFMON, + * so update effective bits manually + */ + if (admin) { + cap->data[1].effective |= 1 << (38 /* CAP_PERFMON */ - 32); + cap->data[1].effective |= 1 << (39 /* CAP_BPF */ - 32); + } else { + cap->data[1].effective &= ~(1 << (38 - 32)); + cap->data[1].effective &= ~(1 << (39 - 32)); + } if (cap_set_proc(caps)) { perror("cap_set_proc"); goto out; @@ -1067,9 +1090,11 @@ fail_log: static bool is_admin(void) { + cap_flag_value_t net_priv = CAP_CLEAR; + bool perfmon_priv = false; + bool bpf_priv = false; + struct libcap *cap; cap_t caps; - cap_flag_value_t sysadmin = CAP_CLEAR; - const cap_value_t cap_val = CAP_SYS_ADMIN; #ifdef CAP_IS_SUPPORTED if (!CAP_IS_SUPPORTED(CAP_SETFCAP)) { @@ -1082,11 +1107,14 @@ static bool is_admin(void) perror("cap_get_proc"); return false; } - if (cap_get_flag(caps, cap_val, CAP_EFFECTIVE, &sysadmin)) - perror("cap_get_flag"); + cap = (struct libcap *)caps; + bpf_priv = cap->data[1].effective & (1 << (39/* CAP_BPF */ - 32)); + perfmon_priv = cap->data[1].effective & (1 << (38/* CAP_PERFMON */ - 32)); + if (cap_get_flag(caps, CAP_NET_ADMIN, CAP_EFFECTIVE, &net_priv)) + perror("cap_get_flag NET"); if (cap_free(caps)) perror("cap_free"); - return (sysadmin == CAP_SET); + return bpf_priv && perfmon_priv && net_priv == CAP_SET; } static void get_unpriv_disabled() diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 2d752c4f8d9d..7629a0cebb9b 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -19,7 +19,7 @@ BPF_MOV64_IMM(BPF_REG_0, 2), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 1, @@ -315,7 +315,7 @@ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "allowed for root only", + .errstr_unpriv = "allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = POINTER_VALUE, @@ -346,7 +346,7 @@ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2), BPF_EXIT_INSN(), }, - .errstr_unpriv = "allowed for root only", + .errstr_unpriv = "allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = TEST_DATA_LEN + TEST_DATA_LEN - ETH_HLEN - ETH_HLEN, @@ -397,7 +397,7 @@ BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .fixup_map_hash_48b = { 3 }, .result_unpriv = REJECT, .result = ACCEPT, @@ -1064,7 +1064,7 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr_unpriv = "allowed for root only", + .errstr_unpriv = "allowed for", .result_unpriv = REJECT, .errstr = "R0 !read_ok", .result = REJECT, @@ -1977,7 +1977,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, }, @@ -2003,7 +2003,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .errstr = "!read_ok", .result = REJECT, }, @@ -2028,7 +2028,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .errstr = "!read_ok", .result = REJECT, }, diff --git a/tools/testing/selftests/bpf/verifier/dead_code.c b/tools/testing/selftests/bpf/verifier/dead_code.c index 50a8a63be4ac..5cf361d8eb1c 100644 --- a/tools/testing/selftests/bpf/verifier/dead_code.c +++ b/tools/testing/selftests/bpf/verifier/dead_code.c @@ -85,7 +85,7 @@ BPF_MOV64_IMM(BPF_REG_0, 12), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -103,7 +103,7 @@ BPF_MOV64_IMM(BPF_REG_0, 12), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -121,7 +121,7 @@ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -5), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -137,7 +137,7 @@ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, @@ -152,7 +152,7 @@ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, -- cgit v1.2.3 From e7534fd42a99f2dcca022d2c9a37adf82ad07998 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 15 May 2020 14:40:14 +0300 Subject: selftests: implement flower classifier terse dump tests Implement two basic tests to verify terse dump functionality of flower classifier: - Test that verifies that terse dump works. - Test that verifies that terse dump doesn't print filter key. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- .../tc-testing/tc-tests/filters/tests.json | 38 ++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json index 12aa4bc1f6a0..bb543bf69d69 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json +++ b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json @@ -87,5 +87,43 @@ "teardown": [ "$TC qdisc del dev $DEV2 ingress" ] + }, + { + "id": "7c65", + "name": "Add flower filter and then terse dump it", + "category": [ + "filter", + "flower" + ], + "setup": [ + "$TC qdisc add dev $DEV2 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip pref 1 ingress flower dst_mac e4:11:22:11:4a:51 action drop", + "expExitCode": "0", + "verifyCmd": "$TC filter show terse dev $DEV2 ingress", + "matchPattern": "filter protocol ip pref 1 flower.*handle", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV2 ingress" + ] + }, + { + "id": "d45e", + "name": "Add flower filter and verify that terse dump doesn't output filter key", + "category": [ + "filter", + "flower" + ], + "setup": [ + "$TC qdisc add dev $DEV2 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip pref 1 ingress flower dst_mac e4:11:22:11:4a:51 action drop", + "expExitCode": "0", + "verifyCmd": "$TC filter show terse dev $DEV2 ingress", + "matchPattern": " dst_mac e4:11:22:11:4a:51", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV2 ingress" + ] } ] -- cgit v1.2.3 From f516acd5397fdbb77ef0aad0798d9ef7c3001d72 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 15 May 2020 09:50:02 -0700 Subject: libbpf, hashmap: Remove unused #include Remove #include of libbpf_internal.h that is unused. Discussed in this thread: https://lore.kernel.org/lkml/CAEf4BzZRmiEds_8R8g4vaAeWvJzPb4xYLnpF0X2VNY8oTzkphQ@mail.gmail.com/ Signed-off-by: Ian Rogers Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200515165007.217120-3-irogers@google.com --- tools/lib/bpf/hashmap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/hashmap.h b/tools/lib/bpf/hashmap.h index bae8879cdf58..e823b35e7371 100644 --- a/tools/lib/bpf/hashmap.h +++ b/tools/lib/bpf/hashmap.h @@ -15,7 +15,6 @@ #else #include #endif -#include "libbpf_internal.h" static inline size_t hash_bits(size_t h, int bits) { -- cgit v1.2.3 From 8d35d74f52ae07689e575ea21f7dc2e07dd1392f Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 15 May 2020 09:50:03 -0700 Subject: libbpf, hashmap: Fix signedness warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes the following warnings: hashmap.c: In function ‘hashmap__clear’: hashmap.h:150:20: error: comparison of integer expressions of different signedness: ‘int’ and ‘size_t’ {aka ‘long unsigned int’} [-Werror=sign-compare] 150 | for (bkt = 0; bkt < map->cap; bkt++) \ hashmap.c: In function ‘hashmap_grow’: hashmap.h:150:20: error: comparison of integer expressions of different signedness: ‘int’ and ‘size_t’ {aka ‘long unsigned int’} [-Werror=sign-compare] 150 | for (bkt = 0; bkt < map->cap; bkt++) \ Signed-off-by: Ian Rogers Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200515165007.217120-4-irogers@google.com --- tools/lib/bpf/hashmap.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/hashmap.c b/tools/lib/bpf/hashmap.c index cffb96202e0d..a405dad068f5 100644 --- a/tools/lib/bpf/hashmap.c +++ b/tools/lib/bpf/hashmap.c @@ -60,7 +60,7 @@ struct hashmap *hashmap__new(hashmap_hash_fn hash_fn, void hashmap__clear(struct hashmap *map) { struct hashmap_entry *cur, *tmp; - int bkt; + size_t bkt; hashmap__for_each_entry_safe(map, cur, tmp, bkt) { free(cur); @@ -100,8 +100,7 @@ static int hashmap_grow(struct hashmap *map) struct hashmap_entry **new_buckets; struct hashmap_entry *cur, *tmp; size_t new_cap_bits, new_cap; - size_t h; - int bkt; + size_t h, bkt; new_cap_bits = map->cap_bits + 1; if (new_cap_bits < HASHMAP_MIN_CAP_BITS) -- cgit v1.2.3 From 5366d2269139ba8eb6a906d73a0819947e3e4e0a Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 15 May 2020 12:49:03 -0700 Subject: selftests/bpf: Fix test_align verifier log patterns Commit 294f2fc6da27 ("bpf: Verifer, adjust_scalar_min_max_vals to always call update_reg_bounds()") changed the way verifier logs some of its state, adjust the test_align accordingly. Where possible, I tried to not copy-paste the entire log line and resorted to dropping the last closing brace instead. Fixes: 294f2fc6da27 ("bpf: Verifer, adjust_scalar_min_max_vals to always call update_reg_bounds()") Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200515194904.229296-1-sdf@google.com --- tools/testing/selftests/bpf/test_align.c | 41 ++++++++++++++++---------------- 1 file changed, 21 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_align.c b/tools/testing/selftests/bpf/test_align.c index 0262f7b374f9..c9c9bdce9d6d 100644 --- a/tools/testing/selftests/bpf/test_align.c +++ b/tools/testing/selftests/bpf/test_align.c @@ -359,15 +359,15 @@ static struct bpf_align_test tests[] = { * is still (4n), fixed offset is not changed. * Also, we create a new reg->id. */ - {29, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc))"}, + {29, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (18) * which is 20. Then the variable offset is (4n), so * the total offset is 4-byte aligned and meets the * load's requirements. */ - {33, "R4=pkt(id=4,off=22,r=22,umax_value=2040,var_off=(0x0; 0x7fc))"}, - {33, "R5=pkt(id=4,off=18,r=22,umax_value=2040,var_off=(0x0; 0x7fc))"}, + {33, "R4=pkt(id=4,off=22,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"}, + {33, "R5=pkt(id=4,off=18,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"}, }, }, { @@ -410,15 +410,15 @@ static struct bpf_align_test tests[] = { /* Adding 14 makes R6 be (4n+2) */ {9, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, /* Packet pointer has (4n+2) offset */ - {11, "R5_w=pkt(id=1,off=0,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, - {13, "R4=pkt(id=1,off=4,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, + {11, "R5_w=pkt(id=1,off=0,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"}, + {13, "R4=pkt(id=1,off=4,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) * which is 2. Then the variable offset is (4n+2), so * the total offset is 4-byte aligned and meets the * load's requirements. */ - {15, "R5=pkt(id=1,off=0,r=4,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, + {15, "R5=pkt(id=1,off=0,r=4,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"}, /* Newly read value in R6 was shifted left by 2, so has * known alignment of 4. */ @@ -426,15 +426,15 @@ static struct bpf_align_test tests[] = { /* Added (4n) to packet pointer's (4n+2) var_off, giving * another (4n+2). */ - {19, "R5_w=pkt(id=2,off=0,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"}, - {21, "R4=pkt(id=2,off=4,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"}, + {19, "R5_w=pkt(id=2,off=0,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"}, + {21, "R4=pkt(id=2,off=4,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) * which is 2. Then the variable offset is (4n+2), so * the total offset is 4-byte aligned and meets the * load's requirements. */ - {23, "R5=pkt(id=2,off=0,r=4,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"}, + {23, "R5=pkt(id=2,off=0,r=4,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"}, }, }, { @@ -469,16 +469,16 @@ static struct bpf_align_test tests[] = { .matches = { {4, "R5_w=pkt_end(id=0,off=0,imm=0)"}, /* (ptr - ptr) << 2 == unknown, (4n) */ - {6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc))"}, + {6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc)"}, /* (4n) + 14 == (4n+2). We blow our bounds, because * the add could overflow. */ - {7, "R5_w=inv(id=0,var_off=(0x2; 0xfffffffffffffffc))"}, + {7, "R5_w=inv(id=0,smin_value=-9223372036854775806,smax_value=9223372036854775806,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"}, /* Checked s>=0 */ - {9, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, + {9, "R5=inv(id=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, /* packet pointer + nonnegative (4n+2) */ - {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, - {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, + {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, + {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. * We checked the bounds, but it might have been able * to overflow if the packet pointer started in the @@ -486,7 +486,7 @@ static struct bpf_align_test tests[] = { * So we did not get a 'range' on R6, and the access * attempt will fail. */ - {15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, + {15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, } }, { @@ -528,7 +528,7 @@ static struct bpf_align_test tests[] = { /* New unknown value in R7 is (4n) */ {11, "R7_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, /* Subtracting it from R6 blows our unsigned bounds */ - {12, "R6=inv(id=0,smin_value=-1006,smax_value=1034,var_off=(0x2; 0xfffffffffffffffc))"}, + {12, "R6=inv(id=0,smin_value=-1006,smax_value=1034,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"}, /* Checked s>= 0 */ {14, "R6=inv(id=0,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc))"}, /* At the time the word size load is performed from R5, @@ -537,7 +537,8 @@ static struct bpf_align_test tests[] = { * the total offset is 4-byte aligned and meets the * load's requirements. */ - {20, "R5=pkt(id=1,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc))"}, + {20, "R5=pkt(id=1,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc)"}, + }, }, { @@ -579,18 +580,18 @@ static struct bpf_align_test tests[] = { /* Adding 14 makes R6 be (4n+2) */ {11, "R6_w=inv(id=0,umin_value=14,umax_value=74,var_off=(0x2; 0x7c))"}, /* Subtracting from packet pointer overflows ubounds */ - {13, "R5_w=pkt(id=1,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c))"}, + {13, "R5_w=pkt(id=1,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c)"}, /* New unknown value in R7 is (4n), >= 76 */ {15, "R7_w=inv(id=0,umin_value=76,umax_value=1096,var_off=(0x0; 0x7fc))"}, /* Adding it to packet pointer gives nice bounds again */ - {16, "R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0x7fc))"}, + {16, "R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) * which is 2. Then the variable offset is (4n+2), so * the total offset is 4-byte aligned and meets the * load's requirements. */ - {20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0x7fc))"}, + {20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"}, }, }, }; -- cgit v1.2.3 From 3b09d27cc93d584f49bc18f1e1696ba19d43233a Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 15 May 2020 12:49:04 -0700 Subject: selftests/bpf: Move test_align under test_progs There is a much higher chance we can see the regressions if the test is part of test_progs. Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200515194904.229296-2-sdf@google.com --- tools/testing/selftests/bpf/prog_tests/align.c | 666 +++++++++++++++++++++++ tools/testing/selftests/bpf/test_align.c | 720 ------------------------- 2 files changed, 666 insertions(+), 720 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/align.c delete mode 100644 tools/testing/selftests/bpf/test_align.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/align.c b/tools/testing/selftests/bpf/prog_tests/align.c new file mode 100644 index 000000000000..c548aded6585 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/align.c @@ -0,0 +1,666 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#define MAX_INSNS 512 +#define MAX_MATCHES 16 + +struct bpf_reg_match { + unsigned int line; + const char *match; +}; + +struct bpf_align_test { + const char *descr; + struct bpf_insn insns[MAX_INSNS]; + enum { + UNDEF, + ACCEPT, + REJECT + } result; + enum bpf_prog_type prog_type; + /* Matches must be in order of increasing line */ + struct bpf_reg_match matches[MAX_MATCHES]; +}; + +static struct bpf_align_test tests[] = { + /* Four tests of known constants. These aren't staggeringly + * interesting since we track exact values now. + */ + { + .descr = "mov", + .insns = { + BPF_MOV64_IMM(BPF_REG_3, 2), + BPF_MOV64_IMM(BPF_REG_3, 4), + BPF_MOV64_IMM(BPF_REG_3, 8), + BPF_MOV64_IMM(BPF_REG_3, 16), + BPF_MOV64_IMM(BPF_REG_3, 32), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .matches = { + {1, "R1=ctx(id=0,off=0,imm=0)"}, + {1, "R10=fp0"}, + {1, "R3_w=inv2"}, + {2, "R3_w=inv4"}, + {3, "R3_w=inv8"}, + {4, "R3_w=inv16"}, + {5, "R3_w=inv32"}, + }, + }, + { + .descr = "shift", + .insns = { + BPF_MOV64_IMM(BPF_REG_3, 1), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1), + BPF_ALU64_IMM(BPF_RSH, BPF_REG_3, 4), + BPF_MOV64_IMM(BPF_REG_4, 32), + BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1), + BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1), + BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1), + BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .matches = { + {1, "R1=ctx(id=0,off=0,imm=0)"}, + {1, "R10=fp0"}, + {1, "R3_w=inv1"}, + {2, "R3_w=inv2"}, + {3, "R3_w=inv4"}, + {4, "R3_w=inv8"}, + {5, "R3_w=inv16"}, + {6, "R3_w=inv1"}, + {7, "R4_w=inv32"}, + {8, "R4_w=inv16"}, + {9, "R4_w=inv8"}, + {10, "R4_w=inv4"}, + {11, "R4_w=inv2"}, + }, + }, + { + .descr = "addsub", + .insns = { + BPF_MOV64_IMM(BPF_REG_3, 4), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 4), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 2), + BPF_MOV64_IMM(BPF_REG_4, 8), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .matches = { + {1, "R1=ctx(id=0,off=0,imm=0)"}, + {1, "R10=fp0"}, + {1, "R3_w=inv4"}, + {2, "R3_w=inv8"}, + {3, "R3_w=inv10"}, + {4, "R4_w=inv8"}, + {5, "R4_w=inv12"}, + {6, "R4_w=inv14"}, + }, + }, + { + .descr = "mul", + .insns = { + BPF_MOV64_IMM(BPF_REG_3, 7), + BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, 1), + BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, 2), + BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, 4), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .matches = { + {1, "R1=ctx(id=0,off=0,imm=0)"}, + {1, "R10=fp0"}, + {1, "R3_w=inv7"}, + {2, "R3_w=inv7"}, + {3, "R3_w=inv14"}, + {4, "R3_w=inv56"}, + }, + }, + + /* Tests using unknown values */ +#define PREP_PKT_POINTERS \ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, \ + offsetof(struct __sk_buff, data)), \ + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, \ + offsetof(struct __sk_buff, data_end)) + +#define LOAD_UNKNOWN(DST_REG) \ + PREP_PKT_POINTERS, \ + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), \ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), \ + BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_0, 1), \ + BPF_EXIT_INSN(), \ + BPF_LDX_MEM(BPF_B, DST_REG, BPF_REG_2, 0) + + { + .descr = "unknown shift", + .insns = { + LOAD_UNKNOWN(BPF_REG_3), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1), + LOAD_UNKNOWN(BPF_REG_4), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_4, 5), + BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1), + BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1), + BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1), + BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .matches = { + {7, "R0_w=pkt(id=0,off=8,r=8,imm=0)"}, + {7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, + {8, "R3_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"}, + {9, "R3_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, + {10, "R3_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"}, + {11, "R3_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"}, + {18, "R3=pkt_end(id=0,off=0,imm=0)"}, + {18, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, + {19, "R4_w=inv(id=0,umax_value=8160,var_off=(0x0; 0x1fe0))"}, + {20, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"}, + {21, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"}, + {22, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, + {23, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"}, + }, + }, + { + .descr = "unknown mul", + .insns = { + LOAD_UNKNOWN(BPF_REG_3), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_3), + BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 1), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_3), + BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 2), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_3), + BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 4), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_3), + BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 8), + BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .matches = { + {7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, + {8, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, + {9, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, + {10, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, + {11, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"}, + {12, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, + {13, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, + {14, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, + {15, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"}, + {16, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"}, + }, + }, + { + .descr = "packet const offset", + .insns = { + PREP_PKT_POINTERS, + BPF_MOV64_REG(BPF_REG_5, BPF_REG_2), + + BPF_MOV64_IMM(BPF_REG_0, 0), + + /* Skip over ethernet header. */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_5), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4), + BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1), + BPF_EXIT_INSN(), + + BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 0), + BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 2), + BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 3), + BPF_LDX_MEM(BPF_H, BPF_REG_4, BPF_REG_5, 0), + BPF_LDX_MEM(BPF_H, BPF_REG_4, BPF_REG_5, 2), + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0), + + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .matches = { + {4, "R5_w=pkt(id=0,off=0,r=0,imm=0)"}, + {5, "R5_w=pkt(id=0,off=14,r=0,imm=0)"}, + {6, "R4_w=pkt(id=0,off=14,r=0,imm=0)"}, + {10, "R2=pkt(id=0,off=0,r=18,imm=0)"}, + {10, "R5=pkt(id=0,off=14,r=18,imm=0)"}, + {10, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, + {14, "R4_w=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff))"}, + {15, "R4_w=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff))"}, + }, + }, + { + .descr = "packet variable offset", + .insns = { + LOAD_UNKNOWN(BPF_REG_6), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2), + + /* First, add a constant to the R5 packet pointer, + * then a variable with a known alignment. + */ + BPF_MOV64_REG(BPF_REG_5, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14), + BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_5), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4), + BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0), + + /* Now, test in the other direction. Adding first + * the variable offset to R5, then the constant. + */ + BPF_MOV64_REG(BPF_REG_5, BPF_REG_2), + BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_5), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4), + BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0), + + /* Test multiple accumulations of unknown values + * into a packet pointer. + */ + BPF_MOV64_REG(BPF_REG_5, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14), + BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 4), + BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_5), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4), + BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0), + + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .matches = { + /* Calculated offset in R6 has unknown value, but known + * alignment of 4. + */ + {8, "R2_w=pkt(id=0,off=0,r=8,imm=0)"}, + {8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, + /* Offset is added to packet pointer R5, resulting in + * known fixed offset, and variable offset from R6. + */ + {11, "R5_w=pkt(id=1,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, + /* At the time the word size load is performed from R5, + * it's total offset is NET_IP_ALIGN + reg->off (0) + + * reg->aux_off (14) which is 16. Then the variable + * offset is considered using reg->aux_off_align which + * is 4 and meets the load's requirements. + */ + {15, "R4=pkt(id=1,off=18,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"}, + {15, "R5=pkt(id=1,off=14,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"}, + /* Variable offset is added to R5 packet pointer, + * resulting in auxiliary alignment of 4. + */ + {18, "R5_w=pkt(id=2,off=0,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, + /* Constant offset is added to R5, resulting in + * reg->off of 14. + */ + {19, "R5_w=pkt(id=2,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, + /* At the time the word size load is performed from R5, + * its total fixed offset is NET_IP_ALIGN + reg->off + * (14) which is 16. Then the variable offset is 4-byte + * aligned, so the total offset is 4-byte aligned and + * meets the load's requirements. + */ + {23, "R4=pkt(id=2,off=18,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"}, + {23, "R5=pkt(id=2,off=14,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"}, + /* Constant offset is added to R5 packet pointer, + * resulting in reg->off value of 14. + */ + {26, "R5_w=pkt(id=0,off=14,r=8"}, + /* Variable offset is added to R5, resulting in a + * variable offset of (4n). + */ + {27, "R5_w=pkt(id=3,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, + /* Constant is added to R5 again, setting reg->off to 18. */ + {28, "R5_w=pkt(id=3,off=18,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, + /* And once more we add a variable; resulting var_off + * is still (4n), fixed offset is not changed. + * Also, we create a new reg->id. + */ + {29, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc)"}, + /* At the time the word size load is performed from R5, + * its total fixed offset is NET_IP_ALIGN + reg->off (18) + * which is 20. Then the variable offset is (4n), so + * the total offset is 4-byte aligned and meets the + * load's requirements. + */ + {33, "R4=pkt(id=4,off=22,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"}, + {33, "R5=pkt(id=4,off=18,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"}, + }, + }, + { + .descr = "packet variable offset 2", + .insns = { + /* Create an unknown offset, (4n+2)-aligned */ + LOAD_UNKNOWN(BPF_REG_6), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 14), + /* Add it to the packet pointer */ + BPF_MOV64_REG(BPF_REG_5, BPF_REG_2), + BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6), + /* Check bounds and perform a read */ + BPF_MOV64_REG(BPF_REG_4, BPF_REG_5), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4), + BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0), + /* Make a (4n) offset from the value we just read */ + BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 0xff), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2), + /* Add it to the packet pointer */ + BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6), + /* Check bounds and perform a read */ + BPF_MOV64_REG(BPF_REG_4, BPF_REG_5), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4), + BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .matches = { + /* Calculated offset in R6 has unknown value, but known + * alignment of 4. + */ + {8, "R2_w=pkt(id=0,off=0,r=8,imm=0)"}, + {8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, + /* Adding 14 makes R6 be (4n+2) */ + {9, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, + /* Packet pointer has (4n+2) offset */ + {11, "R5_w=pkt(id=1,off=0,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"}, + {13, "R4=pkt(id=1,off=4,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"}, + /* At the time the word size load is performed from R5, + * its total fixed offset is NET_IP_ALIGN + reg->off (0) + * which is 2. Then the variable offset is (4n+2), so + * the total offset is 4-byte aligned and meets the + * load's requirements. + */ + {15, "R5=pkt(id=1,off=0,r=4,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"}, + /* Newly read value in R6 was shifted left by 2, so has + * known alignment of 4. + */ + {18, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, + /* Added (4n) to packet pointer's (4n+2) var_off, giving + * another (4n+2). + */ + {19, "R5_w=pkt(id=2,off=0,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"}, + {21, "R4=pkt(id=2,off=4,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"}, + /* At the time the word size load is performed from R5, + * its total fixed offset is NET_IP_ALIGN + reg->off (0) + * which is 2. Then the variable offset is (4n+2), so + * the total offset is 4-byte aligned and meets the + * load's requirements. + */ + {23, "R5=pkt(id=2,off=0,r=4,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"}, + }, + }, + { + .descr = "dubious pointer arithmetic", + .insns = { + PREP_PKT_POINTERS, + BPF_MOV64_IMM(BPF_REG_0, 0), + /* (ptr - ptr) << 2 */ + BPF_MOV64_REG(BPF_REG_5, BPF_REG_3), + BPF_ALU64_REG(BPF_SUB, BPF_REG_5, BPF_REG_2), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_5, 2), + /* We have a (4n) value. Let's make a packet offset + * out of it. First add 14, to make it a (4n+2) + */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14), + /* Then make sure it's nonnegative */ + BPF_JMP_IMM(BPF_JSGE, BPF_REG_5, 0, 1), + BPF_EXIT_INSN(), + /* Add it to packet pointer */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_2), + BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_5), + /* Check bounds and perform a read */ + BPF_MOV64_REG(BPF_REG_4, BPF_REG_6), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4), + BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_6, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = REJECT, + .matches = { + {4, "R5_w=pkt_end(id=0,off=0,imm=0)"}, + /* (ptr - ptr) << 2 == unknown, (4n) */ + {6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc)"}, + /* (4n) + 14 == (4n+2). We blow our bounds, because + * the add could overflow. + */ + {7, "R5_w=inv(id=0,smin_value=-9223372036854775806,smax_value=9223372036854775806,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"}, + /* Checked s>=0 */ + {9, "R5=inv(id=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, + /* packet pointer + nonnegative (4n+2) */ + {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, + {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, + /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. + * We checked the bounds, but it might have been able + * to overflow if the packet pointer started in the + * upper half of the address space. + * So we did not get a 'range' on R6, and the access + * attempt will fail. + */ + {15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, + } + }, + { + .descr = "variable subtraction", + .insns = { + /* Create an unknown offset, (4n+2)-aligned */ + LOAD_UNKNOWN(BPF_REG_6), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_6), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 14), + /* Create another unknown, (4n)-aligned, and subtract + * it from the first one + */ + BPF_ALU64_IMM(BPF_LSH, BPF_REG_7, 2), + BPF_ALU64_REG(BPF_SUB, BPF_REG_6, BPF_REG_7), + /* Bounds-check the result */ + BPF_JMP_IMM(BPF_JSGE, BPF_REG_6, 0, 1), + BPF_EXIT_INSN(), + /* Add it to the packet pointer */ + BPF_MOV64_REG(BPF_REG_5, BPF_REG_2), + BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6), + /* Check bounds and perform a read */ + BPF_MOV64_REG(BPF_REG_4, BPF_REG_5), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4), + BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .matches = { + /* Calculated offset in R6 has unknown value, but known + * alignment of 4. + */ + {7, "R2_w=pkt(id=0,off=0,r=8,imm=0)"}, + {9, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, + /* Adding 14 makes R6 be (4n+2) */ + {10, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, + /* New unknown value in R7 is (4n) */ + {11, "R7_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, + /* Subtracting it from R6 blows our unsigned bounds */ + {12, "R6=inv(id=0,smin_value=-1006,smax_value=1034,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"}, + /* Checked s>= 0 */ + {14, "R6=inv(id=0,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc))"}, + /* At the time the word size load is performed from R5, + * its total fixed offset is NET_IP_ALIGN + reg->off (0) + * which is 2. Then the variable offset is (4n+2), so + * the total offset is 4-byte aligned and meets the + * load's requirements. + */ + {20, "R5=pkt(id=1,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc)"}, + + }, + }, + { + .descr = "pointer variable subtraction", + .insns = { + /* Create an unknown offset, (4n+2)-aligned and bounded + * to [14,74] + */ + LOAD_UNKNOWN(BPF_REG_6), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_6), + BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 0xf), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 14), + /* Subtract it from the packet pointer */ + BPF_MOV64_REG(BPF_REG_5, BPF_REG_2), + BPF_ALU64_REG(BPF_SUB, BPF_REG_5, BPF_REG_6), + /* Create another unknown, (4n)-aligned and >= 74. + * That in fact means >= 76, since 74 % 4 == 2 + */ + BPF_ALU64_IMM(BPF_LSH, BPF_REG_7, 2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 76), + /* Add it to the packet pointer */ + BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_7), + /* Check bounds and perform a read */ + BPF_MOV64_REG(BPF_REG_4, BPF_REG_5), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4), + BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .matches = { + /* Calculated offset in R6 has unknown value, but known + * alignment of 4. + */ + {7, "R2_w=pkt(id=0,off=0,r=8,imm=0)"}, + {10, "R6_w=inv(id=0,umax_value=60,var_off=(0x0; 0x3c))"}, + /* Adding 14 makes R6 be (4n+2) */ + {11, "R6_w=inv(id=0,umin_value=14,umax_value=74,var_off=(0x2; 0x7c))"}, + /* Subtracting from packet pointer overflows ubounds */ + {13, "R5_w=pkt(id=1,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c)"}, + /* New unknown value in R7 is (4n), >= 76 */ + {15, "R7_w=inv(id=0,umin_value=76,umax_value=1096,var_off=(0x0; 0x7fc))"}, + /* Adding it to packet pointer gives nice bounds again */ + {16, "R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"}, + /* At the time the word size load is performed from R5, + * its total fixed offset is NET_IP_ALIGN + reg->off (0) + * which is 2. Then the variable offset is (4n+2), so + * the total offset is 4-byte aligned and meets the + * load's requirements. + */ + {20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"}, + }, + }, +}; + +static int probe_filter_length(const struct bpf_insn *fp) +{ + int len; + + for (len = MAX_INSNS - 1; len > 0; --len) + if (fp[len].code != 0 || fp[len].imm != 0) + break; + return len + 1; +} + +static char bpf_vlog[32768]; + +static int do_test_single(struct bpf_align_test *test) +{ + struct bpf_insn *prog = test->insns; + int prog_type = test->prog_type; + char bpf_vlog_copy[32768]; + const char *line_ptr; + int cur_line = -1; + int prog_len, i; + int fd_prog; + int ret; + + prog_len = probe_filter_length(prog); + fd_prog = bpf_verify_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER, + prog, prog_len, BPF_F_STRICT_ALIGNMENT, + "GPL", 0, bpf_vlog, sizeof(bpf_vlog), 2); + if (fd_prog < 0 && test->result != REJECT) { + printf("Failed to load program.\n"); + printf("%s", bpf_vlog); + ret = 1; + } else if (fd_prog >= 0 && test->result == REJECT) { + printf("Unexpected success to load!\n"); + printf("%s", bpf_vlog); + ret = 1; + close(fd_prog); + } else { + ret = 0; + /* We make a local copy so that we can strtok() it */ + strncpy(bpf_vlog_copy, bpf_vlog, sizeof(bpf_vlog_copy)); + line_ptr = strtok(bpf_vlog_copy, "\n"); + for (i = 0; i < MAX_MATCHES; i++) { + struct bpf_reg_match m = test->matches[i]; + + if (!m.match) + break; + while (line_ptr) { + cur_line = -1; + sscanf(line_ptr, "%u: ", &cur_line); + if (cur_line == m.line) + break; + line_ptr = strtok(NULL, "\n"); + } + if (!line_ptr) { + printf("Failed to find line %u for match: %s\n", + m.line, m.match); + ret = 1; + printf("%s", bpf_vlog); + break; + } + if (!strstr(line_ptr, m.match)) { + printf("Failed to find match %u: %s\n", + m.line, m.match); + ret = 1; + printf("%s", bpf_vlog); + break; + } + } + if (fd_prog >= 0) + close(fd_prog); + } + return ret; +} + +void test_align(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(tests); i++) { + struct bpf_align_test *test = &tests[i]; + + if (!test__start_subtest(test->descr)) + continue; + + CHECK_FAIL(do_test_single(test)); + } +} diff --git a/tools/testing/selftests/bpf/test_align.c b/tools/testing/selftests/bpf/test_align.c deleted file mode 100644 index c9c9bdce9d6d..000000000000 --- a/tools/testing/selftests/bpf/test_align.c +++ /dev/null @@ -1,720 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include - -#include "../../../include/linux/filter.h" -#include "bpf_rlimit.h" -#include "bpf_util.h" - -#define MAX_INSNS 512 -#define MAX_MATCHES 16 - -struct bpf_reg_match { - unsigned int line; - const char *match; -}; - -struct bpf_align_test { - const char *descr; - struct bpf_insn insns[MAX_INSNS]; - enum { - UNDEF, - ACCEPT, - REJECT - } result; - enum bpf_prog_type prog_type; - /* Matches must be in order of increasing line */ - struct bpf_reg_match matches[MAX_MATCHES]; -}; - -static struct bpf_align_test tests[] = { - /* Four tests of known constants. These aren't staggeringly - * interesting since we track exact values now. - */ - { - .descr = "mov", - .insns = { - BPF_MOV64_IMM(BPF_REG_3, 2), - BPF_MOV64_IMM(BPF_REG_3, 4), - BPF_MOV64_IMM(BPF_REG_3, 8), - BPF_MOV64_IMM(BPF_REG_3, 16), - BPF_MOV64_IMM(BPF_REG_3, 32), - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - }, - .prog_type = BPF_PROG_TYPE_SCHED_CLS, - .matches = { - {1, "R1=ctx(id=0,off=0,imm=0)"}, - {1, "R10=fp0"}, - {1, "R3_w=inv2"}, - {2, "R3_w=inv4"}, - {3, "R3_w=inv8"}, - {4, "R3_w=inv16"}, - {5, "R3_w=inv32"}, - }, - }, - { - .descr = "shift", - .insns = { - BPF_MOV64_IMM(BPF_REG_3, 1), - BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1), - BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1), - BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1), - BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1), - BPF_ALU64_IMM(BPF_RSH, BPF_REG_3, 4), - BPF_MOV64_IMM(BPF_REG_4, 32), - BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1), - BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1), - BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1), - BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1), - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - }, - .prog_type = BPF_PROG_TYPE_SCHED_CLS, - .matches = { - {1, "R1=ctx(id=0,off=0,imm=0)"}, - {1, "R10=fp0"}, - {1, "R3_w=inv1"}, - {2, "R3_w=inv2"}, - {3, "R3_w=inv4"}, - {4, "R3_w=inv8"}, - {5, "R3_w=inv16"}, - {6, "R3_w=inv1"}, - {7, "R4_w=inv32"}, - {8, "R4_w=inv16"}, - {9, "R4_w=inv8"}, - {10, "R4_w=inv4"}, - {11, "R4_w=inv2"}, - }, - }, - { - .descr = "addsub", - .insns = { - BPF_MOV64_IMM(BPF_REG_3, 4), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 4), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 2), - BPF_MOV64_IMM(BPF_REG_4, 8), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 2), - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - }, - .prog_type = BPF_PROG_TYPE_SCHED_CLS, - .matches = { - {1, "R1=ctx(id=0,off=0,imm=0)"}, - {1, "R10=fp0"}, - {1, "R3_w=inv4"}, - {2, "R3_w=inv8"}, - {3, "R3_w=inv10"}, - {4, "R4_w=inv8"}, - {5, "R4_w=inv12"}, - {6, "R4_w=inv14"}, - }, - }, - { - .descr = "mul", - .insns = { - BPF_MOV64_IMM(BPF_REG_3, 7), - BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, 1), - BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, 2), - BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, 4), - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - }, - .prog_type = BPF_PROG_TYPE_SCHED_CLS, - .matches = { - {1, "R1=ctx(id=0,off=0,imm=0)"}, - {1, "R10=fp0"}, - {1, "R3_w=inv7"}, - {2, "R3_w=inv7"}, - {3, "R3_w=inv14"}, - {4, "R3_w=inv56"}, - }, - }, - - /* Tests using unknown values */ -#define PREP_PKT_POINTERS \ - BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, \ - offsetof(struct __sk_buff, data)), \ - BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, \ - offsetof(struct __sk_buff, data_end)) - -#define LOAD_UNKNOWN(DST_REG) \ - PREP_PKT_POINTERS, \ - BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), \ - BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), \ - BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_0, 1), \ - BPF_EXIT_INSN(), \ - BPF_LDX_MEM(BPF_B, DST_REG, BPF_REG_2, 0) - - { - .descr = "unknown shift", - .insns = { - LOAD_UNKNOWN(BPF_REG_3), - BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1), - BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1), - BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1), - BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1), - LOAD_UNKNOWN(BPF_REG_4), - BPF_ALU64_IMM(BPF_LSH, BPF_REG_4, 5), - BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1), - BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1), - BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1), - BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1), - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - }, - .prog_type = BPF_PROG_TYPE_SCHED_CLS, - .matches = { - {7, "R0_w=pkt(id=0,off=8,r=8,imm=0)"}, - {7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, - {8, "R3_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"}, - {9, "R3_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, - {10, "R3_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"}, - {11, "R3_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"}, - {18, "R3=pkt_end(id=0,off=0,imm=0)"}, - {18, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, - {19, "R4_w=inv(id=0,umax_value=8160,var_off=(0x0; 0x1fe0))"}, - {20, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"}, - {21, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"}, - {22, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, - {23, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"}, - }, - }, - { - .descr = "unknown mul", - .insns = { - LOAD_UNKNOWN(BPF_REG_3), - BPF_MOV64_REG(BPF_REG_4, BPF_REG_3), - BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 1), - BPF_MOV64_REG(BPF_REG_4, BPF_REG_3), - BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 2), - BPF_MOV64_REG(BPF_REG_4, BPF_REG_3), - BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 4), - BPF_MOV64_REG(BPF_REG_4, BPF_REG_3), - BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 8), - BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 2), - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - }, - .prog_type = BPF_PROG_TYPE_SCHED_CLS, - .matches = { - {7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, - {8, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, - {9, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, - {10, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, - {11, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"}, - {12, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, - {13, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, - {14, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, - {15, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"}, - {16, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"}, - }, - }, - { - .descr = "packet const offset", - .insns = { - PREP_PKT_POINTERS, - BPF_MOV64_REG(BPF_REG_5, BPF_REG_2), - - BPF_MOV64_IMM(BPF_REG_0, 0), - - /* Skip over ethernet header. */ - BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14), - BPF_MOV64_REG(BPF_REG_4, BPF_REG_5), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4), - BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1), - BPF_EXIT_INSN(), - - BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 0), - BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 1), - BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 2), - BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 3), - BPF_LDX_MEM(BPF_H, BPF_REG_4, BPF_REG_5, 0), - BPF_LDX_MEM(BPF_H, BPF_REG_4, BPF_REG_5, 2), - BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0), - - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - }, - .prog_type = BPF_PROG_TYPE_SCHED_CLS, - .matches = { - {4, "R5_w=pkt(id=0,off=0,r=0,imm=0)"}, - {5, "R5_w=pkt(id=0,off=14,r=0,imm=0)"}, - {6, "R4_w=pkt(id=0,off=14,r=0,imm=0)"}, - {10, "R2=pkt(id=0,off=0,r=18,imm=0)"}, - {10, "R5=pkt(id=0,off=14,r=18,imm=0)"}, - {10, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, - {14, "R4_w=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff))"}, - {15, "R4_w=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff))"}, - }, - }, - { - .descr = "packet variable offset", - .insns = { - LOAD_UNKNOWN(BPF_REG_6), - BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2), - - /* First, add a constant to the R5 packet pointer, - * then a variable with a known alignment. - */ - BPF_MOV64_REG(BPF_REG_5, BPF_REG_2), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14), - BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6), - BPF_MOV64_REG(BPF_REG_4, BPF_REG_5), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4), - BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1), - BPF_EXIT_INSN(), - BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0), - - /* Now, test in the other direction. Adding first - * the variable offset to R5, then the constant. - */ - BPF_MOV64_REG(BPF_REG_5, BPF_REG_2), - BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14), - BPF_MOV64_REG(BPF_REG_4, BPF_REG_5), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4), - BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1), - BPF_EXIT_INSN(), - BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0), - - /* Test multiple accumulations of unknown values - * into a packet pointer. - */ - BPF_MOV64_REG(BPF_REG_5, BPF_REG_2), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14), - BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 4), - BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6), - BPF_MOV64_REG(BPF_REG_4, BPF_REG_5), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4), - BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1), - BPF_EXIT_INSN(), - BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0), - - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - }, - .prog_type = BPF_PROG_TYPE_SCHED_CLS, - .matches = { - /* Calculated offset in R6 has unknown value, but known - * alignment of 4. - */ - {8, "R2_w=pkt(id=0,off=0,r=8,imm=0)"}, - {8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, - /* Offset is added to packet pointer R5, resulting in - * known fixed offset, and variable offset from R6. - */ - {11, "R5_w=pkt(id=1,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, - /* At the time the word size load is performed from R5, - * it's total offset is NET_IP_ALIGN + reg->off (0) + - * reg->aux_off (14) which is 16. Then the variable - * offset is considered using reg->aux_off_align which - * is 4 and meets the load's requirements. - */ - {15, "R4=pkt(id=1,off=18,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"}, - {15, "R5=pkt(id=1,off=14,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"}, - /* Variable offset is added to R5 packet pointer, - * resulting in auxiliary alignment of 4. - */ - {18, "R5_w=pkt(id=2,off=0,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, - /* Constant offset is added to R5, resulting in - * reg->off of 14. - */ - {19, "R5_w=pkt(id=2,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, - /* At the time the word size load is performed from R5, - * its total fixed offset is NET_IP_ALIGN + reg->off - * (14) which is 16. Then the variable offset is 4-byte - * aligned, so the total offset is 4-byte aligned and - * meets the load's requirements. - */ - {23, "R4=pkt(id=2,off=18,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"}, - {23, "R5=pkt(id=2,off=14,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"}, - /* Constant offset is added to R5 packet pointer, - * resulting in reg->off value of 14. - */ - {26, "R5_w=pkt(id=0,off=14,r=8"}, - /* Variable offset is added to R5, resulting in a - * variable offset of (4n). - */ - {27, "R5_w=pkt(id=3,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, - /* Constant is added to R5 again, setting reg->off to 18. */ - {28, "R5_w=pkt(id=3,off=18,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, - /* And once more we add a variable; resulting var_off - * is still (4n), fixed offset is not changed. - * Also, we create a new reg->id. - */ - {29, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc)"}, - /* At the time the word size load is performed from R5, - * its total fixed offset is NET_IP_ALIGN + reg->off (18) - * which is 20. Then the variable offset is (4n), so - * the total offset is 4-byte aligned and meets the - * load's requirements. - */ - {33, "R4=pkt(id=4,off=22,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"}, - {33, "R5=pkt(id=4,off=18,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"}, - }, - }, - { - .descr = "packet variable offset 2", - .insns = { - /* Create an unknown offset, (4n+2)-aligned */ - LOAD_UNKNOWN(BPF_REG_6), - BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 14), - /* Add it to the packet pointer */ - BPF_MOV64_REG(BPF_REG_5, BPF_REG_2), - BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6), - /* Check bounds and perform a read */ - BPF_MOV64_REG(BPF_REG_4, BPF_REG_5), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4), - BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1), - BPF_EXIT_INSN(), - BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0), - /* Make a (4n) offset from the value we just read */ - BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 0xff), - BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2), - /* Add it to the packet pointer */ - BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6), - /* Check bounds and perform a read */ - BPF_MOV64_REG(BPF_REG_4, BPF_REG_5), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4), - BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1), - BPF_EXIT_INSN(), - BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0), - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - }, - .prog_type = BPF_PROG_TYPE_SCHED_CLS, - .matches = { - /* Calculated offset in R6 has unknown value, but known - * alignment of 4. - */ - {8, "R2_w=pkt(id=0,off=0,r=8,imm=0)"}, - {8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, - /* Adding 14 makes R6 be (4n+2) */ - {9, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, - /* Packet pointer has (4n+2) offset */ - {11, "R5_w=pkt(id=1,off=0,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"}, - {13, "R4=pkt(id=1,off=4,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"}, - /* At the time the word size load is performed from R5, - * its total fixed offset is NET_IP_ALIGN + reg->off (0) - * which is 2. Then the variable offset is (4n+2), so - * the total offset is 4-byte aligned and meets the - * load's requirements. - */ - {15, "R5=pkt(id=1,off=0,r=4,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"}, - /* Newly read value in R6 was shifted left by 2, so has - * known alignment of 4. - */ - {18, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, - /* Added (4n) to packet pointer's (4n+2) var_off, giving - * another (4n+2). - */ - {19, "R5_w=pkt(id=2,off=0,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"}, - {21, "R4=pkt(id=2,off=4,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"}, - /* At the time the word size load is performed from R5, - * its total fixed offset is NET_IP_ALIGN + reg->off (0) - * which is 2. Then the variable offset is (4n+2), so - * the total offset is 4-byte aligned and meets the - * load's requirements. - */ - {23, "R5=pkt(id=2,off=0,r=4,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"}, - }, - }, - { - .descr = "dubious pointer arithmetic", - .insns = { - PREP_PKT_POINTERS, - BPF_MOV64_IMM(BPF_REG_0, 0), - /* (ptr - ptr) << 2 */ - BPF_MOV64_REG(BPF_REG_5, BPF_REG_3), - BPF_ALU64_REG(BPF_SUB, BPF_REG_5, BPF_REG_2), - BPF_ALU64_IMM(BPF_LSH, BPF_REG_5, 2), - /* We have a (4n) value. Let's make a packet offset - * out of it. First add 14, to make it a (4n+2) - */ - BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14), - /* Then make sure it's nonnegative */ - BPF_JMP_IMM(BPF_JSGE, BPF_REG_5, 0, 1), - BPF_EXIT_INSN(), - /* Add it to packet pointer */ - BPF_MOV64_REG(BPF_REG_6, BPF_REG_2), - BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_5), - /* Check bounds and perform a read */ - BPF_MOV64_REG(BPF_REG_4, BPF_REG_6), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4), - BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1), - BPF_EXIT_INSN(), - BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_6, 0), - BPF_EXIT_INSN(), - }, - .prog_type = BPF_PROG_TYPE_SCHED_CLS, - .result = REJECT, - .matches = { - {4, "R5_w=pkt_end(id=0,off=0,imm=0)"}, - /* (ptr - ptr) << 2 == unknown, (4n) */ - {6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc)"}, - /* (4n) + 14 == (4n+2). We blow our bounds, because - * the add could overflow. - */ - {7, "R5_w=inv(id=0,smin_value=-9223372036854775806,smax_value=9223372036854775806,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"}, - /* Checked s>=0 */ - {9, "R5=inv(id=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, - /* packet pointer + nonnegative (4n+2) */ - {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, - {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, - /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. - * We checked the bounds, but it might have been able - * to overflow if the packet pointer started in the - * upper half of the address space. - * So we did not get a 'range' on R6, and the access - * attempt will fail. - */ - {15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, - } - }, - { - .descr = "variable subtraction", - .insns = { - /* Create an unknown offset, (4n+2)-aligned */ - LOAD_UNKNOWN(BPF_REG_6), - BPF_MOV64_REG(BPF_REG_7, BPF_REG_6), - BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 14), - /* Create another unknown, (4n)-aligned, and subtract - * it from the first one - */ - BPF_ALU64_IMM(BPF_LSH, BPF_REG_7, 2), - BPF_ALU64_REG(BPF_SUB, BPF_REG_6, BPF_REG_7), - /* Bounds-check the result */ - BPF_JMP_IMM(BPF_JSGE, BPF_REG_6, 0, 1), - BPF_EXIT_INSN(), - /* Add it to the packet pointer */ - BPF_MOV64_REG(BPF_REG_5, BPF_REG_2), - BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6), - /* Check bounds and perform a read */ - BPF_MOV64_REG(BPF_REG_4, BPF_REG_5), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4), - BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1), - BPF_EXIT_INSN(), - BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0), - BPF_EXIT_INSN(), - }, - .prog_type = BPF_PROG_TYPE_SCHED_CLS, - .matches = { - /* Calculated offset in R6 has unknown value, but known - * alignment of 4. - */ - {7, "R2_w=pkt(id=0,off=0,r=8,imm=0)"}, - {9, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, - /* Adding 14 makes R6 be (4n+2) */ - {10, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, - /* New unknown value in R7 is (4n) */ - {11, "R7_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, - /* Subtracting it from R6 blows our unsigned bounds */ - {12, "R6=inv(id=0,smin_value=-1006,smax_value=1034,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"}, - /* Checked s>= 0 */ - {14, "R6=inv(id=0,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc))"}, - /* At the time the word size load is performed from R5, - * its total fixed offset is NET_IP_ALIGN + reg->off (0) - * which is 2. Then the variable offset is (4n+2), so - * the total offset is 4-byte aligned and meets the - * load's requirements. - */ - {20, "R5=pkt(id=1,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc)"}, - - }, - }, - { - .descr = "pointer variable subtraction", - .insns = { - /* Create an unknown offset, (4n+2)-aligned and bounded - * to [14,74] - */ - LOAD_UNKNOWN(BPF_REG_6), - BPF_MOV64_REG(BPF_REG_7, BPF_REG_6), - BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 0xf), - BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 14), - /* Subtract it from the packet pointer */ - BPF_MOV64_REG(BPF_REG_5, BPF_REG_2), - BPF_ALU64_REG(BPF_SUB, BPF_REG_5, BPF_REG_6), - /* Create another unknown, (4n)-aligned and >= 74. - * That in fact means >= 76, since 74 % 4 == 2 - */ - BPF_ALU64_IMM(BPF_LSH, BPF_REG_7, 2), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 76), - /* Add it to the packet pointer */ - BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_7), - /* Check bounds and perform a read */ - BPF_MOV64_REG(BPF_REG_4, BPF_REG_5), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4), - BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1), - BPF_EXIT_INSN(), - BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0), - BPF_EXIT_INSN(), - }, - .prog_type = BPF_PROG_TYPE_SCHED_CLS, - .matches = { - /* Calculated offset in R6 has unknown value, but known - * alignment of 4. - */ - {7, "R2_w=pkt(id=0,off=0,r=8,imm=0)"}, - {10, "R6_w=inv(id=0,umax_value=60,var_off=(0x0; 0x3c))"}, - /* Adding 14 makes R6 be (4n+2) */ - {11, "R6_w=inv(id=0,umin_value=14,umax_value=74,var_off=(0x2; 0x7c))"}, - /* Subtracting from packet pointer overflows ubounds */ - {13, "R5_w=pkt(id=1,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c)"}, - /* New unknown value in R7 is (4n), >= 76 */ - {15, "R7_w=inv(id=0,umin_value=76,umax_value=1096,var_off=(0x0; 0x7fc))"}, - /* Adding it to packet pointer gives nice bounds again */ - {16, "R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"}, - /* At the time the word size load is performed from R5, - * its total fixed offset is NET_IP_ALIGN + reg->off (0) - * which is 2. Then the variable offset is (4n+2), so - * the total offset is 4-byte aligned and meets the - * load's requirements. - */ - {20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"}, - }, - }, -}; - -static int probe_filter_length(const struct bpf_insn *fp) -{ - int len; - - for (len = MAX_INSNS - 1; len > 0; --len) - if (fp[len].code != 0 || fp[len].imm != 0) - break; - return len + 1; -} - -static char bpf_vlog[32768]; - -static int do_test_single(struct bpf_align_test *test) -{ - struct bpf_insn *prog = test->insns; - int prog_type = test->prog_type; - char bpf_vlog_copy[32768]; - const char *line_ptr; - int cur_line = -1; - int prog_len, i; - int fd_prog; - int ret; - - prog_len = probe_filter_length(prog); - fd_prog = bpf_verify_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER, - prog, prog_len, BPF_F_STRICT_ALIGNMENT, - "GPL", 0, bpf_vlog, sizeof(bpf_vlog), 2); - if (fd_prog < 0 && test->result != REJECT) { - printf("Failed to load program.\n"); - printf("%s", bpf_vlog); - ret = 1; - } else if (fd_prog >= 0 && test->result == REJECT) { - printf("Unexpected success to load!\n"); - printf("%s", bpf_vlog); - ret = 1; - close(fd_prog); - } else { - ret = 0; - /* We make a local copy so that we can strtok() it */ - strncpy(bpf_vlog_copy, bpf_vlog, sizeof(bpf_vlog_copy)); - line_ptr = strtok(bpf_vlog_copy, "\n"); - for (i = 0; i < MAX_MATCHES; i++) { - struct bpf_reg_match m = test->matches[i]; - - if (!m.match) - break; - while (line_ptr) { - cur_line = -1; - sscanf(line_ptr, "%u: ", &cur_line); - if (cur_line == m.line) - break; - line_ptr = strtok(NULL, "\n"); - } - if (!line_ptr) { - printf("Failed to find line %u for match: %s\n", - m.line, m.match); - ret = 1; - printf("%s", bpf_vlog); - break; - } - if (!strstr(line_ptr, m.match)) { - printf("Failed to find match %u: %s\n", - m.line, m.match); - ret = 1; - printf("%s", bpf_vlog); - break; - } - } - if (fd_prog >= 0) - close(fd_prog); - } - return ret; -} - -static int do_test(unsigned int from, unsigned int to) -{ - int all_pass = 0; - int all_fail = 0; - unsigned int i; - - for (i = from; i < to; i++) { - struct bpf_align_test *test = &tests[i]; - int fail; - - printf("Test %3d: %s ... ", - i, test->descr); - fail = do_test_single(test); - if (fail) { - all_fail++; - printf("FAIL\n"); - } else { - all_pass++; - printf("PASS\n"); - } - } - printf("Results: %d pass %d fail\n", - all_pass, all_fail); - return all_fail ? EXIT_FAILURE : EXIT_SUCCESS; -} - -int main(int argc, char **argv) -{ - unsigned int from = 0, to = ARRAY_SIZE(tests); - - if (argc == 3) { - unsigned int l = atoi(argv[argc - 2]); - unsigned int u = atoi(argv[argc - 1]); - - if (l < to && u < to) { - from = l; - to = u + 1; - } - } else if (argc == 2) { - unsigned int t = atoi(argv[argc - 1]); - - if (t < to) { - from = t; - to = t + 1; - } - } - return do_test(from, to); -} -- cgit v1.2.3 From 991e35eebe1e90ffc1c75105286a50e627b56dd1 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 13 May 2020 12:13:09 -0700 Subject: bpf: Selftests, move sockmap bpf prog header into progs Moves test_sockmap_kern.h into progs directory but does not change code at all. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/158939718921.15176.5766299102332077086.stgit@john-Precision-5820-Tower --- .../selftests/bpf/progs/test_sockmap_kern.h | 451 +++++++++++++++++++++ tools/testing/selftests/bpf/test_sockmap_kern.h | 451 --------------------- 2 files changed, 451 insertions(+), 451 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/test_sockmap_kern.h delete mode 100644 tools/testing/selftests/bpf/test_sockmap_kern.h (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_kern.h b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h new file mode 100644 index 000000000000..9b4d3a68a91a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h @@ -0,0 +1,451 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2017-2018 Covalent IO, Inc. http://covalent.io */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Sockmap sample program connects a client and a backend together + * using cgroups. + * + * client:X <---> frontend:80 client:X <---> backend:80 + * + * For simplicity we hard code values here and bind 1:1. The hard + * coded values are part of the setup in sockmap.sh script that + * is associated with this BPF program. + * + * The bpf_printk is verbose and prints information as connections + * are established and verdicts are decided. + */ + +struct { + __uint(type, TEST_MAP_TYPE); + __uint(max_entries, 20); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); +} sock_map SEC(".maps"); + +struct { + __uint(type, TEST_MAP_TYPE); + __uint(max_entries, 20); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); +} sock_map_txmsg SEC(".maps"); + +struct { + __uint(type, TEST_MAP_TYPE); + __uint(max_entries, 20); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); +} sock_map_redir SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, int); +} sock_apply_bytes SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, int); +} sock_cork_bytes SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 6); + __type(key, int); + __type(value, int); +} sock_bytes SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, int); +} sock_redir_flags SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, int); +} sock_skb_opts SEC(".maps"); + +SEC("sk_skb1") +int bpf_prog1(struct __sk_buff *skb) +{ + return skb->len; +} + +SEC("sk_skb2") +int bpf_prog2(struct __sk_buff *skb) +{ + __u32 lport = skb->local_port; + __u32 rport = skb->remote_port; + int len, *f, ret, zero = 0; + __u64 flags = 0; + + if (lport == 10000) + ret = 10; + else + ret = 1; + + len = (__u32)skb->data_end - (__u32)skb->data; + f = bpf_map_lookup_elem(&sock_skb_opts, &zero); + if (f && *f) { + ret = 3; + flags = *f; + } + + bpf_printk("sk_skb2: redirect(%iB) flags=%i\n", + len, flags); +#ifdef SOCKMAP + return bpf_sk_redirect_map(skb, &sock_map, ret, flags); +#else + return bpf_sk_redirect_hash(skb, &sock_map, &ret, flags); +#endif + +} + +SEC("sockops") +int bpf_sockmap(struct bpf_sock_ops *skops) +{ + __u32 lport, rport; + int op, err = 0, index, key, ret; + + + op = (int) skops->op; + + switch (op) { + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + lport = skops->local_port; + rport = skops->remote_port; + + if (lport == 10000) { + ret = 1; +#ifdef SOCKMAP + err = bpf_sock_map_update(skops, &sock_map, &ret, + BPF_NOEXIST); +#else + err = bpf_sock_hash_update(skops, &sock_map, &ret, + BPF_NOEXIST); +#endif + bpf_printk("passive(%i -> %i) map ctx update err: %d\n", + lport, bpf_ntohl(rport), err); + } + break; + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + lport = skops->local_port; + rport = skops->remote_port; + + if (bpf_ntohl(rport) == 10001) { + ret = 10; +#ifdef SOCKMAP + err = bpf_sock_map_update(skops, &sock_map, &ret, + BPF_NOEXIST); +#else + err = bpf_sock_hash_update(skops, &sock_map, &ret, + BPF_NOEXIST); +#endif + bpf_printk("active(%i -> %i) map ctx update err: %d\n", + lport, bpf_ntohl(rport), err); + } + break; + default: + break; + } + + return 0; +} + +SEC("sk_msg1") +int bpf_prog4(struct sk_msg_md *msg) +{ + int *bytes, zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5; + int *start, *end, *start_push, *end_push, *start_pop, *pop; + + bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); + if (bytes) + bpf_msg_apply_bytes(msg, *bytes); + bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); + if (bytes) + bpf_msg_cork_bytes(msg, *bytes); + start = bpf_map_lookup_elem(&sock_bytes, &zero); + end = bpf_map_lookup_elem(&sock_bytes, &one); + if (start && end) + bpf_msg_pull_data(msg, *start, *end, 0); + start_push = bpf_map_lookup_elem(&sock_bytes, &two); + end_push = bpf_map_lookup_elem(&sock_bytes, &three); + if (start_push && end_push) + bpf_msg_push_data(msg, *start_push, *end_push, 0); + start_pop = bpf_map_lookup_elem(&sock_bytes, &four); + pop = bpf_map_lookup_elem(&sock_bytes, &five); + if (start_pop && pop) + bpf_msg_pop_data(msg, *start_pop, *pop, 0); + return SK_PASS; +} + +SEC("sk_msg2") +int bpf_prog5(struct sk_msg_md *msg) +{ + int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5; + int *start, *end, *start_push, *end_push, *start_pop, *pop; + int *bytes, len1, len2 = 0, len3, len4; + int err1 = -1, err2 = -1; + + bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); + if (bytes) + err1 = bpf_msg_apply_bytes(msg, *bytes); + bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); + if (bytes) + err2 = bpf_msg_cork_bytes(msg, *bytes); + len1 = (__u64)msg->data_end - (__u64)msg->data; + start = bpf_map_lookup_elem(&sock_bytes, &zero); + end = bpf_map_lookup_elem(&sock_bytes, &one); + if (start && end) { + int err; + + bpf_printk("sk_msg2: pull(%i:%i)\n", + start ? *start : 0, end ? *end : 0); + err = bpf_msg_pull_data(msg, *start, *end, 0); + if (err) + bpf_printk("sk_msg2: pull_data err %i\n", + err); + len2 = (__u64)msg->data_end - (__u64)msg->data; + bpf_printk("sk_msg2: length update %i->%i\n", + len1, len2); + } + + start_push = bpf_map_lookup_elem(&sock_bytes, &two); + end_push = bpf_map_lookup_elem(&sock_bytes, &three); + if (start_push && end_push) { + int err; + + bpf_printk("sk_msg2: push(%i:%i)\n", + start_push ? *start_push : 0, + end_push ? *end_push : 0); + err = bpf_msg_push_data(msg, *start_push, *end_push, 0); + if (err) + bpf_printk("sk_msg2: push_data err %i\n", err); + len3 = (__u64)msg->data_end - (__u64)msg->data; + bpf_printk("sk_msg2: length push_update %i->%i\n", + len2 ? len2 : len1, len3); + } + start_pop = bpf_map_lookup_elem(&sock_bytes, &four); + pop = bpf_map_lookup_elem(&sock_bytes, &five); + if (start_pop && pop) { + int err; + + bpf_printk("sk_msg2: pop(%i@%i)\n", + start_pop, pop); + err = bpf_msg_pop_data(msg, *start_pop, *pop, 0); + if (err) + bpf_printk("sk_msg2: pop_data err %i\n", err); + len4 = (__u64)msg->data_end - (__u64)msg->data; + bpf_printk("sk_msg2: length pop_data %i->%i\n", + len1 ? len1 : 0, len4); + } + + bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n", + len1, err1, err2); + return SK_PASS; +} + +SEC("sk_msg3") +int bpf_prog6(struct sk_msg_md *msg) +{ + int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5, key = 0; + int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop, *f; + __u64 flags = 0; + + bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); + if (bytes) + bpf_msg_apply_bytes(msg, *bytes); + bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); + if (bytes) + bpf_msg_cork_bytes(msg, *bytes); + + start = bpf_map_lookup_elem(&sock_bytes, &zero); + end = bpf_map_lookup_elem(&sock_bytes, &one); + if (start && end) + bpf_msg_pull_data(msg, *start, *end, 0); + + start_push = bpf_map_lookup_elem(&sock_bytes, &two); + end_push = bpf_map_lookup_elem(&sock_bytes, &three); + if (start_push && end_push) + bpf_msg_push_data(msg, *start_push, *end_push, 0); + + start_pop = bpf_map_lookup_elem(&sock_bytes, &four); + pop = bpf_map_lookup_elem(&sock_bytes, &five); + if (start_pop && pop) + bpf_msg_pop_data(msg, *start_pop, *pop, 0); + + f = bpf_map_lookup_elem(&sock_redir_flags, &zero); + if (f && *f) { + key = 2; + flags = *f; + } +#ifdef SOCKMAP + return bpf_msg_redirect_map(msg, &sock_map_redir, key, flags); +#else + return bpf_msg_redirect_hash(msg, &sock_map_redir, &key, flags); +#endif +} + +SEC("sk_msg4") +int bpf_prog7(struct sk_msg_md *msg) +{ + int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop, *f; + int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5; + int len1, len2 = 0, len3, len4; + int err1 = 0, err2 = 0, key = 0; + __u64 flags = 0; + + int err; + bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); + if (bytes) + err1 = bpf_msg_apply_bytes(msg, *bytes); + bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); + if (bytes) + err2 = bpf_msg_cork_bytes(msg, *bytes); + len1 = (__u64)msg->data_end - (__u64)msg->data; + + start = bpf_map_lookup_elem(&sock_bytes, &zero); + end = bpf_map_lookup_elem(&sock_bytes, &one); + if (start && end) { + bpf_printk("sk_msg2: pull(%i:%i)\n", + start ? *start : 0, end ? *end : 0); + err = bpf_msg_pull_data(msg, *start, *end, 0); + if (err) + bpf_printk("sk_msg2: pull_data err %i\n", + err); + len2 = (__u64)msg->data_end - (__u64)msg->data; + bpf_printk("sk_msg2: length update %i->%i\n", + len1, len2); + } + + start_push = bpf_map_lookup_elem(&sock_bytes, &two); + end_push = bpf_map_lookup_elem(&sock_bytes, &three); + if (start_push && end_push) { + bpf_printk("sk_msg4: push(%i:%i)\n", + start_push ? *start_push : 0, + end_push ? *end_push : 0); + err = bpf_msg_push_data(msg, *start_push, *end_push, 0); + if (err) + bpf_printk("sk_msg4: push_data err %i\n", + err); + len3 = (__u64)msg->data_end - (__u64)msg->data; + bpf_printk("sk_msg4: length push_update %i->%i\n", + len2 ? len2 : len1, len3); + } + + start_pop = bpf_map_lookup_elem(&sock_bytes, &four); + pop = bpf_map_lookup_elem(&sock_bytes, &five); + if (start_pop && pop) { + int err; + + bpf_printk("sk_msg4: pop(%i@%i)\n", + start_pop, pop); + err = bpf_msg_pop_data(msg, *start_pop, *pop, 0); + if (err) + bpf_printk("sk_msg4: pop_data err %i\n", err); + len4 = (__u64)msg->data_end - (__u64)msg->data; + bpf_printk("sk_msg4: length pop_data %i->%i\n", + len1 ? len1 : 0, len4); + } + + + f = bpf_map_lookup_elem(&sock_redir_flags, &zero); + if (f && *f) { + key = 2; + flags = *f; + } + bpf_printk("sk_msg3: redirect(%iB) flags=%i err=%i\n", + len1, flags, err1 ? err1 : err2); +#ifdef SOCKMAP + err = bpf_msg_redirect_map(msg, &sock_map_redir, key, flags); +#else + err = bpf_msg_redirect_hash(msg, &sock_map_redir, &key, flags); +#endif + bpf_printk("sk_msg3: err %i\n", err); + return err; +} + +SEC("sk_msg5") +int bpf_prog8(struct sk_msg_md *msg) +{ + void *data_end = (void *)(long) msg->data_end; + void *data = (void *)(long) msg->data; + int ret = 0, *bytes, zero = 0; + + bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); + if (bytes) { + ret = bpf_msg_apply_bytes(msg, *bytes); + if (ret) + return SK_DROP; + } else { + return SK_DROP; + } + return SK_PASS; +} +SEC("sk_msg6") +int bpf_prog9(struct sk_msg_md *msg) +{ + void *data_end = (void *)(long) msg->data_end; + void *data = (void *)(long) msg->data; + int ret = 0, *bytes, zero = 0; + + bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); + if (bytes) { + if (((__u64)data_end - (__u64)data) >= *bytes) + return SK_PASS; + ret = bpf_msg_cork_bytes(msg, *bytes); + if (ret) + return SK_DROP; + } + return SK_PASS; +} + +SEC("sk_msg7") +int bpf_prog10(struct sk_msg_md *msg) +{ + int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop; + int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5; + + bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); + if (bytes) + bpf_msg_apply_bytes(msg, *bytes); + bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); + if (bytes) + bpf_msg_cork_bytes(msg, *bytes); + start = bpf_map_lookup_elem(&sock_bytes, &zero); + end = bpf_map_lookup_elem(&sock_bytes, &one); + if (start && end) + bpf_msg_pull_data(msg, *start, *end, 0); + start_push = bpf_map_lookup_elem(&sock_bytes, &two); + end_push = bpf_map_lookup_elem(&sock_bytes, &three); + if (start_push && end_push) + bpf_msg_push_data(msg, *start_push, *end_push, 0); + start_pop = bpf_map_lookup_elem(&sock_bytes, &four); + pop = bpf_map_lookup_elem(&sock_bytes, &five); + if (start_pop && pop) + bpf_msg_pop_data(msg, *start_pop, *pop, 0); + bpf_printk("return sk drop\n"); + return SK_DROP; +} + +int _version SEC("version") = 1; +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_sockmap_kern.h b/tools/testing/selftests/bpf/test_sockmap_kern.h deleted file mode 100644 index 9b4d3a68a91a..000000000000 --- a/tools/testing/selftests/bpf/test_sockmap_kern.h +++ /dev/null @@ -1,451 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (c) 2017-2018 Covalent IO, Inc. http://covalent.io */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Sockmap sample program connects a client and a backend together - * using cgroups. - * - * client:X <---> frontend:80 client:X <---> backend:80 - * - * For simplicity we hard code values here and bind 1:1. The hard - * coded values are part of the setup in sockmap.sh script that - * is associated with this BPF program. - * - * The bpf_printk is verbose and prints information as connections - * are established and verdicts are decided. - */ - -struct { - __uint(type, TEST_MAP_TYPE); - __uint(max_entries, 20); - __uint(key_size, sizeof(int)); - __uint(value_size, sizeof(int)); -} sock_map SEC(".maps"); - -struct { - __uint(type, TEST_MAP_TYPE); - __uint(max_entries, 20); - __uint(key_size, sizeof(int)); - __uint(value_size, sizeof(int)); -} sock_map_txmsg SEC(".maps"); - -struct { - __uint(type, TEST_MAP_TYPE); - __uint(max_entries, 20); - __uint(key_size, sizeof(int)); - __uint(value_size, sizeof(int)); -} sock_map_redir SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 1); - __type(key, int); - __type(value, int); -} sock_apply_bytes SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 1); - __type(key, int); - __type(value, int); -} sock_cork_bytes SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 6); - __type(key, int); - __type(value, int); -} sock_bytes SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 1); - __type(key, int); - __type(value, int); -} sock_redir_flags SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 1); - __type(key, int); - __type(value, int); -} sock_skb_opts SEC(".maps"); - -SEC("sk_skb1") -int bpf_prog1(struct __sk_buff *skb) -{ - return skb->len; -} - -SEC("sk_skb2") -int bpf_prog2(struct __sk_buff *skb) -{ - __u32 lport = skb->local_port; - __u32 rport = skb->remote_port; - int len, *f, ret, zero = 0; - __u64 flags = 0; - - if (lport == 10000) - ret = 10; - else - ret = 1; - - len = (__u32)skb->data_end - (__u32)skb->data; - f = bpf_map_lookup_elem(&sock_skb_opts, &zero); - if (f && *f) { - ret = 3; - flags = *f; - } - - bpf_printk("sk_skb2: redirect(%iB) flags=%i\n", - len, flags); -#ifdef SOCKMAP - return bpf_sk_redirect_map(skb, &sock_map, ret, flags); -#else - return bpf_sk_redirect_hash(skb, &sock_map, &ret, flags); -#endif - -} - -SEC("sockops") -int bpf_sockmap(struct bpf_sock_ops *skops) -{ - __u32 lport, rport; - int op, err = 0, index, key, ret; - - - op = (int) skops->op; - - switch (op) { - case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: - lport = skops->local_port; - rport = skops->remote_port; - - if (lport == 10000) { - ret = 1; -#ifdef SOCKMAP - err = bpf_sock_map_update(skops, &sock_map, &ret, - BPF_NOEXIST); -#else - err = bpf_sock_hash_update(skops, &sock_map, &ret, - BPF_NOEXIST); -#endif - bpf_printk("passive(%i -> %i) map ctx update err: %d\n", - lport, bpf_ntohl(rport), err); - } - break; - case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: - lport = skops->local_port; - rport = skops->remote_port; - - if (bpf_ntohl(rport) == 10001) { - ret = 10; -#ifdef SOCKMAP - err = bpf_sock_map_update(skops, &sock_map, &ret, - BPF_NOEXIST); -#else - err = bpf_sock_hash_update(skops, &sock_map, &ret, - BPF_NOEXIST); -#endif - bpf_printk("active(%i -> %i) map ctx update err: %d\n", - lport, bpf_ntohl(rport), err); - } - break; - default: - break; - } - - return 0; -} - -SEC("sk_msg1") -int bpf_prog4(struct sk_msg_md *msg) -{ - int *bytes, zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5; - int *start, *end, *start_push, *end_push, *start_pop, *pop; - - bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); - if (bytes) - bpf_msg_apply_bytes(msg, *bytes); - bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); - if (bytes) - bpf_msg_cork_bytes(msg, *bytes); - start = bpf_map_lookup_elem(&sock_bytes, &zero); - end = bpf_map_lookup_elem(&sock_bytes, &one); - if (start && end) - bpf_msg_pull_data(msg, *start, *end, 0); - start_push = bpf_map_lookup_elem(&sock_bytes, &two); - end_push = bpf_map_lookup_elem(&sock_bytes, &three); - if (start_push && end_push) - bpf_msg_push_data(msg, *start_push, *end_push, 0); - start_pop = bpf_map_lookup_elem(&sock_bytes, &four); - pop = bpf_map_lookup_elem(&sock_bytes, &five); - if (start_pop && pop) - bpf_msg_pop_data(msg, *start_pop, *pop, 0); - return SK_PASS; -} - -SEC("sk_msg2") -int bpf_prog5(struct sk_msg_md *msg) -{ - int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5; - int *start, *end, *start_push, *end_push, *start_pop, *pop; - int *bytes, len1, len2 = 0, len3, len4; - int err1 = -1, err2 = -1; - - bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); - if (bytes) - err1 = bpf_msg_apply_bytes(msg, *bytes); - bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); - if (bytes) - err2 = bpf_msg_cork_bytes(msg, *bytes); - len1 = (__u64)msg->data_end - (__u64)msg->data; - start = bpf_map_lookup_elem(&sock_bytes, &zero); - end = bpf_map_lookup_elem(&sock_bytes, &one); - if (start && end) { - int err; - - bpf_printk("sk_msg2: pull(%i:%i)\n", - start ? *start : 0, end ? *end : 0); - err = bpf_msg_pull_data(msg, *start, *end, 0); - if (err) - bpf_printk("sk_msg2: pull_data err %i\n", - err); - len2 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length update %i->%i\n", - len1, len2); - } - - start_push = bpf_map_lookup_elem(&sock_bytes, &two); - end_push = bpf_map_lookup_elem(&sock_bytes, &three); - if (start_push && end_push) { - int err; - - bpf_printk("sk_msg2: push(%i:%i)\n", - start_push ? *start_push : 0, - end_push ? *end_push : 0); - err = bpf_msg_push_data(msg, *start_push, *end_push, 0); - if (err) - bpf_printk("sk_msg2: push_data err %i\n", err); - len3 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length push_update %i->%i\n", - len2 ? len2 : len1, len3); - } - start_pop = bpf_map_lookup_elem(&sock_bytes, &four); - pop = bpf_map_lookup_elem(&sock_bytes, &five); - if (start_pop && pop) { - int err; - - bpf_printk("sk_msg2: pop(%i@%i)\n", - start_pop, pop); - err = bpf_msg_pop_data(msg, *start_pop, *pop, 0); - if (err) - bpf_printk("sk_msg2: pop_data err %i\n", err); - len4 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length pop_data %i->%i\n", - len1 ? len1 : 0, len4); - } - - bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n", - len1, err1, err2); - return SK_PASS; -} - -SEC("sk_msg3") -int bpf_prog6(struct sk_msg_md *msg) -{ - int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5, key = 0; - int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop, *f; - __u64 flags = 0; - - bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); - if (bytes) - bpf_msg_apply_bytes(msg, *bytes); - bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); - if (bytes) - bpf_msg_cork_bytes(msg, *bytes); - - start = bpf_map_lookup_elem(&sock_bytes, &zero); - end = bpf_map_lookup_elem(&sock_bytes, &one); - if (start && end) - bpf_msg_pull_data(msg, *start, *end, 0); - - start_push = bpf_map_lookup_elem(&sock_bytes, &two); - end_push = bpf_map_lookup_elem(&sock_bytes, &three); - if (start_push && end_push) - bpf_msg_push_data(msg, *start_push, *end_push, 0); - - start_pop = bpf_map_lookup_elem(&sock_bytes, &four); - pop = bpf_map_lookup_elem(&sock_bytes, &five); - if (start_pop && pop) - bpf_msg_pop_data(msg, *start_pop, *pop, 0); - - f = bpf_map_lookup_elem(&sock_redir_flags, &zero); - if (f && *f) { - key = 2; - flags = *f; - } -#ifdef SOCKMAP - return bpf_msg_redirect_map(msg, &sock_map_redir, key, flags); -#else - return bpf_msg_redirect_hash(msg, &sock_map_redir, &key, flags); -#endif -} - -SEC("sk_msg4") -int bpf_prog7(struct sk_msg_md *msg) -{ - int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop, *f; - int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5; - int len1, len2 = 0, len3, len4; - int err1 = 0, err2 = 0, key = 0; - __u64 flags = 0; - - int err; - bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); - if (bytes) - err1 = bpf_msg_apply_bytes(msg, *bytes); - bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); - if (bytes) - err2 = bpf_msg_cork_bytes(msg, *bytes); - len1 = (__u64)msg->data_end - (__u64)msg->data; - - start = bpf_map_lookup_elem(&sock_bytes, &zero); - end = bpf_map_lookup_elem(&sock_bytes, &one); - if (start && end) { - bpf_printk("sk_msg2: pull(%i:%i)\n", - start ? *start : 0, end ? *end : 0); - err = bpf_msg_pull_data(msg, *start, *end, 0); - if (err) - bpf_printk("sk_msg2: pull_data err %i\n", - err); - len2 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length update %i->%i\n", - len1, len2); - } - - start_push = bpf_map_lookup_elem(&sock_bytes, &two); - end_push = bpf_map_lookup_elem(&sock_bytes, &three); - if (start_push && end_push) { - bpf_printk("sk_msg4: push(%i:%i)\n", - start_push ? *start_push : 0, - end_push ? *end_push : 0); - err = bpf_msg_push_data(msg, *start_push, *end_push, 0); - if (err) - bpf_printk("sk_msg4: push_data err %i\n", - err); - len3 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg4: length push_update %i->%i\n", - len2 ? len2 : len1, len3); - } - - start_pop = bpf_map_lookup_elem(&sock_bytes, &four); - pop = bpf_map_lookup_elem(&sock_bytes, &five); - if (start_pop && pop) { - int err; - - bpf_printk("sk_msg4: pop(%i@%i)\n", - start_pop, pop); - err = bpf_msg_pop_data(msg, *start_pop, *pop, 0); - if (err) - bpf_printk("sk_msg4: pop_data err %i\n", err); - len4 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg4: length pop_data %i->%i\n", - len1 ? len1 : 0, len4); - } - - - f = bpf_map_lookup_elem(&sock_redir_flags, &zero); - if (f && *f) { - key = 2; - flags = *f; - } - bpf_printk("sk_msg3: redirect(%iB) flags=%i err=%i\n", - len1, flags, err1 ? err1 : err2); -#ifdef SOCKMAP - err = bpf_msg_redirect_map(msg, &sock_map_redir, key, flags); -#else - err = bpf_msg_redirect_hash(msg, &sock_map_redir, &key, flags); -#endif - bpf_printk("sk_msg3: err %i\n", err); - return err; -} - -SEC("sk_msg5") -int bpf_prog8(struct sk_msg_md *msg) -{ - void *data_end = (void *)(long) msg->data_end; - void *data = (void *)(long) msg->data; - int ret = 0, *bytes, zero = 0; - - bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); - if (bytes) { - ret = bpf_msg_apply_bytes(msg, *bytes); - if (ret) - return SK_DROP; - } else { - return SK_DROP; - } - return SK_PASS; -} -SEC("sk_msg6") -int bpf_prog9(struct sk_msg_md *msg) -{ - void *data_end = (void *)(long) msg->data_end; - void *data = (void *)(long) msg->data; - int ret = 0, *bytes, zero = 0; - - bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); - if (bytes) { - if (((__u64)data_end - (__u64)data) >= *bytes) - return SK_PASS; - ret = bpf_msg_cork_bytes(msg, *bytes); - if (ret) - return SK_DROP; - } - return SK_PASS; -} - -SEC("sk_msg7") -int bpf_prog10(struct sk_msg_md *msg) -{ - int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop; - int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5; - - bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); - if (bytes) - bpf_msg_apply_bytes(msg, *bytes); - bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); - if (bytes) - bpf_msg_cork_bytes(msg, *bytes); - start = bpf_map_lookup_elem(&sock_bytes, &zero); - end = bpf_map_lookup_elem(&sock_bytes, &one); - if (start && end) - bpf_msg_pull_data(msg, *start, *end, 0); - start_push = bpf_map_lookup_elem(&sock_bytes, &two); - end_push = bpf_map_lookup_elem(&sock_bytes, &three); - if (start_push && end_push) - bpf_msg_push_data(msg, *start_push, *end_push, 0); - start_pop = bpf_map_lookup_elem(&sock_bytes, &four); - pop = bpf_map_lookup_elem(&sock_bytes, &five); - if (start_pop && pop) - bpf_msg_pop_data(msg, *start_pop, *pop, 0); - bpf_printk("return sk drop\n"); - return SK_DROP; -} - -int _version SEC("version") = 1; -char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From d79a32129b21296f1dce1bd9d703826853bb63a6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 13 May 2020 12:13:27 -0700 Subject: bpf: Selftests, remove prints from sockmap tests The prints in the test_sockmap programs were only useful when we didn't have enough control over test infrastructure to know from user program what was being pushed into kernel side. Now that we have or will shortly have better test controls lets remove the printers. This means we can remove half the programs and cleanup bpf side. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/158939720756.15176.9806965887313279429.stgit@john-Precision-5820-Tower --- .../selftests/bpf/progs/test_sockmap_kern.h | 158 +-------------------- tools/testing/selftests/bpf/test_sockmap.c | 25 +--- 2 files changed, 9 insertions(+), 174 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_kern.h b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h index 9b4d3a68a91a..a443d3637db3 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_kern.h +++ b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h @@ -110,8 +110,6 @@ int bpf_prog2(struct __sk_buff *skb) flags = *f; } - bpf_printk("sk_skb2: redirect(%iB) flags=%i\n", - len, flags); #ifdef SOCKMAP return bpf_sk_redirect_map(skb, &sock_map, ret, flags); #else @@ -143,8 +141,6 @@ int bpf_sockmap(struct bpf_sock_ops *skops) err = bpf_sock_hash_update(skops, &sock_map, &ret, BPF_NOEXIST); #endif - bpf_printk("passive(%i -> %i) map ctx update err: %d\n", - lport, bpf_ntohl(rport), err); } break; case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: @@ -160,8 +156,6 @@ int bpf_sockmap(struct bpf_sock_ops *skops) err = bpf_sock_hash_update(skops, &sock_map, &ret, BPF_NOEXIST); #endif - bpf_printk("active(%i -> %i) map ctx update err: %d\n", - lport, bpf_ntohl(rport), err); } break; default: @@ -199,72 +193,6 @@ int bpf_prog4(struct sk_msg_md *msg) } SEC("sk_msg2") -int bpf_prog5(struct sk_msg_md *msg) -{ - int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5; - int *start, *end, *start_push, *end_push, *start_pop, *pop; - int *bytes, len1, len2 = 0, len3, len4; - int err1 = -1, err2 = -1; - - bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); - if (bytes) - err1 = bpf_msg_apply_bytes(msg, *bytes); - bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); - if (bytes) - err2 = bpf_msg_cork_bytes(msg, *bytes); - len1 = (__u64)msg->data_end - (__u64)msg->data; - start = bpf_map_lookup_elem(&sock_bytes, &zero); - end = bpf_map_lookup_elem(&sock_bytes, &one); - if (start && end) { - int err; - - bpf_printk("sk_msg2: pull(%i:%i)\n", - start ? *start : 0, end ? *end : 0); - err = bpf_msg_pull_data(msg, *start, *end, 0); - if (err) - bpf_printk("sk_msg2: pull_data err %i\n", - err); - len2 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length update %i->%i\n", - len1, len2); - } - - start_push = bpf_map_lookup_elem(&sock_bytes, &two); - end_push = bpf_map_lookup_elem(&sock_bytes, &three); - if (start_push && end_push) { - int err; - - bpf_printk("sk_msg2: push(%i:%i)\n", - start_push ? *start_push : 0, - end_push ? *end_push : 0); - err = bpf_msg_push_data(msg, *start_push, *end_push, 0); - if (err) - bpf_printk("sk_msg2: push_data err %i\n", err); - len3 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length push_update %i->%i\n", - len2 ? len2 : len1, len3); - } - start_pop = bpf_map_lookup_elem(&sock_bytes, &four); - pop = bpf_map_lookup_elem(&sock_bytes, &five); - if (start_pop && pop) { - int err; - - bpf_printk("sk_msg2: pop(%i@%i)\n", - start_pop, pop); - err = bpf_msg_pop_data(msg, *start_pop, *pop, 0); - if (err) - bpf_printk("sk_msg2: pop_data err %i\n", err); - len4 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length pop_data %i->%i\n", - len1 ? len1 : 0, len4); - } - - bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n", - len1, err1, err2); - return SK_PASS; -} - -SEC("sk_msg3") int bpf_prog6(struct sk_msg_md *msg) { int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5, key = 0; @@ -305,86 +233,7 @@ int bpf_prog6(struct sk_msg_md *msg) #endif } -SEC("sk_msg4") -int bpf_prog7(struct sk_msg_md *msg) -{ - int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop, *f; - int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5; - int len1, len2 = 0, len3, len4; - int err1 = 0, err2 = 0, key = 0; - __u64 flags = 0; - - int err; - bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); - if (bytes) - err1 = bpf_msg_apply_bytes(msg, *bytes); - bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); - if (bytes) - err2 = bpf_msg_cork_bytes(msg, *bytes); - len1 = (__u64)msg->data_end - (__u64)msg->data; - - start = bpf_map_lookup_elem(&sock_bytes, &zero); - end = bpf_map_lookup_elem(&sock_bytes, &one); - if (start && end) { - bpf_printk("sk_msg2: pull(%i:%i)\n", - start ? *start : 0, end ? *end : 0); - err = bpf_msg_pull_data(msg, *start, *end, 0); - if (err) - bpf_printk("sk_msg2: pull_data err %i\n", - err); - len2 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length update %i->%i\n", - len1, len2); - } - - start_push = bpf_map_lookup_elem(&sock_bytes, &two); - end_push = bpf_map_lookup_elem(&sock_bytes, &three); - if (start_push && end_push) { - bpf_printk("sk_msg4: push(%i:%i)\n", - start_push ? *start_push : 0, - end_push ? *end_push : 0); - err = bpf_msg_push_data(msg, *start_push, *end_push, 0); - if (err) - bpf_printk("sk_msg4: push_data err %i\n", - err); - len3 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg4: length push_update %i->%i\n", - len2 ? len2 : len1, len3); - } - - start_pop = bpf_map_lookup_elem(&sock_bytes, &four); - pop = bpf_map_lookup_elem(&sock_bytes, &five); - if (start_pop && pop) { - int err; - - bpf_printk("sk_msg4: pop(%i@%i)\n", - start_pop, pop); - err = bpf_msg_pop_data(msg, *start_pop, *pop, 0); - if (err) - bpf_printk("sk_msg4: pop_data err %i\n", err); - len4 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg4: length pop_data %i->%i\n", - len1 ? len1 : 0, len4); - } - - - f = bpf_map_lookup_elem(&sock_redir_flags, &zero); - if (f && *f) { - key = 2; - flags = *f; - } - bpf_printk("sk_msg3: redirect(%iB) flags=%i err=%i\n", - len1, flags, err1 ? err1 : err2); -#ifdef SOCKMAP - err = bpf_msg_redirect_map(msg, &sock_map_redir, key, flags); -#else - err = bpf_msg_redirect_hash(msg, &sock_map_redir, &key, flags); -#endif - bpf_printk("sk_msg3: err %i\n", err); - return err; -} - -SEC("sk_msg5") +SEC("sk_msg3") int bpf_prog8(struct sk_msg_md *msg) { void *data_end = (void *)(long) msg->data_end; @@ -401,7 +250,7 @@ int bpf_prog8(struct sk_msg_md *msg) } return SK_PASS; } -SEC("sk_msg6") +SEC("sk_msg4") int bpf_prog9(struct sk_msg_md *msg) { void *data_end = (void *)(long) msg->data_end; @@ -419,7 +268,7 @@ int bpf_prog9(struct sk_msg_md *msg) return SK_PASS; } -SEC("sk_msg7") +SEC("sk_msg5") int bpf_prog10(struct sk_msg_md *msg) { int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop; @@ -443,7 +292,6 @@ int bpf_prog10(struct sk_msg_md *msg) pop = bpf_map_lookup_elem(&sock_bytes, &five); if (start_pop && pop) bpf_msg_pop_data(msg, *start_pop, *pop, 0); - bpf_printk("return sk drop\n"); return SK_DROP; } diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 779e11da979c..6bdacc4f04d8 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -68,9 +68,7 @@ struct bpf_map *maps[8]; int prog_fd[11]; int txmsg_pass; -int txmsg_noisy; int txmsg_redir; -int txmsg_redir_noisy; int txmsg_drop; int txmsg_apply; int txmsg_cork; @@ -95,9 +93,7 @@ static const struct option long_options[] = { {"test", required_argument, NULL, 't' }, {"data_test", no_argument, NULL, 'd' }, {"txmsg", no_argument, &txmsg_pass, 1 }, - {"txmsg_noisy", no_argument, &txmsg_noisy, 1 }, {"txmsg_redir", no_argument, &txmsg_redir, 1 }, - {"txmsg_redir_noisy", no_argument, &txmsg_redir_noisy, 1}, {"txmsg_drop", no_argument, &txmsg_drop, 1 }, {"txmsg_apply", required_argument, NULL, 'a'}, {"txmsg_cork", required_argument, NULL, 'k'}, @@ -834,19 +830,14 @@ run: /* Attach txmsg program to sockmap */ if (txmsg_pass) tx_prog_fd = prog_fd[3]; - else if (txmsg_noisy) - tx_prog_fd = prog_fd[4]; else if (txmsg_redir) + tx_prog_fd = prog_fd[4]; + else if (txmsg_apply) tx_prog_fd = prog_fd[5]; - else if (txmsg_redir_noisy) + else if (txmsg_cork) tx_prog_fd = prog_fd[6]; else if (txmsg_drop) - tx_prog_fd = prog_fd[9]; - /* apply and cork must be last */ - else if (txmsg_apply) tx_prog_fd = prog_fd[7]; - else if (txmsg_cork) - tx_prog_fd = prog_fd[8]; else tx_prog_fd = 0; @@ -870,7 +861,7 @@ run: goto out; } - if (txmsg_redir || txmsg_redir_noisy) + if (txmsg_redir) redir_fd = c2; else redir_fd = c1; @@ -1112,12 +1103,8 @@ static void test_options(char *options) if (txmsg_pass) strncat(options, "pass,", OPTSTRING); - if (txmsg_noisy) - strncat(options, "pass_noisy,", OPTSTRING); if (txmsg_redir) strncat(options, "redir,", OPTSTRING); - if (txmsg_redir_noisy) - strncat(options, "redir_noisy,", OPTSTRING); if (txmsg_drop) strncat(options, "drop,", OPTSTRING); if (txmsg_apply) { @@ -1228,7 +1215,7 @@ static int test_txmsg(int cgrp) { int err; - txmsg_pass = txmsg_noisy = txmsg_redir_noisy = txmsg_drop = 0; + txmsg_pass = txmsg_drop = 0; txmsg_apply = txmsg_cork = 0; txmsg_ingress = txmsg_skb = 0; @@ -1319,7 +1306,7 @@ static int test_mixed(int cgrp) struct sockmap_options opt = {0}; int err; - txmsg_pass = txmsg_noisy = txmsg_redir_noisy = txmsg_drop = 0; + txmsg_pass = txmsg_drop = 0; txmsg_apply = txmsg_cork = 0; txmsg_start = txmsg_end = 0; txmsg_start_push = txmsg_end_push = 0; -- cgit v1.2.3 From 13a5f3ffd202f73f1d0c2ed36dd66a0cd891e61a Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 13 May 2020 12:13:46 -0700 Subject: bpf: Selftests, sockmap test prog run without setting cgroup Running test_sockmap with arguments to specify a test pattern requires including a cgroup argument. Instead of requiring this if the option is not provided create one This is not used by selftest runs but I use it when I want to test a specific test. Most useful when developing new code and/or tests. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/158939722675.15176.6294210959489131688.stgit@john-Precision-5820-Tower --- tools/testing/selftests/bpf/test_sockmap.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 6bdacc4f04d8..5ef71feb65ce 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -1725,6 +1725,7 @@ int main(int argc, char **argv) int opt, longindex, err, cg_fd = 0; char *bpf_file = BPF_SOCKMAP_FILENAME; int test = PING_PONG; + bool cg_created = 0; if (argc < 2) return test_suite(-1); @@ -1805,13 +1806,25 @@ int main(int argc, char **argv) } } - if (argc <= 3 && cg_fd) - return test_suite(cg_fd); - if (!cg_fd) { - fprintf(stderr, "%s requires cgroup option: --cgroup \n", - argv[0]); - return -1; + if (setup_cgroup_environment()) { + fprintf(stderr, "ERROR: cgroup env failed\n"); + return -EINVAL; + } + + cg_fd = create_and_get_cgroup(CG_PATH); + if (cg_fd < 0) { + fprintf(stderr, + "ERROR: (%i) open cg path failed: %s\n", + cg_fd, strerror(errno)); + return cg_fd; + } + + if (join_cgroup(CG_PATH)) { + fprintf(stderr, "ERROR: failed to join cgroup\n"); + return -EINVAL; + } + cg_created = 1; } err = populate_progs(bpf_file); @@ -1830,6 +1843,9 @@ int main(int argc, char **argv) options.rate = rate; err = run_options(&options, cg_fd, test); + + if (cg_created) + cleanup_cgroup_environment(); close(cg_fd); return err; } -- cgit v1.2.3 From 248aba1d526e052ee9aba6dd9c5a198e30839cbd Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 13 May 2020 12:14:05 -0700 Subject: bpf: Selftests, print error in test_sockmap error cases Its helpful to know the error value if an error occurs. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/158939724566.15176.12079885932643225626.stgit@john-Precision-5820-Tower --- tools/testing/selftests/bpf/test_sockmap.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 5ef71feb65ce..7f45a8fd8f02 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -341,14 +341,18 @@ static int msg_loop_sendpage(int fd, int iov_length, int cnt, clock_gettime(CLOCK_MONOTONIC, &s->start); for (i = 0; i < cnt; i++) { - int sent = sendfile(fd, fp, NULL, iov_length); + int sent; + + errno = 0; + sent = sendfile(fd, fp, NULL, iov_length); if (!drop && sent < 0) { - perror("send loop error"); + perror("sendpage loop error"); fclose(file); return sent; } else if (drop && sent >= 0) { - printf("sendpage loop error expected: %i\n", sent); + printf("sendpage loop error expected: %i errno %i\n", + sent, errno); fclose(file); return -EIO; } @@ -460,13 +464,18 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, if (tx) { clock_gettime(CLOCK_MONOTONIC, &s->start); for (i = 0; i < cnt; i++) { - int sent = sendmsg(fd, &msg, flags); + int sent; + + errno = 0; + sent = sendmsg(fd, &msg, flags); if (!drop && sent < 0) { - perror("send loop error"); + perror("sendmsg loop error"); goto out_errno; } else if (drop && sent >= 0) { - printf("send loop error expected: %i\n", sent); + fprintf(stderr, + "sendmsg loop error expected: %i errno %i\n", + sent, errno); errno = -EIO; goto out_errno; } @@ -690,14 +699,14 @@ static int sendmsg_test(struct sockmap_options *opt) if (WIFEXITED(rx_status)) { err = WEXITSTATUS(rx_status); if (err) { - fprintf(stderr, "rx thread exited with err %d. ", err); + fprintf(stderr, "rx thread exited with err %d.\n", err); goto out; } } if (WIFEXITED(tx_status)) { err = WEXITSTATUS(tx_status); if (err) - fprintf(stderr, "tx thread exited with err %d. ", err); + fprintf(stderr, "tx thread exited with err %d.\n", err); } out: return err; -- cgit v1.2.3 From 18d4e900a4500c54af56b9ad39f4d3b378eb0661 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 13 May 2020 12:14:25 -0700 Subject: bpf: Selftests, improve test_sockmap total bytes counter The recv thread in test_sockmap waits to receive all bytes from sender but in the case we use pop data it may wait for more bytes then actually being sent. This stalls the test harness for multiple seconds. Because this happens in multiple tests it slows time to run the selftest. Fix by doing a better job of accounting for total bytes when pop helpers are used. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/158939726542.15176.5964532245173539540.stgit@john-Precision-5820-Tower --- tools/testing/selftests/bpf/test_sockmap.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 7f45a8fd8f02..9a7e10424584 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -502,9 +502,10 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, * paths. */ total_bytes = (float)iov_count * (float)iov_length * (float)cnt; - txmsg_pop_total = txmsg_pop; if (txmsg_apply) - txmsg_pop_total *= (total_bytes / txmsg_apply); + txmsg_pop_total = txmsg_pop * (total_bytes / txmsg_apply); + else + txmsg_pop_total = txmsg_pop * cnt; total_bytes -= txmsg_pop_total; err = clock_gettime(CLOCK_MONOTONIC, &s->start); if (err < 0) @@ -638,8 +639,12 @@ static int sendmsg_test(struct sockmap_options *opt) rxpid = fork(); if (rxpid == 0) { + iov_buf -= (txmsg_pop - txmsg_start_pop + 1); if (opt->drop_expected) - exit(0); + _exit(0); + + if (!iov_buf) /* zero bytes sent case */ + _exit(0); if (opt->sendpage) iov_count = 1; -- cgit v1.2.3 From 328aa08a081bb94f9aba506363186de6ec3382ec Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 13 May 2020 12:14:44 -0700 Subject: bpf: Selftests, break down test_sockmap into subtests At the moment test_sockmap runs all 800+ tests ungrouped which is not ideal because it makes it hard to see what is failing but also more importantly its hard to confirm all cases are tested. Additionally, after inspecting we noticed the runtime is bloated because we run many duplicate tests. Worse some of these tests are known error cases that wait for the recvmsg handler to timeout which creats long delays. Also we noted some tests were not clearing their options and as a result the following tests would run with extra and incorrect options. Fix this by reorganizing test code so its clear what tests are running and when. Then it becomes easy to remove duplication and run tests with only the set of send/recv patterns that are relavent. To accomplish this break test_sockmap into subtests and remove unnecessary duplication. The output is more readable now and the runtime reduced. Now default output prints subtests like this, $ ./test_sockmap # 1/ 6 sockmap:txmsg test passthrough:OK ... #22/ 1 sockhash:txmsg test push/pop data:OK Pass: 22 Fail: 0 Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/158939728384.15176.13601520183665880762.stgit@john-Precision-5820-Tower --- tools/testing/selftests/bpf/test_sockmap.c | 723 ++++++++++++++--------------- 1 file changed, 348 insertions(+), 375 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 9a7e10424584..ad0540acc0eb 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -54,7 +54,7 @@ static void running_handler(int a); #define S1_PORT 10000 #define S2_PORT 10001 -#define BPF_SOCKMAP_FILENAME "test_sockmap_kern.o" +#define BPF_SOCKMAP_FILENAME "test_sockmap_kern.o" #define BPF_SOCKHASH_FILENAME "test_sockhash_kern.o" #define CG_PATH "/sockmap" @@ -110,6 +110,76 @@ static const struct option long_options[] = { {0, 0, NULL, 0 } }; +struct test_env { + const char *type; + const char *subtest; + + int test_num; + int subtest_num; + + int succ_cnt; + int fail_cnt; + int fail_last; +}; + +struct test_env env; + +static void test_start(void) +{ + env.subtest_num++; +} + +static void test_fail(void) +{ + env.fail_cnt++; +} + +static void test_pass(void) +{ + env.succ_cnt++; +} + +static void test_reset(void) +{ + txmsg_start = txmsg_end = 0; + txmsg_start_pop = txmsg_pop = 0; + txmsg_start_push = txmsg_end_push = 0; + txmsg_pass = txmsg_drop = txmsg_redir = 0; + txmsg_apply = txmsg_cork = 0; + txmsg_ingress = txmsg_skb = 0; +} + +static int test_start_subtest(const char *name, const char *type) +{ + env.type = type; + env.subtest = name; + env.test_num++; + env.subtest_num = 0; + env.fail_last = env.fail_cnt; + test_reset(); + return 0; +} + +static void test_end_subtest(void) +{ + int error = env.fail_cnt - env.fail_last; + int type = strcmp(env.type, BPF_SOCKMAP_FILENAME); + + if (!error) + test_pass(); + + fprintf(stdout, "#%2d/%2d %8s:%s:%s\n", + env.test_num, env.subtest_num, + !type ? "sockmap" : "sockhash", + env.subtest, error ? "FAIL" : "OK"); +} + +static void test_print_results(void) +{ + fprintf(stdout, "Pass: %d Fail: %d\n", + env.succ_cnt, env.fail_cnt); +} + static void usage(char *argv[]) { int i; @@ -316,6 +386,7 @@ struct sockmap_options { int iov_count; int iov_length; int rate; + char *map; }; static int msg_loop_sendpage(int fd, int iov_length, int cnt, @@ -1169,416 +1240,305 @@ static int __test_exec(int cgrp, int test, struct sockmap_options *opt) test_options(options); - fprintf(stdout, - "[TEST %i]: (%i, %i, %i, %s, %s): ", - test_cnt, opt->rate, opt->iov_count, opt->iov_length, - test_to_str(test), options); - fflush(stdout); + if (opt->verbose) { + fprintf(stdout, + "[TEST %i]: (%i, %i, %i, %s, %s): ", + test_cnt, opt->rate, opt->iov_count, opt->iov_length, + test_to_str(test), options); + fflush(stdout); + } err = run_options(opt, cgrp, test); - fprintf(stdout, "%s\n", !err ? "PASS" : "FAILED"); + if (opt->verbose) + fprintf(stdout, "%s\n", !err ? "PASS" : "FAILED"); test_cnt++; !err ? passed++ : failed++; free(options); return err; } -static int test_exec(int cgrp, struct sockmap_options *opt) -{ - int err = __test_exec(cgrp, SENDMSG, opt); - - if (err) - goto out; - - err = __test_exec(cgrp, SENDPAGE, opt); -out: - return err; -} - -static int test_loop(int cgrp) -{ - struct sockmap_options opt; - - int err, i, l, r; - - opt.verbose = 0; - opt.base = false; - opt.sendpage = false; - opt.data_test = false; - opt.drop_expected = false; - opt.iov_count = 0; - opt.iov_length = 0; - opt.rate = 0; - - r = 1; - for (i = 1; i < 100; i += 33) { - for (l = 1; l < 100; l += 33) { - opt.rate = r; - opt.iov_count = i; - opt.iov_length = l; - err = test_exec(cgrp, &opt); - if (err) - goto out; - } - } - sched_yield(); -out: - return err; -} - -static int test_txmsg(int cgrp) +static void test_exec(int cgrp, struct sockmap_options *opt) { + int type = strcmp(opt->map, BPF_SOCKMAP_FILENAME); int err; - txmsg_pass = txmsg_drop = 0; - txmsg_apply = txmsg_cork = 0; - txmsg_ingress = txmsg_skb = 0; - - txmsg_pass = 1; - err = test_loop(cgrp); - txmsg_pass = 0; - if (err) - goto out; - - txmsg_redir = 1; - err = test_loop(cgrp); - txmsg_redir = 0; - if (err) - goto out; - - txmsg_drop = 1; - err = test_loop(cgrp); - txmsg_drop = 0; - if (err) - goto out; - - txmsg_redir = 1; - txmsg_ingress = 1; - err = test_loop(cgrp); - txmsg_redir = 0; - txmsg_ingress = 0; - if (err) - goto out; -out: - txmsg_pass = 0; - txmsg_redir = 0; - txmsg_drop = 0; - return err; + if (type == 0) { + test_start(); + err = __test_exec(cgrp, SENDMSG, opt); + if (err) + test_fail(); + } else { + test_start(); + err = __test_exec(cgrp, SENDPAGE, opt); + if (err) + test_fail(); + } } -static int test_send(struct sockmap_options *opt, int cgrp) +static void test_send_one(struct sockmap_options *opt, int cgrp) { - int err; - opt->iov_length = 1; opt->iov_count = 1; opt->rate = 1; - err = test_exec(cgrp, opt); - if (err) - goto out; + test_exec(cgrp, opt); opt->iov_length = 1; opt->iov_count = 1024; opt->rate = 1; - err = test_exec(cgrp, opt); - if (err) - goto out; + test_exec(cgrp, opt); opt->iov_length = 1024; opt->iov_count = 1; opt->rate = 1; - err = test_exec(cgrp, opt); - if (err) - goto out; + test_exec(cgrp, opt); - opt->iov_length = 1; +} + +static void test_send_many(struct sockmap_options *opt, int cgrp) +{ + opt->iov_length = 3; opt->iov_count = 1; opt->rate = 512; - err = test_exec(cgrp, opt); - if (err) - goto out; + test_exec(cgrp, opt); + + opt->rate = 100; + opt->iov_count = 1; + opt->iov_length = 5; + test_exec(cgrp, opt); +} +static void test_send_large(struct sockmap_options *opt, int cgrp) +{ opt->iov_length = 256; opt->iov_count = 1024; opt->rate = 2; - err = test_exec(cgrp, opt); - if (err) - goto out; + test_exec(cgrp, opt); +} - opt->rate = 100; - opt->iov_count = 1; - opt->iov_length = 5; - err = test_exec(cgrp, opt); - if (err) - goto out; -out: +static void test_send(struct sockmap_options *opt, int cgrp) +{ + test_send_one(opt, cgrp); + test_send_many(opt, cgrp); + test_send_large(opt, cgrp); sched_yield(); - return err; } -static int test_mixed(int cgrp) +static void test_txmsg_pass(int cgrp, char *map) { - struct sockmap_options opt = {0}; - int err; - - txmsg_pass = txmsg_drop = 0; - txmsg_apply = txmsg_cork = 0; - txmsg_start = txmsg_end = 0; - txmsg_start_push = txmsg_end_push = 0; - txmsg_start_pop = txmsg_pop = 0; + struct sockmap_options opt = {.map = map}; /* Test small and large iov_count values with pass/redir/apply/cork */ txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 1; - txmsg_cork = 0; - err = test_send(&opt, cgrp); - if (err) - goto out; + test_send(&opt, cgrp); +} - txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 0; - txmsg_cork = 1; - err = test_send(&opt, cgrp); - if (err) - goto out; +static void test_txmsg_redir(int cgrp, char *map) +{ + struct sockmap_options opt = {.map = map}; - txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 1; - txmsg_cork = 1; - err = test_send(&opt, cgrp); - if (err) - goto out; + txmsg_redir = 1; + test_send(&opt, cgrp); +} - txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 1024; - txmsg_cork = 0; - err = test_send(&opt, cgrp); - if (err) - goto out; +static void test_txmsg_drop(int cgrp, char *map) +{ + struct sockmap_options opt = {.map = map}; - txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 0; - txmsg_cork = 1024; - err = test_send(&opt, cgrp); - if (err) - goto out; + txmsg_drop = 1; + test_send(&opt, cgrp); +} - txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 1024; - txmsg_cork = 1024; - err = test_send(&opt, cgrp); - if (err) - goto out; +static void test_txmsg_ingress_redir(int cgrp, char *map) +{ + struct sockmap_options opt = {.map = map}; + + txmsg_pass = txmsg_drop = 0; + txmsg_ingress = txmsg_redir = 1; + test_send(&opt, cgrp); +} + +/* Test cork with hung data. This tests poor usage patterns where + * cork can leave data on the ring if user program is buggy and + * doesn't flush them somehow. They do take some time however + * because they wait for a timeout. Test pass, redir and cork with + * apply logic. Use cork size of 4097 with send_large to avoid + * aligning cork size with send size. + */ +static void test_txmsg_cork_hangs(int cgrp, char *map) +{ + struct sockmap_options opt = {.map = map}; txmsg_pass = 1; txmsg_redir = 0; - txmsg_cork = 4096; - txmsg_apply = 4096; - err = test_send(&opt, cgrp); - if (err) - goto out; - - txmsg_pass = 0; - txmsg_redir = 1; - txmsg_apply = 1; - txmsg_cork = 0; - err = test_send(&opt, cgrp); - if (err) - goto out; + txmsg_cork = 4097; + txmsg_apply = 4097; + test_send_large(&opt, cgrp); txmsg_pass = 0; txmsg_redir = 1; txmsg_apply = 0; - txmsg_cork = 1; - err = test_send(&opt, cgrp); - if (err) - goto out; + txmsg_cork = 4097; + test_send_large(&opt, cgrp); txmsg_pass = 0; txmsg_redir = 1; - txmsg_apply = 1024; - txmsg_cork = 0; - err = test_send(&opt, cgrp); - if (err) - goto out; + txmsg_apply = 4097; + txmsg_cork = 4097; + test_send_large(&opt, cgrp); +} - txmsg_pass = 0; +static void test_txmsg_pull(int cgrp, char *map) +{ + struct sockmap_options opt = {.map = map}; + + /* Test basic start/end */ + txmsg_start = 1; + txmsg_end = 2; + test_send(&opt, cgrp); + + /* Test >4k pull */ + txmsg_start = 4096; + txmsg_end = 9182; + test_send_large(&opt, cgrp); + + /* Test pull + redirect */ + txmsg_redir = 0; + txmsg_start = 1; + txmsg_end = 2; + test_send(&opt, cgrp); + + /* Test pull + cork */ + txmsg_redir = 0; + txmsg_cork = 512; + txmsg_start = 1; + txmsg_end = 2; + test_send_many(&opt, cgrp); + + /* Test pull + cork + redirect */ txmsg_redir = 1; - txmsg_apply = 0; - txmsg_cork = 1024; - err = test_send(&opt, cgrp); - if (err) - goto out; + txmsg_cork = 512; + txmsg_start = 1; + txmsg_end = 2; + test_send_many(&opt, cgrp); +} - txmsg_pass = 0; +static void test_txmsg_pop(int cgrp, char *map) +{ + struct sockmap_options opt = {.map = map}; + + /* Test basic pop */ + txmsg_start_pop = 1; + txmsg_pop = 2; + test_send_many(&opt, cgrp); + + /* Test pop with >4k */ + txmsg_start_pop = 4096; + txmsg_pop = 4096; + test_send_large(&opt, cgrp); + + /* Test pop + redirect */ txmsg_redir = 1; - txmsg_apply = 1024; - txmsg_cork = 1024; - err = test_send(&opt, cgrp); - if (err) - goto out; + txmsg_start_pop = 1; + txmsg_pop = 2; + test_send_many(&opt, cgrp); - txmsg_pass = 0; + /* Test pop + cork */ + txmsg_redir = 0; + txmsg_cork = 512; + txmsg_start_pop = 1; + txmsg_pop = 2; + test_send_many(&opt, cgrp); + + /* Test pop + redirect + cork */ txmsg_redir = 1; - txmsg_cork = 4096; - txmsg_apply = 4096; - err = test_send(&opt, cgrp); - if (err) - goto out; -out: - return err; + txmsg_cork = 4; + txmsg_start_pop = 1; + txmsg_pop = 2; + test_send_many(&opt, cgrp); } -static int test_start_end(int cgrp) +static void test_txmsg_push(int cgrp, char *map) { - struct sockmap_options opt = {0}; - int err, i; + struct sockmap_options opt = {.map = map}; - /* Test basic start/end with lots of iov_count and iov_lengths */ - txmsg_start = 1; - txmsg_end = 2; + /* Test basic push */ + txmsg_start_push = 1; + txmsg_end_push = 1; + test_send(&opt, cgrp); + + /* Test push 4kB >4k */ + txmsg_start_push = 4096; + txmsg_end_push = 4096; + test_send_large(&opt, cgrp); + + /* Test push + redirect */ + txmsg_redir = 1; txmsg_start_push = 1; txmsg_end_push = 2; - txmsg_start_pop = 1; - txmsg_pop = 1; - err = test_txmsg(cgrp); - if (err) - goto out; + test_send_many(&opt, cgrp); - /* Cut a byte of pushed data but leave reamining in place */ - txmsg_start = 1; - txmsg_end = 2; + /* Test push + cork */ + txmsg_redir = 0; + txmsg_cork = 512; txmsg_start_push = 1; - txmsg_end_push = 3; - txmsg_start_pop = 1; - txmsg_pop = 1; - err = test_txmsg(cgrp); - if (err) - goto out; + txmsg_end_push = 2; + test_send_many(&opt, cgrp); +} - /* Test start/end with cork */ - opt.rate = 16; - opt.iov_count = 1; - opt.iov_length = 100; - txmsg_cork = 1600; - - txmsg_start_pop = 0; - txmsg_pop = 0; - - for (i = 99; i <= 1600; i += 500) { - txmsg_start = 0; - txmsg_end = i; - txmsg_start_push = 0; - txmsg_end_push = i; - err = test_exec(cgrp, &opt); - if (err) - goto out; - } +static void test_txmsg_push_pop(int cgrp, char *map) +{ + struct sockmap_options opt = {.map = map}; - /* Test pop data in middle of cork */ - for (i = 99; i <= 1600; i += 500) { - txmsg_start_pop = 10; - txmsg_pop = i; - err = test_exec(cgrp, &opt); - if (err) - goto out; - } - txmsg_start_pop = 0; - txmsg_pop = 0; - - /* Test start/end with cork but pull data in middle */ - for (i = 199; i <= 1600; i += 500) { - txmsg_start = 100; - txmsg_end = i; - txmsg_start_push = 100; - txmsg_end_push = i; - err = test_exec(cgrp, &opt); - if (err) - goto out; - } + txmsg_start_push = 1; + txmsg_end_push = 10; + txmsg_start_pop = 5; + txmsg_pop = 4; + test_send_large(&opt, cgrp); +} - /* Test start/end with cork pulling last sg entry */ - txmsg_start = 1500; - txmsg_end = 1600; - txmsg_start_push = 1500; - txmsg_end_push = 1600; - err = test_exec(cgrp, &opt); - if (err) - goto out; +static void test_txmsg_apply(int cgrp, char *map) +{ + struct sockmap_options opt = {.map = map}; - /* Test pop with cork pulling last sg entry */ - txmsg_start_pop = 1500; - txmsg_pop = 1600; - err = test_exec(cgrp, &opt); - if (err) - goto out; - txmsg_start_pop = 0; - txmsg_pop = 0; - - /* Test start/end pull of single byte in last page */ - txmsg_start = 1111; - txmsg_end = 1112; - txmsg_start_push = 1111; - txmsg_end_push = 1112; - err = test_exec(cgrp, &opt); - if (err) - goto out; + txmsg_pass = 1; + txmsg_redir = 0; + txmsg_apply = 1; + txmsg_cork = 0; + test_send_one(&opt, cgrp); - /* Test pop of single byte in last page */ - txmsg_start_pop = 1111; - txmsg_pop = 1112; - err = test_exec(cgrp, &opt); - if (err) - goto out; + txmsg_pass = 0; + txmsg_redir = 1; + txmsg_apply = 1; + txmsg_cork = 0; + test_send_one(&opt, cgrp); - /* Test start/end with end < start */ - txmsg_start = 1111; - txmsg_end = 0; - txmsg_start_push = 1111; - txmsg_end_push = 0; - err = test_exec(cgrp, &opt); - if (err) - goto out; + txmsg_pass = 1; + txmsg_redir = 0; + txmsg_apply = 1024; + txmsg_cork = 0; + test_send_large(&opt, cgrp); - /* Test start/end with end > data */ - txmsg_start = 0; - txmsg_end = 1601; - txmsg_start_push = 0; - txmsg_end_push = 1601; - err = test_exec(cgrp, &opt); - if (err) - goto out; + txmsg_pass = 0; + txmsg_redir = 1; + txmsg_apply = 1024; + txmsg_cork = 0; + test_send_large(&opt, cgrp); +} - /* Test start/end with start > data */ - txmsg_start = 1601; - txmsg_end = 1600; - txmsg_start_push = 1601; - txmsg_end_push = 1600; - err = test_exec(cgrp, &opt); - if (err) - goto out; +static void test_txmsg_cork(int cgrp, char *map) +{ + struct sockmap_options opt = {.map = map}; - /* Test pop with start > data */ - txmsg_start_pop = 1601; - txmsg_pop = 1; - err = test_exec(cgrp, &opt); - if (err) - goto out; + txmsg_pass = 1; + txmsg_redir = 0; + txmsg_apply = 0; + txmsg_cork = 1; + test_send(&opt, cgrp); - /* Test pop with pop range > data */ - txmsg_start_pop = 1599; - txmsg_pop = 10; - err = test_exec(cgrp, &opt); -out: - txmsg_start = 0; - txmsg_end = 0; - sched_yield(); - return err; + txmsg_pass = 1; + txmsg_redir = 0; + txmsg_apply = 1; + txmsg_cork = 1; + test_send(&opt, cgrp); } char *map_names[] = { @@ -1663,16 +1623,59 @@ static int populate_progs(char *bpf_file) return 0; } -static int __test_suite(int cg_fd, char *bpf_file) +struct _test { + char *title; + void (*tester)(int cg_fd, char *map); +}; + +struct _test test[] = { + {"txmsg test passthrough", test_txmsg_pass}, + {"txmsg test redirect", test_txmsg_redir}, + {"txmsg test drop", test_txmsg_drop}, + {"txmsg test ingress redirect", test_txmsg_ingress_redir}, + {"txmsg test apply", test_txmsg_apply}, + {"txmsg test cork", test_txmsg_cork}, + {"txmsg test hanging corks", test_txmsg_cork_hangs}, + {"txmsg test push_data", test_txmsg_push}, + {"txmsg test pull-data", test_txmsg_pull}, + {"txmsg test pop-data", test_txmsg_pop}, + {"txmsg test push/pop data", test_txmsg_push_pop}, +}; + +static int __test_selftests(int cg_fd, char *map) { - int err, cleanup = cg_fd; + int i, err; - err = populate_progs(bpf_file); + err = populate_progs(map); if (err < 0) { fprintf(stderr, "ERROR: (%i) load bpf failed\n", err); return err; } + /* Tests basic commands and APIs */ + for (i = 0; i < sizeof(test)/sizeof(struct _test); i++) { + struct _test t = test[i]; + + test_start_subtest(t.title, map); + t.tester(cg_fd, map); + test_end_subtest(); + } + + return err; +} + +static void test_selftests_sockmap(int cg_fd) +{ + __test_selftests(cg_fd, BPF_SOCKMAP_FILENAME); +} + +static void test_selftests_sockhash(int cg_fd) +{ + __test_selftests(cg_fd, BPF_SOCKHASH_FILENAME); +} + +static int test_selftest(int cg_fd) +{ if (cg_fd < 0) { if (setup_cgroup_environment()) { fprintf(stderr, "ERROR: cgroup env failed\n"); @@ -1693,43 +1696,12 @@ static int __test_suite(int cg_fd, char *bpf_file) } } - /* Tests basic commands and APIs with range of iov values */ - txmsg_start = txmsg_end = txmsg_start_push = txmsg_end_push = 0; - err = test_txmsg(cg_fd); - if (err) - goto out; - - /* Tests interesting combinations of APIs used together */ - err = test_mixed(cg_fd); - if (err) - goto out; - - /* Tests pull_data API using start/end API */ - err = test_start_end(cg_fd); - if (err) - goto out; - -out: - printf("Summary: %i PASSED %i FAILED\n", passed, failed); - if (cleanup < 0) { - cleanup_cgroup_environment(); - close(cg_fd); - } - return err; -} - -static int test_suite(int cg_fd) -{ - int err; - - err = __test_suite(cg_fd, BPF_SOCKMAP_FILENAME); - if (err) - goto out; - err = __test_suite(cg_fd, BPF_SOCKHASH_FILENAME); -out: - if (cg_fd > -1) - close(cg_fd); - return err; + test_selftests_sockmap(cg_fd); + test_selftests_sockhash(cg_fd); + cleanup_cgroup_environment(); + close(cg_fd); + test_print_results(); + return 0; } int main(int argc, char **argv) @@ -1741,8 +1713,9 @@ int main(int argc, char **argv) int test = PING_PONG; bool cg_created = 0; - if (argc < 2) - return test_suite(-1); + if (argc < 2) { + return test_selftest(-1); + } while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:p:q:", long_options, &longindex)) != -1) { -- cgit v1.2.3 From b98ca90c56ee498c751ff5c20b9db8cb64c13fc5 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 13 May 2020 12:15:04 -0700 Subject: bpf: Selftests, provide verbose option for selftests execution Pass options from command line args into individual tests which allows us to use verbose option from command line with selftests. Now when verbose option is set individual subtest details will be printed. Also we can consolidate cgroup bring up and tear down. Additionally just setting verbose is very noisy so introduce verbose=1 and verbose=2. Really verbose=2 is only useful when developing tests or debugging some specific issue. For example now we get output like this with --verbose, #20/17 sockhash:txmsg test pull-data:OK [TEST 160]: (512, 1, 3, sendpage, pop (1,3),): msg_loop_rx: iov_count 1 iov_buf 1 cnt 512 err 0 [TEST 161]: (100, 1, 5, sendpage, pop (1,3),): msg_loop_rx: iov_count 1 iov_buf 3 cnt 100 err 0 [TEST 162]: (2, 1024, 256, sendpage, pop (4096,8192),): msg_loop_rx: iov_count 1 iov_buf 255 cnt 2 err 0 [TEST 163]: (512, 1, 3, sendpage, redir,pop (1,3),): msg_loop_rx: iov_count 1 iov_buf 1 cnt 512 err 0 [TEST 164]: (100, 1, 5, sendpage, redir,pop (1,3),): msg_loop_rx: iov_count 1 iov_buf 3 cnt 100 err 0 [TEST 165]: (512, 1, 3, sendpage, cork 512,pop (1,3),): msg_loop_rx: iov_count 1 iov_buf 1 cnt 512 err 0 [TEST 166]: (100, 1, 5, sendpage, cork 512,pop (1,3),): msg_loop_rx: iov_count 1 iov_buf 3 cnt 100 err 0 [TEST 167]: (512, 1, 3, sendpage, redir,cork 4,pop (1,3),): msg_loop_rx: iov_count 1 iov_buf 1 cnt 512 err 0 [TEST 168]: (100, 1, 5, sendpage, redir,cork 4,pop (1,3),): msg_loop_rx: iov_count 1 iov_buf 3 cnt 100 err 0 Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/158939730412.15176.1975675235035143367.stgit@john-Precision-5820-Tower --- tools/testing/selftests/bpf/test_sockmap.c | 179 ++++++++++++----------------- 1 file changed, 71 insertions(+), 108 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index ad0540acc0eb..2be8d9df152a 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -87,7 +87,7 @@ static const struct option long_options[] = { {"help", no_argument, NULL, 'h' }, {"cgroup", required_argument, NULL, 'c' }, {"rate", required_argument, NULL, 'r' }, - {"verbose", no_argument, NULL, 'v' }, + {"verbose", optional_argument, NULL, 'v' }, {"iov_count", required_argument, NULL, 'i' }, {"length", required_argument, NULL, 'l' }, {"test", required_argument, NULL, 't' }, @@ -362,7 +362,7 @@ static int sockmap_init_sockets(int verbose) return errno; } - if (verbose) { + if (verbose > 1) { printf("connected sockets: c1 <-> p1, c2 <-> p2\n"); printf("cgroups binding: c1(%i) <-> s1(%i) - - - c2(%i) <-> s2(%i)\n", c1, s1, c2, s2); @@ -721,7 +721,7 @@ static int sendmsg_test(struct sockmap_options *opt) iov_count = 1; err = msg_loop(rx_fd, iov_count, iov_buf, cnt, &s, false, opt); - if (opt->verbose) + if (opt->verbose > 1) fprintf(stderr, "msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n", iov_count, iov_buf, cnt, err); @@ -729,7 +729,7 @@ static int sendmsg_test(struct sockmap_options *opt) sent_Bps = sentBps(s); recvd_Bps = recvdBps(s); } - if (opt->verbose) + if (opt->verbose > 1) fprintf(stdout, "rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s %s\n", s.bytes_sent, sent_Bps, sent_Bps/giga, @@ -759,7 +759,7 @@ static int sendmsg_test(struct sockmap_options *opt) sent_Bps = sentBps(s); recvd_Bps = recvdBps(s); } - if (opt->verbose) + if (opt->verbose > 1) fprintf(stdout, "tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n", s.bytes_sent, sent_Bps, sent_Bps/giga, @@ -864,6 +864,7 @@ static int forever_ping_pong(int rate, struct sockmap_options *opt) } enum { + SELFTESTS, PING_PONG, SENDMSG, BASE, @@ -1242,14 +1243,14 @@ static int __test_exec(int cgrp, int test, struct sockmap_options *opt) if (opt->verbose) { fprintf(stdout, - "[TEST %i]: (%i, %i, %i, %s, %s): ", + " [TEST %i]: (%i, %i, %i, %s, %s): ", test_cnt, opt->rate, opt->iov_count, opt->iov_length, test_to_str(test), options); fflush(stdout); } err = run_options(opt, cgrp, test); if (opt->verbose) - fprintf(stdout, "%s\n", !err ? "PASS" : "FAILED"); + fprintf(stdout, " %s\n", !err ? "PASS" : "FAILED"); test_cnt++; !err ? passed++ : failed++; free(options); @@ -1322,38 +1323,30 @@ static void test_send(struct sockmap_options *opt, int cgrp) sched_yield(); } -static void test_txmsg_pass(int cgrp, char *map) +static void test_txmsg_pass(int cgrp, struct sockmap_options *opt) { - struct sockmap_options opt = {.map = map}; - /* Test small and large iov_count values with pass/redir/apply/cork */ txmsg_pass = 1; - test_send(&opt, cgrp); + test_send(opt, cgrp); } -static void test_txmsg_redir(int cgrp, char *map) +static void test_txmsg_redir(int cgrp, struct sockmap_options *opt) { - struct sockmap_options opt = {.map = map}; - txmsg_redir = 1; - test_send(&opt, cgrp); + test_send(opt, cgrp); } -static void test_txmsg_drop(int cgrp, char *map) +static void test_txmsg_drop(int cgrp, struct sockmap_options *opt) { - struct sockmap_options opt = {.map = map}; - txmsg_drop = 1; - test_send(&opt, cgrp); + test_send(opt, cgrp); } -static void test_txmsg_ingress_redir(int cgrp, char *map) +static void test_txmsg_ingress_redir(int cgrp, struct sockmap_options *opt) { - struct sockmap_options opt = {.map = map}; - txmsg_pass = txmsg_drop = 0; txmsg_ingress = txmsg_redir = 1; - test_send(&opt, cgrp); + test_send(opt, cgrp); } /* Test cork with hung data. This tests poor usage patterns where @@ -1363,182 +1356,168 @@ static void test_txmsg_ingress_redir(int cgrp, char *map) * apply logic. Use cork size of 4097 with send_large to avoid * aligning cork size with send size. */ -static void test_txmsg_cork_hangs(int cgrp, char *map) +static void test_txmsg_cork_hangs(int cgrp, struct sockmap_options *opt) { - struct sockmap_options opt = {.map = map}; - txmsg_pass = 1; txmsg_redir = 0; txmsg_cork = 4097; txmsg_apply = 4097; - test_send_large(&opt, cgrp); + test_send_large(opt, cgrp); txmsg_pass = 0; txmsg_redir = 1; txmsg_apply = 0; txmsg_cork = 4097; - test_send_large(&opt, cgrp); + test_send_large(opt, cgrp); txmsg_pass = 0; txmsg_redir = 1; txmsg_apply = 4097; txmsg_cork = 4097; - test_send_large(&opt, cgrp); + test_send_large(opt, cgrp); } -static void test_txmsg_pull(int cgrp, char *map) +static void test_txmsg_pull(int cgrp, struct sockmap_options *opt) { - struct sockmap_options opt = {.map = map}; - /* Test basic start/end */ txmsg_start = 1; txmsg_end = 2; - test_send(&opt, cgrp); + test_send(opt, cgrp); /* Test >4k pull */ txmsg_start = 4096; txmsg_end = 9182; - test_send_large(&opt, cgrp); + test_send_large(opt, cgrp); /* Test pull + redirect */ txmsg_redir = 0; txmsg_start = 1; txmsg_end = 2; - test_send(&opt, cgrp); + test_send(opt, cgrp); /* Test pull + cork */ txmsg_redir = 0; txmsg_cork = 512; txmsg_start = 1; txmsg_end = 2; - test_send_many(&opt, cgrp); + test_send_many(opt, cgrp); /* Test pull + cork + redirect */ txmsg_redir = 1; txmsg_cork = 512; txmsg_start = 1; txmsg_end = 2; - test_send_many(&opt, cgrp); + test_send_many(opt, cgrp); } -static void test_txmsg_pop(int cgrp, char *map) +static void test_txmsg_pop(int cgrp, struct sockmap_options *opt) { - struct sockmap_options opt = {.map = map}; - /* Test basic pop */ txmsg_start_pop = 1; txmsg_pop = 2; - test_send_many(&opt, cgrp); + test_send_many(opt, cgrp); /* Test pop with >4k */ txmsg_start_pop = 4096; txmsg_pop = 4096; - test_send_large(&opt, cgrp); + test_send_large(opt, cgrp); /* Test pop + redirect */ txmsg_redir = 1; txmsg_start_pop = 1; txmsg_pop = 2; - test_send_many(&opt, cgrp); + test_send_many(opt, cgrp); /* Test pop + cork */ txmsg_redir = 0; txmsg_cork = 512; txmsg_start_pop = 1; txmsg_pop = 2; - test_send_many(&opt, cgrp); + test_send_many(opt, cgrp); /* Test pop + redirect + cork */ txmsg_redir = 1; txmsg_cork = 4; txmsg_start_pop = 1; txmsg_pop = 2; - test_send_many(&opt, cgrp); + test_send_many(opt, cgrp); } -static void test_txmsg_push(int cgrp, char *map) +static void test_txmsg_push(int cgrp, struct sockmap_options *opt) { - struct sockmap_options opt = {.map = map}; - /* Test basic push */ txmsg_start_push = 1; txmsg_end_push = 1; - test_send(&opt, cgrp); + test_send(opt, cgrp); /* Test push 4kB >4k */ txmsg_start_push = 4096; txmsg_end_push = 4096; - test_send_large(&opt, cgrp); + test_send_large(opt, cgrp); /* Test push + redirect */ txmsg_redir = 1; txmsg_start_push = 1; txmsg_end_push = 2; - test_send_many(&opt, cgrp); + test_send_many(opt, cgrp); /* Test push + cork */ txmsg_redir = 0; txmsg_cork = 512; txmsg_start_push = 1; txmsg_end_push = 2; - test_send_many(&opt, cgrp); + test_send_many(opt, cgrp); } -static void test_txmsg_push_pop(int cgrp, char *map) +static void test_txmsg_push_pop(int cgrp, struct sockmap_options *opt) { - struct sockmap_options opt = {.map = map}; - txmsg_start_push = 1; txmsg_end_push = 10; txmsg_start_pop = 5; txmsg_pop = 4; - test_send_large(&opt, cgrp); + test_send_large(opt, cgrp); } -static void test_txmsg_apply(int cgrp, char *map) +static void test_txmsg_apply(int cgrp, struct sockmap_options *opt) { - struct sockmap_options opt = {.map = map}; - txmsg_pass = 1; txmsg_redir = 0; txmsg_apply = 1; txmsg_cork = 0; - test_send_one(&opt, cgrp); + test_send_one(opt, cgrp); txmsg_pass = 0; txmsg_redir = 1; txmsg_apply = 1; txmsg_cork = 0; - test_send_one(&opt, cgrp); + test_send_one(opt, cgrp); txmsg_pass = 1; txmsg_redir = 0; txmsg_apply = 1024; txmsg_cork = 0; - test_send_large(&opt, cgrp); + test_send_large(opt, cgrp); txmsg_pass = 0; txmsg_redir = 1; txmsg_apply = 1024; txmsg_cork = 0; - test_send_large(&opt, cgrp); + test_send_large(opt, cgrp); } -static void test_txmsg_cork(int cgrp, char *map) +static void test_txmsg_cork(int cgrp, struct sockmap_options *opt) { - struct sockmap_options opt = {.map = map}; - txmsg_pass = 1; txmsg_redir = 0; txmsg_apply = 0; txmsg_cork = 1; - test_send(&opt, cgrp); + test_send(opt, cgrp); txmsg_pass = 1; txmsg_redir = 0; txmsg_apply = 1; txmsg_cork = 1; - test_send(&opt, cgrp); + test_send(opt, cgrp); } char *map_names[] = { @@ -1625,7 +1604,7 @@ static int populate_progs(char *bpf_file) struct _test { char *title; - void (*tester)(int cg_fd, char *map); + void (*tester)(int cg_fd, struct sockmap_options *opt); }; struct _test test[] = { @@ -1642,11 +1621,11 @@ struct _test test[] = { {"txmsg test push/pop data", test_txmsg_push_pop}, }; -static int __test_selftests(int cg_fd, char *map) +static int __test_selftests(int cg_fd, struct sockmap_options *opt) { int i, err; - err = populate_progs(map); + err = populate_progs(opt->map); if (err < 0) { fprintf(stderr, "ERROR: (%i) load bpf failed\n", err); return err; @@ -1656,50 +1635,31 @@ static int __test_selftests(int cg_fd, char *map) for (i = 0; i < sizeof(test)/sizeof(struct _test); i++) { struct _test t = test[i]; - test_start_subtest(t.title, map); - t.tester(cg_fd, map); + test_start_subtest(t.title, opt->map); + t.tester(cg_fd, opt); test_end_subtest(); } return err; } -static void test_selftests_sockmap(int cg_fd) +static void test_selftests_sockmap(int cg_fd, struct sockmap_options *opt) { - __test_selftests(cg_fd, BPF_SOCKMAP_FILENAME); + opt->map = BPF_SOCKMAP_FILENAME; + __test_selftests(cg_fd, opt); } -static void test_selftests_sockhash(int cg_fd) +static void test_selftests_sockhash(int cg_fd, struct sockmap_options *opt) { - __test_selftests(cg_fd, BPF_SOCKHASH_FILENAME); + opt->map = BPF_SOCKHASH_FILENAME; + __test_selftests(cg_fd, opt); } -static int test_selftest(int cg_fd) +static int test_selftest(int cg_fd, struct sockmap_options *opt) { - if (cg_fd < 0) { - if (setup_cgroup_environment()) { - fprintf(stderr, "ERROR: cgroup env failed\n"); - return -EINVAL; - } - - cg_fd = create_and_get_cgroup(CG_PATH); - if (cg_fd < 0) { - fprintf(stderr, - "ERROR: (%i) open cg path failed: %s\n", - cg_fd, optarg); - return cg_fd; - } - if (join_cgroup(CG_PATH)) { - fprintf(stderr, "ERROR: failed to join cgroup\n"); - return -EINVAL; - } - } - - test_selftests_sockmap(cg_fd); - test_selftests_sockhash(cg_fd); - cleanup_cgroup_environment(); - close(cg_fd); + test_selftests_sockmap(cg_fd, opt); + test_selftests_sockhash(cg_fd, opt); test_print_results(); return 0; } @@ -1710,14 +1670,10 @@ int main(int argc, char **argv) struct sockmap_options options = {0}; int opt, longindex, err, cg_fd = 0; char *bpf_file = BPF_SOCKMAP_FILENAME; - int test = PING_PONG; + int test = SELFTESTS; bool cg_created = 0; - if (argc < 2) { - return test_selftest(-1); - } - - while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:p:q:", + while ((opt = getopt_long(argc, argv, ":dhv:c:r:i:l:t:p:q:", long_options, &longindex)) != -1) { switch (opt) { case 's': @@ -1758,6 +1714,8 @@ int main(int argc, char **argv) break; case 'v': options.verbose = 1; + if (optarg) + options.verbose = atoi(optarg); break; case 'i': iov_count = atoi(optarg); @@ -1814,6 +1772,11 @@ int main(int argc, char **argv) cg_created = 1; } + if (test == SELFTESTS) { + err = test_selftest(cg_fd, &options); + goto out; + } + err = populate_progs(bpf_file); if (err) { fprintf(stderr, "populate program: (%s) %s\n", @@ -1830,7 +1793,7 @@ int main(int argc, char **argv) options.rate = rate; err = run_options(&options, cg_fd, test); - +out: if (cg_created) cleanup_cgroup_environment(); close(cg_fd); -- cgit v1.2.3 From 065a74cbd0d0bd7115846d630e141a95a95e1ce1 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 13 May 2020 12:15:24 -0700 Subject: bpf: Selftests, add whitelist option to test_sockmap Allow running specific tests with a comma deliminated whitelist. For example to run all apply and cork tests. $ ./test_sockmap --whitelist="cork,apply" Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/158939732464.15176.1959113294944564542.stgit@john-Precision-5820-Tower --- tools/testing/selftests/bpf/test_sockmap.c | 31 +++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 2be8d9df152a..1b98e9210d13 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -107,6 +107,7 @@ static const struct option long_options[] = { {"txmsg_skb", no_argument, &txmsg_skb, 1 }, {"ktls", no_argument, &ktls, 1 }, {"peek", no_argument, &peek_flag, 1 }, + {"whitelist", required_argument, NULL, 'n' }, {0, 0, NULL, 0 } }; @@ -387,6 +388,7 @@ struct sockmap_options { int iov_length; int rate; char *map; + char *whitelist; }; static int msg_loop_sendpage(int fd, int iov_length, int cnt, @@ -1621,6 +1623,24 @@ struct _test test[] = { {"txmsg test push/pop data", test_txmsg_push_pop}, }; +static int check_whitelist(struct _test *t, struct sockmap_options *opt) +{ + char *entry, *ptr; + + if (!opt->whitelist) + return 0; + ptr = strdup(opt->whitelist); + if (!ptr) + return -ENOMEM; + entry = strtok(ptr, ","); + while (entry) { + if (strstr(opt->map, entry) != 0 || strstr(t->title, entry) != 0) + return 0; + entry = strtok(NULL, ","); + } + return -EINVAL; +} + static int __test_selftests(int cg_fd, struct sockmap_options *opt) { int i, err; @@ -1635,6 +1655,9 @@ static int __test_selftests(int cg_fd, struct sockmap_options *opt) for (i = 0; i < sizeof(test)/sizeof(struct _test); i++) { struct _test t = test[i]; + if (check_whitelist(&t, opt) < 0) + continue; + test_start_subtest(t.title, opt->map); t.tester(cg_fd, opt); test_end_subtest(); @@ -1673,7 +1696,7 @@ int main(int argc, char **argv) int test = SELFTESTS; bool cg_created = 0; - while ((opt = getopt_long(argc, argv, ":dhv:c:r:i:l:t:p:q:", + while ((opt = getopt_long(argc, argv, ":dhv:c:r:i:l:t:p:q:n:", long_options, &longindex)) != -1) { switch (opt) { case 's': @@ -1742,6 +1765,10 @@ int main(int argc, char **argv) return -1; } break; + case 'n': + options.whitelist = strdup(optarg); + if (!options.whitelist) + return -ENOMEM; case 0: break; case 'h': @@ -1794,6 +1821,8 @@ int main(int argc, char **argv) err = run_options(&options, cg_fd, test); out: + if (options.whitelist) + free(options.whitelist); if (cg_created) cleanup_cgroup_environment(); close(cg_fd); -- cgit v1.2.3 From a7238f7c79dda1c484f92478c42408e1a3d418c6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 13 May 2020 12:15:43 -0700 Subject: bpf: Selftests, add blacklist to test_sockmap This adds a blacklist to test_sockmap. For example, now we can run all apply and cork tests except those with timeouts by doing, $ ./test_sockmap --whitelist "apply,cork" --blacklist "hang" Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/158939734350.15176.6643981099665208826.stgit@john-Precision-5820-Tower --- tools/testing/selftests/bpf/test_sockmap.c | 33 ++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 1b98e9210d13..2ed2db625371 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -108,6 +108,7 @@ static const struct option long_options[] = { {"ktls", no_argument, &ktls, 1 }, {"peek", no_argument, &peek_flag, 1 }, {"whitelist", required_argument, NULL, 'n' }, + {"blacklist", required_argument, NULL, 'b' }, {0, 0, NULL, 0 } }; @@ -389,6 +390,7 @@ struct sockmap_options { int rate; char *map; char *whitelist; + char *blacklist; }; static int msg_loop_sendpage(int fd, int iov_length, int cnt, @@ -1641,6 +1643,24 @@ static int check_whitelist(struct _test *t, struct sockmap_options *opt) return -EINVAL; } +static int check_blacklist(struct _test *t, struct sockmap_options *opt) +{ + char *entry, *ptr; + + if (!opt->blacklist) + return -EINVAL; + ptr = strdup(opt->blacklist); + if (!ptr) + return -ENOMEM; + entry = strtok(ptr, ","); + while (entry) { + if (strstr(opt->map, entry) != 0 || strstr(t->title, entry) != 0) + return 0; + entry = strtok(NULL, ","); + } + return -EINVAL; +} + static int __test_selftests(int cg_fd, struct sockmap_options *opt) { int i, err; @@ -1655,7 +1675,9 @@ static int __test_selftests(int cg_fd, struct sockmap_options *opt) for (i = 0; i < sizeof(test)/sizeof(struct _test); i++) { struct _test t = test[i]; - if (check_whitelist(&t, opt) < 0) + if (check_whitelist(&t, opt) != 0) + continue; + if (check_blacklist(&t, opt) == 0) continue; test_start_subtest(t.title, opt->map); @@ -1696,7 +1718,7 @@ int main(int argc, char **argv) int test = SELFTESTS; bool cg_created = 0; - while ((opt = getopt_long(argc, argv, ":dhv:c:r:i:l:t:p:q:n:", + while ((opt = getopt_long(argc, argv, ":dhv:c:r:i:l:t:p:q:n:b:", long_options, &longindex)) != -1) { switch (opt) { case 's': @@ -1769,6 +1791,11 @@ int main(int argc, char **argv) options.whitelist = strdup(optarg); if (!options.whitelist) return -ENOMEM; + break; + case 'b': + options.blacklist = strdup(optarg); + if (!options.blacklist) + return -ENOMEM; case 0: break; case 'h': @@ -1823,6 +1850,8 @@ int main(int argc, char **argv) out: if (options.whitelist) free(options.whitelist); + if (options.blacklist) + free(options.blacklist); if (cg_created) cleanup_cgroup_environment(); close(cg_fd); -- cgit v1.2.3 From 96586dd9268d26b278a1dd9110080001a6acbb0f Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 13 May 2020 12:16:02 -0700 Subject: bpf: Selftests, add ktls tests to test_sockmap Until now we have only had minimal ktls+sockmap testing when being used with helpers and different sendmsg/sendpage patterns. Add a pass with ktls here. To run just ktls tests, $ ./test_sockmap --whitelist="ktls" Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/158939736278.15176.5435314315563203761.stgit@john-Precision-5820-Tower --- tools/testing/selftests/bpf/test_sockmap.c | 70 +++++++++++++++++++----------- 1 file changed, 44 insertions(+), 26 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 2ed2db625371..c80643828b82 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -115,6 +115,7 @@ static const struct option long_options[] = { struct test_env { const char *type; const char *subtest; + const char *prepend; int test_num; int subtest_num; @@ -126,6 +127,26 @@ struct test_env { struct test_env env; +struct sockmap_options { + int verbose; + bool base; + bool sendpage; + bool data_test; + bool drop_expected; + int iov_count; + int iov_length; + int rate; + char *map; + char *whitelist; + char *blacklist; + char *prepend; +}; + +struct _test { + char *title; + void (*tester)(int cg_fd, struct sockmap_options *opt); +}; + static void test_start(void) { env.subtest_num++; @@ -151,10 +172,11 @@ static void test_reset(void) txmsg_ingress = txmsg_skb = 0; } -static int test_start_subtest(const char *name, const char *type) +static int test_start_subtest(const struct _test *t, struct sockmap_options *o) { - env.type = type; - env.subtest = name; + env.type = o->map; + env.subtest = t->title; + env.prepend = o->prepend; env.test_num++; env.subtest_num = 0; env.fail_last = env.fail_cnt; @@ -170,9 +192,10 @@ static void test_end_subtest(void) if (!error) test_pass(); - fprintf(stdout, "#%2d/%2d %8s:%s:%s\n", + fprintf(stdout, "#%2d/%2d %8s:%s:%s:%s\n", env.test_num, env.subtest_num, !type ? "sockmap" : "sockhash", + env.prepend ? : "", env.subtest, error ? "FAIL" : "OK"); } @@ -379,20 +402,6 @@ struct msg_stats { struct timespec end; }; -struct sockmap_options { - int verbose; - bool base; - bool sendpage; - bool data_test; - bool drop_expected; - int iov_count; - int iov_length; - int rate; - char *map; - char *whitelist; - char *blacklist; -}; - static int msg_loop_sendpage(int fd, int iov_length, int cnt, struct msg_stats *s, struct sockmap_options *opt) @@ -1606,11 +1615,6 @@ static int populate_progs(char *bpf_file) return 0; } -struct _test { - char *title; - void (*tester)(int cg_fd, struct sockmap_options *opt); -}; - struct _test test[] = { {"txmsg test passthrough", test_txmsg_pass}, {"txmsg test redirect", test_txmsg_redir}, @@ -1636,7 +1640,9 @@ static int check_whitelist(struct _test *t, struct sockmap_options *opt) return -ENOMEM; entry = strtok(ptr, ","); while (entry) { - if (strstr(opt->map, entry) != 0 || strstr(t->title, entry) != 0) + if ((opt->prepend && strstr(opt->prepend, entry) != 0) || + strstr(opt->map, entry) != 0 || + strstr(t->title, entry) != 0) return 0; entry = strtok(NULL, ","); } @@ -1654,7 +1660,9 @@ static int check_blacklist(struct _test *t, struct sockmap_options *opt) return -ENOMEM; entry = strtok(ptr, ","); while (entry) { - if (strstr(opt->map, entry) != 0 || strstr(t->title, entry) != 0) + if ((opt->prepend && strstr(opt->prepend, entry) != 0) || + strstr(opt->map, entry) != 0 || + strstr(t->title, entry) != 0) return 0; entry = strtok(NULL, ","); } @@ -1680,7 +1688,7 @@ static int __test_selftests(int cg_fd, struct sockmap_options *opt) if (check_blacklist(&t, opt) == 0) continue; - test_start_subtest(t.title, opt->map); + test_start_subtest(&t, opt); t.tester(cg_fd, opt); test_end_subtest(); } @@ -1700,11 +1708,21 @@ static void test_selftests_sockhash(int cg_fd, struct sockmap_options *opt) __test_selftests(cg_fd, opt); } +static void test_selftests_ktls(int cg_fd, struct sockmap_options *opt) +{ + opt->map = BPF_SOCKHASH_FILENAME; + opt->prepend = "ktls"; + ktls = 1; + __test_selftests(cg_fd, opt); + ktls = 0; +} + static int test_selftest(int cg_fd, struct sockmap_options *opt) { test_selftests_sockmap(cg_fd, opt); test_selftests_sockhash(cg_fd, opt); + test_selftests_ktls(cg_fd, opt); test_print_results(); return 0; } -- cgit v1.2.3 From 84e0d83567df4597b1b624b495d689104227a551 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 17 May 2020 01:43:09 +0300 Subject: selftests: devlink_lib: Remove double blank line One blank line is enough. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- tools/testing/selftests/net/forwarding/devlink_lib.sh | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/devlink_lib.sh b/tools/testing/selftests/net/forwarding/devlink_lib.sh index 155d48bd4d9e..7b6390aea50b 100644 --- a/tools/testing/selftests/net/forwarding/devlink_lib.sh +++ b/tools/testing/selftests/net/forwarding/devlink_lib.sh @@ -390,7 +390,6 @@ devlink_trap_drop_test() devlink_trap_group_stats_idle_test $group_name check_err $? "Trap group stats not idle with initial drop action" - devlink_trap_action_set $trap_name "trap" devlink_trap_stats_idle_test $trap_name check_fail $? "Trap stats idle after setting action to trap" -- cgit v1.2.3 From 04cc99d9bdb1119172e21c121950a0253f5c659f Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 17 May 2020 01:43:10 +0300 Subject: selftests: mlxsw: Do not hard code trap group name It can be derived dynamically from the trap's name, so drop it. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- .../drivers/net/mlxsw/devlink_trap_acl_drops.sh | 4 +-- .../drivers/net/mlxsw/devlink_trap_l2_drops.sh | 33 ++++++++------------ .../drivers/net/mlxsw/devlink_trap_l3_drops.sh | 35 +++++++--------------- .../net/mlxsw/devlink_trap_l3_exceptions.sh | 20 ++++--------- .../drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh | 6 ++-- .../drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh | 9 ++---- .../selftests/net/forwarding/devlink_lib.sh | 8 +++-- 7 files changed, 43 insertions(+), 72 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_acl_drops.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_acl_drops.sh index 26044e397157..b32ba5fec59d 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_acl_drops.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_acl_drops.sh @@ -107,7 +107,7 @@ ingress_flow_action_drop_test() RET=0 - devlink_trap_drop_test ingress_flow_action_drop acl_drops $swp2 101 + devlink_trap_drop_test ingress_flow_action_drop $swp2 101 log_test "ingress_flow_action_drop" @@ -132,7 +132,7 @@ egress_flow_action_drop_test() RET=0 - devlink_trap_drop_test egress_flow_action_drop acl_drops $swp2 102 + devlink_trap_drop_test egress_flow_action_drop $swp2 102 log_test "egress_flow_action_drop" diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh index e7aecb065409..a4c2812e9807 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh @@ -96,7 +96,6 @@ source_mac_is_multicast_test() { local trap_name="source_mac_is_multicast" local smac=01:02:03:04:05:06 - local group_name="l2_drops" local mz_pid tc filter add dev $swp2 egress protocol ip pref 1 handle 101 \ @@ -107,7 +106,7 @@ source_mac_is_multicast_test() RET=0 - devlink_trap_drop_test $trap_name $group_name $swp2 101 + devlink_trap_drop_test $trap_name $swp2 101 log_test "Source MAC is multicast" @@ -118,7 +117,6 @@ __vlan_tag_mismatch_test() { local trap_name="vlan_tag_mismatch" local dmac=de:ad:be:ef:13:37 - local group_name="l2_drops" local opt=$1; shift local mz_pid @@ -132,7 +130,7 @@ __vlan_tag_mismatch_test() $MZ $h1 "$opt" -c 0 -p 100 -a own -b $dmac -t ip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp2 101 + devlink_trap_drop_test $trap_name $swp2 101 # Add PVID and make sure packets are no longer dropped. bridge vlan add vid 1 dev $swp1 pvid untagged master @@ -140,7 +138,7 @@ __vlan_tag_mismatch_test() devlink_trap_stats_idle_test $trap_name check_err $? "Trap stats not idle when packets should not be dropped" - devlink_trap_group_stats_idle_test $group_name + devlink_trap_group_stats_idle_test $(devlink_trap_group_get $trap_name) check_err $? "Trap group stats not idle with when packets should not be dropped" tc_check_packets "dev $swp2 egress" 101 0 @@ -179,7 +177,6 @@ ingress_vlan_filter_test() { local trap_name="ingress_vlan_filter" local dmac=de:ad:be:ef:13:37 - local group_name="l2_drops" local mz_pid local vid=10 @@ -193,7 +190,7 @@ ingress_vlan_filter_test() $MZ $h1 -Q $vid -c 0 -p 100 -a own -b $dmac -t ip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp2 101 + devlink_trap_drop_test $trap_name $swp2 101 # Add the VLAN on the bridge port and make sure packets are no longer # dropped. @@ -202,7 +199,7 @@ ingress_vlan_filter_test() devlink_trap_stats_idle_test $trap_name check_err $? "Trap stats not idle when packets should not be dropped" - devlink_trap_group_stats_idle_test $group_name + devlink_trap_group_stats_idle_test $(devlink_trap_group_get $trap_name) check_err $? "Trap group stats not idle with when packets should not be dropped" tc_check_packets "dev $swp2 egress" 101 0 @@ -222,7 +219,6 @@ __ingress_stp_filter_test() { local trap_name="ingress_spanning_tree_filter" local dmac=de:ad:be:ef:13:37 - local group_name="l2_drops" local state=$1; shift local mz_pid local vid=20 @@ -237,7 +233,7 @@ __ingress_stp_filter_test() $MZ $h1 -Q $vid -c 0 -p 100 -a own -b $dmac -t ip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp2 101 + devlink_trap_drop_test $trap_name $swp2 101 # Change STP state to forwarding and make sure packets are no longer # dropped. @@ -246,7 +242,7 @@ __ingress_stp_filter_test() devlink_trap_stats_idle_test $trap_name check_err $? "Trap stats not idle when packets should not be dropped" - devlink_trap_group_stats_idle_test $group_name + devlink_trap_group_stats_idle_test $(devlink_trap_group_get $trap_name) check_err $? "Trap group stats not idle with when packets should not be dropped" tc_check_packets "dev $swp2 egress" 101 0 @@ -292,7 +288,6 @@ port_list_is_empty_uc_test() { local trap_name="port_list_is_empty" local dmac=de:ad:be:ef:13:37 - local group_name="l2_drops" local mz_pid # Disable unicast flooding on both ports, so that packets cannot egress @@ -308,7 +303,7 @@ port_list_is_empty_uc_test() $MZ $h1 -c 0 -p 100 -a own -b $dmac -t ip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp2 101 + devlink_trap_drop_test $trap_name $swp2 101 # Allow packets to be flooded to one port. ip link set dev $swp2 type bridge_slave flood on @@ -316,7 +311,7 @@ port_list_is_empty_uc_test() devlink_trap_stats_idle_test $trap_name check_err $? "Trap stats not idle when packets should not be dropped" - devlink_trap_group_stats_idle_test $group_name + devlink_trap_group_stats_idle_test $(devlink_trap_group_get $trap_name) check_err $? "Trap group stats not idle with when packets should not be dropped" tc_check_packets "dev $swp2 egress" 101 0 @@ -335,7 +330,6 @@ port_list_is_empty_mc_test() { local trap_name="port_list_is_empty" local dmac=01:00:5e:00:00:01 - local group_name="l2_drops" local dip=239.0.0.1 local mz_pid @@ -354,7 +348,7 @@ port_list_is_empty_mc_test() $MZ $h1 -c 0 -p 100 -a own -b $dmac -t ip -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp2 101 + devlink_trap_drop_test $trap_name $swp2 101 # Allow packets to be flooded to one port. ip link set dev $swp2 type bridge_slave mcast_flood on @@ -362,7 +356,7 @@ port_list_is_empty_mc_test() devlink_trap_stats_idle_test $trap_name check_err $? "Trap stats not idle when packets should not be dropped" - devlink_trap_group_stats_idle_test $group_name + devlink_trap_group_stats_idle_test $(devlink_trap_group_get $trap_name) check_err $? "Trap group stats not idle with when packets should not be dropped" tc_check_packets "dev $swp2 egress" 101 0 @@ -387,7 +381,6 @@ port_loopback_filter_uc_test() { local trap_name="port_loopback_filter" local dmac=de:ad:be:ef:13:37 - local group_name="l2_drops" local mz_pid # Make sure packets can only egress the input port. @@ -401,7 +394,7 @@ port_loopback_filter_uc_test() $MZ $h1 -c 0 -p 100 -a own -b $dmac -t ip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp2 101 + devlink_trap_drop_test $trap_name $swp2 101 # Allow packets to be flooded. ip link set dev $swp2 type bridge_slave flood on @@ -409,7 +402,7 @@ port_loopback_filter_uc_test() devlink_trap_stats_idle_test $trap_name check_err $? "Trap stats not idle when packets should not be dropped" - devlink_trap_group_stats_idle_test $group_name + devlink_trap_group_stats_idle_test $(devlink_trap_group_get $trap_name) check_err $? "Trap group stats not idle with when packets should not be dropped" tc_check_packets "dev $swp2 egress" 101 0 diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh index 616f47d86a61..f5abb1ebd392 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh @@ -161,7 +161,6 @@ ping_check() non_ip_test() { local trap_name="non_ip" - local group_name="l3_drops" local mz_pid RET=0 @@ -176,7 +175,7 @@ non_ip_test() 00:00 de:ad:be:ef" & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 101 + devlink_trap_drop_test $trap_name $rp2 101 log_test "Non IP" @@ -190,7 +189,6 @@ __uc_dip_over_mc_dmac_test() local dip=$1; shift local flags=${1:-""}; shift local trap_name="uc_dip_over_mc_dmac" - local group_name="l3_drops" local dmac=01:02:03:04:05:06 local mz_pid @@ -206,7 +204,7 @@ __uc_dip_over_mc_dmac_test() -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 101 + devlink_trap_drop_test $trap_name $rp2 101 log_test "Unicast destination IP over multicast destination MAC: $desc" @@ -227,7 +225,6 @@ __sip_is_loopback_test() local dip=$1; shift local flags=${1:-""}; shift local trap_name="sip_is_loopback_address" - local group_name="l3_drops" local mz_pid RET=0 @@ -242,7 +239,7 @@ __sip_is_loopback_test() -b $rp1mac -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 101 + devlink_trap_drop_test $trap_name $rp2 101 log_test "Source IP is loopback address: $desc" @@ -262,7 +259,6 @@ __dip_is_loopback_test() local dip=$1; shift local flags=${1:-""}; shift local trap_name="dip_is_loopback_address" - local group_name="l3_drops" local mz_pid RET=0 @@ -277,7 +273,7 @@ __dip_is_loopback_test() -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 101 + devlink_trap_drop_test $trap_name $rp2 101 log_test "Destination IP is loopback address: $desc" @@ -298,7 +294,6 @@ __sip_is_mc_test() local dip=$1; shift local flags=${1:-""}; shift local trap_name="sip_is_mc" - local group_name="l3_drops" local mz_pid RET=0 @@ -313,7 +308,7 @@ __sip_is_mc_test() -b $rp1mac -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 101 + devlink_trap_drop_test $trap_name $rp2 101 log_test "Source IP is multicast: $desc" @@ -329,7 +324,6 @@ sip_is_mc_test() ipv4_sip_is_limited_bc_test() { local trap_name="ipv4_sip_is_limited_bc" - local group_name="l3_drops" local sip=255.255.255.255 local mz_pid @@ -345,7 +339,7 @@ ipv4_sip_is_limited_bc_test() -B $h2_ipv4 -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 101 + devlink_trap_drop_test $trap_name $rp2 101 log_test "IPv4 source IP is limited broadcast" @@ -382,7 +376,6 @@ __ipv4_header_corrupted_test() local ihl=$1; shift local checksum=$1; shift local trap_name="ip_header_corrupted" - local group_name="l3_drops" local payload local mz_pid @@ -399,7 +392,7 @@ __ipv4_header_corrupted_test() $MZ $h1 -c 0 -d 1msec -a $h1mac -b $rp1mac -q p=$payload & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 101 + devlink_trap_drop_test $trap_name $rp2 101 log_test "IP header corrupted: $desc: IPv4" @@ -429,7 +422,6 @@ __ipv6_header_corrupted_test() local desc=$1; shift local ipver=$1; shift local trap_name="ip_header_corrupted" - local group_name="l3_drops" local payload local mz_pid @@ -446,7 +438,7 @@ __ipv6_header_corrupted_test() $MZ $h1 -c 0 -d 1msec -a $h1mac -b $rp1mac -q p=$payload & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 101 + devlink_trap_drop_test $trap_name $rp2 101 log_test "IP header corrupted: $desc: IPv6" @@ -469,7 +461,6 @@ ip_header_corrupted_test() ipv6_mc_dip_reserved_scope_test() { local trap_name="ipv6_mc_dip_reserved_scope" - local group_name="l3_drops" local dip=FF00:: local mz_pid @@ -485,7 +476,7 @@ ipv6_mc_dip_reserved_scope_test() "33:33:00:00:00:00" -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 101 + devlink_trap_drop_test $trap_name $rp2 101 log_test "IPv6 multicast destination IP reserved scope" @@ -495,7 +486,6 @@ ipv6_mc_dip_reserved_scope_test() ipv6_mc_dip_interface_local_scope_test() { local trap_name="ipv6_mc_dip_interface_local_scope" - local group_name="l3_drops" local dip=FF01:: local mz_pid @@ -511,7 +501,7 @@ ipv6_mc_dip_interface_local_scope_test() "33:33:00:00:00:00" -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 101 + devlink_trap_drop_test $trap_name $rp2 101 log_test "IPv6 multicast destination IP interface-local scope" @@ -526,7 +516,6 @@ __blackhole_route_test() local dip=$1; shift local ip_proto=${1:-"icmp"}; shift local trap_name="blackhole_route" - local group_name="l3_drops" local mz_pid RET=0 @@ -542,7 +531,7 @@ __blackhole_route_test() -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 101 + devlink_trap_drop_test $trap_name $rp2 101 log_test "Blackhole route: IPv$flags" devlink_trap_drop_cleanup $mz_pid $rp2 $proto 1 101 @@ -558,7 +547,6 @@ blackhole_route_test() irif_disabled_test() { local trap_name="irif_disabled" - local group_name="l3_drops" local t0_packets t0_bytes local t1_packets t1_bytes local mz_pid @@ -613,7 +601,6 @@ irif_disabled_test() erif_disabled_test() { local trap_name="erif_disabled" - local group_name="l3_drops" local t0_packets t0_bytes local t1_packets t1_bytes local mz_pid diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh index 2bc6df42d597..1fedfc9da434 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh @@ -169,7 +169,6 @@ trap_action_check() mtu_value_is_too_small_test() { local trap_name="mtu_value_is_too_small" - local group_name="l3_drops" local expected_action="trap" local mz_pid @@ -191,7 +190,7 @@ mtu_value_is_too_small_test() -B 198.51.100.1 -q & mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name tc_check_packets_hitting "dev $h1 ingress" 101 check_err $? "Packets were not received to h1" @@ -208,7 +207,6 @@ __ttl_value_is_too_small_test() { local ttl_val=$1; shift local trap_name="ttl_value_is_too_small" - local group_name="l3_drops" local expected_action="trap" local mz_pid @@ -227,7 +225,7 @@ __ttl_value_is_too_small_test() -b $rp1mac -B 198.51.100.1 -q & mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name tc_check_packets_hitting "dev $h1 ingress" 101 check_err $? "Packets were not received to h1" @@ -271,7 +269,6 @@ __mc_reverse_path_forwarding_test() local proto=$1; shift local flags=${1:-""}; shift local trap_name="mc_reverse_path_forwarding" - local group_name="l3_drops" local expected_action="trap" local mz_pid @@ -292,7 +289,7 @@ __mc_reverse_path_forwarding_test() mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name tc_check_packets "dev $rp2 egress" 101 0 check_err $? "Packets were not dropped" @@ -322,7 +319,6 @@ __reject_route_test() local unreachable=$1; shift local flags=${1:-""}; shift local trap_name="reject_route" - local group_name="l3_drops" local expected_action="trap" local mz_pid @@ -341,7 +337,7 @@ __reject_route_test() -B $dst_ip -q & mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name tc_check_packets_hitting "dev $h1 ingress" 101 check_err $? "ICMP packet was not received to h1" @@ -370,7 +366,6 @@ __host_miss_test() local desc=$1; shift local dip=$1; shift local trap_name="unresolved_neigh" - local group_name="l3_drops" local expected_action="trap" local mz_pid @@ -405,7 +400,6 @@ __invalid_nexthop_test() local subnet=$1; shift local via_add=$1; shift local trap_name="unresolved_neigh" - local group_name="l3_drops" local expected_action="trap" local mz_pid @@ -494,7 +488,6 @@ vrf_without_routes_destroy() ipv4_lpm_miss_test() { local trap_name="ipv4_lpm_miss" - local group_name="l3_drops" local expected_action="trap" local mz_pid @@ -511,7 +504,7 @@ ipv4_lpm_miss_test() -B 203.0.113.1 -q & mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name log_test "LPM miss: IPv4" @@ -522,7 +515,6 @@ ipv4_lpm_miss_test() ipv6_lpm_miss_test() { local trap_name="ipv6_lpm_miss" - local group_name="l3_drops" local expected_action="trap" local mz_pid @@ -539,7 +531,7 @@ ipv6_lpm_miss_test() -B 2001:db8::1 -q & mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name log_test "LPM miss: IPv6" diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh index 039629bb92a3..8817851da7a9 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh @@ -140,7 +140,6 @@ ecn_payload_get() ecn_decap_test() { local trap_name="decap_error" - local group_name="tunnel_drops" local desc=$1; shift local ecn_desc=$1; shift local outer_tos=$1; shift @@ -161,7 +160,7 @@ ecn_decap_test() mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name tc_check_packets "dev $swp1 egress" 101 0 check_err $? "Packets were not dropped" @@ -200,7 +199,6 @@ ipip_payload_get() no_matching_tunnel_test() { local trap_name="decap_error" - local group_name="tunnel_drops" local desc=$1; shift local sip=$1; shift local mz_pid @@ -218,7 +216,7 @@ no_matching_tunnel_test() -A $sip -B 192.0.2.65 -t ip len=48,proto=47,p=$payload -q & mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name tc_check_packets "dev $swp1 egress" 101 0 check_err $? "Packets were not dropped" diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh index e11a416323cf..10e0f3dbc930 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh @@ -159,7 +159,6 @@ ecn_payload_get() ecn_decap_test() { local trap_name="decap_error" - local group_name="tunnel_drops" local desc=$1; shift local ecn_desc=$1; shift local outer_tos=$1; shift @@ -177,7 +176,7 @@ ecn_decap_test() -t udp sp=12345,dp=$VXPORT,tos=$outer_tos,p=$payload -q & mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name tc_check_packets "dev $swp1 egress" 101 0 check_err $? "Packets were not dropped" @@ -228,7 +227,6 @@ short_payload_get() corrupted_packet_test() { local trap_name="decap_error" - local group_name="tunnel_drops" local desc=$1; shift local payload_get=$1; shift local mz_pid @@ -246,7 +244,7 @@ corrupted_packet_test() -B 192.0.2.17 -t udp sp=12345,dp=$VXPORT,p=$payload -q & mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name tc_check_packets "dev $swp1 egress" 101 0 check_err $? "Packets were not dropped" @@ -297,7 +295,6 @@ mc_smac_payload_get() overlay_smac_is_mc_test() { local trap_name="overlay_smac_is_mc" - local group_name="tunnel_drops" local mz_pid RET=0 @@ -314,7 +311,7 @@ overlay_smac_is_mc_test() -B 192.0.2.17 -t udp sp=12345,dp=$VXPORT,p=$payload -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp1 101 + devlink_trap_drop_test $trap_name $swp1 101 log_test "Overlay source MAC is multicast" diff --git a/tools/testing/selftests/net/forwarding/devlink_lib.sh b/tools/testing/selftests/net/forwarding/devlink_lib.sh index 7b6390aea50b..e27236109235 100644 --- a/tools/testing/selftests/net/forwarding/devlink_lib.sh +++ b/tools/testing/selftests/net/forwarding/devlink_lib.sh @@ -365,7 +365,9 @@ devlink_trap_group_stats_idle_test() devlink_trap_exception_test() { local trap_name=$1; shift - local group_name=$1; shift + local group_name + + group_name=$(devlink_trap_group_get $trap_name) devlink_trap_stats_idle_test $trap_name check_fail $? "Trap stats idle when packets should have been trapped" @@ -377,9 +379,11 @@ devlink_trap_exception_test() devlink_trap_drop_test() { local trap_name=$1; shift - local group_name=$1; shift local dev=$1; shift local handle=$1; shift + local group_name + + group_name=$(devlink_trap_group_get $trap_name) # This is the common part of all the tests. It checks that stats are # initially idle, then non-idle after changing the trap action and -- cgit v1.2.3 From eb682677f59e809d8e06c218b565aeb9723a4ad3 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 17 May 2020 12:00:33 -0600 Subject: selftests: Drop 'pref medium' in route checks The 'pref medium' attribute was moved in iproute2 to be near the prefix which is where it applies versus after the last nexthop. The nexthop tests were updated to drop the string from route checking, but it crept in again with the compat tests. Fixes: 4dddb5be136a ("selftests: net: add new testcases for nexthop API compat mode sysctl") Signed-off-by: David Ahern Cc: Roopa Prabhu Signed-off-by: David S. Miller --- tools/testing/selftests/net/fib_nexthops.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index dd0e5fec6367..50d822face36 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -965,7 +965,7 @@ ipv6_compat_mode() log_test $? 0 "IPv6 compat mode on - route add notification" # route dump should contain expanded nexthops - check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 122 metric 1024 pref medium nexthop via 2001:db8:91::2 dev veth1 weight 1 nexthop via 2001:db8:91::3 dev veth1 weight 1" + check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 122 metric 1024 nexthop via 2001:db8:91::2 dev veth1 weight 1 nexthop via 2001:db8:91::3 dev veth1 weight 1" log_test $? 0 "IPv6 compat mode on - route dump" # change in nexthop group should generate route notification @@ -992,7 +992,7 @@ ipv6_compat_mode() log_test $? 0 "IPv6 compat mode off - route add notification" # route dump should not contain expanded nexthops - check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 122 metric 1024 pref medium" + check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 122 metric 1024" log_test $? 0 "IPv6 compat mode off - route dump" # change in nexthop group should not generate route notification -- cgit v1.2.3 From 1b66d253610c7f8f257103808a9460223a087469 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 19 May 2020 00:45:45 +0200 Subject: bpf: Add get{peer, sock}name attach types for sock_addr As stated in 983695fa6765 ("bpf: fix unconnected udp hooks"), the objective for the existing cgroup connect/sendmsg/recvmsg/bind BPF hooks is to be transparent to applications. In Cilium we make use of these hooks [0] in order to enable E-W load balancing for existing Kubernetes service types for all Cilium managed nodes in the cluster. Those backends can be local or remote. The main advantage of this approach is that it operates as close as possible to the socket, and therefore allows to avoid packet-based NAT given in connect/sendmsg/recvmsg hooks we only need to xlate sock addresses. This also allows to expose NodePort services on loopback addresses in the host namespace, for example. As another advantage, this also efficiently blocks bind requests for applications in the host namespace for exposed ports. However, one missing item is that we also need to perform reverse xlation for inet{,6}_getname() hooks such that we can return the service IP/port tuple back to the application instead of the remote peer address. The vast majority of applications does not bother about getpeername(), but in a few occasions we've seen breakage when validating the peer's address since it returns unexpectedly the backend tuple instead of the service one. Therefore, this trivial patch allows to customise and adds a getpeername() as well as getsockname() BPF cgroup hook for both IPv4 and IPv6 in order to address this situation. Simple example: # ./cilium/cilium service list ID Frontend Service Type Backend 1 1.2.3.4:80 ClusterIP 1 => 10.0.0.10:80 Before; curl's verbose output example, no getpeername() reverse xlation: # curl --verbose 1.2.3.4 * Rebuilt URL to: 1.2.3.4/ * Trying 1.2.3.4... * TCP_NODELAY set * Connected to 1.2.3.4 (10.0.0.10) port 80 (#0) > GET / HTTP/1.1 > Host: 1.2.3.4 > User-Agent: curl/7.58.0 > Accept: */* [...] After; with getpeername() reverse xlation: # curl --verbose 1.2.3.4 * Rebuilt URL to: 1.2.3.4/ * Trying 1.2.3.4... * TCP_NODELAY set * Connected to 1.2.3.4 (1.2.3.4) port 80 (#0) > GET / HTTP/1.1 > Host: 1.2.3.4 > User-Agent: curl/7.58.0 > Accept: */* [...] Originally, I had both under a BPF_CGROUP_INET{4,6}_GETNAME type and exposed peer to the context similar as in inet{,6}_getname() fashion, but API-wise this is suboptimal as it always enforces programs having to test for ctx->peer which can easily be missed, hence BPF_CGROUP_INET{4,6}_GET{PEER,SOCK}NAME split. Similarly, the checked return code is on tnum_range(1, 1), but if a use case comes up in future, it can easily be changed to return an error code instead. Helper and ctx member access is the same as with connect/sendmsg/etc hooks. [0] https://github.com/cilium/cilium/blob/master/bpf/bpf_sock.c Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Andrey Ignatov Link: https://lore.kernel.org/bpf/61a479d759b2482ae3efb45546490bacd796a220.1589841594.git.daniel@iogearbox.net --- include/linux/bpf-cgroup.h | 1 + include/uapi/linux/bpf.h | 4 ++++ kernel/bpf/syscall.c | 12 ++++++++++++ kernel/bpf/verifier.c | 6 +++++- net/core/filter.c | 4 ++++ net/ipv4/af_inet.c | 8 ++++++-- net/ipv6/af_inet6.c | 9 ++++++--- tools/include/uapi/linux/bpf.h | 4 ++++ 8 files changed, 42 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 272626cc3fc9..c66c545e161a 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -396,6 +396,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, } #define cgroup_bpf_enabled (0) +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b9b8a0f63b91..97e1fd19ff58 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -220,6 +220,10 @@ enum bpf_attach_type { BPF_MODIFY_RETURN, BPF_LSM_MAC, BPF_TRACE_ITER, + BPF_CGROUP_INET4_GETPEERNAME, + BPF_CGROUP_INET6_GETPEERNAME, + BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 57dfc98289d5..431241c74614 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1978,6 +1978,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: @@ -2767,6 +2771,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: @@ -2912,6 +2920,10 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET6_POST_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9c7d67d65d8c..2ed8351f47a4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7094,7 +7094,11 @@ static int check_return_code(struct bpf_verifier_env *env) switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || - env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG) + env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG || + env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME) range = tnum_range(1, 1); break; case BPF_PROG_TYPE_CGROUP_SKB: diff --git a/net/core/filter.c b/net/core/filter.c index 822d662f97ef..bd2853d23b50 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7049,6 +7049,8 @@ static bool sock_addr_is_valid_access(int off, int size, switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: break; @@ -7060,6 +7062,8 @@ static bool sock_addr_is_valid_access(int off, int size, switch (prog->expected_attach_type) { case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP6_RECVMSG: break; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index fcf0d12a407a..8f5c8c9409d3 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -755,12 +755,11 @@ do_err: } EXPORT_SYMBOL(inet_accept); - /* * This does both peername and sockname. */ int inet_getname(struct socket *sock, struct sockaddr *uaddr, - int peer) + int peer) { struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); @@ -781,6 +780,11 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; } + if (cgroup_bpf_enabled) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + peer ? BPF_CGROUP_INET4_GETPEERNAME : + BPF_CGROUP_INET4_GETSOCKNAME, + NULL); memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return sizeof(*sin); } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 771a462a8322..3b6fcc0c321a 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -504,9 +504,8 @@ EXPORT_SYMBOL_GPL(inet6_destroy_sock); /* * This does both peername and sockname. */ - int inet6_getname(struct socket *sock, struct sockaddr *uaddr, - int peer) + int peer) { struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr; struct sock *sk = sock->sk; @@ -531,9 +530,13 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_addr = np->saddr; else sin->sin6_addr = sk->sk_v6_rcv_saddr; - sin->sin6_port = inet->inet_sport; } + if (cgroup_bpf_enabled) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + peer ? BPF_CGROUP_INET6_GETPEERNAME : + BPF_CGROUP_INET6_GETSOCKNAME, + NULL); sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, sk->sk_bound_dev_if); return sizeof(*sin); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 146c742f1d49..1cddc398404a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -220,6 +220,10 @@ enum bpf_attach_type { BPF_MODIFY_RETURN, BPF_LSM_MAC, BPF_TRACE_ITER, + BPF_CGROUP_INET4_GETPEERNAME, + BPF_CGROUP_INET6_GETPEERNAME, + BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From f15ed0185de7d471e907783739dffbe397a93142 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 19 May 2020 00:45:46 +0200 Subject: bpf, libbpf: Enable get{peer, sock}name attach types Trivial patch to add the new get{peer,sock}name attach types to the section definitions in order to hook them up to sock_addr cgroup program type. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Andrey Ignatov Link: https://lore.kernel.org/bpf/7fcd4b1e41a8ebb364754a5975c75a7795051bd2.1589841594.git.daniel@iogearbox.net --- tools/lib/bpf/libbpf.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 292257995487..fa04cbe547ed 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6705,6 +6705,14 @@ static const struct bpf_sec_def section_defs[] = { BPF_CGROUP_UDP4_RECVMSG), BPF_EAPROG_SEC("cgroup/recvmsg6", BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_RECVMSG), + BPF_EAPROG_SEC("cgroup/getpeername4", BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_CGROUP_INET4_GETPEERNAME), + BPF_EAPROG_SEC("cgroup/getpeername6", BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_CGROUP_INET6_GETPEERNAME), + BPF_EAPROG_SEC("cgroup/getsockname4", BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_CGROUP_INET4_GETSOCKNAME), + BPF_EAPROG_SEC("cgroup/getsockname6", BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_CGROUP_INET6_GETSOCKNAME), BPF_EAPROG_SEC("cgroup/sysctl", BPF_PROG_TYPE_CGROUP_SYSCTL, BPF_CGROUP_SYSCTL), BPF_EAPROG_SEC("cgroup/getsockopt", BPF_PROG_TYPE_CGROUP_SOCKOPT, -- cgit v1.2.3 From 05ee19c18c2bb3dea69e29219017367c4a77e65a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 19 May 2020 00:45:47 +0200 Subject: bpf, bpftool: Enable get{peer, sock}name attach types Make bpftool aware and add the new get{peer,sock}name attach types to its cli, documentation and bash completion to allow attachment/detachment of sock_addr programs there. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Andrey Ignatov Link: https://lore.kernel.org/bpf/9765b3d03e4c29210c4df56a9cc7e52f5f7bb5ef.1589841594.git.daniel@iogearbox.net --- tools/bpf/bpftool/Documentation/bpftool-cgroup.rst | 10 +++++++--- tools/bpf/bpftool/Documentation/bpftool-prog.rst | 3 ++- tools/bpf/bpftool/bash-completion/bpftool | 15 +++++++++------ tools/bpf/bpftool/cgroup.c | 7 ++++--- tools/bpf/bpftool/main.h | 4 ++++ tools/bpf/bpftool/prog.c | 6 ++++-- 6 files changed, 30 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index e4d9da654e84..a226aee3574f 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -29,8 +29,8 @@ CGROUP COMMANDS | *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } | *ATTACH_TYPE* := { **ingress** | **egress** | **sock_create** | **sock_ops** | **device** | | **bind4** | **bind6** | **post_bind4** | **post_bind6** | **connect4** | **connect6** | -| **sendmsg4** | **sendmsg6** | **recvmsg4** | **recvmsg6** | **sysctl** | -| **getsockopt** | **setsockopt** } +| **getpeername4** | **getpeername6** | **getsockname4** | **getsockname6** | **sendmsg4** | +| **sendmsg6** | **recvmsg4** | **recvmsg6** | **sysctl** | **getsockopt** | **setsockopt** } | *ATTACH_FLAGS* := { **multi** | **override** } DESCRIPTION @@ -101,7 +101,11 @@ DESCRIPTION an unconnected udp6 socket (since 5.2); **sysctl** sysctl access (since 5.2); **getsockopt** call to getsockopt (since 5.3); - **setsockopt** call to setsockopt (since 5.3). + **setsockopt** call to setsockopt (since 5.3); + **getpeername4** call to getpeername(2) for an inet4 socket (since 5.8); + **getpeername6** call to getpeername(2) for an inet6 socket (since 5.8); + **getsockname4** call to getsockname(2) for an inet4 socket (since 5.8); + **getsockname6** call to getsockname(2) for an inet6 socket (since 5.8). **bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* Detach *PROG* from the cgroup *CGROUP* and attach type diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 5948e9d89c8d..2b254959d488 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -41,7 +41,8 @@ PROG COMMANDS | **cgroup/sock** | **cgroup/dev** | **lwt_in** | **lwt_out** | **lwt_xmit** | | **lwt_seg6local** | **sockops** | **sk_skb** | **sk_msg** | **lirc_mode2** | | **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** | -| **cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** | +| **cgroup/connect4** | **cgroup/connect6** | **cgroup/getpeername4** | **cgroup/getpeername6** | +| **cgroup/getsockname4** | **cgroup/getsockname6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** | | **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/sysctl** | | **cgroup/getsockopt** | **cgroup/setsockopt** | | **struct_ops** | **fentry** | **fexit** | **freplace** diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 9f0f20e73b87..25b25aca1112 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -472,6 +472,8 @@ _bpftool() lwt_seg6local sockops sk_skb sk_msg \ lirc_mode2 cgroup/bind4 cgroup/bind6 \ cgroup/connect4 cgroup/connect6 \ + cgroup/getpeername4 cgroup/getpeername6 \ + cgroup/getsockname4 cgroup/getsockname6 \ cgroup/sendmsg4 cgroup/sendmsg6 \ cgroup/recvmsg4 cgroup/recvmsg6 \ cgroup/post_bind4 cgroup/post_bind6 \ @@ -966,9 +968,10 @@ _bpftool() ;; attach|detach) local ATTACH_TYPES='ingress egress sock_create sock_ops \ - device bind4 bind6 post_bind4 post_bind6 connect4 \ - connect6 sendmsg4 sendmsg6 recvmsg4 recvmsg6 sysctl \ - getsockopt setsockopt' + device bind4 bind6 post_bind4 post_bind6 connect4 connect6 \ + getpeername4 getpeername6 getsockname4 getsockname6 \ + sendmsg4 sendmsg6 recvmsg4 recvmsg6 sysctl getsockopt \ + setsockopt' local ATTACH_FLAGS='multi override' local PROG_TYPE='id pinned tag name' case $prev in @@ -977,9 +980,9 @@ _bpftool() return 0 ;; ingress|egress|sock_create|sock_ops|device|bind4|bind6|\ - post_bind4|post_bind6|connect4|connect6|sendmsg4|\ - sendmsg6|recvmsg4|recvmsg6|sysctl|getsockopt|\ - setsockopt) + post_bind4|post_bind6|connect4|connect6|getpeername4|\ + getpeername6|getsockname4|getsockname6|sendmsg4|sendmsg6|\ + recvmsg4|recvmsg6|sysctl|getsockopt|setsockopt) COMPREPLY=( $( compgen -W "$PROG_TYPE" -- \ "$cur" ) ) return 0 diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c index 1693c802bb20..27931db421d8 100644 --- a/tools/bpf/bpftool/cgroup.c +++ b/tools/bpf/bpftool/cgroup.c @@ -25,9 +25,10 @@ " ATTACH_TYPE := { ingress | egress | sock_create |\n" \ " sock_ops | device | bind4 | bind6 |\n" \ " post_bind4 | post_bind6 | connect4 |\n" \ - " connect6 | sendmsg4 | sendmsg6 |\n" \ - " recvmsg4 | recvmsg6 | sysctl |\n" \ - " getsockopt | setsockopt }" + " connect6 | getpeername4 | getpeername6 |\n" \ + " getsockname4 | getsockname6 | sendmsg4 |\n" \ + " sendmsg6 | recvmsg4 | recvmsg6 |\n" \ + " sysctl | getsockopt | setsockopt }" static unsigned int query_flags; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index f89ac70ef973..5cdf0bc049bd 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -100,6 +100,10 @@ static const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = { [BPF_CGROUP_INET6_CONNECT] = "connect6", [BPF_CGROUP_INET4_POST_BIND] = "post_bind4", [BPF_CGROUP_INET6_POST_BIND] = "post_bind6", + [BPF_CGROUP_INET4_GETPEERNAME] = "getpeername4", + [BPF_CGROUP_INET6_GETPEERNAME] = "getpeername6", + [BPF_CGROUP_INET4_GETSOCKNAME] = "getsockname4", + [BPF_CGROUP_INET6_GETSOCKNAME] = "getsockname6", [BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4", [BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6", [BPF_CGROUP_SYSCTL] = "sysctl", diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index b6e5ba568f98..245f941fdbcf 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -2012,8 +2012,10 @@ static int do_help(int argc, char **argv) " sk_reuseport | flow_dissector | cgroup/sysctl |\n" " cgroup/bind4 | cgroup/bind6 | cgroup/post_bind4 |\n" " cgroup/post_bind6 | cgroup/connect4 | cgroup/connect6 |\n" - " cgroup/sendmsg4 | cgroup/sendmsg6 | cgroup/recvmsg4 |\n" - " cgroup/recvmsg6 | cgroup/getsockopt | cgroup/setsockopt |\n" + " cgroup/getpeername4 | cgroup/getpeername6 |\n" + " cgroup/getsockname4 | cgroup/getsockname6 | cgroup/sendmsg4 |\n" + " cgroup/sendmsg6 | cgroup/recvmsg4 | cgroup/recvmsg6 |\n" + " cgroup/getsockopt | cgroup/setsockopt |\n" " struct_ops | fentry | fexit | freplace }\n" " ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n" " flow_dissector }\n" -- cgit v1.2.3 From 566fc3f5d1c641b510ec487cf274a047f8a1e849 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 19 May 2020 00:45:48 +0200 Subject: bpf, testing: Add get{peer, sock}name selftests to test_progs Extend the existing connect_force_port test to assert get{peer,sock}name programs as well. The workflow for e.g. IPv4 is as follows: i) server binds to concrete port, ii) client calls getsockname() on server fd which exposes 1.2.3.4:60000 to client, iii) client connects to service address 1.2.3.4:60000 binds to concrete local address (127.0.0.1:22222) and remaps service address to a concrete backend address (127.0.0.1:60123), iv) client then calls getsockname() on its own fd to verify local address (127.0.0.1:22222) and getpeername() on its own fd which then publishes service address (1.2.3.4:60000) instead of actual backend. Same workflow is done for IPv6 just with different address/port tuples. # ./test_progs -t connect_force_port #14 connect_force_port:OK Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Andrey Ignatov Link: https://lore.kernel.org/bpf/3343da6ad08df81af715a95d61a84fb4a960f2bf.1589841594.git.daniel@iogearbox.net --- tools/testing/selftests/bpf/network_helpers.c | 11 ++- tools/testing/selftests/bpf/network_helpers.h | 1 + .../selftests/bpf/prog_tests/connect_force_port.c | 107 +++++++++++++++------ .../selftests/bpf/progs/connect_force_port4.c | 59 +++++++++++- .../selftests/bpf/progs/connect_force_port6.c | 70 +++++++++++++- 5 files changed, 215 insertions(+), 33 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 999a775484c1..e36dd1a1780d 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -5,6 +5,8 @@ #include #include +#include + #include #include @@ -35,7 +37,7 @@ struct ipv6_packet pkt_v6 = { .tcp.doff = 5, }; -int start_server(int family, int type) +int start_server_with_port(int family, int type, __u16 port) { struct sockaddr_storage addr = {}; socklen_t len; @@ -45,11 +47,13 @@ int start_server(int family, int type) struct sockaddr_in *sin = (void *)&addr; sin->sin_family = AF_INET; + sin->sin_port = htons(port); len = sizeof(*sin); } else { struct sockaddr_in6 *sin6 = (void *)&addr; sin6->sin6_family = AF_INET6; + sin6->sin6_port = htons(port); len = sizeof(*sin6); } @@ -76,6 +80,11 @@ int start_server(int family, int type) return fd; } +int start_server(int family, int type) +{ + return start_server_with_port(family, type, 0); +} + static const struct timeval timeo_sec = { .tv_sec = 3 }; static const size_t timeo_optlen = sizeof(timeo_sec); diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 86914e6e7b53..6a8009605670 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -34,6 +34,7 @@ struct ipv6_packet { extern struct ipv6_packet pkt_v6; int start_server(int family, int type); +int start_server_with_port(int family, int type, __u16 port); int connect_to_fd(int family, int type, int server_fd); int connect_fd_to_fd(int client_fd, int server_fd); int connect_wait(int client_fd); diff --git a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c index 47fbb20cb6a6..17bbf76812ca 100644 --- a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c +++ b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c @@ -4,7 +4,8 @@ #include "cgroup_helpers.h" #include "network_helpers.h" -static int verify_port(int family, int fd, int expected) +static int verify_ports(int family, int fd, + __u16 expected_local, __u16 expected_peer) { struct sockaddr_storage addr; socklen_t len = sizeof(addr); @@ -20,9 +21,25 @@ static int verify_port(int family, int fd, int expected) else port = ((struct sockaddr_in6 *)&addr)->sin6_port; - if (ntohs(port) != expected) { - log_err("Unexpected port %d, expected %d", ntohs(port), - expected); + if (ntohs(port) != expected_local) { + log_err("Unexpected local port %d, expected %d", ntohs(port), + expected_local); + return -1; + } + + if (getpeername(fd, (struct sockaddr *)&addr, &len)) { + log_err("Failed to get peer addr"); + return -1; + } + + if (family == AF_INET) + port = ((struct sockaddr_in *)&addr)->sin_port; + else + port = ((struct sockaddr_in6 *)&addr)->sin6_port; + + if (ntohs(port) != expected_peer) { + log_err("Unexpected peer port %d, expected %d", ntohs(port), + expected_peer); return -1; } @@ -31,33 +48,67 @@ static int verify_port(int family, int fd, int expected) static int run_test(int cgroup_fd, int server_fd, int family, int type) { + bool v4 = family == AF_INET; + __u16 expected_local_port = v4 ? 22222 : 22223; + __u16 expected_peer_port = 60000; struct bpf_prog_load_attr attr = { - .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + .file = v4 ? "./connect_force_port4.o" : + "./connect_force_port6.o", }; + struct bpf_program *prog; struct bpf_object *obj; - int expected_port; - int prog_fd; - int err; - int fd; - - if (family == AF_INET) { - attr.file = "./connect_force_port4.o"; - attr.expected_attach_type = BPF_CGROUP_INET4_CONNECT; - expected_port = 22222; - } else { - attr.file = "./connect_force_port6.o"; - attr.expected_attach_type = BPF_CGROUP_INET6_CONNECT; - expected_port = 22223; - } + int xlate_fd, fd, err; + __u32 duration = 0; - err = bpf_prog_load_xattr(&attr, &obj, &prog_fd); + err = bpf_prog_load_xattr(&attr, &obj, &xlate_fd); if (err) { log_err("Failed to load BPF object"); return -1; } - err = bpf_prog_attach(prog_fd, cgroup_fd, attr.expected_attach_type, - 0); + prog = bpf_object__find_program_by_title(obj, v4 ? + "cgroup/connect4" : + "cgroup/connect6"); + if (CHECK(!prog, "find_prog", "connect prog not found\n")) { + err = -EIO; + goto close_bpf_object; + } + + err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ? + BPF_CGROUP_INET4_CONNECT : + BPF_CGROUP_INET6_CONNECT, 0); + if (err) { + log_err("Failed to attach BPF program"); + goto close_bpf_object; + } + + prog = bpf_object__find_program_by_title(obj, v4 ? + "cgroup/getpeername4" : + "cgroup/getpeername6"); + if (CHECK(!prog, "find_prog", "getpeername prog not found\n")) { + err = -EIO; + goto close_bpf_object; + } + + err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ? + BPF_CGROUP_INET4_GETPEERNAME : + BPF_CGROUP_INET6_GETPEERNAME, 0); + if (err) { + log_err("Failed to attach BPF program"); + goto close_bpf_object; + } + + prog = bpf_object__find_program_by_title(obj, v4 ? + "cgroup/getsockname4" : + "cgroup/getsockname6"); + if (CHECK(!prog, "find_prog", "getsockname prog not found\n")) { + err = -EIO; + goto close_bpf_object; + } + + err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ? + BPF_CGROUP_INET4_GETSOCKNAME : + BPF_CGROUP_INET6_GETSOCKNAME, 0); if (err) { log_err("Failed to attach BPF program"); goto close_bpf_object; @@ -69,8 +120,8 @@ static int run_test(int cgroup_fd, int server_fd, int family, int type) goto close_bpf_object; } - err = verify_port(family, fd, expected_port); - + err = verify_ports(family, fd, expected_local_port, + expected_peer_port); close(fd); close_bpf_object: @@ -86,25 +137,25 @@ void test_connect_force_port(void) if (CHECK_FAIL(cgroup_fd < 0)) return; - server_fd = start_server(AF_INET, SOCK_STREAM); + server_fd = start_server_with_port(AF_INET, SOCK_STREAM, 60123); if (CHECK_FAIL(server_fd < 0)) goto close_cgroup_fd; CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_STREAM)); close(server_fd); - server_fd = start_server(AF_INET6, SOCK_STREAM); + server_fd = start_server_with_port(AF_INET6, SOCK_STREAM, 60124); if (CHECK_FAIL(server_fd < 0)) goto close_cgroup_fd; CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_STREAM)); close(server_fd); - server_fd = start_server(AF_INET, SOCK_DGRAM); + server_fd = start_server_with_port(AF_INET, SOCK_DGRAM, 60123); if (CHECK_FAIL(server_fd < 0)) goto close_cgroup_fd; CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_DGRAM)); close(server_fd); - server_fd = start_server(AF_INET6, SOCK_DGRAM); + server_fd = start_server_with_port(AF_INET6, SOCK_DGRAM, 60124); if (CHECK_FAIL(server_fd < 0)) goto close_cgroup_fd; CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_DGRAM)); diff --git a/tools/testing/selftests/bpf/progs/connect_force_port4.c b/tools/testing/selftests/bpf/progs/connect_force_port4.c index 1b8eb34b2db0..7396308677a3 100644 --- a/tools/testing/selftests/bpf/progs/connect_force_port4.c +++ b/tools/testing/selftests/bpf/progs/connect_force_port4.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include @@ -12,17 +13,71 @@ char _license[] SEC("license") = "GPL"; int _version SEC("version") = 1; +struct svc_addr { + __be32 addr; + __be16 port; +}; + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct svc_addr); +} service_mapping SEC(".maps"); + SEC("cgroup/connect4") -int _connect4(struct bpf_sock_addr *ctx) +int connect4(struct bpf_sock_addr *ctx) { struct sockaddr_in sa = {}; + struct svc_addr *orig; + /* Force local address to 127.0.0.1:22222. */ sa.sin_family = AF_INET; sa.sin_port = bpf_htons(22222); - sa.sin_addr.s_addr = bpf_htonl(0x7f000001); /* 127.0.0.1 */ + sa.sin_addr.s_addr = bpf_htonl(0x7f000001); if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) return 0; + /* Rewire service 1.2.3.4:60000 to backend 127.0.0.1:60123. */ + if (ctx->user_port == bpf_htons(60000)) { + orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!orig) + return 0; + + orig->addr = ctx->user_ip4; + orig->port = ctx->user_port; + + ctx->user_ip4 = bpf_htonl(0x7f000001); + ctx->user_port = bpf_htons(60123); + } + return 1; +} + +SEC("cgroup/getsockname4") +int getsockname4(struct bpf_sock_addr *ctx) +{ + /* Expose local server as 1.2.3.4:60000 to client. */ + if (ctx->user_port == bpf_htons(60123)) { + ctx->user_ip4 = bpf_htonl(0x01020304); + ctx->user_port = bpf_htons(60000); + } + return 1; +} + +SEC("cgroup/getpeername4") +int getpeername4(struct bpf_sock_addr *ctx) +{ + struct svc_addr *orig; + + /* Expose service 1.2.3.4:60000 as peer instead of backend. */ + if (ctx->user_port == bpf_htons(60123)) { + orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, 0); + if (orig) { + ctx->user_ip4 = orig->addr; + ctx->user_port = orig->port; + } + } return 1; } diff --git a/tools/testing/selftests/bpf/progs/connect_force_port6.c b/tools/testing/selftests/bpf/progs/connect_force_port6.c index ae6f7d750b4c..c1a2b555e9ad 100644 --- a/tools/testing/selftests/bpf/progs/connect_force_port6.c +++ b/tools/testing/selftests/bpf/progs/connect_force_port6.c @@ -12,17 +12,83 @@ char _license[] SEC("license") = "GPL"; int _version SEC("version") = 1; +struct svc_addr { + __be32 addr[4]; + __be16 port; +}; + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct svc_addr); +} service_mapping SEC(".maps"); + SEC("cgroup/connect6") -int _connect6(struct bpf_sock_addr *ctx) +int connect6(struct bpf_sock_addr *ctx) { struct sockaddr_in6 sa = {}; + struct svc_addr *orig; + /* Force local address to [::1]:22223. */ sa.sin6_family = AF_INET6; sa.sin6_port = bpf_htons(22223); - sa.sin6_addr.s6_addr32[3] = bpf_htonl(1); /* ::1 */ + sa.sin6_addr.s6_addr32[3] = bpf_htonl(1); if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) return 0; + /* Rewire service [fc00::1]:60000 to backend [::1]:60124. */ + if (ctx->user_port == bpf_htons(60000)) { + orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!orig) + return 0; + + orig->addr[0] = ctx->user_ip6[0]; + orig->addr[1] = ctx->user_ip6[1]; + orig->addr[2] = ctx->user_ip6[2]; + orig->addr[3] = ctx->user_ip6[3]; + orig->port = ctx->user_port; + + ctx->user_ip6[0] = 0; + ctx->user_ip6[1] = 0; + ctx->user_ip6[2] = 0; + ctx->user_ip6[3] = bpf_htonl(1); + ctx->user_port = bpf_htons(60124); + } + return 1; +} + +SEC("cgroup/getsockname6") +int getsockname6(struct bpf_sock_addr *ctx) +{ + /* Expose local server as [fc00::1]:60000 to client. */ + if (ctx->user_port == bpf_htons(60124)) { + ctx->user_ip6[0] = bpf_htonl(0xfc000000); + ctx->user_ip6[1] = 0; + ctx->user_ip6[2] = 0; + ctx->user_ip6[3] = bpf_htonl(1); + ctx->user_port = bpf_htons(60000); + } + return 1; +} + +SEC("cgroup/getpeername6") +int getpeername6(struct bpf_sock_addr *ctx) +{ + struct svc_addr *orig; + + /* Expose service [fc00::1]:60000 as peer instead of backend. */ + if (ctx->user_port == bpf_htons(60124)) { + orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, 0); + if (orig) { + ctx->user_ip6[0] = orig->addr[0]; + ctx->user_ip6[1] = orig->addr[1]; + ctx->user_ip6[2] = orig->addr[2]; + ctx->user_ip6[3] = orig->addr[3]; + ctx->user_port = orig->port; + } + } return 1; } -- cgit v1.2.3 From fb53d3b63743585ce918094d6109a3865fa66e5f Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 19 May 2020 11:39:53 -0700 Subject: tools/bpf: sync bpf.h Sync tools/include/uapi/linux/bpf.h from include/uapi. Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1cddc398404a..97e1fd19ff58 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -73,7 +73,7 @@ struct bpf_insn { /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ struct bpf_lpm_trie_key { __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ - __u8 data[]; /* Arbitrary size */ + __u8 data[0]; /* Arbitrary size */ }; struct bpf_cgroup_storage_key { @@ -2019,8 +2019,8 @@ union bpf_attr { * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) * Description * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is - * only possible to shrink the packet as of this writing, - * therefore *delta* must be a negative integer. + * possible to both shrink and grow the packet tail. + * Shrink done via *delta* being a negative integer. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers -- cgit v1.2.3 From b9f4c01f3e0b06579a8074dcc8638fae89a1ca67 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 18 May 2020 16:45:16 -0700 Subject: selftest/bpf: Make bpf_iter selftest compilable against old vmlinux.h It's good to be able to compile bpf_iter selftest even on systems that don't have the very latest vmlinux.h, e.g., for libbpf tests against older kernels in Travis CI. To that extent, re-define bpf_iter_meta and corresponding bpf_iter context structs in each selftest. To avoid type clashes with vmlinux.h, rename vmlinux.h's definitions to get them out of the way. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/20200518234516.3915052-1-andriin@fb.com --- tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c | 16 ++++++++++++++++ .../testing/selftests/bpf/progs/bpf_iter_ipv6_route.c | 16 ++++++++++++++++ tools/testing/selftests/bpf/progs/bpf_iter_netlink.c | 16 ++++++++++++++++ tools/testing/selftests/bpf/progs/bpf_iter_task.c | 16 ++++++++++++++++ tools/testing/selftests/bpf/progs/bpf_iter_task_file.c | 18 ++++++++++++++++++ .../selftests/bpf/progs/bpf_iter_test_kern_common.h | 16 ++++++++++++++++ 6 files changed, 98 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c index 4867cd3445c8..b57bd6fef208 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c @@ -1,11 +1,27 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__bpf_map bpf_iter__bpf_map___not_used #include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__bpf_map #include #include char _license[] SEC("license") = "GPL"; +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__bpf_map { + struct bpf_iter_meta *meta; + struct bpf_map *map; +} __attribute__((preserve_access_index)); + SEC("iter/bpf_map") int dump_bpf_map(struct bpf_iter__bpf_map *ctx) { diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c index ab9e2650e021..c8e9ca74c87b 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c @@ -1,9 +1,25 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__ipv6_route bpf_iter__ipv6_route___not_used #include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__ipv6_route #include #include +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__ipv6_route { + struct bpf_iter_meta *meta; + struct fib6_info *rt; +} __attribute__((preserve_access_index)); + char _license[] SEC("license") = "GPL"; extern bool CONFIG_IPV6_SUBTREES __kconfig __weak; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c index 6b40a233d4e0..e7b8753eac0b 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c @@ -1,6 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__netlink bpf_iter__netlink___not_used #include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__netlink #include #include @@ -9,6 +14,17 @@ char _license[] SEC("license") = "GPL"; #define sk_rmem_alloc sk_backlog.rmem_alloc #define sk_refcnt __sk_common.skc_refcnt +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__netlink { + struct bpf_iter_meta *meta; + struct netlink_sock *sk; +} __attribute__((preserve_access_index)); + static inline struct inode *SOCK_INODE(struct socket *socket) { return &container_of(socket, struct socket_alloc, socket)->vfs_inode; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task.c b/tools/testing/selftests/bpf/progs/bpf_iter_task.c index 90f9011c57ca..ee754021f98e 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_task.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task.c @@ -1,11 +1,27 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__task bpf_iter__task___not_used #include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__task #include #include char _license[] SEC("license") = "GPL"; +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__task { + struct bpf_iter_meta *meta; + struct task_struct *task; +} __attribute__((preserve_access_index)); + SEC("iter/task") int dump_task(struct bpf_iter__task *ctx) { diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c index c6ced38f0880..0f0ec3db20ba 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c @@ -1,11 +1,29 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__task_file bpf_iter__task_file___not_used #include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__task_file #include #include char _license[] SEC("license") = "GPL"; +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__task_file { + struct bpf_iter_meta *meta; + struct task_struct *task; + __u32 fd; + struct file *file; +} __attribute__((preserve_access_index)); + SEC("iter/task_file") int dump_task_file(struct bpf_iter__task_file *ctx) { diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h index bdd51cf14b54..dee1339e6905 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h @@ -1,11 +1,27 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__task bpf_iter__task___not_used #include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__task #include char _license[] SEC("license") = "GPL"; int count = 0; +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__task { + struct bpf_iter_meta *meta; + struct task_struct *task; +} __attribute__((preserve_access_index)); + SEC("iter/task") int dump_task(struct bpf_iter__task *ctx) { -- cgit v1.2.3 From dda18a5c0b75461d1ed228f80b59c67434b8d601 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 19 May 2020 12:23:41 -0700 Subject: selftests/bpf: Convert bpf_iter_test_kern{3, 4}.c to define own bpf_iter_meta b9f4c01f3e0b ("selftest/bpf: Make bpf_iter selftest compilable against old vmlinux.h") missed the fact that bpf_iter_test_kern{3,4}.c are not just including bpf_iter_test_kern_common.h and need similar bpf_iter_meta re-definition explicitly. Fixes: b9f4c01f3e0b ("selftest/bpf: Make bpf_iter selftest compilable against old vmlinux.h") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200519192341.134360-1-andriin@fb.com --- tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c | 15 +++++++++++++++ tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c | 15 +++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c index 636a00fa074d..13c2c90c835f 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c @@ -1,10 +1,25 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__task bpf_iter__task___not_used #include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__task #include char _license[] SEC("license") = "GPL"; +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__task { + struct bpf_iter_meta *meta; + struct task_struct *task; +} __attribute__((preserve_access_index)); + SEC("iter/task") int dump_task(struct bpf_iter__task *ctx) { diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c index b18dc0471d07..0aa71b333cf3 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c @@ -1,10 +1,25 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__bpf_map bpf_iter__bpf_map___not_used #include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__bpf_map #include char _license[] SEC("license") = "GPL"; +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__bpf_map { + struct bpf_iter_meta *meta; + struct bpf_map *map; +} __attribute__((preserve_access_index)); + __u32 map1_id = 0, map2_id = 0; __u32 map1_accessed = 0, map2_accessed = 0; __u64 map1_seqnum = 0, map2_seqnum1 = 0, map2_seqnum2 = 0; -- cgit v1.2.3 From c72b5cbb09bd76634b8d19695db2219964e24128 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 21 May 2020 13:07:46 -0700 Subject: bpf: Selftests, verifier case for non null pointer check branch taken When we have pointer type that is known to be non-null and comparing against zero we only follow the non-null branch. This adds tests to cover this case for reference tracking. Also add the other case when comparison against a non-zero value and ensure we still fail with unreleased reference. Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/159009166599.6313.1593680633787453767.stgit@john-Precision-5820-Tower --- .../testing/selftests/bpf/verifier/ref_tracking.c | 33 ++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/verifier/ref_tracking.c b/tools/testing/selftests/bpf/verifier/ref_tracking.c index 604b46151736..056e0273bf12 100644 --- a/tools/testing/selftests/bpf/verifier/ref_tracking.c +++ b/tools/testing/selftests/bpf/verifier/ref_tracking.c @@ -821,3 +821,36 @@ .result = REJECT, .errstr = "invalid mem access", }, +{ + "reference tracking: branch tracking valid pointer null comparison", + .insns = { + BPF_SK_LOOKUP(sk_lookup_tcp), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_3, 1), + BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 1), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 2), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_EMIT_CALL(BPF_FUNC_sk_release), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, +}, +{ + "reference tracking: branch tracking valid pointer value comparison", + .insns = { + BPF_SK_LOOKUP(sk_lookup_tcp), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_3, 1), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 4), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 1234, 2), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_EMIT_CALL(BPF_FUNC_sk_release), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .errstr = "Unreleased reference", + .result = REJECT, +}, -- cgit v1.2.3 From f9b16ec0eeb75337aef38954a4066e6eecd7cfe5 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 21 May 2020 13:08:06 -0700 Subject: bpf: Selftests, verifier case for non null pointer map value branch When we have pointer type that is known to be non-null we only follow the non-null branch. This adds tests to cover the map_value pointer returned from a map lookup. To force an error if both branches are followed we do an ALU op on R10. Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/159009168650.6313.7434084136067263554.stgit@john-Precision-5820-Tower --- tools/testing/selftests/bpf/verifier/value_or_null.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/verifier/value_or_null.c b/tools/testing/selftests/bpf/verifier/value_or_null.c index 860d4a71cd83..3ecb70a3d939 100644 --- a/tools/testing/selftests/bpf/verifier/value_or_null.c +++ b/tools/testing/selftests/bpf/verifier/value_or_null.c @@ -150,3 +150,22 @@ .result_unpriv = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, +{ + "map lookup and null branch prediction", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 10), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 2), + BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 1), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_10, 10), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_8b = { 4 }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, +}, -- cgit v1.2.3 From d844a71bff0fd899146e5981ec44b618afd17d83 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 21 May 2020 13:08:26 -0700 Subject: bpf: Selftests, add printk to test_sk_lookup_kern to encode null ptr check Adding a printk to test_sk_lookup_kern created the reported failure where a pointer type is checked twice for NULL. Lets add it to the progs test test_sk_lookup_kern.c so we test the case from C all the way into the verifier. We already have printk's in selftests so seems OK to add another one. Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/159009170603.6313.1715279795045285176.stgit@john-Precision-5820-Tower --- tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c b/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c index d2b38fa6a5b0..e83d0b48d80c 100644 --- a/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c +++ b/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c @@ -73,6 +73,7 @@ int bpf_sk_lookup_test0(struct __sk_buff *skb) tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6); sk = bpf_sk_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0); + bpf_printk("sk=%d\n", sk ? 1 : 0); if (sk) bpf_sk_release(sk); return sk ? TC_ACT_OK : TC_ACT_UNSPEC; -- cgit v1.2.3 From 0534c5489c11cbda0bd2d9719a121a0f90433905 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Thu, 21 May 2020 22:26:17 -0700 Subject: selftests: net: add fdb nexthop tests This commit adds ipv4 and ipv6 fdb nexthop api tests to fib_nexthops.sh. Signed-off-by: Roopa Prabhu Reviewed-by: David Ahern Signed-off-by: David S. Miller --- tools/testing/selftests/net/fib_nexthops.sh | 160 +++++++++++++++++++++++++++- 1 file changed, 158 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index 50d822face36..51f8e9afe6ae 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -19,8 +19,8 @@ ret=0 ksft_skip=4 # all tests in this script. Can be overridden with -t option -IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_compat_mode" -IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_compat_mode" +IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_compat_mode ipv4_fdb_grp_fcnal" +IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_compat_mode ipv6_fdb_grp_fcnal" ALL_TESTS="basic ${IPV4_TESTS} ${IPV6_TESTS}" TESTS="${ALL_TESTS}" @@ -146,6 +146,7 @@ setup() create_ns remote IP="ip -netns me" + BRIDGE="bridge -netns me" set -e $IP li add veth1 type veth peer name veth2 $IP li set veth1 up @@ -280,6 +281,161 @@ stop_ip_monitor() return $rc } +check_nexthop_fdb_support() +{ + $IP nexthop help 2>&1 | grep -q fdb + if [ $? -ne 0 ]; then + echo "SKIP: iproute2 too old, missing fdb nexthop support" + return $ksft_skip + fi +} + +ipv6_fdb_grp_fcnal() +{ + local rc + + echo + echo "IPv6 fdb groups functional" + echo "--------------------------" + + check_nexthop_fdb_support + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + # create group with multiple nexthops + run_cmd "$IP nexthop add id 61 via 2001:db8:91::2 fdb" + run_cmd "$IP nexthop add id 62 via 2001:db8:91::3 fdb" + run_cmd "$IP nexthop add id 102 group 61/62 fdb" + check_nexthop "id 102" "id 102 group 61/62 fdb" + log_test $? 0 "Fdb Nexthop group with multiple nexthops" + + ## get nexthop group + run_cmd "$IP nexthop get id 102" + check_nexthop "id 102" "id 102 group 61/62 fdb" + log_test $? 0 "Get Fdb nexthop group by id" + + # fdb nexthop group can only contain fdb nexthops + run_cmd "$IP nexthop add id 63 via 2001:db8:91::4" + run_cmd "$IP nexthop add id 64 via 2001:db8:91::5" + run_cmd "$IP nexthop add id 103 group 63/64 fdb" + log_test $? 2 "Fdb Nexthop group with non-fdb nexthops" + + # Non fdb nexthop group can not contain fdb nexthops + run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 fdb" + run_cmd "$IP nexthop add id 66 via 2001:db8:91::6 fdb" + run_cmd "$IP nexthop add id 104 group 65/66" + log_test $? 2 "Non-Fdb Nexthop group with fdb nexthops" + + # fdb nexthop cannot have blackhole + run_cmd "$IP nexthop add id 67 blackhole fdb" + log_test $? 2 "Fdb Nexthop with blackhole" + + # fdb nexthop with oif + run_cmd "$IP nexthop add id 68 via 2001:db8:91::7 dev veth1 fdb" + log_test $? 2 "Fdb Nexthop with oif" + + # fdb nexthop with onlink + run_cmd "$IP nexthop add id 68 via 2001:db8:91::7 onlink fdb" + log_test $? 2 "Fdb Nexthop with onlink" + + # fdb nexthop with encap + run_cmd "$IP nexthop add id 69 encap mpls 101 via 2001:db8:91::8 dev veth1 fdb" + log_test $? 2 "Fdb Nexthop with encap" + + run_cmd "$IP link add name vx10 type vxlan id 1010 local 2001:db8:91::9 remote 2001:db8:91::10 dstport 4789 nolearning noudpcsum tos inherit ttl 100" + run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self" + log_test $? 0 "Fdb mac add with nexthop group" + + ## fdb nexthops can only reference nexthop groups and not nexthops + run_cmd "$BRIDGE fdb add 02:02:00:00:00:14 dev vx10 nhid 61 self" + log_test $? 255 "Fdb mac add with nexthop" + + run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 66" + log_test $? 2 "Route add with fdb nexthop" + + run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 103" + log_test $? 2 "Route add with fdb nexthop group" + + run_cmd "$IP nexthop del id 102" + log_test $? 0 "Fdb nexthop delete" + + $IP link del dev vx10 +} + +ipv4_fdb_grp_fcnal() +{ + local rc + + echo + echo "IPv4 fdb groups functional" + echo "--------------------------" + + check_nexthop_fdb_support + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + # create group with multiple nexthops + run_cmd "$IP nexthop add id 12 via 172.16.1.2 fdb" + run_cmd "$IP nexthop add id 13 via 172.16.1.3 fdb" + run_cmd "$IP nexthop add id 102 group 12/13 fdb" + check_nexthop "id 102" "id 102 group 12/13 fdb" + log_test $? 0 "Fdb Nexthop group with multiple nexthops" + + # get nexthop group + run_cmd "$IP nexthop get id 102" + check_nexthop "id 102" "id 102 group 12/13 fdb" + log_test $? 0 "Get Fdb nexthop group by id" + + # fdb nexthop group can only contain fdb nexthops + run_cmd "$IP nexthop add id 14 via 172.16.1.2" + run_cmd "$IP nexthop add id 15 via 172.16.1.3" + run_cmd "$IP nexthop add id 103 group 14/15 fdb" + log_test $? 2 "Fdb Nexthop group with non-fdb nexthops" + + # Non fdb nexthop group can not contain fdb nexthops + run_cmd "$IP nexthop add id 16 via 172.16.1.2 fdb" + run_cmd "$IP nexthop add id 17 via 172.16.1.3 fdb" + run_cmd "$IP nexthop add id 104 group 14/15" + log_test $? 2 "Non-Fdb Nexthop group with fdb nexthops" + + # fdb nexthop cannot have blackhole + run_cmd "$IP nexthop add id 18 blackhole fdb" + log_test $? 2 "Fdb Nexthop with blackhole" + + # fdb nexthop with oif + run_cmd "$IP nexthop add id 16 via 172.16.1.2 dev veth1 fdb" + log_test $? 2 "Fdb Nexthop with oif" + + # fdb nexthop with onlink + run_cmd "$IP nexthop add id 16 via 172.16.1.2 onlink fdb" + log_test $? 2 "Fdb Nexthop with onlink" + + # fdb nexthop with encap + run_cmd "$IP nexthop add id 17 encap mpls 101 via 172.16.1.2 dev veth1 fdb" + log_test $? 2 "Fdb Nexthop with encap" + + run_cmd "$IP link add name vx10 type vxlan id 1010 local 10.0.0.1 remote 10.0.0.2 dstport 4789 nolearning noudpcsum tos inherit ttl 100" + run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self" + log_test $? 0 "Fdb mac add with nexthop group" + + # fdb nexthops can only reference nexthop groups and not nexthops + run_cmd "$BRIDGE fdb add 02:02:00:00:00:14 dev vx10 nhid 12 self" + log_test $? 255 "Fdb mac add with nexthop" + + run_cmd "$IP ro add 172.16.0.0/22 nhid 15" + log_test $? 2 "Route add with fdb nexthop" + + run_cmd "$IP ro add 172.16.0.0/22 nhid 103" + log_test $? 2 "Route add with fdb nexthop group" + + run_cmd "$IP nexthop del id 102" + log_test $? 0 "Fdb nexthop delete" + + $IP link del dev vx10 +} + ################################################################################ # basic operations (add, delete, replace) on nexthops and nexthop groups # -- cgit v1.2.3 From 6736aa793c2b5fb6c64884d2623c66aa1b9bfa92 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Fri, 22 May 2020 12:24:34 +0100 Subject: selftests/bpf: Add general instructions for test execution Getting a clean BPF selftests run involves ensuring latest trunk LLVM/clang are used, pahole is recent (>=1.16) and config matches the specified config file as closely as possible. Add to bpf_devel_QA.rst and point tools/testing/selftests/bpf/README.rst to it. Signed-off-by: Alan Maguire Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1590146674-25485-1-git-send-email-alan.maguire@oracle.com --- Documentation/bpf/bpf_devel_QA.rst | 15 +++++++++++++++ tools/testing/selftests/bpf/README.rst | 2 ++ 2 files changed, 17 insertions(+) (limited to 'tools') diff --git a/Documentation/bpf/bpf_devel_QA.rst b/Documentation/bpf/bpf_devel_QA.rst index 38c15c6fcb14..0b3db91dc100 100644 --- a/Documentation/bpf/bpf_devel_QA.rst +++ b/Documentation/bpf/bpf_devel_QA.rst @@ -437,6 +437,21 @@ needed:: See the kernels selftest `Documentation/dev-tools/kselftest.rst`_ document for further documentation. +To maximize the number of tests passing, the .config of the kernel +under test should match the config file fragment in +tools/testing/selftests/bpf as closely as possible. + +Finally to ensure support for latest BPF Type Format features - +discussed in `Documentation/bpf/btf.rst`_ - pahole version 1.16 +is required for kernels built with CONFIG_DEBUG_INFO_BTF=y. +pahole is delivered in the dwarves package or can be built +from source at + +https://github.com/acmel/dwarves + +Some distros have pahole version 1.16 packaged already, e.g. +Fedora, Gentoo. + Q: Which BPF kernel selftests version should I run my kernel against? --------------------------------------------------------------------- A: If you run a kernel ``xyz``, then always run the BPF kernel selftests diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index 0f67f1b470b0..e885d351595f 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -1,6 +1,8 @@ ================== BPF Selftest Notes ================== +General instructions on running selftests can be found in +`Documentation/bpf/bpf_devel_QA.rst`_. Additional information about selftest failures are documented here. -- cgit v1.2.3 From 3c8e8cf4b18b3a7034fab4c4504fc4b54e4b6195 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Fri, 22 May 2020 12:36:28 +0100 Subject: selftests/bpf: CONFIG_IPV6_SEG6_BPF required for test_seg6_loop.o test_seg6_loop.o uses the helper bpf_lwt_seg6_adjust_srh(); it will not be present if CONFIG_IPV6_SEG6_BPF is not specified. Fixes: b061017f8b4d ("selftests/bpf: add realistic loop tests") Signed-off-by: Alan Maguire Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/1590147389-26482-2-git-send-email-alan.maguire@oracle.com --- tools/testing/selftests/bpf/config | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 60e3ae5d4e48..48e058552eb7 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -25,6 +25,7 @@ CONFIG_XDP_SOCKETS=y CONFIG_FTRACE_SYSCALLS=y CONFIG_IPV6_TUNNEL=y CONFIG_IPV6_GRE=y +CONFIG_IPV6_SEG6_BPF=y CONFIG_NET_FOU=m CONFIG_NET_FOU_IP_TUNNELS=y CONFIG_IPV6_FOU=m -- cgit v1.2.3 From a5dfaa2ab94057dd75c7911143482a0a85593c14 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Fri, 22 May 2020 12:36:29 +0100 Subject: selftests/bpf: CONFIG_LIRC required for test_lirc_mode2.sh test_lirc_mode2.sh assumes presence of /sys/class/rc/rc0/lirc*/uevent which will not be present unless CONFIG_LIRC=y Fixes: 6bdd533cee9a ("bpf: add selftest for lirc_mode2 type program") Signed-off-by: Alan Maguire Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/1590147389-26482-3-git-send-email-alan.maguire@oracle.com --- tools/testing/selftests/bpf/config | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 48e058552eb7..2118e23ac07a 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -38,3 +38,4 @@ CONFIG_IPV6_SIT=m CONFIG_BPF_JIT=y CONFIG_BPF_LSM=y CONFIG_SECURITY=y +CONFIG_LIRC=y -- cgit v1.2.3 From 025b7de7f4e9b26c31c511e84a7cef14605e70ef Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 26 May 2020 02:05:49 +0300 Subject: mlxsw: spectrum: Reduce priority of locally delivered packets To align with recent recommended values. Will be configurable by future patches. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 2 +- tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 5fe51ee8a206..b10e5aeaedef 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -4235,11 +4235,11 @@ static int mlxsw_sp_trap_groups_set(struct mlxsw_core *mlxsw_core) tc = 4; break; case MLXSW_REG_HTGT_TRAP_GROUP_SP_MC_SNOOPING: - case MLXSW_REG_HTGT_TRAP_GROUP_SP_IP2ME: priority = 3; tc = 3; break; case MLXSW_REG_HTGT_TRAP_GROUP_SP_NEIGH_DISCOVERY: + case MLXSW_REG_HTGT_TRAP_GROUP_SP_IP2ME: case MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6: case MLXSW_REG_HTGT_TRAP_GROUP_SP_PTP1: case MLXSW_REG_HTGT_TRAP_GROUP_SP_DHCP: diff --git a/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh b/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh index 58f3a05f08af..7d9e73a43a49 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh @@ -15,7 +15,7 @@ source mlxsw_lib.sh SB_POOL_ING=0 SB_POOL_EGR_CPU=10 -SB_ITC_CPU_IP=3 +SB_ITC_CPU_IP=2 SB_ITC_CPU_ARP=2 SB_ITC=0 -- cgit v1.2.3 From 5a1b72cebc774ec68854dab59c6838d795ee9370 Mon Sep 17 00:00:00 2001 From: Stephen Worley Date: Wed, 27 May 2020 12:41:42 -0400 Subject: net: add large ecmp group nexthop tests Add a couple large ecmp group nexthop selftests to cover the remnant fixed by d69100b8eee27c2d60ee52df76e0b80a8d492d34. The tests create 100 x32 ecmp groups of ipv4 and ipv6 and then dump them. On kernels without the fix, they will fail due to data remnant during the dump. Signed-off-by: Stephen Worley Reviewed-by: David Ahern Reviewed-by: David Ahern Signed-off-by: David S. Miller --- tools/testing/selftests/net/fib_nexthops.sh | 84 ++++++++++++++++++++++++++++- 1 file changed, 82 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index 51f8e9afe6ae..1e2f61262e4e 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -19,8 +19,8 @@ ret=0 ksft_skip=4 # all tests in this script. Can be overridden with -t option -IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_compat_mode ipv4_fdb_grp_fcnal" -IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_compat_mode ipv6_fdb_grp_fcnal" +IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_large_grp ipv4_compat_mode ipv4_fdb_grp_fcnal" +IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_large_grp ipv6_compat_mode ipv6_fdb_grp_fcnal" ALL_TESTS="basic ${IPV4_TESTS} ${IPV6_TESTS}" TESTS="${ALL_TESTS}" @@ -254,6 +254,60 @@ check_route6() check_output "${out}" "${expected}" } +check_large_grp() +{ + local ipv=$1 + local ecmp=$2 + local grpnum=100 + local nhidstart=100 + local grpidstart=1000 + local iter=0 + local nhidstr="" + local grpidstr="" + local grpstr="" + local ipstr="" + + if [ $ipv -eq 4 ]; then + ipstr="172.16.1." + else + ipstr="2001:db8:91::" + fi + + # + # Create $grpnum groups with specified $ecmp and dump them + # + + # create nexthops with different gateways + iter=2 + while [ $iter -le $(($ecmp + 1)) ] + do + nhidstr="$(($nhidstart + $iter))" + run_cmd "$IP nexthop add id $nhidstr via $ipstr$iter dev veth1" + check_nexthop "id $nhidstr" "id $nhidstr via $ipstr$iter dev veth1 scope link" + + if [ $iter -le $ecmp ]; then + grpstr+="$nhidstr/" + else + grpstr+="$nhidstr" + fi + ((iter++)) + done + + # create duplicate large ecmp groups + iter=0 + while [ $iter -le $grpnum ] + do + grpidstr="$(($grpidstart + $iter))" + run_cmd "$IP nexthop add id $grpidstr group $grpstr" + check_nexthop "id $grpidstr" "id $grpidstr group $grpstr" + ((iter++)) + done + + # dump large groups + run_cmd "$IP nexthop list" + log_test $? 0 "Dump large (x$ecmp) ecmp groups" +} + start_ip_monitor() { local mtype=$1 @@ -700,6 +754,19 @@ ipv6_fcnal_runtime() # route with src address and using nexthop - not allowed } +ipv6_large_grp() +{ + local ecmp=32 + + echo + echo "IPv6 large groups (x$ecmp)" + echo "---------------------" + + check_large_grp 6 $ecmp + + $IP nexthop flush >/dev/null 2>&1 +} + ipv4_fcnal() { local rc @@ -1066,6 +1133,19 @@ ipv4_fcnal_runtime() log_test $? 0 "IPv4 route with MPLS encap, v6 gw - check" } +ipv4_large_grp() +{ + local ecmp=32 + + echo + echo "IPv4 large groups (x$ecmp)" + echo "---------------------" + + check_large_grp 4 $ecmp + + $IP nexthop flush >/dev/null 2>&1 +} + sysctl_nexthop_compat_mode_check() { local sysctlname="net.ipv4.nexthop_compat_mode" -- cgit v1.2.3 From 7c741868ceab825bb99cf6c72859e9364d54a07c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 May 2020 18:03:44 -0600 Subject: selftests: Add torture tests to nexthop tests Add Nik's torture tests as a new set to stress the replace and cleanup paths. Torture test created by Nikolay Aleksandrov and then I adapted to selftest and added IPv6 version. Signed-off-by: David Ahern Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- tools/testing/selftests/net/fib_nexthops.sh | 115 +++++++++++++++++++++++++++- 1 file changed, 113 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index 1e2f61262e4e..dee567f7576a 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -19,8 +19,8 @@ ret=0 ksft_skip=4 # all tests in this script. Can be overridden with -t option -IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_large_grp ipv4_compat_mode ipv4_fdb_grp_fcnal" -IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_large_grp ipv6_compat_mode ipv6_fdb_grp_fcnal" +IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_large_grp ipv4_compat_mode ipv4_fdb_grp_fcnal ipv4_torture" +IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_large_grp ipv6_compat_mode ipv6_fdb_grp_fcnal ipv6_torture" ALL_TESTS="basic ${IPV4_TESTS} ${IPV6_TESTS}" TESTS="${ALL_TESTS}" @@ -767,6 +767,62 @@ ipv6_large_grp() $IP nexthop flush >/dev/null 2>&1 } +ipv6_del_add_loop1() +{ + while :; do + $IP nexthop del id 100 + $IP nexthop add id 100 via 2001:db8:91::2 dev veth1 + done >/dev/null 2>&1 +} + +ipv6_grp_replace_loop() +{ + while :; do + $IP nexthop replace id 102 group 100/101 + done >/dev/null 2>&1 +} + +ipv6_torture() +{ + local pid1 + local pid2 + local pid3 + local pid4 + local pid5 + + echo + echo "IPv6 runtime torture" + echo "--------------------" + if [ ! -x "$(command -v mausezahn)" ]; then + echo "SKIP: Could not run test; need mausezahn tool" + return + fi + + run_cmd "$IP nexthop add id 100 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 101 via 2001:db8:92::2 dev veth3" + run_cmd "$IP nexthop add id 102 group 100/101" + run_cmd "$IP route add 2001:db8:101::1 nhid 102" + run_cmd "$IP route add 2001:db8:101::2 nhid 102" + + ipv6_del_add_loop1 & + pid1=$! + ipv6_grp_replace_loop & + pid2=$! + ip netns exec me ping -f 2001:db8:101::1 >/dev/null 2>&1 & + pid3=$! + ip netns exec me ping -f 2001:db8:101::2 >/dev/null 2>&1 & + pid4=$! + ip netns exec me mausezahn veth1 -B 2001:db8:101::2 -A 2001:db8:91::1 -c 0 -t tcp "dp=1-1023, flags=syn" >/dev/null 2>&1 & + pid5=$! + + sleep 300 + kill -9 $pid1 $pid2 $pid3 $pid4 $pid5 + + # if we did not crash, success + log_test 0 0 "IPv6 torture test" +} + + ipv4_fcnal() { local rc @@ -1313,6 +1369,61 @@ ipv4_compat_mode() sysctl_nexthop_compat_mode_set 1 "IPv4" } +ipv4_del_add_loop1() +{ + while :; do + $IP nexthop del id 100 + $IP nexthop add id 100 via 172.16.1.2 dev veth1 + done >/dev/null 2>&1 +} + +ipv4_grp_replace_loop() +{ + while :; do + $IP nexthop replace id 102 group 100/101 + done >/dev/null 2>&1 +} + +ipv4_torture() +{ + local pid1 + local pid2 + local pid3 + local pid4 + local pid5 + + echo + echo "IPv4 runtime torture" + echo "--------------------" + if [ ! -x "$(command -v mausezahn)" ]; then + echo "SKIP: Could not run test; need mausezahn tool" + return + fi + + run_cmd "$IP nexthop add id 100 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 101 via 172.16.2.2 dev veth3" + run_cmd "$IP nexthop add id 102 group 100/101" + run_cmd "$IP route add 172.16.101.1 nhid 102" + run_cmd "$IP route add 172.16.101.2 nhid 102" + + ipv4_del_add_loop1 & + pid1=$! + ipv4_grp_replace_loop & + pid2=$! + ip netns exec me ping -f 172.16.101.1 >/dev/null 2>&1 & + pid3=$! + ip netns exec me ping -f 172.16.101.2 >/dev/null 2>&1 & + pid4=$! + ip netns exec me mausezahn veth1 -B 172.16.101.2 -A 172.16.1.1 -c 0 -t tcp "dp=1-1023, flags=syn" >/dev/null 2>&1 & + pid5=$! + + sleep 300 + kill -9 $pid1 $pid2 $pid3 $pid4 $pid5 + + # if we did not crash, success + log_test 0 0 "IPv4 torture test" +} + basic() { echo -- cgit v1.2.3 From 1c0522b4a2e143fa6e55e4bd2308415c81184ec7 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 29 May 2020 14:16:53 +0300 Subject: selftests: forwarding: mirror_lib: Use mausezahn Using ping in tests is error-prone, because ping is too smart. On a flaky system (notably in a simulator), when packets don't come quickly enough, more pings are sent, and that throws off counters. Instead use mausezahn to generate ICMP echo request packets. That allows us to send them in quicker succession as well, because the reason the ping was made slow in the first place was to make the tests work on simulated systems. Signed-off-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- tools/testing/selftests/net/forwarding/mirror_lib.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/mirror_lib.sh b/tools/testing/selftests/net/forwarding/mirror_lib.sh index 00797597fcf5..c33bfd7ba214 100644 --- a/tools/testing/selftests/net/forwarding/mirror_lib.sh +++ b/tools/testing/selftests/net/forwarding/mirror_lib.sh @@ -29,11 +29,9 @@ mirror_test() local pref=$1; shift local expect=$1; shift - local ping_timeout=$((PING_TIMEOUT * 5)) local t0=$(tc_rule_stats_get $dev $pref) - ip vrf exec $vrf_name \ - ${PING} ${sip:+-I $sip} $dip -c 10 -i 0.5 -w $ping_timeout \ - &> /dev/null + $MZ $vrf_name ${sip:+-A $sip} -B $dip -a own -b bc -q \ + -c 10 -d 100ms -t icmp type=8 sleep 0.5 local t1=$(tc_rule_stats_get $dev $pref) local delta=$((t1 - t0)) -- cgit v1.2.3 From 3ed97037f063b9130b56991f55f346597d27440d Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 29 May 2020 14:16:54 +0300 Subject: selftests: forwarding: pedit_dsfield: Check counter value A missing stats_update callback was recently added to act_pedit. Now that iproute2 supports JSON dumping for pedit, extend the pedit_dsfield selftest with a check that would have caught the fact that the callback was missing. Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- tools/testing/selftests/net/forwarding/pedit_dsfield.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/pedit_dsfield.sh b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh index 1181d647f6a7..55eeacf59241 100755 --- a/tools/testing/selftests/net/forwarding/pedit_dsfield.sh +++ b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh @@ -132,7 +132,12 @@ do_test_pedit_dsfield_common() local pkts pkts=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= 10" \ tc_rule_handle_stats_get "dev $h2 ingress" 101) - check_err $? "Expected to get 10 packets, but got $pkts." + check_err $? "Expected to get 10 packets on test probe, but got $pkts." + + pkts=$(tc_rule_handle_stats_get "$pedit_locus" 101) + ((pkts >= 10)) + check_err $? "Expected to get 10 packets on pedit rule, but got $pkts." + log_test "$pedit_locus pedit $pedit_action" } -- cgit v1.2.3 From 9959b389779a9e688d1a9272eed6377d999d8739 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:49 +0300 Subject: selftests: mlxsw: Add test for control packets Generate packets matching the various control traps and check that the traps' stats increase accordingly. Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- .../drivers/net/mlxsw/devlink_trap_control.sh | 688 +++++++++++++++++++++ .../selftests/net/forwarding/devlink_lib.sh | 23 + 2 files changed, 711 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/mlxsw/devlink_trap_control.sh (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_control.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_control.sh new file mode 100755 index 000000000000..a37273473c1b --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_control.sh @@ -0,0 +1,688 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test devlink-trap control trap functionality over mlxsw. Each registered +# control packet trap is tested to make sure it is triggered under the right +# conditions. +# +# +---------------------------------+ +# | H1 (vrf) | +# | + $h1 | +# | | 192.0.2.1/24 | +# | | 2001:db8:1::1/64 | +# | | | +# | | default via 192.0.2.2 | +# | | default via 2001:db8:1::2 | +# +----|----------------------------+ +# | +# +----|----------------------------------------------------------------------+ +# | SW | | +# | + $rp1 | +# | 192.0.2.2/24 | +# | 2001:db8:1::2/64 | +# | | +# | 2001:db8:2::2/64 | +# | 198.51.100.2/24 | +# | + $rp2 | +# | | | +# +----|----------------------------------------------------------------------+ +# | +# +----|----------------------------+ +# | | default via 198.51.100.2 | +# | | default via 2001:db8:2::2 | +# | | | +# | | 2001:db8:2::1/64 | +# | | 198.51.100.1/24 | +# | + $h2 | +# | H2 (vrf) | +# +---------------------------------+ + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + stp_test + lacp_test + lldp_test + igmp_query_test + igmp_v1_report_test + igmp_v2_report_test + igmp_v3_report_test + igmp_v2_leave_test + mld_query_test + mld_v1_report_test + mld_v2_report_test + mld_v1_done_test + ipv4_dhcp_test + ipv6_dhcp_test + arp_request_test + arp_response_test + ipv6_neigh_solicit_test + ipv6_neigh_advert_test + ipv4_bfd_test + ipv6_bfd_test + ipv4_ospf_test + ipv6_ospf_test + ipv4_bgp_test + ipv6_bgp_test + ipv4_vrrp_test + ipv6_vrrp_test + ipv4_pim_test + ipv6_pim_test + uc_loopback_test + local_route_test + external_route_test + ipv6_uc_dip_link_local_scope_test + ipv4_router_alert_test + ipv6_router_alert_test + ipv6_dip_all_nodes_test + ipv6_dip_all_routers_test + ipv6_router_solicit_test + ipv6_router_advert_test + ipv6_redirect_test + ptp_event_test + ptp_general_test + flow_action_sample_test + flow_action_trap_test +" +NUM_NETIFS=4 +source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh + +h1_create() +{ + simple_if_init $h1 192.0.2.1/24 2001:db8:1::1/64 + + ip -4 route add default vrf v$h1 nexthop via 192.0.2.2 + ip -6 route add default vrf v$h1 nexthop via 2001:db8:1::2 +} + +h1_destroy() +{ + ip -6 route del default vrf v$h1 nexthop via 2001:db8:1::2 + ip -4 route del default vrf v$h1 nexthop via 192.0.2.2 + + simple_if_fini $h1 192.0.2.1/24 2001:db8:1::1/64 +} + +h2_create() +{ + simple_if_init $h2 198.51.100.1/24 2001:db8:2::1/64 + + ip -4 route add default vrf v$h2 nexthop via 198.51.100.2 + ip -6 route add default vrf v$h2 nexthop via 2001:db8:2::2 +} + +h2_destroy() +{ + ip -6 route del default vrf v$h2 nexthop via 2001:db8:2::2 + ip -4 route del default vrf v$h2 nexthop via 198.51.100.2 + + simple_if_fini $h2 198.51.100.1/24 2001:db8:2::1/64 +} + +router_create() +{ + ip link set dev $rp1 up + ip link set dev $rp2 up + + __addr_add_del $rp1 add 192.0.2.2/24 2001:db8:1::2/64 + __addr_add_del $rp2 add 198.51.100.2/24 2001:db8:2::2/64 +} + +router_destroy() +{ + __addr_add_del $rp2 del 198.51.100.2/24 2001:db8:2::2/64 + __addr_add_del $rp1 del 192.0.2.2/24 2001:db8:1::2/64 + + ip link set dev $rp2 down + ip link set dev $rp1 down +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + rp1=${NETIFS[p2]} + + rp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + vrf_prepare + forwarding_enable + + h1_create + h2_create + router_create +} + +cleanup() +{ + pre_cleanup + + router_destroy + h2_destroy + h1_destroy + + forwarding_restore + vrf_cleanup +} + +stp_test() +{ + devlink_trap_stats_test "STP" "stp" $MZ $h1 -c 1 -t bpdu -q +} + +lacp_payload_get() +{ + local source_mac=$1; shift + local p + + p=$(: + )"01:80:C2:00:00:02:"$( : ETH daddr + )"$source_mac:"$( : ETH saddr + )"88:09:"$( : ETH type + ) + echo $p +} + +lacp_test() +{ + local h1mac=$(mac_get $h1) + + devlink_trap_stats_test "LACP" "lacp" $MZ $h1 -c 1 \ + $(lacp_payload_get $h1mac) -p 100 -q +} + +lldp_payload_get() +{ + local source_mac=$1; shift + local p + + p=$(: + )"01:80:C2:00:00:0E:"$( : ETH daddr + )"$source_mac:"$( : ETH saddr + )"88:CC:"$( : ETH type + ) + echo $p +} + +lldp_test() +{ + local h1mac=$(mac_get $h1) + + devlink_trap_stats_test "LLDP" "lldp" $MZ $h1 -c 1 \ + $(lldp_payload_get $h1mac) -p 100 -q +} + +igmp_query_test() +{ + # IGMP (IP Protocol 2) Membership Query (Type 0x11) + devlink_trap_stats_test "IGMP Membership Query" "igmp_query" \ + $MZ $h1 -c 1 -a own -b 01:00:5E:00:00:01 \ + -A 192.0.2.1 -B 224.0.0.1 -t ip proto=2,p=11 -p 100 -q +} + +igmp_v1_report_test() +{ + # IGMP (IP Protocol 2) Version 1 Membership Report (Type 0x12) + devlink_trap_stats_test "IGMP Version 1 Membership Report" \ + "igmp_v1_report" $MZ $h1 -c 1 -a own -b 01:00:5E:00:00:01 \ + -A 192.0.2.1 -B 244.0.0.1 -t ip proto=2,p=12 -p 100 -q +} + +igmp_v2_report_test() +{ + # IGMP (IP Protocol 2) Version 2 Membership Report (Type 0x16) + devlink_trap_stats_test "IGMP Version 2 Membership Report" \ + "igmp_v2_report" $MZ $h1 -c 1 -a own -b 01:00:5E:00:00:01 \ + -A 192.0.2.1 -B 244.0.0.1 -t ip proto=2,p=16 -p 100 -q +} + +igmp_v3_report_test() +{ + # IGMP (IP Protocol 2) Version 3 Membership Report (Type 0x22) + devlink_trap_stats_test "IGMP Version 3 Membership Report" \ + "igmp_v3_report" $MZ $h1 -c 1 -a own -b 01:00:5E:00:00:01 \ + -A 192.0.2.1 -B 244.0.0.1 -t ip proto=2,p=22 -p 100 -q +} + +igmp_v2_leave_test() +{ + # IGMP (IP Protocol 2) Version 2 Leave Group (Type 0x17) + devlink_trap_stats_test "IGMP Version 2 Leave Group" \ + "igmp_v2_leave" $MZ $h1 -c 1 -a own -b 01:00:5E:00:00:02 \ + -A 192.0.2.1 -B 224.0.0.2 -t ip proto=2,p=17 -p 100 -q +} + +mld_payload_get() +{ + local type=$1; shift + local p + + type=$(printf "%x" $type) + p=$(: + )"3A:"$( : Next Header - ICMPv6 + )"00:"$( : Hdr Ext Len + )"00:00:00:00:00:00:"$( : Options and Padding + )"$type:"$( : ICMPv6.type + )"00:"$( : ICMPv6.code + )"00:"$( : ICMPv6.checksum + ) + echo $p +} + +mld_query_test() +{ + # MLD Multicast Listener Query (Type 130) + devlink_trap_stats_test "MLD Multicast Listener Query" "mld_query" \ + $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::1 \ + -t ip hop=1,next=0,payload=$(mld_payload_get 130) -p 100 -q +} + +mld_v1_report_test() +{ + # MLD Version 1 Multicast Listener Report (Type 131) + devlink_trap_stats_test "MLD Version 1 Multicast Listener Report" \ + "mld_v1_report" $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::16 \ + -t ip hop=1,next=0,payload=$(mld_payload_get 131) -p 100 -q +} + +mld_v2_report_test() +{ + # MLD Version 2 Multicast Listener Report (Type 143) + devlink_trap_stats_test "MLD Version 2 Multicast Listener Report" \ + "mld_v2_report" $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::16 \ + -t ip hop=1,next=0,payload=$(mld_payload_get 143) -p 100 -q +} + +mld_v1_done_test() +{ + # MLD Version 1 Multicast Listener Done (Type 132) + devlink_trap_stats_test "MLD Version 1 Multicast Listener Done" \ + "mld_v1_done" $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::16 \ + -t ip hop=1,next=0,payload=$(mld_payload_get 132) -p 100 -q +} + +ipv4_dhcp_test() +{ + devlink_trap_stats_test "IPv4 DHCP Port 67" "ipv4_dhcp" \ + $MZ $h1 -c 1 -a own -b bcast -A 0.0.0.0 -B 255.255.255.255 \ + -t udp sp=68,dp=67 -p 100 -q + + devlink_trap_stats_test "IPv4 DHCP Port 68" "ipv4_dhcp" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) -A 192.0.2.1 \ + -B 255.255.255.255 -t udp sp=67,dp=68 -p 100 -q +} + +ipv6_dhcp_test() +{ + devlink_trap_stats_test "IPv6 DHCP Port 547" "ipv6_dhcp" \ + $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::1:2 -t udp sp=546,dp=547 \ + -p 100 -q + + devlink_trap_stats_test "IPv6 DHCP Port 546" "ipv6_dhcp" \ + $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::1:2 -t udp sp=547,dp=546 \ + -p 100 -q +} + +arp_request_test() +{ + devlink_trap_stats_test "ARP Request" "arp_request" \ + $MZ $h1 -c 1 -a own -b bcast -t arp request -p 100 -q +} + +arp_response_test() +{ + devlink_trap_stats_test "ARP Response" "arp_response" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) -t arp reply -p 100 -q +} + +icmpv6_header_get() +{ + local type=$1; shift + local p + + type=$(printf "%x" $type) + p=$(: + )"$type:"$( : ICMPv6.type + )"00:"$( : ICMPv6.code + )"00:"$( : ICMPv6.checksum + ) + echo $p +} + +ipv6_neigh_solicit_test() +{ + devlink_trap_stats_test "IPv6 Neighbour Solicitation" \ + "ipv6_neigh_solicit" $MZ $h1 -6 -c 1 \ + -A fe80::1 -B ff02::1:ff00:02 \ + -t ip hop=1,next=58,payload=$(icmpv6_header_get 135) -p 100 -q +} + +ipv6_neigh_advert_test() +{ + devlink_trap_stats_test "IPv6 Neighbour Advertisement" \ + "ipv6_neigh_advert" $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A fe80::1 -B 2001:db8:1::2 \ + -t ip hop=1,next=58,payload=$(icmpv6_header_get 136) -p 100 -q +} + +ipv4_bfd_test() +{ + devlink_trap_stats_test "IPv4 BFD Control - Port 3784" "ipv4_bfd" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 192.0.2.2 -t udp sp=49153,dp=3784 -p 100 -q + + devlink_trap_stats_test "IPv4 BFD Echo - Port 3785" "ipv4_bfd" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 192.0.2.2 -t udp sp=49153,dp=3785 -p 100 -q +} + +ipv6_bfd_test() +{ + devlink_trap_stats_test "IPv6 BFD Control - Port 3784" "ipv6_bfd" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:1::2 \ + -t udp sp=49153,dp=3784 -p 100 -q + + devlink_trap_stats_test "IPv6 BFD Echo - Port 3785" "ipv6_bfd" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:1::2 \ + -t udp sp=49153,dp=3785 -p 100 -q +} + +ipv4_ospf_test() +{ + devlink_trap_stats_test "IPv4 OSPF - Multicast" "ipv4_ospf" \ + $MZ $h1 -c 1 -a own -b 01:00:5e:00:00:05 \ + -A 192.0.2.1 -B 224.0.0.5 -t ip proto=89 -p 100 -q + + devlink_trap_stats_test "IPv4 OSPF - Unicast" "ipv4_ospf" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 192.0.2.2 -t ip proto=89 -p 100 -q +} + +ipv6_ospf_test() +{ + devlink_trap_stats_test "IPv6 OSPF - Multicast" "ipv6_ospf" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:05 \ + -A fe80::1 -B ff02::5 -t ip next=89 -p 100 -q + + devlink_trap_stats_test "IPv6 OSPF - Unicast" "ipv6_ospf" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:1::2 -t ip next=89 -p 100 -q +} + +ipv4_bgp_test() +{ + devlink_trap_stats_test "IPv4 BGP" "ipv4_bgp" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 192.0.2.2 -t tcp sp=54321,dp=179,flags=rst \ + -p 100 -q +} + +ipv6_bgp_test() +{ + devlink_trap_stats_test "IPv6 BGP" "ipv6_bgp" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:1::2 \ + -t tcp sp=54321,dp=179,flags=rst -p 100 -q +} + +ipv4_vrrp_test() +{ + devlink_trap_stats_test "IPv4 VRRP" "ipv4_vrrp" \ + $MZ $h1 -c 1 -a own -b 01:00:5e:00:00:12 \ + -A 192.0.2.1 -B 224.0.0.18 -t ip proto=112 -p 100 -q +} + +ipv6_vrrp_test() +{ + devlink_trap_stats_test "IPv6 VRRP" "ipv6_vrrp" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:12 \ + -A fe80::1 -B ff02::12 -t ip next=112 -p 100 -q +} + +ipv4_pim_test() +{ + devlink_trap_stats_test "IPv4 PIM - Multicast" "ipv4_pim" \ + $MZ $h1 -c 1 -a own -b 01:00:5e:00:00:0d \ + -A 192.0.2.1 -B 224.0.0.13 -t ip proto=103 -p 100 -q + + devlink_trap_stats_test "IPv4 PIM - Unicast" "ipv4_pim" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 192.0.2.2 -t ip proto=103 -p 100 -q +} + +ipv6_pim_test() +{ + devlink_trap_stats_test "IPv6 PIM - Multicast" "ipv6_pim" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:0d \ + -A fe80::1 -B ff02::d -t ip next=103 -p 100 -q + + devlink_trap_stats_test "IPv6 PIM - Unicast" "ipv6_pim" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A fe80::1 -B 2001:db8:1::2 -t ip next=103 -p 100 -q +} + +uc_loopback_test() +{ + # Add neighbours to the fake destination IPs, so that the packets are + # routed in the device and not trapped due to an unresolved neighbour + # exception. + ip -4 neigh add 192.0.2.3 lladdr 00:11:22:33:44:55 nud permanent \ + dev $rp1 + ip -6 neigh add 2001:db8:1::3 lladdr 00:11:22:33:44:55 nud permanent \ + dev $rp1 + + devlink_trap_stats_test "IPv4 Unicast Loopback" "uc_loopback" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 192.0.2.3 -t udp sp=54321,dp=12345 -p 100 -q + + devlink_trap_stats_test "IPv6 Unicast Loopback" "uc_loopback" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:1::3 -t udp sp=54321,dp=12345 \ + -p 100 -q + + ip -6 neigh del 2001:db8:1::3 dev $rp1 + ip -4 neigh del 192.0.2.3 dev $rp1 +} + +local_route_test() +{ + # Use a fake source IP to prevent the trap from being triggered twice + # when the router sends back a port unreachable message. + devlink_trap_stats_test "IPv4 Local Route" "local_route" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.3 -B 192.0.2.2 -t udp sp=54321,dp=12345 -p 100 -q + + devlink_trap_stats_test "IPv6 Local Route" "local_route" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::3 -B 2001:db8:1::2 -t udp sp=54321,sp=12345 \ + -p 100 -q +} + +external_route_test() +{ + # Add a dummy device through which the incoming packets should be + # routed. + ip link add name dummy10 up type dummy + ip address add 203.0.113.1/24 dev dummy10 + ip -6 address add 2001:db8:10::1/64 dev dummy10 + + devlink_trap_stats_test "IPv4 External Route" "external_route" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 203.0.113.2 -t udp sp=54321,dp=12345 -p 100 -q + + devlink_trap_stats_test "IPv6 External Route" "external_route" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:10::2 -t udp sp=54321,sp=12345 \ + -p 100 -q + + ip -6 address del 2001:db8:10::1/64 dev dummy10 + ip address del 203.0.113.1/24 dev dummy10 + ip link del dev dummy10 +} + +ipv6_uc_dip_link_local_scope_test() +{ + # Add a dummy link-local prefix route to allow the packet to be routed. + ip -6 route add fe80:1::/64 dev $rp2 + + devlink_trap_stats_test \ + "IPv6 Unicast Destination IP With Link-Local Scope" \ + "ipv6_uc_dip_link_local_scope" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A fe80::1 -B fe80:1::2 -t udp sp=54321,sp=12345 \ + -p 100 -q + + ip -6 route del fe80:1::/64 dev $rp2 +} + +ipv4_router_alert_get() +{ + local p + + # https://en.wikipedia.org/wiki/IPv4#Options + p=$(: + )"94:"$( : Option Number + )"04:"$( : Option Length + )"00:00:"$( : Option Data + ) + echo $p +} + +ipv4_router_alert_test() +{ + devlink_trap_stats_test "IPv4 Router Alert" "ipv4_router_alert" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 198.51.100.3 \ + -t ip option=$(ipv4_router_alert_get) -p 100 -q +} + +ipv6_router_alert_get() +{ + local p + + # https://en.wikipedia.org/wiki/IPv6_packet#Hop-by-hop_options_and_destination_options + # https://tools.ietf.org/html/rfc2711#section-2.1 + p=$(: + )"11:"$( : Next Header - UDP + )"00:"$( : Hdr Ext Len + )"05:02:00:00:00:00:"$( : Option Data + ) + echo $p +} + +ipv6_router_alert_test() +{ + devlink_trap_stats_test "IPv6 Router Alert" "ipv6_router_alert" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:1::3 \ + -t ip next=0,payload=$(ipv6_router_alert_get) -p 100 -q +} + +ipv6_dip_all_nodes_test() +{ + devlink_trap_stats_test "IPv6 Destination IP \"All Nodes Address\"" \ + "ipv6_dip_all_nodes" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:01 \ + -A 2001:db8:1::1 -B ff02::1 -t udp sp=12345,dp=54321 -p 100 -q +} + +ipv6_dip_all_routers_test() +{ + devlink_trap_stats_test "IPv6 Destination IP \"All Routers Address\"" \ + "ipv6_dip_all_routers" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:02 \ + -A 2001:db8:1::1 -B ff02::2 -t udp sp=12345,dp=54321 -p 100 -q +} + +ipv6_router_solicit_test() +{ + devlink_trap_stats_test "IPv6 Router Solicitation" \ + "ipv6_router_solicit" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:02 \ + -A fe80::1 -B ff02::2 \ + -t ip hop=1,next=58,payload=$(icmpv6_header_get 133) -p 100 -q +} + +ipv6_router_advert_test() +{ + devlink_trap_stats_test "IPv6 Router Advertisement" \ + "ipv6_router_advert" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:01 \ + -A fe80::1 -B ff02::1 \ + -t ip hop=1,next=58,payload=$(icmpv6_header_get 134) -p 100 -q +} + +ipv6_redirect_test() +{ + devlink_trap_stats_test "IPv6 Redirect Message" \ + "ipv6_redirect" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A fe80::1 -B 2001:db8:1::2 \ + -t ip hop=1,next=58,payload=$(icmpv6_header_get 137) -p 100 -q +} + +ptp_event_test() +{ + # PTP is only supported on Spectrum-1, for now. + [[ "$DEVLINK_VIDDID" != "15b3:cb84" ]] && return + + # PTP Sync (0) + devlink_trap_stats_test "PTP Time-Critical Event Message" "ptp_event" \ + $MZ $h1 -c 1 -a own -b 01:00:5e:00:01:81 \ + -A 192.0.2.1 -B 224.0.1.129 \ + -t udp sp=12345,dp=319,payload=10 -p 100 -q +} + +ptp_general_test() +{ + # PTP is only supported on Spectrum-1, for now. + [[ "$DEVLINK_VIDDID" != "15b3:cb84" ]] && return + + # PTP Announce (b) + devlink_trap_stats_test "PTP General Message" "ptp_general" \ + $MZ $h1 -c 1 -a own -b 01:00:5e:00:01:81 \ + -A 192.0.2.1 -B 224.0.1.129 \ + -t udp sp=12345,dp=320,payload=1b -p 100 -q +} + +flow_action_sample_test() +{ + # Install a filter that samples every incoming packet. + tc qdisc add dev $rp1 clsact + tc filter add dev $rp1 ingress proto all pref 1 handle 101 matchall \ + skip_sw action sample rate 1 group 1 + + devlink_trap_stats_test "Flow Sampling" "flow_action_sample" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 198.51.100.1 -t udp sp=12345,dp=54321 -p 100 -q + + tc filter del dev $rp1 ingress proto all pref 1 handle 101 matchall + tc qdisc del dev $rp1 clsact +} + +flow_action_trap_test() +{ + # Install a filter that traps a specific flow. + tc qdisc add dev $rp1 clsact + tc filter add dev $rp1 ingress proto ip pref 1 handle 101 flower \ + skip_sw ip_proto udp src_port 12345 dst_port 54321 action trap + + devlink_trap_stats_test "Flow Trapping (Logging)" "flow_action_trap" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 198.51.100.1 -t udp sp=12345,dp=54321 -p 100 -q + + tc filter del dev $rp1 ingress proto ip pref 1 handle 101 flower + tc qdisc del dev $rp1 clsact +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/devlink_lib.sh b/tools/testing/selftests/net/forwarding/devlink_lib.sh index e27236109235..f0e6be4c09e9 100644 --- a/tools/testing/selftests/net/forwarding/devlink_lib.sh +++ b/tools/testing/selftests/net/forwarding/devlink_lib.sh @@ -423,6 +423,29 @@ devlink_trap_drop_cleanup() tc filter del dev $dev egress protocol $proto pref $pref handle $handle flower } +devlink_trap_stats_test() +{ + local test_name=$1; shift + local trap_name=$1; shift + local send_one="$@" + local t0_packets + local t1_packets + + RET=0 + + t0_packets=$(devlink_trap_rx_packets_get $trap_name) + + $send_one && sleep 1 + + t1_packets=$(devlink_trap_rx_packets_get $trap_name) + + if [[ $t1_packets -eq $t0_packets ]]; then + check_err 1 "Trap stats did not increase" + fi + + log_test "$test_name" +} + devlink_trap_policers_num_get() { devlink -j -p trap policer show | jq '.[]["'$DEVLINK_DEV'"] | length' -- cgit v1.2.3 From 90040351a832acf862c8f1855c29411303d23755 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Sat, 23 May 2020 02:07:51 +0100 Subject: tools, bpftool: Clean subcommand help messages This is a clean-up for the formatting of the do_help functions for bpftool's subcommands. The following fixes are included: - Do not use argv[-2] for "iter" help message, as the help is shown by default if no "iter" action is selected, resulting in messages looking like "./bpftool bpftool pin...". - Do not print unused HELP_SPEC_PROGRAM in help message for "bpftool link". - Andrii used argument indexing to avoid having multiple occurrences of bin_name and argv[-2] in the fprintf() for the help message, for "bpftool gen" and "bpftool link". Let's reuse this for all other help functions. We can remove up to thirty arguments for the "bpftool map" help message. - Harmonise all functions, e.g. use ending quotes-comma on a separate line. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200523010751.23465-1-quentin@isovalent.com Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/btf.c | 8 ++++---- tools/bpf/bpftool/cgroup.c | 14 ++++++-------- tools/bpf/bpftool/feature.c | 6 +++--- tools/bpf/bpftool/gen.c | 6 +++--- tools/bpf/bpftool/iter.c | 8 ++++---- tools/bpf/bpftool/link.c | 1 - tools/bpf/bpftool/map.c | 41 ++++++++++++++++++----------------------- tools/bpf/bpftool/net.c | 12 ++++++------ tools/bpf/bpftool/perf.c | 2 +- tools/bpf/bpftool/prog.c | 27 ++++++++++++--------------- tools/bpf/bpftool/struct_ops.c | 15 +++++++-------- 11 files changed, 64 insertions(+), 76 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index 41a1346934a1..c134666591a6 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -951,9 +951,9 @@ static int do_help(int argc, char **argv) } fprintf(stderr, - "Usage: %s btf { show | list } [id BTF_ID]\n" - " %s btf dump BTF_SRC [format FORMAT]\n" - " %s btf help\n" + "Usage: %1$s %2$s { show | list } [id BTF_ID]\n" + " %1$s %2$s dump BTF_SRC [format FORMAT]\n" + " %1$s %2$s help\n" "\n" " BTF_SRC := { id BTF_ID | prog PROG | map MAP [{key | value | kv | all}] | file FILE }\n" " FORMAT := { raw | c }\n" @@ -961,7 +961,7 @@ static int do_help(int argc, char **argv) " " HELP_SPEC_PROGRAM "\n" " " HELP_SPEC_OPTIONS "\n" "", - bin_name, bin_name, bin_name); + bin_name, "btf"); return 0; } diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c index 27931db421d8..d901cc1b904a 100644 --- a/tools/bpf/bpftool/cgroup.c +++ b/tools/bpf/bpftool/cgroup.c @@ -491,20 +491,18 @@ static int do_help(int argc, char **argv) } fprintf(stderr, - "Usage: %s %s { show | list } CGROUP [**effective**]\n" - " %s %s tree [CGROUP_ROOT] [**effective**]\n" - " %s %s attach CGROUP ATTACH_TYPE PROG [ATTACH_FLAGS]\n" - " %s %s detach CGROUP ATTACH_TYPE PROG\n" - " %s %s help\n" + "Usage: %1$s %2$s { show | list } CGROUP [**effective**]\n" + " %1$s %2$s tree [CGROUP_ROOT] [**effective**]\n" + " %1$s %2$s attach CGROUP ATTACH_TYPE PROG [ATTACH_FLAGS]\n" + " %1$s %2$s detach CGROUP ATTACH_TYPE PROG\n" + " %1$s %2$s help\n" "\n" HELP_SPEC_ATTACH_TYPES "\n" " " HELP_SPEC_ATTACH_FLAGS "\n" " " HELP_SPEC_PROGRAM "\n" " " HELP_SPEC_OPTIONS "\n" "", - bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2]); + bin_name, argv[-2]); return 0; } diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 1b73e63274b5..f05e9e57b593 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -937,12 +937,12 @@ static int do_help(int argc, char **argv) } fprintf(stderr, - "Usage: %s %s probe [COMPONENT] [full] [unprivileged] [macros [prefix PREFIX]]\n" - " %s %s help\n" + "Usage: %1$s %2$s probe [COMPONENT] [full] [unprivileged] [macros [prefix PREFIX]]\n" + " %1$s %2$s help\n" "\n" " COMPONENT := { kernel | dev NAME }\n" "", - bin_name, argv[-2], bin_name, argv[-2]); + bin_name, argv[-2]); return 0; } diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 0e5f0236cc76..a3c4bb86c05a 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -586,12 +586,12 @@ static int do_help(int argc, char **argv) } fprintf(stderr, - "Usage: %1$s gen skeleton FILE\n" - " %1$s gen help\n" + "Usage: %1$s %2$s skeleton FILE\n" + " %1$s %2$s help\n" "\n" " " HELP_SPEC_OPTIONS "\n" "", - bin_name); + bin_name, "gen"); return 0; } diff --git a/tools/bpf/bpftool/iter.c b/tools/bpf/bpftool/iter.c index eb5987a0c3b6..33240fcc6319 100644 --- a/tools/bpf/bpftool/iter.c +++ b/tools/bpf/bpftool/iter.c @@ -68,10 +68,10 @@ close_obj: static int do_help(int argc, char **argv) { fprintf(stderr, - "Usage: %s %s pin OBJ PATH\n" - " %s %s help\n" - "\n", - bin_name, argv[-2], bin_name, argv[-2]); + "Usage: %1$s %2$s pin OBJ PATH\n" + " %1$s %2$s help\n" + "", + bin_name, "iter"); return 0; } diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index b6a0b35c78ae..670a561dc31b 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -312,7 +312,6 @@ static int do_help(int argc, char **argv) " %1$s %2$s help\n" "\n" " " HELP_SPEC_LINK "\n" - " " HELP_SPEC_PROGRAM "\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, argv[-2]); diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 85cbe9a19170..c5fac8068ba1 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -1561,24 +1561,24 @@ static int do_help(int argc, char **argv) } fprintf(stderr, - "Usage: %s %s { show | list } [MAP]\n" - " %s %s create FILE type TYPE key KEY_SIZE value VALUE_SIZE \\\n" - " entries MAX_ENTRIES name NAME [flags FLAGS] \\\n" - " [dev NAME]\n" - " %s %s dump MAP\n" - " %s %s update MAP [key DATA] [value VALUE] [UPDATE_FLAGS]\n" - " %s %s lookup MAP [key DATA]\n" - " %s %s getnext MAP [key DATA]\n" - " %s %s delete MAP key DATA\n" - " %s %s pin MAP FILE\n" - " %s %s event_pipe MAP [cpu N index M]\n" - " %s %s peek MAP\n" - " %s %s push MAP value VALUE\n" - " %s %s pop MAP\n" - " %s %s enqueue MAP value VALUE\n" - " %s %s dequeue MAP\n" - " %s %s freeze MAP\n" - " %s %s help\n" + "Usage: %1$s %2$s { show | list } [MAP]\n" + " %1$s %2$s create FILE type TYPE key KEY_SIZE value VALUE_SIZE \\\n" + " entries MAX_ENTRIES name NAME [flags FLAGS] \\\n" + " [dev NAME]\n" + " %1$s %2$s dump MAP\n" + " %1$s %2$s update MAP [key DATA] [value VALUE] [UPDATE_FLAGS]\n" + " %1$s %2$s lookup MAP [key DATA]\n" + " %1$s %2$s getnext MAP [key DATA]\n" + " %1$s %2$s delete MAP key DATA\n" + " %1$s %2$s pin MAP FILE\n" + " %1$s %2$s event_pipe MAP [cpu N index M]\n" + " %1$s %2$s peek MAP\n" + " %1$s %2$s push MAP value VALUE\n" + " %1$s %2$s pop MAP\n" + " %1$s %2$s enqueue MAP value VALUE\n" + " %1$s %2$s dequeue MAP\n" + " %1$s %2$s freeze MAP\n" + " %1$s %2$s help\n" "\n" " " HELP_SPEC_MAP "\n" " DATA := { [hex] BYTES }\n" @@ -1593,11 +1593,6 @@ static int do_help(int argc, char **argv) " queue | stack | sk_storage | struct_ops }\n" " " HELP_SPEC_OPTIONS "\n" "", - bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]); return 0; diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c index c5e3895b7c8b..56c3a2bae3ef 100644 --- a/tools/bpf/bpftool/net.c +++ b/tools/bpf/bpftool/net.c @@ -458,10 +458,10 @@ static int do_help(int argc, char **argv) } fprintf(stderr, - "Usage: %s %s { show | list } [dev ]\n" - " %s %s attach ATTACH_TYPE PROG dev [ overwrite ]\n" - " %s %s detach ATTACH_TYPE dev \n" - " %s %s help\n" + "Usage: %1$s %2$s { show | list } [dev ]\n" + " %1$s %2$s attach ATTACH_TYPE PROG dev [ overwrite ]\n" + " %1$s %2$s detach ATTACH_TYPE dev \n" + " %1$s %2$s help\n" "\n" " " HELP_SPEC_PROGRAM "\n" " ATTACH_TYPE := { xdp | xdpgeneric | xdpdrv | xdpoffload }\n" @@ -470,8 +470,8 @@ static int do_help(int argc, char **argv) " For progs attached to cgroups, use \"bpftool cgroup\"\n" " to dump program attachments. For program types\n" " sk_{filter,skb,msg,reuseport} and lwt/seg6, please\n" - " consult iproute2.\n", - bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], + " consult iproute2.\n" + "", bin_name, argv[-2]); return 0; diff --git a/tools/bpf/bpftool/perf.c b/tools/bpf/bpftool/perf.c index 3341aa14acda..ad23934819c7 100644 --- a/tools/bpf/bpftool/perf.c +++ b/tools/bpf/bpftool/perf.c @@ -231,7 +231,7 @@ static int do_show(int argc, char **argv) static int do_help(int argc, char **argv) { fprintf(stderr, - "Usage: %s %s { show | list | help }\n" + "Usage: %1$s %2$s { show | list | help }\n" "", bin_name, argv[-2]); diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 245f941fdbcf..a5eff83496f2 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -1984,24 +1984,24 @@ static int do_help(int argc, char **argv) } fprintf(stderr, - "Usage: %s %s { show | list } [PROG]\n" - " %s %s dump xlated PROG [{ file FILE | opcodes | visual | linum }]\n" - " %s %s dump jited PROG [{ file FILE | opcodes | linum }]\n" - " %s %s pin PROG FILE\n" - " %s %s { load | loadall } OBJ PATH \\\n" + "Usage: %1$s %2$s { show | list } [PROG]\n" + " %1$s %2$s dump xlated PROG [{ file FILE | opcodes | visual | linum }]\n" + " %1$s %2$s dump jited PROG [{ file FILE | opcodes | linum }]\n" + " %1$s %2$s pin PROG FILE\n" + " %1$s %2$s { load | loadall } OBJ PATH \\\n" " [type TYPE] [dev NAME] \\\n" " [map { idx IDX | name NAME } MAP]\\\n" " [pinmaps MAP_DIR]\n" - " %s %s attach PROG ATTACH_TYPE [MAP]\n" - " %s %s detach PROG ATTACH_TYPE [MAP]\n" - " %s %s run PROG \\\n" + " %1$s %2$s attach PROG ATTACH_TYPE [MAP]\n" + " %1$s %2$s detach PROG ATTACH_TYPE [MAP]\n" + " %1$s %2$s run PROG \\\n" " data_in FILE \\\n" " [data_out FILE [data_size_out L]] \\\n" " [ctx_in FILE [ctx_out FILE [ctx_size_out M]]] \\\n" " [repeat N]\n" - " %s %s profile PROG [duration DURATION] METRICs\n" - " %s %s tracelog\n" - " %s %s help\n" + " %1$s %2$s profile PROG [duration DURATION] METRICs\n" + " %1$s %2$s tracelog\n" + " %1$s %2$s help\n" "\n" " " HELP_SPEC_MAP "\n" " " HELP_SPEC_PROGRAM "\n" @@ -2022,10 +2022,7 @@ static int do_help(int argc, char **argv) " METRIC := { cycles | instructions | l1d_loads | llc_misses }\n" " " HELP_SPEC_OPTIONS "\n" "", - bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2]); + bin_name, argv[-2]); return 0; } diff --git a/tools/bpf/bpftool/struct_ops.c b/tools/bpf/bpftool/struct_ops.c index e17738479edc..b58b91f62ffb 100644 --- a/tools/bpf/bpftool/struct_ops.c +++ b/tools/bpf/bpftool/struct_ops.c @@ -566,16 +566,15 @@ static int do_help(int argc, char **argv) } fprintf(stderr, - "Usage: %s %s { show | list } [STRUCT_OPS_MAP]\n" - " %s %s dump [STRUCT_OPS_MAP]\n" - " %s %s register OBJ\n" - " %s %s unregister STRUCT_OPS_MAP\n" - " %s %s help\n" + "Usage: %1$s %2$s { show | list } [STRUCT_OPS_MAP]\n" + " %1$s %2$s dump [STRUCT_OPS_MAP]\n" + " %1$s %2$s register OBJ\n" + " %1$s %2$s unregister STRUCT_OPS_MAP\n" + " %1$s %2$s help\n" "\n" " OPTIONS := { {-j|--json} [{-p|--pretty}] }\n" - " STRUCT_OPS_MAP := [ id STRUCT_OPS_MAP_ID | name STRUCT_OPS_MAP_NAME ]\n", - bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2], + " STRUCT_OPS_MAP := [ id STRUCT_OPS_MAP_ID | name STRUCT_OPS_MAP_NAME ]\n" + "", bin_name, argv[-2]); return 0; -- cgit v1.2.3 From 73a4f0407e67cdfdf55dd94f573ed4ee2d0d62fe Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Sat, 23 May 2020 02:02:47 +0100 Subject: tools, bpftool: Make capability check account for new BPF caps Following the introduction of CAP_BPF, and the switch from CAP_SYS_ADMIN to other capabilities for various BPF features, update the capability checks (and potentially, drops) in bpftool for feature probes. Because bpftool and/or the system might not know of CAP_BPF yet, some caution is necessary: - If compiled and run on a system with CAP_BPF, check CAP_BPF, CAP_SYS_ADMIN, CAP_PERFMON, CAP_NET_ADMIN. - Guard against CAP_BPF being undefined, to allow compiling bpftool from latest sources on older systems. If the system where feature probes are run does not know of CAP_BPF, stop checking after CAP_SYS_ADMIN, as this should be the only capability required for all the BPF probing. - If compiled from latest sources on a system without CAP_BPF, but later executed on a newer system with CAP_BPF knowledge, then we only test CAP_SYS_ADMIN. Some probes may fail if the bpftool process has CAP_SYS_ADMIN but misses the other capabilities. The alternative would be to redefine the value for CAP_BPF in bpftool, but this does not look clean, and the case sounds relatively rare anyway. Note that libcap offers a cap_to_name() function to retrieve the name of a given capability (e.g. "cap_sys_admin"). We do not use it because deriving the names from the macros looks simpler than using cap_to_name() (doing a strdup() on the string) + cap_free() + handling the case of failed allocations, when we just want to use the name of the capability in an error message. The checks when compiling without libcap (i.e. root versus non-root) are unchanged. v2: - Do not allocate cap_list dynamically. - Drop BPF-related capabilities when running with "unprivileged", even if we didn't have the full set in the first place (in v1, we would skip dropping them in that case). - Keep track of what capabilities we have, print the names of the missing ones for privileged probing. - Attempt to drop only the capabilities we actually have. - Rename a couple variables. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200523010247.20654-1-quentin@isovalent.com Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/feature.c | 85 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 66 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index f05e9e57b593..768bf77df886 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -758,11 +758,29 @@ static void section_misc(const char *define_prefix, __u32 ifindex) print_end_section(); } +#ifdef USE_LIBCAP +#define capability(c) { c, false, #c } +#define capability_msg(a, i) a[i].set ? "" : a[i].name, a[i].set ? "" : ", " +#endif + static int handle_perms(void) { #ifdef USE_LIBCAP - cap_value_t cap_list[1] = { CAP_SYS_ADMIN }; - bool has_sys_admin_cap = false; + struct { + cap_value_t cap; + bool set; + char name[14]; /* strlen("CAP_SYS_ADMIN") */ + } bpf_caps[] = { + capability(CAP_SYS_ADMIN), +#ifdef CAP_BPF + capability(CAP_BPF), + capability(CAP_NET_ADMIN), + capability(CAP_PERFMON), +#endif + }; + cap_value_t cap_list[ARRAY_SIZE(bpf_caps)]; + unsigned int i, nb_bpf_caps = 0; + bool cap_sys_admin_only = true; cap_flag_value_t val; int res = -1; cap_t caps; @@ -774,35 +792,64 @@ static int handle_perms(void) return -1; } - if (cap_get_flag(caps, CAP_SYS_ADMIN, CAP_EFFECTIVE, &val)) { - p_err("bug: failed to retrieve CAP_SYS_ADMIN status"); - goto exit_free; - } - if (val == CAP_SET) - has_sys_admin_cap = true; +#ifdef CAP_BPF + if (CAP_IS_SUPPORTED(CAP_BPF)) + cap_sys_admin_only = false; +#endif - if (!run_as_unprivileged && !has_sys_admin_cap) { - p_err("full feature probing requires CAP_SYS_ADMIN, run as root or use 'unprivileged'"); - goto exit_free; + for (i = 0; i < ARRAY_SIZE(bpf_caps); i++) { + const char *cap_name = bpf_caps[i].name; + cap_value_t cap = bpf_caps[i].cap; + + if (cap_get_flag(caps, cap, CAP_EFFECTIVE, &val)) { + p_err("bug: failed to retrieve %s status: %s", cap_name, + strerror(errno)); + goto exit_free; + } + + if (val == CAP_SET) { + bpf_caps[i].set = true; + cap_list[nb_bpf_caps++] = cap; + } + + if (cap_sys_admin_only) + /* System does not know about CAP_BPF, meaning that + * CAP_SYS_ADMIN is the only capability required. We + * just checked it, break. + */ + break; } - if ((run_as_unprivileged && !has_sys_admin_cap) || - (!run_as_unprivileged && has_sys_admin_cap)) { + if ((run_as_unprivileged && !nb_bpf_caps) || + (!run_as_unprivileged && nb_bpf_caps == ARRAY_SIZE(bpf_caps)) || + (!run_as_unprivileged && cap_sys_admin_only && nb_bpf_caps)) { /* We are all good, exit now */ res = 0; goto exit_free; } - /* if (run_as_unprivileged && has_sys_admin_cap), drop CAP_SYS_ADMIN */ + if (!run_as_unprivileged) { + if (cap_sys_admin_only) + p_err("missing %s, required for full feature probing; run as root or use 'unprivileged'", + bpf_caps[0].name); + else + p_err("missing %s%s%s%s%s%s%s%srequired for full feature probing; run as root or use 'unprivileged'", + capability_msg(bpf_caps, 0), + capability_msg(bpf_caps, 1), + capability_msg(bpf_caps, 2), + capability_msg(bpf_caps, 3)); + goto exit_free; + } - if (cap_set_flag(caps, CAP_EFFECTIVE, ARRAY_SIZE(cap_list), cap_list, + /* if (run_as_unprivileged && nb_bpf_caps > 0), drop capabilities. */ + if (cap_set_flag(caps, CAP_EFFECTIVE, nb_bpf_caps, cap_list, CAP_CLEAR)) { - p_err("bug: failed to clear CAP_SYS_ADMIN from capabilities"); + p_err("bug: failed to clear capabilities: %s", strerror(errno)); goto exit_free; } if (cap_set_proc(caps)) { - p_err("failed to drop CAP_SYS_ADMIN: %s", strerror(errno)); + p_err("failed to drop capabilities: %s", strerror(errno)); goto exit_free; } @@ -817,7 +864,7 @@ exit_free: return res; #else - /* Detection assumes user has sufficient privileges (CAP_SYS_ADMIN). + /* Detection assumes user has specific privileges. * We do not use libpcap so let's approximate, and restrict usage to * root user only. */ @@ -901,7 +948,7 @@ static int do_probe(int argc, char **argv) } } - /* Full feature detection requires CAP_SYS_ADMIN privilege. + /* Full feature detection requires specific privileges. * Let's approximate, and warn if user is not root. */ if (handle_perms()) -- cgit v1.2.3 From dc3ca5cf3e0be9fb73f4691247367d76a22bf30b Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 25 May 2020 15:54:21 +0200 Subject: tools, bpftool: Print correct error message when failing to load BTF btf__parse_raw and btf__parse_elf return negative error numbers wrapped in an ERR_PTR, so the extracted value needs to be negated before passing them to strerror which expects a positive error number. Before: Error: failed to load BTF from .../vmlinux: Unknown error -2 After: Error: failed to load BTF from .../vmlinux: No such file or directory Signed-off-by: Tobias Klauser Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200525135421.4154-1-tklauser@distanz.ch Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index c134666591a6..faac8189b285 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -553,7 +553,7 @@ static int do_dump(int argc, char **argv) btf = btf__parse_elf(*argv, NULL); if (IS_ERR(btf)) { - err = PTR_ERR(btf); + err = -PTR_ERR(btf); btf = NULL; p_err("failed to load BTF from %s: %s", *argv, strerror(err)); -- cgit v1.2.3 From 272d51af32890632134845ddf35318c11da20c7b Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 26 May 2020 11:21:42 +0200 Subject: libbpf: Add API to consume the perf ring buffer content This new API, perf_buffer__consume, can be used as follows: - When you have a perf ring where wakeup_events is higher than 1, and you have remaining data in the rings you would like to pull out on exit (or maybe based on a timeout). - For low latency cases where you burn a CPU that constantly polls the queues. Signed-off-by: Eelco Chaudron Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/159048487929.89441.7465713173442594608.stgit@ebuild Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 19 +++++++++++++++++++ tools/lib/bpf/libbpf.h | 1 + tools/lib/bpf/libbpf.map | 1 + 3 files changed, 21 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index fa04cbe547ed..5d60de6fd818 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -8456,6 +8456,25 @@ int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms) return cnt < 0 ? -errno : cnt; } +int perf_buffer__consume(struct perf_buffer *pb) +{ + int i, err; + + for (i = 0; i < pb->cpu_cnt; i++) { + struct perf_cpu_buf *cpu_buf = pb->cpu_bufs[i]; + + if (!cpu_buf) + continue; + + err = perf_buffer__process_records(pb, cpu_buf); + if (err) { + pr_warn("error while processing records: %d\n", err); + return err; + } + } + return 0; +} + struct bpf_prog_info_array_desc { int array_offset; /* e.g. offset of jited_prog_insns */ int count_offset; /* e.g. offset of jited_prog_len */ diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 8ea69558f0a8..1e2e399a5f2c 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -533,6 +533,7 @@ perf_buffer__new_raw(int map_fd, size_t page_cnt, LIBBPF_API void perf_buffer__free(struct perf_buffer *pb); LIBBPF_API int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms); +LIBBPF_API int perf_buffer__consume(struct perf_buffer *pb); typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(struct perf_event_header *hdr, diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 0133d469d30b..381a7342ecfc 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -262,4 +262,5 @@ LIBBPF_0.0.9 { bpf_link_get_fd_by_id; bpf_link_get_next_id; bpf_program__attach_iter; + perf_buffer__consume; } LIBBPF_0.0.8; -- cgit v1.2.3 From 93581359e7aeb11358018f2e3a737776d1e899ae Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 26 May 2020 20:46:12 +0300 Subject: libbpf: Install headers as part of make install Current 'make install' results in only pkg-config and library binaries being installed. For consistency also install headers as part of "make install" Signed-off-by: Nikolay Borisov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200526174612.5447-1-nborisov@suse.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index aee7f1a83c77..d02c4d910aad 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -264,7 +264,7 @@ install_pkgconfig: $(PC_FILE) $(call QUIET_INSTALL, $(PC_FILE)) \ $(call do_install,$(PC_FILE),$(libdir_SQ)/pkgconfig,644) -install: install_lib install_pkgconfig +install: install_lib install_pkgconfig install_headers ### Cleaning rules -- cgit v1.2.3 From 55983299b7ea94d714c19cdfd8d969ba86e0d7e9 Mon Sep 17 00:00:00 2001 From: Yauheni Kaliuta Date: Mon, 25 May 2020 09:18:46 +0300 Subject: libbpf: Use .so dynamic symbols for abi check Since dynamic symbols are used for dynamic linking it makes sense to use them (readelf --dyn-syms) for abi check. Found with some configuration on powerpc where linker puts local *.plt_call.* symbols into .so. Signed-off-by: Yauheni Kaliuta Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200525061846.16524-1-yauheni.kaliuta@redhat.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index d02c4d910aad..bf8ed134cb8a 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -151,7 +151,7 @@ GLOBAL_SYM_COUNT = $(shell readelf -s --wide $(BPF_IN_SHARED) | \ sed 's/\[.*\]//' | \ awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$NF}' | \ sort -u | wc -l) -VERSIONED_SYM_COUNT = $(shell readelf -s --wide $(OUTPUT)libbpf.so | \ +VERSIONED_SYM_COUNT = $(shell readelf --dyn-syms --wide $(OUTPUT)libbpf.so | \ grep -Eo '[^ ]+@LIBBPF_' | cut -d@ -f1 | sort -u | wc -l) CMD_TARGETS = $(LIB_TARGET) $(PC_FILE) @@ -218,7 +218,7 @@ check_abi: $(OUTPUT)libbpf.so sed 's/\[.*\]//' | \ awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$NF}'| \ sort -u > $(OUTPUT)libbpf_global_syms.tmp; \ - readelf -s --wide $(OUTPUT)libbpf.so | \ + readelf --dyn-syms --wide $(OUTPUT)libbpf.so | \ grep -Eo '[^ ]+@LIBBPF_' | cut -d@ -f1 | \ sort -u > $(OUTPUT)libbpf_versioned_syms.tmp; \ diff -u $(OUTPUT)libbpf_global_syms.tmp \ -- cgit v1.2.3 From 13d70f5a5ecff367db2fb18ed4ebe433eab8a74c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 24 May 2020 09:51:15 -0700 Subject: bpf, sk_msg: Add get socket storage helpers Add helpers to use local socket storage. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/159033907577.12355.14740125020572756560.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 ++ net/core/filter.c | 15 +++++++++++++++ tools/include/uapi/linux/bpf.h | 2 ++ 3 files changed, 19 insertions(+) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 97e1fd19ff58..54b93f8b49b8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3645,6 +3645,8 @@ struct sk_msg_md { __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ __u32 size; /* Total size of sk_msg */ + + __bpf_md_ptr(struct bpf_sock *, sk); /* current socket */ }; struct sk_reuseport_md { diff --git a/net/core/filter.c b/net/core/filter.c index c3b496a19748..a6fc23447f12 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6449,6 +6449,10 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_current_pid_tgid: return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; #ifdef CONFIG_CGROUPS case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; @@ -7273,6 +7277,11 @@ static bool sk_msg_is_valid_access(int off, int size, if (size != sizeof(__u64)) return false; break; + case offsetof(struct sk_msg_md, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET; + break; case bpf_ctx_range(struct sk_msg_md, family): case bpf_ctx_range(struct sk_msg_md, remote_ip4): case bpf_ctx_range(struct sk_msg_md, local_ip4): @@ -8609,6 +8618,12 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct sk_msg_sg, size)); break; + + case offsetof(struct sk_msg_md, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, sk)); + break; } return insn - insn_buf; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 97e1fd19ff58..54b93f8b49b8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3645,6 +3645,8 @@ struct sk_msg_md { __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ __u32 size; /* Total size of sk_msg */ + + __bpf_md_ptr(struct bpf_sock *, sk); /* current socket */ }; struct sk_reuseport_md { -- cgit v1.2.3 From 1d9c037a898b3c0344cfe5064ba6c482bf9b46b0 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 24 May 2020 09:51:36 -0700 Subject: bpf, selftests: Add sk_msg helpers load and attach test The test itself is not particularly useful but it encodes a common pattern we have. Namely do a sk storage lookup then depending on data here decide if we need to do more work or alternatively allow packet to PASS. Then if we need to do more work consult task_struct for more information about the running task. Finally based on this additional information drop or pass the data. In this case the suspicious check is not so realisitic but it encodes the general pattern and uses the helpers so we test the workflow. This is a load test to ensure verifier correctly handles this case. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/159033909665.12355.6166415847337547879.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/sockmap_basic.c | 35 ++++++++++++++++ .../selftests/bpf/progs/test_skmsg_load_helpers.c | 47 ++++++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index aa43e0bd210c..96e7b7f84c65 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2020 Cloudflare +#include #include "test_progs.h" +#include "test_skmsg_load_helpers.skel.h" #define TCP_REPAIR 19 /* TCP sock is under repair right now */ @@ -70,10 +72,43 @@ out: close(s); } +static void test_skmsg_helpers(enum bpf_map_type map_type) +{ + struct test_skmsg_load_helpers *skel; + int err, map, verdict; + + skel = test_skmsg_load_helpers__open_and_load(); + if (CHECK_FAIL(!skel)) { + perror("test_skmsg_load_helpers__open_and_load"); + return; + } + + verdict = bpf_program__fd(skel->progs.prog_msg_verdict); + map = bpf_map__fd(skel->maps.sock_map); + + err = bpf_prog_attach(verdict, map, BPF_SK_MSG_VERDICT, 0); + if (CHECK_FAIL(err)) { + perror("bpf_prog_attach"); + goto out; + } + + err = bpf_prog_detach2(verdict, map, BPF_SK_MSG_VERDICT); + if (CHECK_FAIL(err)) { + perror("bpf_prog_detach2"); + goto out; + } +out: + test_skmsg_load_helpers__destroy(skel); +} + void test_sockmap_basic(void) { if (test__start_subtest("sockmap create_update_free")) test_sockmap_create_update_free(BPF_MAP_TYPE_SOCKMAP); if (test__start_subtest("sockhash create_update_free")) test_sockmap_create_update_free(BPF_MAP_TYPE_SOCKHASH); + if (test__start_subtest("sockmap sk_msg load helpers")) + test_skmsg_helpers(BPF_MAP_TYPE_SOCKMAP); + if (test__start_subtest("sockhash sk_msg load helpers")) + test_skmsg_helpers(BPF_MAP_TYPE_SOCKHASH); } diff --git a/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c new file mode 100644 index 000000000000..45e8fc75a739 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Isovalent, Inc. +#include "vmlinux.h" +#include + +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 2); + __type(key, __u32); + __type(value, __u64); +} sock_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_SOCKHASH); + __uint(max_entries, 2); + __type(key, __u32); + __type(value, __u64); +} sock_hash SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, __u32); + __type(value, __u64); +} socket_storage SEC(".maps"); + +SEC("sk_msg") +int prog_msg_verdict(struct sk_msg_md *msg) +{ + struct task_struct *task = (struct task_struct *)bpf_get_current_task(); + int verdict = SK_PASS; + __u32 pid, tpid; + __u64 *sk_stg; + + pid = bpf_get_current_pid_tgid() >> 32; + sk_stg = bpf_sk_storage_get(&socket_storage, msg->sk, 0, BPF_SK_STORAGE_GET_F_CREATE); + if (!sk_stg) + return SK_DROP; + *sk_stg = pid; + bpf_probe_read_kernel(&tpid , sizeof(tpid), &task->tgid); + if (pid != tpid) + verdict = SK_DROP; + bpf_sk_storage_delete(&socket_storage, (void *)msg->sk); + return verdict; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From ee103e9f1544e04ecd1db5eb5e9eb9a8b8698879 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 24 May 2020 09:51:57 -0700 Subject: bpf, selftests: Test probe_* helpers from SCHED_CLS Lets test using probe* in SCHED_CLS network programs as well just to be sure these keep working. Its cheap to add the extra test and provides a second context to test outside of sk_msg after we generalized probe* helpers to all networking types. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/159033911685.12355.15951980509828906214.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/skb_helpers.c | 30 ++++++++++++++++++++++ .../testing/selftests/bpf/progs/test_skb_helpers.c | 28 ++++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/skb_helpers.c create mode 100644 tools/testing/selftests/bpf/progs/test_skb_helpers.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/skb_helpers.c b/tools/testing/selftests/bpf/prog_tests/skb_helpers.c new file mode 100644 index 000000000000..f302ad84a298 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/skb_helpers.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +void test_skb_helpers(void) +{ + struct __sk_buff skb = { + .wire_len = 100, + .gso_segs = 8, + .gso_size = 10, + }; + struct bpf_prog_test_run_attr tattr = { + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .ctx_in = &skb, + .ctx_size_in = sizeof(skb), + .ctx_out = &skb, + .ctx_size_out = sizeof(skb), + }; + struct bpf_object *obj; + int err; + + err = bpf_prog_load("./test_skb_helpers.o", BPF_PROG_TYPE_SCHED_CLS, &obj, + &tattr.prog_fd); + if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno)) + return; + err = bpf_prog_test_run_xattr(&tattr); + CHECK_ATTR(err, "len", "err %d errno %d\n", err, errno); + bpf_object__close(obj); +} diff --git a/tools/testing/selftests/bpf/progs/test_skb_helpers.c b/tools/testing/selftests/bpf/progs/test_skb_helpers.c new file mode 100644 index 000000000000..bb3fbf1a29e3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_skb_helpers.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "vmlinux.h" +#include +#include + +#define TEST_COMM_LEN 16 + +struct { + __uint(type, BPF_MAP_TYPE_CGROUP_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, u32); +} cgroup_map SEC(".maps"); + +char _license[] SEC("license") = "GPL"; + +SEC("classifier/test_skb_helpers") +int test_skb_helpers(struct __sk_buff *skb) +{ + struct task_struct *task; + char comm[TEST_COMM_LEN]; + __u32 tpid; + + task = (struct task_struct *)bpf_get_current_task(); + bpf_probe_read_kernel(&tpid , sizeof(tpid), &task->tgid); + bpf_probe_read_kernel_str(&comm, sizeof(comm), &task->comm); + return 0; +} -- cgit v1.2.3 From 601b05ca6edb0422bf6ce313fbfd55ec7bbbc0fd Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Wed, 27 May 2020 10:42:00 +0200 Subject: libbpf: Fix perf_buffer__free() API for sparse allocs In case the cpu_bufs are sparsely allocated they are not all free'ed. These changes will fix this. Fixes: fb84b8224655 ("libbpf: add perf buffer API") Signed-off-by: Eelco Chaudron Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/159056888305.330763.9684536967379110349.stgit@ebuild Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 5d60de6fd818..74d967619dcf 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -8137,9 +8137,12 @@ void perf_buffer__free(struct perf_buffer *pb) if (!pb) return; if (pb->cpu_bufs) { - for (i = 0; i < pb->cpu_cnt && pb->cpu_bufs[i]; i++) { + for (i = 0; i < pb->cpu_cnt; i++) { struct perf_cpu_buf *cpu_buf = pb->cpu_bufs[i]; + if (!cpu_buf) + continue; + bpf_map_delete_elem(pb->map_fd, &cpu_buf->map_key); perf_buffer__free_cpu_buf(pb, cpu_buf); } -- cgit v1.2.3 From 204fb0413a92342d31f3e2557db0bb5babed586c Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 27 May 2020 18:56:56 +0000 Subject: selftests/bpf: Fix a typo in test_maps Trivial fix to a typo in the test_map_wronly test: "read" -> "write" Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200527185700.14658-2-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_maps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index c6766b2cff85..f717acc0c68d 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -1410,7 +1410,7 @@ static void test_map_wronly(void) fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), MAP_SIZE, map_flags | BPF_F_WRONLY); if (fd < 0) { - printf("Failed to create map for read only test '%s'!\n", + printf("Failed to create map for write only test '%s'!\n", strerror(errno)); exit(1); } -- cgit v1.2.3 From 36ef9a2d3f764a37cf3d8e619bfebf5c99c070a0 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 27 May 2020 18:56:57 +0000 Subject: selftests/bpf: Cleanup some file descriptors in test_maps The test_map_rdonly and test_map_wronly tests should close file descriptors which they open. Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200527185700.14658-3-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_maps.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index f717acc0c68d..46cf2c232964 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -1401,6 +1401,8 @@ static void test_map_rdonly(void) /* Check that key=2 is not found. */ assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == ENOENT); assert(bpf_map_get_next_key(fd, &key, &value) == -1 && errno == ENOENT); + + close(fd); } static void test_map_wronly(void) @@ -1423,6 +1425,8 @@ static void test_map_wronly(void) /* Check that key=2 is not found. */ assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == EPERM); assert(bpf_map_get_next_key(fd, &key, &value) == -1 && errno == EPERM); + + close(fd); } static void prepare_reuseport_grp(int type, int map_fd, size_t map_elem_size, -- cgit v1.2.3 From efbc3b8fe1e6259777670aadf931500545073c6c Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 27 May 2020 18:56:58 +0000 Subject: selftests/bpf: Cleanup comments in test_maps Make comments inside the test_map_rdonly and test_map_wronly tests consistent with logic. Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200527185700.14658-4-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_maps.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 46cf2c232964..08d63948514a 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -1394,11 +1394,11 @@ static void test_map_rdonly(void) key = 1; value = 1234; - /* Insert key=1 element. */ + /* Try to insert key=1 element. */ assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == -1 && errno == EPERM); - /* Check that key=2 is not found. */ + /* Check that key=1 is not found. */ assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == ENOENT); assert(bpf_map_get_next_key(fd, &key, &value) == -1 && errno == ENOENT); @@ -1422,7 +1422,7 @@ static void test_map_wronly(void) /* Insert key=1 element. */ assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0); - /* Check that key=2 is not found. */ + /* Check that reading elements and keys from the map is not allowed. */ assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == EPERM); assert(bpf_map_get_next_key(fd, &key, &value) == -1 && errno == EPERM); -- cgit v1.2.3 From 43dd115b1fffdd8d2c4cc15659c00b2a1addbc43 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 27 May 2020 18:57:00 +0000 Subject: selftests/bpf: Add tests for write-only stacks/queues For write-only stacks and queues bpf_map_update_elem should be allowed, but bpf_map_lookup_elem and bpf_map_lookup_and_delete_elem should fail with EPERM. Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200527185700.14658-6-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_maps.c | 40 ++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 08d63948514a..6a12a0e01e07 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -1405,7 +1405,7 @@ static void test_map_rdonly(void) close(fd); } -static void test_map_wronly(void) +static void test_map_wronly_hash(void) { int fd, key = 0, value = 0; @@ -1429,6 +1429,44 @@ static void test_map_wronly(void) close(fd); } +static void test_map_wronly_stack_or_queue(enum bpf_map_type map_type) +{ + int fd, value = 0; + + assert(map_type == BPF_MAP_TYPE_QUEUE || + map_type == BPF_MAP_TYPE_STACK); + fd = bpf_create_map(map_type, 0, sizeof(value), MAP_SIZE, + map_flags | BPF_F_WRONLY); + /* Stack/Queue maps do not support BPF_F_NO_PREALLOC */ + if (map_flags & BPF_F_NO_PREALLOC) { + assert(fd < 0 && errno == EINVAL); + return; + } + if (fd < 0) { + printf("Failed to create map '%s'!\n", strerror(errno)); + exit(1); + } + + value = 1234; + assert(bpf_map_update_elem(fd, NULL, &value, BPF_ANY) == 0); + + /* Peek element should fail */ + assert(bpf_map_lookup_elem(fd, NULL, &value) == -1 && errno == EPERM); + + /* Pop element should fail */ + assert(bpf_map_lookup_and_delete_elem(fd, NULL, &value) == -1 && + errno == EPERM); + + close(fd); +} + +static void test_map_wronly(void) +{ + test_map_wronly_hash(); + test_map_wronly_stack_or_queue(BPF_MAP_TYPE_STACK); + test_map_wronly_stack_or_queue(BPF_MAP_TYPE_QUEUE); +} + static void prepare_reuseport_grp(int type, int map_fd, size_t map_elem_size, __s64 *fds64, __u64 *sk_cookies, unsigned int n) -- cgit v1.2.3 From 457f44363a8894135c85b7a9afd2bd8196db24ab Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 29 May 2020 00:54:20 -0700 Subject: bpf: Implement BPF ring buffer and verifier support for it This commit adds a new MPSC ring buffer implementation into BPF ecosystem, which allows multiple CPUs to submit data to a single shared ring buffer. On the consumption side, only single consumer is assumed. Motivation ---------- There are two distinctive motivators for this work, which are not satisfied by existing perf buffer, which prompted creation of a new ring buffer implementation. - more efficient memory utilization by sharing ring buffer across CPUs; - preserving ordering of events that happen sequentially in time, even across multiple CPUs (e.g., fork/exec/exit events for a task). These two problems are independent, but perf buffer fails to satisfy both. Both are a result of a choice to have per-CPU perf ring buffer. Both can be also solved by having an MPSC implementation of ring buffer. The ordering problem could technically be solved for perf buffer with some in-kernel counting, but given the first one requires an MPSC buffer, the same solution would solve the second problem automatically. Semantics and APIs ------------------ Single ring buffer is presented to BPF programs as an instance of BPF map of type BPF_MAP_TYPE_RINGBUF. Two other alternatives considered, but ultimately rejected. One way would be to, similar to BPF_MAP_TYPE_PERF_EVENT_ARRAY, make BPF_MAP_TYPE_RINGBUF could represent an array of ring buffers, but not enforce "same CPU only" rule. This would be more familiar interface compatible with existing perf buffer use in BPF, but would fail if application needed more advanced logic to lookup ring buffer by arbitrary key. HASH_OF_MAPS addresses this with current approach. Additionally, given the performance of BPF ringbuf, many use cases would just opt into a simple single ring buffer shared among all CPUs, for which current approach would be an overkill. Another approach could introduce a new concept, alongside BPF map, to represent generic "container" object, which doesn't necessarily have key/value interface with lookup/update/delete operations. This approach would add a lot of extra infrastructure that has to be built for observability and verifier support. It would also add another concept that BPF developers would have to familiarize themselves with, new syntax in libbpf, etc. But then would really provide no additional benefits over the approach of using a map. BPF_MAP_TYPE_RINGBUF doesn't support lookup/update/delete operations, but so doesn't few other map types (e.g., queue and stack; array doesn't support delete, etc). The approach chosen has an advantage of re-using existing BPF map infrastructure (introspection APIs in kernel, libbpf support, etc), being familiar concept (no need to teach users a new type of object in BPF program), and utilizing existing tooling (bpftool). For common scenario of using a single ring buffer for all CPUs, it's as simple and straightforward, as would be with a dedicated "container" object. On the other hand, by being a map, it can be combined with ARRAY_OF_MAPS and HASH_OF_MAPS map-in-maps to implement a wide variety of topologies, from one ring buffer for each CPU (e.g., as a replacement for perf buffer use cases), to a complicated application hashing/sharding of ring buffers (e.g., having a small pool of ring buffers with hashed task's tgid being a look up key to preserve order, but reduce contention). Key and value sizes are enforced to be zero. max_entries is used to specify the size of ring buffer and has to be a power of 2 value. There are a bunch of similarities between perf buffer (BPF_MAP_TYPE_PERF_EVENT_ARRAY) and new BPF ring buffer semantics: - variable-length records; - if there is no more space left in ring buffer, reservation fails, no blocking; - memory-mappable data area for user-space applications for ease of consumption and high performance; - epoll notifications for new incoming data; - but still the ability to do busy polling for new data to achieve the lowest latency, if necessary. BPF ringbuf provides two sets of APIs to BPF programs: - bpf_ringbuf_output() allows to *copy* data from one place to a ring buffer, similarly to bpf_perf_event_output(); - bpf_ringbuf_reserve()/bpf_ringbuf_commit()/bpf_ringbuf_discard() APIs split the whole process into two steps. First, a fixed amount of space is reserved. If successful, a pointer to a data inside ring buffer data area is returned, which BPF programs can use similarly to a data inside array/hash maps. Once ready, this piece of memory is either committed or discarded. Discard is similar to commit, but makes consumer ignore the record. bpf_ringbuf_output() has disadvantage of incurring extra memory copy, because record has to be prepared in some other place first. But it allows to submit records of the length that's not known to verifier beforehand. It also closely matches bpf_perf_event_output(), so will simplify migration significantly. bpf_ringbuf_reserve() avoids the extra copy of memory by providing a memory pointer directly to ring buffer memory. In a lot of cases records are larger than BPF stack space allows, so many programs have use extra per-CPU array as a temporary heap for preparing sample. bpf_ringbuf_reserve() avoid this needs completely. But in exchange, it only allows a known constant size of memory to be reserved, such that verifier can verify that BPF program can't access memory outside its reserved record space. bpf_ringbuf_output(), while slightly slower due to extra memory copy, covers some use cases that are not suitable for bpf_ringbuf_reserve(). The difference between commit and discard is very small. Discard just marks a record as discarded, and such records are supposed to be ignored by consumer code. Discard is useful for some advanced use-cases, such as ensuring all-or-nothing multi-record submission, or emulating temporary malloc()/free() within single BPF program invocation. Each reserved record is tracked by verifier through existing reference-tracking logic, similar to socket ref-tracking. It is thus impossible to reserve a record, but forget to submit (or discard) it. bpf_ringbuf_query() helper allows to query various properties of ring buffer. Currently 4 are supported: - BPF_RB_AVAIL_DATA returns amount of unconsumed data in ring buffer; - BPF_RB_RING_SIZE returns the size of ring buffer; - BPF_RB_CONS_POS/BPF_RB_PROD_POS returns current logical possition of consumer/producer, respectively. Returned values are momentarily snapshots of ring buffer state and could be off by the time helper returns, so this should be used only for debugging/reporting reasons or for implementing various heuristics, that take into account highly-changeable nature of some of those characteristics. One such heuristic might involve more fine-grained control over poll/epoll notifications about new data availability in ring buffer. Together with BPF_RB_NO_WAKEUP/BPF_RB_FORCE_WAKEUP flags for output/commit/discard helpers, it allows BPF program a high degree of control and, e.g., more efficient batched notifications. Default self-balancing strategy, though, should be adequate for most applications and will work reliable and efficiently already. Design and implementation ------------------------- This reserve/commit schema allows a natural way for multiple producers, either on different CPUs or even on the same CPU/in the same BPF program, to reserve independent records and work with them without blocking other producers. This means that if BPF program was interruped by another BPF program sharing the same ring buffer, they will both get a record reserved (provided there is enough space left) and can work with it and submit it independently. This applies to NMI context as well, except that due to using a spinlock during reservation, in NMI context, bpf_ringbuf_reserve() might fail to get a lock, in which case reservation will fail even if ring buffer is not full. The ring buffer itself internally is implemented as a power-of-2 sized circular buffer, with two logical and ever-increasing counters (which might wrap around on 32-bit architectures, that's not a problem): - consumer counter shows up to which logical position consumer consumed the data; - producer counter denotes amount of data reserved by all producers. Each time a record is reserved, producer that "owns" the record will successfully advance producer counter. At that point, data is still not yet ready to be consumed, though. Each record has 8 byte header, which contains the length of reserved record, as well as two extra bits: busy bit to denote that record is still being worked on, and discard bit, which might be set at commit time if record is discarded. In the latter case, consumer is supposed to skip the record and move on to the next one. Record header also encodes record's relative offset from the beginning of ring buffer data area (in pages). This allows bpf_ringbuf_commit()/bpf_ringbuf_discard() to accept only the pointer to the record itself, without requiring also the pointer to ring buffer itself. Ring buffer memory location will be restored from record metadata header. This significantly simplifies verifier, as well as improving API usability. Producer counter increments are serialized under spinlock, so there is a strict ordering between reservations. Commits, on the other hand, are completely lockless and independent. All records become available to consumer in the order of reservations, but only after all previous records where already committed. It is thus possible for slow producers to temporarily hold off submitted records, that were reserved later. Reservation/commit/consumer protocol is verified by litmus tests in Documentation/litmus-test/bpf-rb. One interesting implementation bit, that significantly simplifies (and thus speeds up as well) implementation of both producers and consumers is how data area is mapped twice contiguously back-to-back in the virtual memory. This allows to not take any special measures for samples that have to wrap around at the end of the circular buffer data area, because the next page after the last data page would be first data page again, and thus the sample will still appear completely contiguous in virtual memory. See comment and a simple ASCII diagram showing this visually in bpf_ringbuf_area_alloc(). Another feature that distinguishes BPF ringbuf from perf ring buffer is a self-pacing notifications of new data being availability. bpf_ringbuf_commit() implementation will send a notification of new record being available after commit only if consumer has already caught up right up to the record being committed. If not, consumer still has to catch up and thus will see new data anyways without needing an extra poll notification. Benchmarks (see tools/testing/selftests/bpf/benchs/bench_ringbuf.c) show that this allows to achieve a very high throughput without having to resort to tricks like "notify only every Nth sample", which are necessary with perf buffer. For extreme cases, when BPF program wants more manual control of notifications, commit/discard/output helpers accept BPF_RB_NO_WAKEUP and BPF_RB_FORCE_WAKEUP flags, which give full control over notifications of data availability, but require extra caution and diligence in using this API. Comparison to alternatives -------------------------- Before considering implementing BPF ring buffer from scratch existing alternatives in kernel were evaluated, but didn't seem to meet the needs. They largely fell into few categores: - per-CPU buffers (perf, ftrace, etc), which don't satisfy two motivations outlined above (ordering and memory consumption); - linked list-based implementations; while some were multi-producer designs, consuming these from user-space would be very complicated and most probably not performant; memory-mapping contiguous piece of memory is simpler and more performant for user-space consumers; - io_uring is SPSC, but also requires fixed-sized elements. Naively turning SPSC queue into MPSC w/ lock would have subpar performance compared to locked reserve + lockless commit, as with BPF ring buffer. Fixed sized elements would be too limiting for BPF programs, given existing BPF programs heavily rely on variable-sized perf buffer already; - specialized implementations (like a new printk ring buffer, [0]) with lots of printk-specific limitations and implications, that didn't seem to fit well for intended use with BPF programs. [0] https://lwn.net/Articles/779550/ Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200529075424.3139988-2-andriin@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 13 + include/linux/bpf_types.h | 1 + include/linux/bpf_verifier.h | 4 + include/uapi/linux/bpf.h | 84 +++- kernel/bpf/Makefile | 2 +- kernel/bpf/helpers.c | 10 + kernel/bpf/ringbuf.c | 501 +++++++++++++++++++++ kernel/bpf/syscall.c | 12 + kernel/bpf/verifier.c | 195 ++++++-- kernel/trace/bpf_trace.c | 10 + tools/include/uapi/linux/bpf.h | 84 +++- tools/testing/selftests/bpf/verifier/and.c | 4 +- .../testing/selftests/bpf/verifier/array_access.c | 4 +- tools/testing/selftests/bpf/verifier/bounds.c | 6 +- tools/testing/selftests/bpf/verifier/calls.c | 2 +- .../selftests/bpf/verifier/direct_value_access.c | 4 +- .../selftests/bpf/verifier/helper_access_var_len.c | 2 +- .../selftests/bpf/verifier/helper_value_access.c | 6 +- .../selftests/bpf/verifier/value_ptr_arith.c | 8 +- 19 files changed, 882 insertions(+), 70 deletions(-) create mode 100644 kernel/bpf/ringbuf.c (limited to 'tools') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index efe8836b5c48..e5884f7f801c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -90,6 +90,8 @@ struct bpf_map_ops { int (*map_direct_value_meta)(const struct bpf_map *map, u64 imm, u32 *off); int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma); + __poll_t (*map_poll)(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts); }; struct bpf_map_memory { @@ -244,6 +246,9 @@ enum bpf_arg_type { ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ + ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ + ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ + ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ }; /* type of values returned from helper functions */ @@ -255,6 +260,7 @@ enum bpf_return_type { RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ + RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -322,6 +328,8 @@ enum bpf_reg_type { PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ PTR_TO_BTF_ID, /* reg points to kernel struct */ PTR_TO_BTF_ID_OR_NULL, /* reg points to kernel struct or NULL */ + PTR_TO_MEM, /* reg points to valid memory region */ + PTR_TO_MEM_OR_NULL, /* reg points to valid memory region or NULL */ }; /* The information passed from prog-specific *_is_valid_access @@ -1611,6 +1619,11 @@ extern const struct bpf_func_proto bpf_tcp_sock_proto; extern const struct bpf_func_proto bpf_jiffies64_proto; extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_event_output_data_proto; +extern const struct bpf_func_proto bpf_ringbuf_output_proto; +extern const struct bpf_func_proto bpf_ringbuf_reserve_proto; +extern const struct bpf_func_proto bpf_ringbuf_submit_proto; +extern const struct bpf_func_proto bpf_ringbuf_discard_proto; +extern const struct bpf_func_proto bpf_ringbuf_query_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 29d22752fc87..fa8e1b552acd 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -118,6 +118,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops) #if defined(CONFIG_BPF_JIT) BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) #endif +BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index ea833087e853..ca08db4ffb5f 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -54,6 +54,8 @@ struct bpf_reg_state { u32 btf_id; /* for PTR_TO_BTF_ID */ + u32 mem_size; /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */ + /* Max size from any of the above. */ unsigned long raw; }; @@ -63,6 +65,8 @@ struct bpf_reg_state { * offset, so they can share range knowledge. * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we * came from, when one is tested for != NULL. + * For PTR_TO_MEM_OR_NULL this is used to identify memory allocation + * for the purpose of tracking that it's freed. * For PTR_TO_SOCKET this is used to share which pointers retain the * same reference to the socket, to determine proper reference freeing. */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 54b93f8b49b8..974ca6e948e3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -147,6 +147,7 @@ enum bpf_map_type { BPF_MAP_TYPE_SK_STORAGE, BPF_MAP_TYPE_DEVMAP_HASH, BPF_MAP_TYPE_STRUCT_OPS, + BPF_MAP_TYPE_RINGBUF, }; /* Note that tracing related programs such as @@ -3157,6 +3158,59 @@ union bpf_attr { * **bpf_sk_cgroup_id**\ (). * Return * The id is returned or 0 in case the id could not be retrieved. + * + * void *bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * Description + * Copy *size* bytes from *data* into a ring buffer *ringbuf*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * 0, on success; + * < 0, on error. + * + * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) + * Description + * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * Return + * Valid pointer with *size* bytes of memory available; NULL, + * otherwise. + * + * void bpf_ringbuf_submit(void *data, u64 flags) + * Description + * Submit reserved ring buffer sample, pointed to by *data*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * Nothing. Always succeeds. + * + * void bpf_ringbuf_discard(void *data, u64 flags) + * Description + * Discard reserved ring buffer sample, pointed to by *data*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * Nothing. Always succeeds. + * + * u64 bpf_ringbuf_query(void *ringbuf, u64 flags) + * Description + * Query various characteristics of provided ring buffer. What + * exactly is queries is determined by *flags*: + * - BPF_RB_AVAIL_DATA - amount of data not yet consumed; + * - BPF_RB_RING_SIZE - the size of ring buffer; + * - BPF_RB_CONS_POS - consumer position (can wrap around); + * - BPF_RB_PROD_POS - producer(s) position (can wrap around); + * Data returned is just a momentary snapshots of actual values + * and could be inaccurate, so this facility should be used to + * power heuristics and for reporting, not to make 100% correct + * calculation. + * Return + * Requested value, or 0, if flags are not recognized. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3288,7 +3342,12 @@ union bpf_attr { FN(seq_printf), \ FN(seq_write), \ FN(sk_cgroup_id), \ - FN(sk_ancestor_cgroup_id), + FN(sk_ancestor_cgroup_id), \ + FN(ringbuf_output), \ + FN(ringbuf_reserve), \ + FN(ringbuf_submit), \ + FN(ringbuf_discard), \ + FN(ringbuf_query), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3398,6 +3457,29 @@ enum { BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0), }; +/* BPF_FUNC_bpf_ringbuf_commit, BPF_FUNC_bpf_ringbuf_discard, and + * BPF_FUNC_bpf_ringbuf_output flags. + */ +enum { + BPF_RB_NO_WAKEUP = (1ULL << 0), + BPF_RB_FORCE_WAKEUP = (1ULL << 1), +}; + +/* BPF_FUNC_bpf_ringbuf_query flags */ +enum { + BPF_RB_AVAIL_DATA = 0, + BPF_RB_RING_SIZE = 1, + BPF_RB_CONS_POS = 2, + BPF_RB_PROD_POS = 3, +}; + +/* BPF ring buffer constants */ +enum { + BPF_RINGBUF_BUSY_BIT = (1U << 31), + BPF_RINGBUF_DISCARD_BIT = (1U << 30), + BPF_RINGBUF_HDR_SZ = 8, +}; + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 375b933010dd..8fca02f64811 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -4,7 +4,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bb4fb634275e..be43ab3e619f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -635,6 +635,16 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; + case BPF_FUNC_ringbuf_output: + return &bpf_ringbuf_output_proto; + case BPF_FUNC_ringbuf_reserve: + return &bpf_ringbuf_reserve_proto; + case BPF_FUNC_ringbuf_submit: + return &bpf_ringbuf_submit_proto; + case BPF_FUNC_ringbuf_discard: + return &bpf_ringbuf_discard_proto; + case BPF_FUNC_ringbuf_query: + return &bpf_ringbuf_query_proto; default: break; } diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c new file mode 100644 index 000000000000..180414bb0d3e --- /dev/null +++ b/kernel/bpf/ringbuf.c @@ -0,0 +1,501 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) + +/* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */ +#define RINGBUF_PGOFF \ + (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT) +/* consumer page and producer page */ +#define RINGBUF_POS_PAGES 2 + +#define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4) + +/* Maximum size of ring buffer area is limited by 32-bit page offset within + * record header, counted in pages. Reserve 8 bits for extensibility, and take + * into account few extra pages for consumer/producer pages and + * non-mmap()'able parts. This gives 64GB limit, which seems plenty for single + * ring buffer. + */ +#define RINGBUF_MAX_DATA_SZ \ + (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE) + +struct bpf_ringbuf { + wait_queue_head_t waitq; + struct irq_work work; + u64 mask; + struct page **pages; + int nr_pages; + spinlock_t spinlock ____cacheline_aligned_in_smp; + /* Consumer and producer counters are put into separate pages to allow + * mapping consumer page as r/w, but restrict producer page to r/o. + * This protects producer position from being modified by user-space + * application and ruining in-kernel position tracking. + */ + unsigned long consumer_pos __aligned(PAGE_SIZE); + unsigned long producer_pos __aligned(PAGE_SIZE); + char data[] __aligned(PAGE_SIZE); +}; + +struct bpf_ringbuf_map { + struct bpf_map map; + struct bpf_map_memory memory; + struct bpf_ringbuf *rb; +}; + +/* 8-byte ring buffer record header structure */ +struct bpf_ringbuf_hdr { + u32 len; + u32 pg_off; +}; + +static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) +{ + const gfp_t flags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | + __GFP_ZERO; + int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES; + int nr_data_pages = data_sz >> PAGE_SHIFT; + int nr_pages = nr_meta_pages + nr_data_pages; + struct page **pages, *page; + struct bpf_ringbuf *rb; + size_t array_size; + int i; + + /* Each data page is mapped twice to allow "virtual" + * continuous read of samples wrapping around the end of ring + * buffer area: + * ------------------------------------------------------ + * | meta pages | real data pages | same data pages | + * ------------------------------------------------------ + * | | 1 2 3 4 5 6 7 8 9 | 1 2 3 4 5 6 7 8 9 | + * ------------------------------------------------------ + * | | TA DA | TA DA | + * ------------------------------------------------------ + * ^^^^^^^ + * | + * Here, no need to worry about special handling of wrapped-around + * data due to double-mapped data pages. This works both in kernel and + * when mmap()'ed in user-space, simplifying both kernel and + * user-space implementations significantly. + */ + array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages); + if (array_size > PAGE_SIZE) + pages = vmalloc_node(array_size, numa_node); + else + pages = kmalloc_node(array_size, flags, numa_node); + if (!pages) + return NULL; + + for (i = 0; i < nr_pages; i++) { + page = alloc_pages_node(numa_node, flags, 0); + if (!page) { + nr_pages = i; + goto err_free_pages; + } + pages[i] = page; + if (i >= nr_meta_pages) + pages[nr_data_pages + i] = page; + } + + rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages, + VM_ALLOC | VM_USERMAP, PAGE_KERNEL); + if (rb) { + rb->pages = pages; + rb->nr_pages = nr_pages; + return rb; + } + +err_free_pages: + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kvfree(pages); + return NULL; +} + +static void bpf_ringbuf_notify(struct irq_work *work) +{ + struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work); + + wake_up_all(&rb->waitq); +} + +static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) +{ + struct bpf_ringbuf *rb; + + if (!data_sz || !PAGE_ALIGNED(data_sz)) + return ERR_PTR(-EINVAL); + +#ifdef CONFIG_64BIT + /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */ + if (data_sz > RINGBUF_MAX_DATA_SZ) + return ERR_PTR(-E2BIG); +#endif + + rb = bpf_ringbuf_area_alloc(data_sz, numa_node); + if (!rb) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&rb->spinlock); + init_waitqueue_head(&rb->waitq); + init_irq_work(&rb->work, bpf_ringbuf_notify); + + rb->mask = data_sz - 1; + rb->consumer_pos = 0; + rb->producer_pos = 0; + + return rb; +} + +static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) +{ + struct bpf_ringbuf_map *rb_map; + u64 cost; + int err; + + if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + + if (attr->key_size || attr->value_size || + attr->max_entries == 0 || !PAGE_ALIGNED(attr->max_entries)) + return ERR_PTR(-EINVAL); + + rb_map = kzalloc(sizeof(*rb_map), GFP_USER); + if (!rb_map) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&rb_map->map, attr); + + cost = sizeof(struct bpf_ringbuf_map) + + sizeof(struct bpf_ringbuf) + + attr->max_entries; + err = bpf_map_charge_init(&rb_map->map.memory, cost); + if (err) + goto err_free_map; + + rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); + if (IS_ERR(rb_map->rb)) { + err = PTR_ERR(rb_map->rb); + goto err_uncharge; + } + + return &rb_map->map; + +err_uncharge: + bpf_map_charge_finish(&rb_map->map.memory); +err_free_map: + kfree(rb_map); + return ERR_PTR(err); +} + +static void bpf_ringbuf_free(struct bpf_ringbuf *rb) +{ + /* copy pages pointer and nr_pages to local variable, as we are going + * to unmap rb itself with vunmap() below + */ + struct page **pages = rb->pages; + int i, nr_pages = rb->nr_pages; + + vunmap(rb); + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kvfree(pages); +} + +static void ringbuf_map_free(struct bpf_map *map) +{ + struct bpf_ringbuf_map *rb_map; + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete + */ + synchronize_rcu(); + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + bpf_ringbuf_free(rb_map->rb); + kfree(rb_map); +} + +static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key) +{ + return ERR_PTR(-ENOTSUPP); +} + +static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 flags) +{ + return -ENOTSUPP; +} + +static int ringbuf_map_delete_elem(struct bpf_map *map, void *key) +{ + return -ENOTSUPP; +} + +static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -ENOTSUPP; +} + +static size_t bpf_ringbuf_mmap_page_cnt(const struct bpf_ringbuf *rb) +{ + size_t data_pages = (rb->mask + 1) >> PAGE_SHIFT; + + /* consumer page + producer page + 2 x data pages */ + return RINGBUF_POS_PAGES + 2 * data_pages; +} + +static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_ringbuf_map *rb_map; + size_t mmap_sz; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + mmap_sz = bpf_ringbuf_mmap_page_cnt(rb_map->rb) << PAGE_SHIFT; + + if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > mmap_sz) + return -EINVAL; + + return remap_vmalloc_range(vma, rb_map->rb, + vma->vm_pgoff + RINGBUF_PGOFF); +} + +static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) +{ + unsigned long cons_pos, prod_pos; + + cons_pos = smp_load_acquire(&rb->consumer_pos); + prod_pos = smp_load_acquire(&rb->producer_pos); + return prod_pos - cons_pos; +} + +static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts) +{ + struct bpf_ringbuf_map *rb_map; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + poll_wait(filp, &rb_map->rb->waitq, pts); + + if (ringbuf_avail_data_sz(rb_map->rb)) + return EPOLLIN | EPOLLRDNORM; + return 0; +} + +const struct bpf_map_ops ringbuf_map_ops = { + .map_alloc = ringbuf_map_alloc, + .map_free = ringbuf_map_free, + .map_mmap = ringbuf_map_mmap, + .map_poll = ringbuf_map_poll, + .map_lookup_elem = ringbuf_map_lookup_elem, + .map_update_elem = ringbuf_map_update_elem, + .map_delete_elem = ringbuf_map_delete_elem, + .map_get_next_key = ringbuf_map_get_next_key, +}; + +/* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, + * calculate offset from record metadata to ring buffer in pages, rounded + * down. This page offset is stored as part of record metadata and allows to + * restore struct bpf_ringbuf * from record pointer. This page offset is + * stored at offset 4 of record metadata header. + */ +static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb, + struct bpf_ringbuf_hdr *hdr) +{ + return ((void *)hdr - (void *)rb) >> PAGE_SHIFT; +} + +/* Given pointer to ring buffer record header, restore pointer to struct + * bpf_ringbuf itself by using page offset stored at offset 4 + */ +static struct bpf_ringbuf * +bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr) +{ + unsigned long addr = (unsigned long)(void *)hdr; + unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT; + + return (void*)((addr & PAGE_MASK) - off); +} + +static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) +{ + unsigned long cons_pos, prod_pos, new_prod_pos, flags; + u32 len, pg_off; + struct bpf_ringbuf_hdr *hdr; + + if (unlikely(size > RINGBUF_MAX_RECORD_SZ)) + return NULL; + + len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); + cons_pos = smp_load_acquire(&rb->consumer_pos); + + if (in_nmi()) { + if (!spin_trylock_irqsave(&rb->spinlock, flags)) + return NULL; + } else { + spin_lock_irqsave(&rb->spinlock, flags); + } + + prod_pos = rb->producer_pos; + new_prod_pos = prod_pos + len; + + /* check for out of ringbuf space by ensuring producer position + * doesn't advance more than (ringbuf_size - 1) ahead + */ + if (new_prod_pos - cons_pos > rb->mask) { + spin_unlock_irqrestore(&rb->spinlock, flags); + return NULL; + } + + hdr = (void *)rb->data + (prod_pos & rb->mask); + pg_off = bpf_ringbuf_rec_pg_off(rb, hdr); + hdr->len = size | BPF_RINGBUF_BUSY_BIT; + hdr->pg_off = pg_off; + + /* pairs with consumer's smp_load_acquire() */ + smp_store_release(&rb->producer_pos, new_prod_pos); + + spin_unlock_irqrestore(&rb->spinlock, flags); + + return (void *)hdr + BPF_RINGBUF_HDR_SZ; +} + +BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags) +{ + struct bpf_ringbuf_map *rb_map; + + if (unlikely(flags)) + return 0; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size); +} + +const struct bpf_func_proto bpf_ringbuf_reserve_proto = { + .func = bpf_ringbuf_reserve, + .ret_type = RET_PTR_TO_ALLOC_MEM_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard) +{ + unsigned long rec_pos, cons_pos; + struct bpf_ringbuf_hdr *hdr; + struct bpf_ringbuf *rb; + u32 new_len; + + hdr = sample - BPF_RINGBUF_HDR_SZ; + rb = bpf_ringbuf_restore_from_rec(hdr); + new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT; + if (discard) + new_len |= BPF_RINGBUF_DISCARD_BIT; + + /* update record header with correct final size prefix */ + xchg(&hdr->len, new_len); + + /* if consumer caught up and is waiting for our record, notify about + * new data availability + */ + rec_pos = (void *)hdr - (void *)rb->data; + cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask; + + if (flags & BPF_RB_FORCE_WAKEUP) + irq_work_queue(&rb->work); + else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP)) + irq_work_queue(&rb->work); +} + +BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags) +{ + bpf_ringbuf_commit(sample, flags, false /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_submit_proto = { + .func = bpf_ringbuf_submit, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags) +{ + bpf_ringbuf_commit(sample, flags, true /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_discard_proto = { + .func = bpf_ringbuf_discard, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size, + u64, flags) +{ + struct bpf_ringbuf_map *rb_map; + void *rec; + + if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP))) + return -EINVAL; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + rec = __bpf_ringbuf_reserve(rb_map->rb, size); + if (!rec) + return -EAGAIN; + + memcpy(rec, data, size); + bpf_ringbuf_commit(rec, flags, false /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_output_proto = { + .func = bpf_ringbuf_output, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) +{ + struct bpf_ringbuf *rb; + + rb = container_of(map, struct bpf_ringbuf_map, map)->rb; + + switch (flags) { + case BPF_RB_AVAIL_DATA: + return ringbuf_avail_data_sz(rb); + case BPF_RB_RING_SIZE: + return rb->mask + 1; + case BPF_RB_CONS_POS: + return smp_load_acquire(&rb->consumer_pos); + case BPF_RB_PROD_POS: + return smp_load_acquire(&rb->producer_pos); + default: + return 0; + } +} + +const struct bpf_func_proto bpf_ringbuf_query_proto = { + .func = bpf_ringbuf_query, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2c969a9b90d3..9de3540fa90c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -26,6 +26,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -662,6 +663,16 @@ out: return err; } +static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts) +{ + struct bpf_map *map = filp->private_data; + + if (map->ops->map_poll) + return map->ops->map_poll(map, filp, pts); + + return EPOLLERR; +} + const struct file_operations bpf_map_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_map_show_fdinfo, @@ -670,6 +681,7 @@ const struct file_operations bpf_map_fops = { .read = bpf_dummy_read, .write = bpf_dummy_write, .mmap = bpf_map_mmap, + .poll = bpf_map_poll, }; int bpf_map_new_fd(struct bpf_map *map, int flags) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6d725a26f66e..5c7bbaac81ef 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -233,6 +233,7 @@ struct bpf_call_arg_meta { bool pkt_access; int regno; int access_size; + int mem_size; u64 msize_max_value; int ref_obj_id; int func_id; @@ -408,7 +409,8 @@ static bool reg_type_may_be_null(enum bpf_reg_type type) type == PTR_TO_SOCKET_OR_NULL || type == PTR_TO_SOCK_COMMON_OR_NULL || type == PTR_TO_TCP_SOCK_OR_NULL || - type == PTR_TO_BTF_ID_OR_NULL; + type == PTR_TO_BTF_ID_OR_NULL || + type == PTR_TO_MEM_OR_NULL; } static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) @@ -422,7 +424,9 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL || type == PTR_TO_TCP_SOCK || - type == PTR_TO_TCP_SOCK_OR_NULL; + type == PTR_TO_TCP_SOCK_OR_NULL || + type == PTR_TO_MEM || + type == PTR_TO_MEM_OR_NULL; } static bool arg_type_may_be_refcounted(enum bpf_arg_type type) @@ -436,7 +440,9 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type) */ static bool is_release_function(enum bpf_func_id func_id) { - return func_id == BPF_FUNC_sk_release; + return func_id == BPF_FUNC_sk_release || + func_id == BPF_FUNC_ringbuf_submit || + func_id == BPF_FUNC_ringbuf_discard; } static bool may_be_acquire_function(enum bpf_func_id func_id) @@ -444,7 +450,8 @@ static bool may_be_acquire_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || func_id == BPF_FUNC_skc_lookup_tcp || - func_id == BPF_FUNC_map_lookup_elem; + func_id == BPF_FUNC_map_lookup_elem || + func_id == BPF_FUNC_ringbuf_reserve; } static bool is_acquire_function(enum bpf_func_id func_id, @@ -454,7 +461,8 @@ static bool is_acquire_function(enum bpf_func_id func_id, if (func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || - func_id == BPF_FUNC_skc_lookup_tcp) + func_id == BPF_FUNC_skc_lookup_tcp || + func_id == BPF_FUNC_ringbuf_reserve) return true; if (func_id == BPF_FUNC_map_lookup_elem && @@ -494,6 +502,8 @@ static const char * const reg_type_str[] = { [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", + [PTR_TO_MEM] = "mem", + [PTR_TO_MEM_OR_NULL] = "mem_or_null", }; static char slot_type_char[] = { @@ -2468,32 +2478,49 @@ static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, return 0; } -/* check read/write into map element returned by bpf_map_lookup_elem() */ -static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, - int size, bool zero_size_allowed) +/* check read/write into memory region (e.g., map value, ringbuf sample, etc) */ +static int __check_mem_access(struct bpf_verifier_env *env, int regno, + int off, int size, u32 mem_size, + bool zero_size_allowed) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_map *map = regs[regno].map_ptr; + bool size_ok = size > 0 || (size == 0 && zero_size_allowed); + struct bpf_reg_state *reg; + + if (off >= 0 && size_ok && (u64)off + size <= mem_size) + return 0; - if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || - off + size > map->value_size) { + reg = &cur_regs(env)[regno]; + switch (reg->type) { + case PTR_TO_MAP_VALUE: verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", - map->value_size, off, size); - return -EACCES; + mem_size, off, size); + break; + case PTR_TO_PACKET: + case PTR_TO_PACKET_META: + case PTR_TO_PACKET_END: + verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", + off, size, regno, reg->id, off, mem_size); + break; + case PTR_TO_MEM: + default: + verbose(env, "invalid access to memory, mem_size=%u off=%d size=%d\n", + mem_size, off, size); } - return 0; + + return -EACCES; } -/* check read/write into a map element with possible variable offset */ -static int check_map_access(struct bpf_verifier_env *env, u32 regno, - int off, int size, bool zero_size_allowed) +/* check read/write into a memory region with possible variable offset */ +static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, + int off, int size, u32 mem_size, + bool zero_size_allowed) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; int err; - /* We may have adjusted the register to this map value, so we + /* We may have adjusted the register pointing to memory region, so we * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. */ @@ -2514,10 +2541,10 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, regno); return -EACCES; } - err = __check_map_access(env, regno, reg->smin_value + off, size, - zero_size_allowed); + err = __check_mem_access(env, regno, reg->smin_value + off, size, + mem_size, zero_size_allowed); if (err) { - verbose(env, "R%d min value is outside of the array range\n", + verbose(env, "R%d min value is outside of the allowed memory range\n", regno); return err; } @@ -2527,18 +2554,38 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * If reg->umax_value + off could overflow, treat that as unbounded too. */ if (reg->umax_value >= BPF_MAX_VAR_OFF) { - verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n", + verbose(env, "R%d unbounded memory access, make sure to bounds check any such access\n", regno); return -EACCES; } - err = __check_map_access(env, regno, reg->umax_value + off, size, - zero_size_allowed); - if (err) - verbose(env, "R%d max value is outside of the array range\n", + err = __check_mem_access(env, regno, reg->umax_value + off, size, + mem_size, zero_size_allowed); + if (err) { + verbose(env, "R%d max value is outside of the allowed memory range\n", regno); + return err; + } + + return 0; +} - if (map_value_has_spin_lock(reg->map_ptr)) { - u32 lock = reg->map_ptr->spin_lock_off; +/* check read/write into a map element with possible variable offset */ +static int check_map_access(struct bpf_verifier_env *env, u32 regno, + int off, int size, bool zero_size_allowed) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *reg = &state->regs[regno]; + struct bpf_map *map = reg->map_ptr; + int err; + + err = check_mem_region_access(env, regno, off, size, map->value_size, + zero_size_allowed); + if (err) + return err; + + if (map_value_has_spin_lock(map)) { + u32 lock = map->spin_lock_off; /* if any part of struct bpf_spin_lock can be touched by * load/store reject this program. @@ -2596,21 +2643,6 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, } } -static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, - int off, int size, bool zero_size_allowed) -{ - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[regno]; - - if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || - (u64)off + size > reg->range) { - verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", - off, size, regno, reg->id, reg->off, reg->range); - return -EACCES; - } - return 0; -} - static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) { @@ -2631,16 +2663,17 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, regno); return -EACCES; } - err = __check_packet_access(env, regno, off, size, zero_size_allowed); + err = __check_mem_access(env, regno, off, size, reg->range, + zero_size_allowed); if (err) { verbose(env, "R%d offset is outside of the packet\n", regno); return err; } - /* __check_packet_access has made sure "off + size - 1" is within u16. + /* __check_mem_access has made sure "off + size - 1" is within u16. * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff, * otherwise find_good_pkt_pointers would have refused to set range info - * that __check_packet_access would have rejected this pkt access. + * that __check_mem_access would have rejected this pkt access. * Therefore, "off + reg->umax_value + size - 1" won't overflow u32. */ env->prog->aux->max_pkt_offset = @@ -3220,6 +3253,16 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn mark_reg_unknown(env, regs, value_regno); } } + } else if (reg->type == PTR_TO_MEM) { + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose(env, "R%d leaks addr into mem\n", value_regno); + return -EACCES; + } + err = check_mem_region_access(env, regno, off, size, + reg->mem_size, false); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; u32 btf_id = 0; @@ -3557,6 +3600,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return -EACCES; return check_map_access(env, regno, reg->off, access_size, zero_size_allowed); + case PTR_TO_MEM: + return check_mem_region_access(env, regno, reg->off, + access_size, reg->mem_size, + zero_size_allowed); default: /* scalar_value|ptr_to_stack or invalid ptr */ return check_stack_boundary(env, regno, access_size, zero_size_allowed, meta); @@ -3661,6 +3708,17 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type) type == ARG_CONST_SIZE_OR_ZERO; } +static bool arg_type_is_alloc_mem_ptr(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_ALLOC_MEM || + type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; +} + +static bool arg_type_is_alloc_size(enum bpf_arg_type type) +{ + return type == ARG_CONST_ALLOC_SIZE_OR_ZERO; +} + static bool arg_type_is_int_ptr(enum bpf_arg_type type) { return type == ARG_PTR_TO_INT || @@ -3720,7 +3778,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || - arg_type == ARG_CONST_SIZE_OR_ZERO) { + arg_type == ARG_CONST_SIZE_OR_ZERO || + arg_type == ARG_CONST_ALLOC_SIZE_OR_ZERO) { expected_type = SCALAR_VALUE; if (type != expected_type) goto err_type; @@ -3791,13 +3850,29 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, * happens during stack boundary checking. */ if (register_is_null(reg) && - arg_type == ARG_PTR_TO_MEM_OR_NULL) + (arg_type == ARG_PTR_TO_MEM_OR_NULL || + arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL)) /* final test in check_stack_boundary() */; else if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && + type != PTR_TO_MEM && type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; + } else if (arg_type_is_alloc_mem_ptr(arg_type)) { + expected_type = PTR_TO_MEM; + if (register_is_null(reg) && + arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL) + /* final test in check_stack_boundary() */; + else if (type != expected_type) + goto err_type; + if (meta->ref_obj_id) { + verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, + meta->ref_obj_id); + return -EFAULT; + } + meta->ref_obj_id = reg->ref_obj_id; } else if (arg_type_is_int_ptr(arg_type)) { expected_type = PTR_TO_STACK; if (!type_is_pkt_pointer(type) && @@ -3893,6 +3968,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, zero_size_allowed, meta); if (!err) err = mark_chain_precision(env, regno); + } else if (arg_type_is_alloc_size(arg_type)) { + if (!tnum_is_const(reg->var_off)) { + verbose(env, "R%d unbounded size, use 'var &= const' or 'if (var < const)'\n", + regno); + return -EACCES; + } + meta->mem_size = reg->var_off.value; } else if (arg_type_is_int_ptr(arg_type)) { int size = int_ptr_type_to_size(arg_type); @@ -3929,6 +4011,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_xdp_output) goto error; break; + case BPF_MAP_TYPE_RINGBUF: + if (func_id != BPF_FUNC_ringbuf_output && + func_id != BPF_FUNC_ringbuf_reserve && + func_id != BPF_FUNC_ringbuf_submit && + func_id != BPF_FUNC_ringbuf_discard && + func_id != BPF_FUNC_ringbuf_query) + goto error; + break; case BPF_MAP_TYPE_STACK_TRACE: if (func_id != BPF_FUNC_get_stackid) goto error; @@ -4655,6 +4745,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; + } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; + regs[BPF_REG_0].mem_size = meta.mem_size; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -6611,6 +6706,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, reg->type = PTR_TO_TCP_SOCK; } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) { reg->type = PTR_TO_BTF_ID; + } else if (reg->type == PTR_TO_MEM_OR_NULL) { + reg->type = PTR_TO_MEM; } if (is_null) { /* We don't need id and ref_obj_id from this point diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 187cd6995bbb..3767d34114c0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1088,6 +1088,16 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_read_value_proto; case BPF_FUNC_get_ns_current_pid_tgid: return &bpf_get_ns_current_pid_tgid_proto; + case BPF_FUNC_ringbuf_output: + return &bpf_ringbuf_output_proto; + case BPF_FUNC_ringbuf_reserve: + return &bpf_ringbuf_reserve_proto; + case BPF_FUNC_ringbuf_submit: + return &bpf_ringbuf_submit_proto; + case BPF_FUNC_ringbuf_discard: + return &bpf_ringbuf_discard_proto; + case BPF_FUNC_ringbuf_query: + return &bpf_ringbuf_query_proto; default: return NULL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 54b93f8b49b8..974ca6e948e3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -147,6 +147,7 @@ enum bpf_map_type { BPF_MAP_TYPE_SK_STORAGE, BPF_MAP_TYPE_DEVMAP_HASH, BPF_MAP_TYPE_STRUCT_OPS, + BPF_MAP_TYPE_RINGBUF, }; /* Note that tracing related programs such as @@ -3157,6 +3158,59 @@ union bpf_attr { * **bpf_sk_cgroup_id**\ (). * Return * The id is returned or 0 in case the id could not be retrieved. + * + * void *bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * Description + * Copy *size* bytes from *data* into a ring buffer *ringbuf*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * 0, on success; + * < 0, on error. + * + * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) + * Description + * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * Return + * Valid pointer with *size* bytes of memory available; NULL, + * otherwise. + * + * void bpf_ringbuf_submit(void *data, u64 flags) + * Description + * Submit reserved ring buffer sample, pointed to by *data*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * Nothing. Always succeeds. + * + * void bpf_ringbuf_discard(void *data, u64 flags) + * Description + * Discard reserved ring buffer sample, pointed to by *data*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * Nothing. Always succeeds. + * + * u64 bpf_ringbuf_query(void *ringbuf, u64 flags) + * Description + * Query various characteristics of provided ring buffer. What + * exactly is queries is determined by *flags*: + * - BPF_RB_AVAIL_DATA - amount of data not yet consumed; + * - BPF_RB_RING_SIZE - the size of ring buffer; + * - BPF_RB_CONS_POS - consumer position (can wrap around); + * - BPF_RB_PROD_POS - producer(s) position (can wrap around); + * Data returned is just a momentary snapshots of actual values + * and could be inaccurate, so this facility should be used to + * power heuristics and for reporting, not to make 100% correct + * calculation. + * Return + * Requested value, or 0, if flags are not recognized. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3288,7 +3342,12 @@ union bpf_attr { FN(seq_printf), \ FN(seq_write), \ FN(sk_cgroup_id), \ - FN(sk_ancestor_cgroup_id), + FN(sk_ancestor_cgroup_id), \ + FN(ringbuf_output), \ + FN(ringbuf_reserve), \ + FN(ringbuf_submit), \ + FN(ringbuf_discard), \ + FN(ringbuf_query), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3398,6 +3457,29 @@ enum { BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0), }; +/* BPF_FUNC_bpf_ringbuf_commit, BPF_FUNC_bpf_ringbuf_discard, and + * BPF_FUNC_bpf_ringbuf_output flags. + */ +enum { + BPF_RB_NO_WAKEUP = (1ULL << 0), + BPF_RB_FORCE_WAKEUP = (1ULL << 1), +}; + +/* BPF_FUNC_bpf_ringbuf_query flags */ +enum { + BPF_RB_AVAIL_DATA = 0, + BPF_RB_RING_SIZE = 1, + BPF_RB_CONS_POS = 2, + BPF_RB_PROD_POS = 3, +}; + +/* BPF ring buffer constants */ +enum { + BPF_RINGBUF_BUSY_BIT = (1U << 31), + BPF_RINGBUF_DISCARD_BIT = (1U << 30), + BPF_RINGBUF_HDR_SZ = 8, +}; + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/tools/testing/selftests/bpf/verifier/and.c b/tools/testing/selftests/bpf/verifier/and.c index e0fad1548737..d781bc86e100 100644 --- a/tools/testing/selftests/bpf/verifier/and.c +++ b/tools/testing/selftests/bpf/verifier/and.c @@ -15,7 +15,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@ -44,7 +44,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, diff --git a/tools/testing/selftests/bpf/verifier/array_access.c b/tools/testing/selftests/bpf/verifier/array_access.c index f3c33e128709..1c4b1939f5a8 100644 --- a/tools/testing/selftests/bpf/verifier/array_access.c +++ b/tools/testing/selftests/bpf/verifier/array_access.c @@ -117,7 +117,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@ -137,7 +137,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 unbounded memory access, make sure to bounds check any array access into a map", + .errstr = "R0 unbounded memory access, make sure to bounds check any such access", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, diff --git a/tools/testing/selftests/bpf/verifier/bounds.c b/tools/testing/selftests/bpf/verifier/bounds.c index 58f4aa593b1b..4d6645f2874c 100644 --- a/tools/testing/selftests/bpf/verifier/bounds.c +++ b/tools/testing/selftests/bpf/verifier/bounds.c @@ -20,7 +20,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT, }, { @@ -146,7 +146,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", .result = REJECT }, { @@ -354,7 +354,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT }, { diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 7629a0cebb9b..94258c6b5235 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -105,7 +105,7 @@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, .fixup_map_hash_8b = { 16 }, .result = REJECT, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", }, { "calls: overlapping caller/callee", diff --git a/tools/testing/selftests/bpf/verifier/direct_value_access.c b/tools/testing/selftests/bpf/verifier/direct_value_access.c index b9fb28e8e224..988f46a1a4c7 100644 --- a/tools/testing/selftests/bpf/verifier/direct_value_access.c +++ b/tools/testing/selftests/bpf/verifier/direct_value_access.c @@ -68,7 +68,7 @@ }, .fixup_map_array_48b = { 1 }, .result = REJECT, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", }, { "direct map access, write test 7", @@ -220,7 +220,7 @@ }, .fixup_map_array_small = { 1 }, .result = REJECT, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", }, { "direct map access, write test 19", diff --git a/tools/testing/selftests/bpf/verifier/helper_access_var_len.c b/tools/testing/selftests/bpf/verifier/helper_access_var_len.c index 67ab12410050..5a605ae131a9 100644 --- a/tools/testing/selftests/bpf/verifier/helper_access_var_len.c +++ b/tools/testing/selftests/bpf/verifier/helper_access_var_len.c @@ -318,7 +318,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 4 }, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, diff --git a/tools/testing/selftests/bpf/verifier/helper_value_access.c b/tools/testing/selftests/bpf/verifier/helper_value_access.c index 7572e403ddb9..961f28139b96 100644 --- a/tools/testing/selftests/bpf/verifier/helper_value_access.c +++ b/tools/testing/selftests/bpf/verifier/helper_value_access.c @@ -280,7 +280,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -415,7 +415,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -926,7 +926,7 @@ }, .fixup_map_hash_16b = { 3, 10 }, .result = REJECT, - .errstr = "R2 unbounded memory access, make sure to bounds check any array access into a map", + .errstr = "R2 unbounded memory access, make sure to bounds check any such access", .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, { diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c index a53d99cebd9f..97ee658e1242 100644 --- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c +++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c @@ -50,7 +50,7 @@ .fixup_map_array_48b = { 8 }, .result = ACCEPT, .result_unpriv = REJECT, - .errstr_unpriv = "R0 min value is outside of the array range", + .errstr_unpriv = "R0 min value is outside of the allowed memory range", .retval = 1, }, { @@ -325,7 +325,7 @@ }, .fixup_map_array_48b = { 3 }, .result = REJECT, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", .result_unpriv = REJECT, .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range", }, @@ -601,7 +601,7 @@ }, .fixup_map_array_48b = { 3 }, .result = REJECT, - .errstr = "R1 max value is outside of the array range", + .errstr = "R1 max value is outside of the allowed memory range", .errstr_unpriv = "R1 pointer arithmetic of map value goes out of range", .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@ -726,7 +726,7 @@ }, .fixup_map_array_48b = { 3 }, .result = REJECT, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", }, { "map access: value_ptr -= known scalar, 2", -- cgit v1.2.3 From bf99c936f9478a05d51e9f101f90de70bee9a89c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 29 May 2020 00:54:21 -0700 Subject: libbpf: Add BPF ring buffer support Declaring and instantiating BPF ring buffer doesn't require any changes to libbpf, as it's just another type of maps. So using existing BTF-defined maps syntax with __uint(type, BPF_MAP_TYPE_RINGBUF) and __uint(max_elements, ) is all that's necessary to create and use BPF ring buffer. This patch adds BPF ring buffer consumer to libbpf. It is very similar to perf_buffer implementation in terms of API, but also attempts to fix some minor problems and inconveniences with existing perf_buffer API. ring_buffer support both single ring buffer use case (with just using ring_buffer__new()), as well as allows to add more ring buffers, each with its own callback and context. This allows to efficiently poll and consume multiple, potentially completely independent, ring buffers, using single epoll instance. The latter is actually a problem in practice for applications that are using multiple sets of perf buffers. They have to create multiple instances for struct perf_buffer and poll them independently or in a loop, each approach having its own problems (e.g., inability to use a common poll timeout). struct ring_buffer eliminates this problem by aggregating many independent ring buffer instances under the single "ring buffer manager". Second, perf_buffer's callback can't return error, so applications that need to stop polling due to error in data or data signalling the end, have to use extra mechanisms to signal that polling has to stop. ring_buffer's callback can return error, which will be passed through back to user code and can be acted upon appropariately. Two APIs allow to consume ring buffer data: - ring_buffer__poll(), which will wait for data availability notification and will consume data only from reported ring buffer(s); this API allows to efficiently use resources by reading data only when it becomes available; - ring_buffer__consume(), will attempt to read new records regardless of data availablity notification sub-system. This API is useful for cases when lowest latency is required, in expense of burning CPU resources. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200529075424.3139988-3-andriin@fb.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/Build | 2 +- tools/lib/bpf/libbpf.h | 21 ++++ tools/lib/bpf/libbpf.map | 5 + tools/lib/bpf/libbpf_probes.c | 5 + tools/lib/bpf/ringbuf.c | 285 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 317 insertions(+), 1 deletion(-) create mode 100644 tools/lib/bpf/ringbuf.c (limited to 'tools') diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build index e3962cfbc9a6..190366d05588 100644 --- a/tools/lib/bpf/Build +++ b/tools/lib/bpf/Build @@ -1,3 +1,3 @@ libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \ netlink.o bpf_prog_linfo.o libbpf_probes.o xsk.o hashmap.o \ - btf_dump.o + btf_dump.o ringbuf.o diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 1e2e399a5f2c..8528a02d5af8 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -478,6 +478,27 @@ LIBBPF_API int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags); LIBBPF_API int bpf_get_link_xdp_info(int ifindex, struct xdp_link_info *info, size_t info_size, __u32 flags); +/* Ring buffer APIs */ +struct ring_buffer; + +typedef int (*ring_buffer_sample_fn)(void *ctx, void *data, size_t size); + +struct ring_buffer_opts { + size_t sz; /* size of this struct, for forward/backward compatiblity */ +}; + +#define ring_buffer_opts__last_field sz + +LIBBPF_API struct ring_buffer * +ring_buffer__new(int map_fd, ring_buffer_sample_fn sample_cb, void *ctx, + const struct ring_buffer_opts *opts); +LIBBPF_API void ring_buffer__free(struct ring_buffer *rb); +LIBBPF_API int ring_buffer__add(struct ring_buffer *rb, int map_fd, + ring_buffer_sample_fn sample_cb, void *ctx); +LIBBPF_API int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms); +LIBBPF_API int ring_buffer__consume(struct ring_buffer *rb); + +/* Perf buffer APIs */ struct perf_buffer; typedef void (*perf_buffer_sample_fn)(void *ctx, int cpu, diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 381a7342ecfc..c18860200abb 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -263,4 +263,9 @@ LIBBPF_0.0.9 { bpf_link_get_next_id; bpf_program__attach_iter; perf_buffer__consume; + ring_buffer__add; + ring_buffer__consume; + ring_buffer__free; + ring_buffer__new; + ring_buffer__poll; } LIBBPF_0.0.8; diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 2c92059c0c90..10cd8d1891f5 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -238,6 +238,11 @@ bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex) if (btf_fd < 0) return false; break; + case BPF_MAP_TYPE_RINGBUF: + key_size = 0; + value_size = 0; + max_entries = 4096; + break; case BPF_MAP_TYPE_UNSPEC: case BPF_MAP_TYPE_HASH: case BPF_MAP_TYPE_ARRAY: diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c new file mode 100644 index 000000000000..bc10fa1d43c7 --- /dev/null +++ b/tools/lib/bpf/ringbuf.c @@ -0,0 +1,285 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* + * Ring buffer operations. + * + * Copyright (C) 2020 Facebook, Inc. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libbpf.h" +#include "libbpf_internal.h" +#include "bpf.h" + +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + +struct ring { + ring_buffer_sample_fn sample_cb; + void *ctx; + void *data; + unsigned long *consumer_pos; + unsigned long *producer_pos; + unsigned long mask; + int map_fd; +}; + +struct ring_buffer { + struct epoll_event *events; + struct ring *rings; + size_t page_size; + int epoll_fd; + int ring_cnt; +}; + +static void ringbuf_unmap_ring(struct ring_buffer *rb, struct ring *r) +{ + if (r->consumer_pos) { + munmap(r->consumer_pos, rb->page_size); + r->consumer_pos = NULL; + } + if (r->producer_pos) { + munmap(r->producer_pos, rb->page_size + 2 * (r->mask + 1)); + r->producer_pos = NULL; + } +} + +/* Add extra RINGBUF maps to this ring buffer manager */ +int ring_buffer__add(struct ring_buffer *rb, int map_fd, + ring_buffer_sample_fn sample_cb, void *ctx) +{ + struct bpf_map_info info; + __u32 len = sizeof(info); + struct epoll_event *e; + struct ring *r; + void *tmp; + int err; + + memset(&info, 0, sizeof(info)); + + err = bpf_obj_get_info_by_fd(map_fd, &info, &len); + if (err) { + err = -errno; + pr_warn("ringbuf: failed to get map info for fd=%d: %d\n", + map_fd, err); + return err; + } + + if (info.type != BPF_MAP_TYPE_RINGBUF) { + pr_warn("ringbuf: map fd=%d is not BPF_MAP_TYPE_RINGBUF\n", + map_fd); + return -EINVAL; + } + + tmp = reallocarray(rb->rings, rb->ring_cnt + 1, sizeof(*rb->rings)); + if (!tmp) + return -ENOMEM; + rb->rings = tmp; + + tmp = reallocarray(rb->events, rb->ring_cnt + 1, sizeof(*rb->events)); + if (!tmp) + return -ENOMEM; + rb->events = tmp; + + r = &rb->rings[rb->ring_cnt]; + memset(r, 0, sizeof(*r)); + + r->map_fd = map_fd; + r->sample_cb = sample_cb; + r->ctx = ctx; + r->mask = info.max_entries - 1; + + /* Map writable consumer page */ + tmp = mmap(NULL, rb->page_size, PROT_READ | PROT_WRITE, MAP_SHARED, + map_fd, 0); + if (tmp == MAP_FAILED) { + err = -errno; + pr_warn("ringbuf: failed to mmap consumer page for map fd=%d: %d\n", + map_fd, err); + return err; + } + r->consumer_pos = tmp; + + /* Map read-only producer page and data pages. We map twice as big + * data size to allow simple reading of samples that wrap around the + * end of a ring buffer. See kernel implementation for details. + * */ + tmp = mmap(NULL, rb->page_size + 2 * info.max_entries, PROT_READ, + MAP_SHARED, map_fd, rb->page_size); + if (tmp == MAP_FAILED) { + err = -errno; + ringbuf_unmap_ring(rb, r); + pr_warn("ringbuf: failed to mmap data pages for map fd=%d: %d\n", + map_fd, err); + return err; + } + r->producer_pos = tmp; + r->data = tmp + rb->page_size; + + e = &rb->events[rb->ring_cnt]; + memset(e, 0, sizeof(*e)); + + e->events = EPOLLIN; + e->data.fd = rb->ring_cnt; + if (epoll_ctl(rb->epoll_fd, EPOLL_CTL_ADD, map_fd, e) < 0) { + err = -errno; + ringbuf_unmap_ring(rb, r); + pr_warn("ringbuf: failed to epoll add map fd=%d: %d\n", + map_fd, err); + return err; + } + + rb->ring_cnt++; + return 0; +} + +void ring_buffer__free(struct ring_buffer *rb) +{ + int i; + + if (!rb) + return; + + for (i = 0; i < rb->ring_cnt; ++i) + ringbuf_unmap_ring(rb, &rb->rings[i]); + if (rb->epoll_fd >= 0) + close(rb->epoll_fd); + + free(rb->events); + free(rb->rings); + free(rb); +} + +struct ring_buffer * +ring_buffer__new(int map_fd, ring_buffer_sample_fn sample_cb, void *ctx, + const struct ring_buffer_opts *opts) +{ + struct ring_buffer *rb; + int err; + + if (!OPTS_VALID(opts, ring_buffer_opts)) + return NULL; + + rb = calloc(1, sizeof(*rb)); + if (!rb) + return NULL; + + rb->page_size = getpagesize(); + + rb->epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (rb->epoll_fd < 0) { + err = -errno; + pr_warn("ringbuf: failed to create epoll instance: %d\n", err); + goto err_out; + } + + err = ring_buffer__add(rb, map_fd, sample_cb, ctx); + if (err) + goto err_out; + + return rb; + +err_out: + ring_buffer__free(rb); + return NULL; +} + +static inline int roundup_len(__u32 len) +{ + /* clear out top 2 bits (discard and busy, if set) */ + len <<= 2; + len >>= 2; + /* add length prefix */ + len += BPF_RINGBUF_HDR_SZ; + /* round up to 8 byte alignment */ + return (len + 7) / 8 * 8; +} + +static int ringbuf_process_ring(struct ring* r) +{ + int *len_ptr, len, err, cnt = 0; + unsigned long cons_pos, prod_pos; + bool got_new_data; + void *sample; + + cons_pos = smp_load_acquire(r->consumer_pos); + do { + got_new_data = false; + prod_pos = smp_load_acquire(r->producer_pos); + while (cons_pos < prod_pos) { + len_ptr = r->data + (cons_pos & r->mask); + len = smp_load_acquire(len_ptr); + + /* sample not committed yet, bail out for now */ + if (len & BPF_RINGBUF_BUSY_BIT) + goto done; + + got_new_data = true; + cons_pos += roundup_len(len); + + if ((len & BPF_RINGBUF_DISCARD_BIT) == 0) { + sample = (void *)len_ptr + BPF_RINGBUF_HDR_SZ; + err = r->sample_cb(r->ctx, sample, len); + if (err) { + /* update consumer pos and bail out */ + smp_store_release(r->consumer_pos, + cons_pos); + return err; + } + cnt++; + } + + smp_store_release(r->consumer_pos, cons_pos); + } + } while (got_new_data); +done: + return cnt; +} + +/* Consume available ring buffer(s) data without event polling. + * Returns number of records consumed across all registered ring buffers, or + * negative number if any of the callbacks return error. + */ +int ring_buffer__consume(struct ring_buffer *rb) +{ + int i, err, res = 0; + + for (i = 0; i < rb->ring_cnt; i++) { + struct ring *ring = &rb->rings[i]; + + err = ringbuf_process_ring(ring); + if (err < 0) + return err; + res += err; + } + return res; +} + +/* Poll for available data and consume records, if any are available. + * Returns number of records consumed, or negative number, if any of the + * registered callbacks returned error. + */ +int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms) +{ + int i, cnt, err, res = 0; + + cnt = epoll_wait(rb->epoll_fd, rb->events, rb->ring_cnt, timeout_ms); + for (i = 0; i < cnt; i++) { + __u32 ring_id = rb->events[i].data.fd; + struct ring *ring = &rb->rings[ring_id]; + + err = ringbuf_process_ring(ring); + if (err < 0) + return err; + res += cnt; + } + return cnt < 0 ? -errno : res; +} -- cgit v1.2.3 From cb1c9ddd552520abd49031d47397c6e95bad882e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 29 May 2020 00:54:22 -0700 Subject: selftests/bpf: Add BPF ringbuf selftests Both singleton BPF ringbuf and BPF ringbuf with map-in-map use cases are tested. Also reserve+submit/discards and output variants of API are validated. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200529075424.3139988-4-andriin@fb.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/ringbuf.c | 211 +++++++++++++++++++++ .../selftests/bpf/prog_tests/ringbuf_multi.c | 102 ++++++++++ tools/testing/selftests/bpf/progs/test_ringbuf.c | 78 ++++++++ .../selftests/bpf/progs/test_ringbuf_multi.c | 77 ++++++++ 4 files changed, 468 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/ringbuf.c create mode 100644 tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c create mode 100644 tools/testing/selftests/bpf/progs/test_ringbuf.c create mode 100644 tools/testing/selftests/bpf/progs/test_ringbuf_multi.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c new file mode 100644 index 000000000000..bb8541f240e2 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "test_ringbuf.skel.h" + +#define EDONE 7777 + +static int duration = 0; + +struct sample { + int pid; + int seq; + long value; + char comm[16]; +}; + +static int sample_cnt; + +static int process_sample(void *ctx, void *data, size_t len) +{ + struct sample *s = data; + + sample_cnt++; + + switch (s->seq) { + case 0: + CHECK(s->value != 333, "sample1_value", "exp %ld, got %ld\n", + 333L, s->value); + return 0; + case 1: + CHECK(s->value != 777, "sample2_value", "exp %ld, got %ld\n", + 777L, s->value); + return -EDONE; + default: + /* we don't care about the rest */ + return 0; + } +} + +static struct test_ringbuf *skel; +static struct ring_buffer *ringbuf; + +static void trigger_samples() +{ + skel->bss->dropped = 0; + skel->bss->total = 0; + skel->bss->discarded = 0; + + /* trigger exactly two samples */ + skel->bss->value = 333; + syscall(__NR_getpgid); + skel->bss->value = 777; + syscall(__NR_getpgid); +} + +static void *poll_thread(void *input) +{ + long timeout = (long)input; + + return (void *)(long)ring_buffer__poll(ringbuf, timeout); +} + +void test_ringbuf(void) +{ + const size_t rec_sz = BPF_RINGBUF_HDR_SZ + sizeof(struct sample); + pthread_t thread; + long bg_ret = -1; + int err; + + skel = test_ringbuf__open_and_load(); + if (CHECK(!skel, "skel_open_load", "skeleton open&load failed\n")) + return; + + /* only trigger BPF program for current process */ + skel->bss->pid = getpid(); + + ringbuf = ring_buffer__new(bpf_map__fd(skel->maps.ringbuf), + process_sample, NULL, NULL); + if (CHECK(!ringbuf, "ringbuf_create", "failed to create ringbuf\n")) + goto cleanup; + + err = test_ringbuf__attach(skel); + if (CHECK(err, "skel_attach", "skeleton attachment failed: %d\n", err)) + goto cleanup; + + trigger_samples(); + + /* 2 submitted + 1 discarded records */ + CHECK(skel->bss->avail_data != 3 * rec_sz, + "err_avail_size", "exp %ld, got %ld\n", + 3L * rec_sz, skel->bss->avail_data); + CHECK(skel->bss->ring_size != 4096, + "err_ring_size", "exp %ld, got %ld\n", + 4096L, skel->bss->ring_size); + CHECK(skel->bss->cons_pos != 0, + "err_cons_pos", "exp %ld, got %ld\n", + 0L, skel->bss->cons_pos); + CHECK(skel->bss->prod_pos != 3 * rec_sz, + "err_prod_pos", "exp %ld, got %ld\n", + 3L * rec_sz, skel->bss->prod_pos); + + /* poll for samples */ + err = ring_buffer__poll(ringbuf, -1); + + /* -EDONE is used as an indicator that we are done */ + if (CHECK(err != -EDONE, "err_done", "done err: %d\n", err)) + goto cleanup; + + /* we expect extra polling to return nothing */ + err = ring_buffer__poll(ringbuf, 0); + if (CHECK(err != 0, "extra_samples", "poll result: %d\n", err)) + goto cleanup; + + CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n", + 0L, skel->bss->dropped); + CHECK(skel->bss->total != 2, "err_total", "exp %ld, got %ld\n", + 2L, skel->bss->total); + CHECK(skel->bss->discarded != 1, "err_discarded", "exp %ld, got %ld\n", + 1L, skel->bss->discarded); + + /* now validate consumer position is updated and returned */ + trigger_samples(); + CHECK(skel->bss->cons_pos != 3 * rec_sz, + "err_cons_pos", "exp %ld, got %ld\n", + 3L * rec_sz, skel->bss->cons_pos); + err = ring_buffer__poll(ringbuf, -1); + CHECK(err <= 0, "poll_err", "err %d\n", err); + + /* start poll in background w/ long timeout */ + err = pthread_create(&thread, NULL, poll_thread, (void *)(long)10000); + if (CHECK(err, "bg_poll", "pthread_create failed: %d\n", err)) + goto cleanup; + + /* turn off notifications now */ + skel->bss->flags = BPF_RB_NO_WAKEUP; + + /* give background thread a bit of a time */ + usleep(50000); + trigger_samples(); + /* sleeping arbitrarily is bad, but no better way to know that + * epoll_wait() **DID NOT** unblock in background thread + */ + usleep(50000); + /* background poll should still be blocked */ + err = pthread_tryjoin_np(thread, (void **)&bg_ret); + if (CHECK(err != EBUSY, "try_join", "err %d\n", err)) + goto cleanup; + + /* BPF side did everything right */ + CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n", + 0L, skel->bss->dropped); + CHECK(skel->bss->total != 2, "err_total", "exp %ld, got %ld\n", + 2L, skel->bss->total); + CHECK(skel->bss->discarded != 1, "err_discarded", "exp %ld, got %ld\n", + 1L, skel->bss->discarded); + + /* clear flags to return to "adaptive" notification mode */ + skel->bss->flags = 0; + + /* produce new samples, no notification should be triggered, because + * consumer is now behind + */ + trigger_samples(); + + /* background poll should still be blocked */ + err = pthread_tryjoin_np(thread, (void **)&bg_ret); + if (CHECK(err != EBUSY, "try_join", "err %d\n", err)) + goto cleanup; + + /* now force notifications */ + skel->bss->flags = BPF_RB_FORCE_WAKEUP; + sample_cnt = 0; + trigger_samples(); + + /* now we should get a pending notification */ + usleep(50000); + err = pthread_tryjoin_np(thread, (void **)&bg_ret); + if (CHECK(err, "join_bg", "err %d\n", err)) + goto cleanup; + + if (CHECK(bg_ret != 1, "bg_ret", "epoll_wait result: %ld", bg_ret)) + goto cleanup; + + /* 3 rounds, 2 samples each */ + CHECK(sample_cnt != 6, "wrong_sample_cnt", + "expected to see %d samples, got %d\n", 6, sample_cnt); + + /* BPF side did everything right */ + CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n", + 0L, skel->bss->dropped); + CHECK(skel->bss->total != 2, "err_total", "exp %ld, got %ld\n", + 2L, skel->bss->total); + CHECK(skel->bss->discarded != 1, "err_discarded", "exp %ld, got %ld\n", + 1L, skel->bss->discarded); + + test_ringbuf__detach(skel); +cleanup: + ring_buffer__free(ringbuf); + test_ringbuf__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c new file mode 100644 index 000000000000..78e450609803 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include "test_ringbuf_multi.skel.h" + +static int duration = 0; + +struct sample { + int pid; + int seq; + long value; + char comm[16]; +}; + +static int process_sample(void *ctx, void *data, size_t len) +{ + int ring = (unsigned long)ctx; + struct sample *s = data; + + switch (s->seq) { + case 0: + CHECK(ring != 1, "sample1_ring", "exp %d, got %d\n", 1, ring); + CHECK(s->value != 333, "sample1_value", "exp %ld, got %ld\n", + 333L, s->value); + break; + case 1: + CHECK(ring != 2, "sample2_ring", "exp %d, got %d\n", 2, ring); + CHECK(s->value != 777, "sample2_value", "exp %ld, got %ld\n", + 777L, s->value); + break; + default: + CHECK(true, "extra_sample", "unexpected sample seq %d, val %ld\n", + s->seq, s->value); + return -1; + } + + return 0; +} + +void test_ringbuf_multi(void) +{ + struct test_ringbuf_multi *skel; + struct ring_buffer *ringbuf; + int err; + + skel = test_ringbuf_multi__open_and_load(); + if (CHECK(!skel, "skel_open_load", "skeleton open&load failed\n")) + return; + + /* only trigger BPF program for current process */ + skel->bss->pid = getpid(); + + ringbuf = ring_buffer__new(bpf_map__fd(skel->maps.ringbuf1), + process_sample, (void *)(long)1, NULL); + if (CHECK(!ringbuf, "ringbuf_create", "failed to create ringbuf\n")) + goto cleanup; + + err = ring_buffer__add(ringbuf, bpf_map__fd(skel->maps.ringbuf2), + process_sample, (void *)(long)2); + if (CHECK(err, "ringbuf_add", "failed to add another ring\n")) + goto cleanup; + + err = test_ringbuf_multi__attach(skel); + if (CHECK(err, "skel_attach", "skeleton attachment failed: %d\n", err)) + goto cleanup; + + /* trigger few samples, some will be skipped */ + skel->bss->target_ring = 0; + skel->bss->value = 333; + syscall(__NR_getpgid); + + /* skipped, no ringbuf in slot 1 */ + skel->bss->target_ring = 1; + skel->bss->value = 555; + syscall(__NR_getpgid); + + skel->bss->target_ring = 2; + skel->bss->value = 777; + syscall(__NR_getpgid); + + /* poll for samples, should get 2 ringbufs back */ + err = ring_buffer__poll(ringbuf, -1); + if (CHECK(err != 4, "poll_res", "expected 4 records, got %d\n", err)) + goto cleanup; + + /* expect extra polling to return nothing */ + err = ring_buffer__poll(ringbuf, 0); + if (CHECK(err < 0, "extra_samples", "poll result: %d\n", err)) + goto cleanup; + + CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n", + 0L, skel->bss->dropped); + CHECK(skel->bss->skipped != 1, "err_skipped", "exp %ld, got %ld\n", + 1L, skel->bss->skipped); + CHECK(skel->bss->total != 2, "err_total", "exp %ld, got %ld\n", + 2L, skel->bss->total); + +cleanup: + ring_buffer__free(ringbuf); + test_ringbuf_multi__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf.c b/tools/testing/selftests/bpf/progs/test_ringbuf.c new file mode 100644 index 000000000000..8ba9959b036b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ringbuf.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct sample { + int pid; + int seq; + long value; + char comm[16]; +}; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 1 << 12); +} ringbuf SEC(".maps"); + +/* inputs */ +int pid = 0; +long value = 0; +long flags = 0; + +/* outputs */ +long total = 0; +long discarded = 0; +long dropped = 0; + +long avail_data = 0; +long ring_size = 0; +long cons_pos = 0; +long prod_pos = 0; + +/* inner state */ +long seq = 0; + +SEC("tp/syscalls/sys_enter_getpgid") +int test_ringbuf(void *ctx) +{ + int cur_pid = bpf_get_current_pid_tgid() >> 32; + struct sample *sample; + int zero = 0; + + if (cur_pid != pid) + return 0; + + sample = bpf_ringbuf_reserve(&ringbuf, sizeof(*sample), 0); + if (!sample) { + __sync_fetch_and_add(&dropped, 1); + return 1; + } + + sample->pid = pid; + bpf_get_current_comm(sample->comm, sizeof(sample->comm)); + sample->value = value; + + sample->seq = seq++; + __sync_fetch_and_add(&total, 1); + + if (sample->seq & 1) { + /* copy from reserved sample to a new one... */ + bpf_ringbuf_output(&ringbuf, sample, sizeof(*sample), flags); + /* ...and then discard reserved sample */ + bpf_ringbuf_discard(sample, flags); + __sync_fetch_and_add(&discarded, 1); + } else { + bpf_ringbuf_submit(sample, flags); + } + + avail_data = bpf_ringbuf_query(&ringbuf, BPF_RB_AVAIL_DATA); + ring_size = bpf_ringbuf_query(&ringbuf, BPF_RB_RING_SIZE); + cons_pos = bpf_ringbuf_query(&ringbuf, BPF_RB_CONS_POS); + prod_pos = bpf_ringbuf_query(&ringbuf, BPF_RB_PROD_POS); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c b/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c new file mode 100644 index 000000000000..edf3b6953533 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct sample { + int pid; + int seq; + long value; + char comm[16]; +}; + +struct ringbuf_map { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 1 << 12); +} ringbuf1 SEC(".maps"), + ringbuf2 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 4); + __type(key, int); + __array(values, struct ringbuf_map); +} ringbuf_arr SEC(".maps") = { + .values = { + [0] = &ringbuf1, + [2] = &ringbuf2, + }, +}; + +/* inputs */ +int pid = 0; +int target_ring = 0; +long value = 0; + +/* outputs */ +long total = 0; +long dropped = 0; +long skipped = 0; + +SEC("tp/syscalls/sys_enter_getpgid") +int test_ringbuf(void *ctx) +{ + int cur_pid = bpf_get_current_pid_tgid() >> 32; + struct sample *sample; + void *rb; + int zero = 0; + + if (cur_pid != pid) + return 0; + + rb = bpf_map_lookup_elem(&ringbuf_arr, &target_ring); + if (!rb) { + skipped += 1; + return 1; + } + + sample = bpf_ringbuf_reserve(rb, sizeof(*sample), 0); + if (!sample) { + dropped += 1; + return 1; + } + + sample->pid = pid; + bpf_get_current_comm(sample->comm, sizeof(sample->comm)); + sample->value = value; + + sample->seq = total; + total += 1; + + bpf_ringbuf_submit(sample, 0); + + return 0; +} -- cgit v1.2.3 From c97099b0f22722be7d0f290278a26d297cc4b7ca Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 29 May 2020 00:54:23 -0700 Subject: bpf: Add BPF ringbuf and perf buffer benchmarks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend bench framework with ability to have benchmark-provided child argument parser for custom benchmark-specific parameters. This makes bench generic code modular and independent from any specific benchmark. Also implement a set of benchmarks for new BPF ring buffer and existing perf buffer. 4 benchmarks were implemented: 2 variations for each of BPF ringbuf and perfbuf:, - rb-libbpf utilizes stock libbpf ring_buffer manager for reading data; - rb-custom implements custom ring buffer setup and reading code, to eliminate overheads inherent in generic libbpf code due to callback functions and the need to update consumer position after each consumed record, instead of batching updates (due to pessimistic assumption that user callback might take long time and thus could unnecessarily hold ring buffer space for too long); - pb-libbpf uses stock libbpf perf_buffer code with all the default settings, though uses higher-performance raw event callback to minimize unnecessary overhead; - pb-custom implements its own custom consumer code to minimize any possible overhead of generic libbpf implementation and indirect function calls. All of the test support default, no data notification skipped, mode, as well as sampled mode (with --rb-sampled flag), which allows to trigger epoll notification less frequently and reduce overhead. As will be shown, this mode is especially critical for perf buffer, which suffers from high overhead of wakeups in kernel. Otherwise, all benchamrks implement similar way to generate a batch of records by using fentry/sys_getpgid BPF program, which pushes a bunch of records in a tight loop and records number of successful and dropped samples. Each record is a small 8-byte integer, to minimize the effect of memory copying with bpf_perf_event_output() and bpf_ringbuf_output(). Benchmarks that have only one producer implement optional back-to-back mode, in which record production and consumption is alternating on the same CPU. This is the highest-throughput happy case, showing ultimate performance achievable with either BPF ringbuf or perfbuf. All the below scenarios are implemented in a script in benchs/run_bench_ringbufs.sh. Tests were performed on 28-core/56-thread Intel Xeon CPU E5-2680 v4 @ 2.40GHz CPU. Single-producer, parallel producer ================================== rb-libbpf 12.054 ± 0.320M/s (drops 0.000 ± 0.000M/s) rb-custom 8.158 ± 0.118M/s (drops 0.001 ± 0.003M/s) pb-libbpf 0.931 ± 0.007M/s (drops 0.000 ± 0.000M/s) pb-custom 0.965 ± 0.003M/s (drops 0.000 ± 0.000M/s) Single-producer, parallel producer, sampled notification ======================================================== rb-libbpf 11.563 ± 0.067M/s (drops 0.000 ± 0.000M/s) rb-custom 15.895 ± 0.076M/s (drops 0.000 ± 0.000M/s) pb-libbpf 9.889 ± 0.032M/s (drops 0.000 ± 0.000M/s) pb-custom 9.866 ± 0.028M/s (drops 0.000 ± 0.000M/s) Single producer on one CPU, consumer on another one, both running at full speed. Curiously, rb-libbpf has higher throughput than objectively faster (due to more lightweight consumer code path) rb-custom. It appears that faster consumer causes kernel to send notifications more frequently, because consumer appears to be caught up more frequently. Performance of perfbuf suffers from default "no sampling" policy and huge overhead that causes. In sampled mode, rb-custom is winning very significantly eliminating too frequent in-kernel wakeups, the gain appears to be more than 2x. Perf buffer achieves even more impressive wins, compared to stock perfbuf settings, with 10x improvements in throughput with 1:500 sampling rate. The trade-off is that with sampling, application might not get next X events until X+1st arrives, which is not always acceptable. With steady influx of events, though, this shouldn't be a problem. Overall, single-producer performance of ring buffers seems to be better no matter the sampled/non-sampled modes, but it especially beats ring buffer without sampling due to its adaptive notification approach. Single-producer, back-to-back mode ================================== rb-libbpf 15.507 ± 0.247M/s (drops 0.000 ± 0.000M/s) rb-libbpf-sampled 14.692 ± 0.195M/s (drops 0.000 ± 0.000M/s) rb-custom 21.449 ± 0.157M/s (drops 0.000 ± 0.000M/s) rb-custom-sampled 20.024 ± 0.386M/s (drops 0.000 ± 0.000M/s) pb-libbpf 1.601 ± 0.015M/s (drops 0.000 ± 0.000M/s) pb-libbpf-sampled 8.545 ± 0.064M/s (drops 0.000 ± 0.000M/s) pb-custom 1.607 ± 0.022M/s (drops 0.000 ± 0.000M/s) pb-custom-sampled 8.988 ± 0.144M/s (drops 0.000 ± 0.000M/s) Here we test a back-to-back mode, which is arguably best-case scenario both for BPF ringbuf and perfbuf, because there is no contention and for ringbuf also no excessive notification, because consumer appears to be behind after the first record. For ringbuf, custom consumer code clearly wins with 21.5 vs 16 million records per second exchanged between producer and consumer. Sampled mode actually hurts a bit due to slightly slower producer logic (it needs to fetch amount of data available to decide whether to skip or force notification). Perfbuf with wakeup sampling gets 5.5x throughput increase, compared to no-sampling version. There also doesn't seem to be noticeable overhead from generic libbpf handling code. Perfbuf back-to-back, effect of sample rate =========================================== pb-sampled-1 1.035 ± 0.012M/s (drops 0.000 ± 0.000M/s) pb-sampled-5 3.476 ± 0.087M/s (drops 0.000 ± 0.000M/s) pb-sampled-10 5.094 ± 0.136M/s (drops 0.000 ± 0.000M/s) pb-sampled-25 7.118 ± 0.153M/s (drops 0.000 ± 0.000M/s) pb-sampled-50 8.169 ± 0.156M/s (drops 0.000 ± 0.000M/s) pb-sampled-100 8.887 ± 0.136M/s (drops 0.000 ± 0.000M/s) pb-sampled-250 9.180 ± 0.209M/s (drops 0.000 ± 0.000M/s) pb-sampled-500 9.353 ± 0.281M/s (drops 0.000 ± 0.000M/s) pb-sampled-1000 9.411 ± 0.217M/s (drops 0.000 ± 0.000M/s) pb-sampled-2000 9.464 ± 0.167M/s (drops 0.000 ± 0.000M/s) pb-sampled-3000 9.575 ± 0.273M/s (drops 0.000 ± 0.000M/s) This benchmark shows the effect of event sampling for perfbuf. Back-to-back mode for highest throughput. Just doing every 5th record notification gives 3.5x speed up. 250-500 appears to be the point of diminishing return, with almost 9x speed up. Most benchmarks use 500 as the default sampling for pb-raw and pb-custom. Ringbuf back-to-back, effect of sample rate =========================================== rb-sampled-1 1.106 ± 0.010M/s (drops 0.000 ± 0.000M/s) rb-sampled-5 4.746 ± 0.149M/s (drops 0.000 ± 0.000M/s) rb-sampled-10 7.706 ± 0.164M/s (drops 0.000 ± 0.000M/s) rb-sampled-25 12.893 ± 0.273M/s (drops 0.000 ± 0.000M/s) rb-sampled-50 15.961 ± 0.361M/s (drops 0.000 ± 0.000M/s) rb-sampled-100 18.203 ± 0.445M/s (drops 0.000 ± 0.000M/s) rb-sampled-250 19.962 ± 0.786M/s (drops 0.000 ± 0.000M/s) rb-sampled-500 20.881 ± 0.551M/s (drops 0.000 ± 0.000M/s) rb-sampled-1000 21.317 ± 0.532M/s (drops 0.000 ± 0.000M/s) rb-sampled-2000 21.331 ± 0.535M/s (drops 0.000 ± 0.000M/s) rb-sampled-3000 21.688 ± 0.392M/s (drops 0.000 ± 0.000M/s) Similar benchmark for ring buffer also shows a great advantage (in terms of throughput) of skipping notifications. Skipping every 5th one gives 4x boost. Also similar to perfbuf case, 250-500 seems to be the point of diminishing returns, giving roughly 20x better results. Keep in mind, for this test, notifications are controlled manually with BPF_RB_NO_WAKEUP and BPF_RB_FORCE_WAKEUP. As can be seen from previous benchmarks, adaptive notifications based on consumer's positions provides same (or even slightly better due to simpler load generator on BPF side) benefits in favorable back-to-back scenario. Over zealous and fast consumer, which is almost always caught up, will make thoughput numbers smaller. That's the case when manual notification control might prove to be extremely beneficial. Ringbuf back-to-back, reserve+commit vs output ============================================== reserve 22.819 ± 0.503M/s (drops 0.000 ± 0.000M/s) output 18.906 ± 0.433M/s (drops 0.000 ± 0.000M/s) Ringbuf sampled, reserve+commit vs output ========================================= reserve-sampled 15.350 ± 0.132M/s (drops 0.000 ± 0.000M/s) output-sampled 14.195 ± 0.144M/s (drops 0.000 ± 0.000M/s) BPF ringbuf supports two sets of APIs with various usability and performance tradeoffs: bpf_ringbuf_reserve()+bpf_ringbuf_commit() vs bpf_ringbuf_output(). This benchmark clearly shows superiority of reserve+commit approach, despite using a small 8-byte record size. Single-producer, consumer/producer competing on the same CPU, low batch count ============================================================================= rb-libbpf 3.045 ± 0.020M/s (drops 3.536 ± 0.148M/s) rb-custom 3.055 ± 0.022M/s (drops 3.893 ± 0.066M/s) pb-libbpf 1.393 ± 0.024M/s (drops 0.000 ± 0.000M/s) pb-custom 1.407 ± 0.016M/s (drops 0.000 ± 0.000M/s) This benchmark shows one of the worst-case scenarios, in which producer and consumer do not coordinate *and* fight for the same CPU. No batch count and sampling settings were able to eliminate drops for ringbuffer, producer is just too fast for consumer to keep up. But ringbuf and perfbuf still able to pass through quite a lot of messages, which is more than enough for a lot of applications. Ringbuf, multi-producer contention ================================== rb-libbpf nr_prod 1 10.916 ± 0.399M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 2 4.931 ± 0.030M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 3 4.880 ± 0.006M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 4 3.926 ± 0.004M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 8 4.011 ± 0.004M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 12 3.967 ± 0.016M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 16 2.604 ± 0.030M/s (drops 0.001 ± 0.002M/s) rb-libbpf nr_prod 20 2.233 ± 0.003M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 24 2.085 ± 0.015M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 28 2.055 ± 0.004M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 32 1.962 ± 0.004M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 36 2.089 ± 0.005M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 40 2.118 ± 0.006M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 44 2.105 ± 0.004M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 48 2.120 ± 0.058M/s (drops 0.000 ± 0.001M/s) rb-libbpf nr_prod 52 2.074 ± 0.024M/s (drops 0.007 ± 0.014M/s) Ringbuf uses a very short-duration spinlock during reservation phase, to check few invariants, increment producer count and set record header. This is the biggest point of contention for ringbuf implementation. This benchmark evaluates the effect of multiple competing writers on overall throughput of a single shared ringbuffer. Overall throughput drops almost 2x when going from single to two highly-contended producers, gradually dropping with additional competing producers. Performance drop stabilizes at around 20 producers and hovers around 2mln even with 50+ fighting producers, which is a 5x drop compared to non-contended case. Good kernel implementation in kernel helps maintain decent performance here. Note, that in the intended real-world scenarios, it's not expected to get even close to such a high levels of contention. But if contention will become a problem, there is always an option of sharding few ring buffers across a set of CPUs. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200529075424.3139988-5-andriin@fb.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 5 +- tools/testing/selftests/bpf/bench.c | 16 + .../testing/selftests/bpf/benchs/bench_ringbufs.c | 566 +++++++++++++++++++++ .../selftests/bpf/benchs/run_bench_ringbufs.sh | 75 +++ tools/testing/selftests/bpf/progs/perfbuf_bench.c | 33 ++ tools/testing/selftests/bpf/progs/ringbuf_bench.c | 60 +++ 6 files changed, 754 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/benchs/bench_ringbufs.c create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh create mode 100644 tools/testing/selftests/bpf/progs/perfbuf_bench.c create mode 100644 tools/testing/selftests/bpf/progs/ringbuf_bench.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index e716e931d0c9..3ce548eff8a8 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -413,12 +413,15 @@ $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h $(CC) $(CFLAGS) -c $(filter %.c,$^) $(LDLIBS) -o $@ $(OUTPUT)/bench_rename.o: $(OUTPUT)/test_overhead.skel.h $(OUTPUT)/bench_trigger.o: $(OUTPUT)/trigger_bench.skel.h +$(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \ + $(OUTPUT)/perfbuf_bench.skel.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(OUTPUT)/bench: LDLIBS += -lm $(OUTPUT)/bench: $(OUTPUT)/bench.o $(OUTPUT)/testing_helpers.o \ $(OUTPUT)/bench_count.o \ $(OUTPUT)/bench_rename.o \ - $(OUTPUT)/bench_trigger.o + $(OUTPUT)/bench_trigger.o \ + $(OUTPUT)/bench_ringbufs.o $(call msg,BINARY,,$@) $(CC) $(LDFLAGS) -o $@ $(filter %.a %.o,$^) $(LDLIBS) diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 14390689ef90..944ad4721c83 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -130,6 +130,13 @@ static const struct argp_option opts[] = { {}, }; +extern struct argp bench_ringbufs_argp; + +static const struct argp_child bench_parsers[] = { + { &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 }, + {}, +}; + static error_t parse_arg(int key, char *arg, struct argp_state *state) { static int pos_args; @@ -208,6 +215,7 @@ static void parse_cmdline_args(int argc, char **argv) .options = opts, .parser = parse_arg, .doc = argp_program_doc, + .children = bench_parsers, }; if (argp_parse(&argp, argc, argv, 0, NULL, NULL)) exit(1); @@ -310,6 +318,10 @@ extern const struct bench bench_trig_rawtp; extern const struct bench bench_trig_kprobe; extern const struct bench bench_trig_fentry; extern const struct bench bench_trig_fmodret; +extern const struct bench bench_rb_libbpf; +extern const struct bench bench_rb_custom; +extern const struct bench bench_pb_libbpf; +extern const struct bench bench_pb_custom; static const struct bench *benchs[] = { &bench_count_global, @@ -327,6 +339,10 @@ static const struct bench *benchs[] = { &bench_trig_kprobe, &bench_trig_fentry, &bench_trig_fmodret, + &bench_rb_libbpf, + &bench_rb_custom, + &bench_pb_libbpf, + &bench_pb_custom, }; static void setup_benchmark() diff --git a/tools/testing/selftests/bpf/benchs/bench_ringbufs.c b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c new file mode 100644 index 000000000000..da87c7f31891 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c @@ -0,0 +1,566 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include +#include +#include +#include +#include +#include +#include +#include "bench.h" +#include "ringbuf_bench.skel.h" +#include "perfbuf_bench.skel.h" + +static struct { + bool back2back; + int batch_cnt; + bool sampled; + int sample_rate; + int ringbuf_sz; /* per-ringbuf, in bytes */ + bool ringbuf_use_output; /* use slower output API */ + int perfbuf_sz; /* per-CPU size, in pages */ +} args = { + .back2back = false, + .batch_cnt = 500, + .sampled = false, + .sample_rate = 500, + .ringbuf_sz = 512 * 1024, + .ringbuf_use_output = false, + .perfbuf_sz = 128, +}; + +enum { + ARG_RB_BACK2BACK = 2000, + ARG_RB_USE_OUTPUT = 2001, + ARG_RB_BATCH_CNT = 2002, + ARG_RB_SAMPLED = 2003, + ARG_RB_SAMPLE_RATE = 2004, +}; + +static const struct argp_option opts[] = { + { "rb-b2b", ARG_RB_BACK2BACK, NULL, 0, "Back-to-back mode"}, + { "rb-use-output", ARG_RB_USE_OUTPUT, NULL, 0, "Use bpf_ringbuf_output() instead of bpf_ringbuf_reserve()"}, + { "rb-batch-cnt", ARG_RB_BATCH_CNT, "CNT", 0, "Set BPF-side record batch count"}, + { "rb-sampled", ARG_RB_SAMPLED, NULL, 0, "Notification sampling"}, + { "rb-sample-rate", ARG_RB_SAMPLE_RATE, "RATE", 0, "Notification sample rate"}, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + switch (key) { + case ARG_RB_BACK2BACK: + args.back2back = true; + break; + case ARG_RB_USE_OUTPUT: + args.ringbuf_use_output = true; + break; + case ARG_RB_BATCH_CNT: + args.batch_cnt = strtol(arg, NULL, 10); + if (args.batch_cnt < 0) { + fprintf(stderr, "Invalid batch count."); + argp_usage(state); + } + break; + case ARG_RB_SAMPLED: + args.sampled = true; + break; + case ARG_RB_SAMPLE_RATE: + args.sample_rate = strtol(arg, NULL, 10); + if (args.sample_rate < 0) { + fprintf(stderr, "Invalid perfbuf sample rate."); + argp_usage(state); + } + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +/* exported into benchmark runner */ +const struct argp bench_ringbufs_argp = { + .options = opts, + .parser = parse_arg, +}; + +/* RINGBUF-LIBBPF benchmark */ + +static struct counter buf_hits; + +static inline void bufs_trigger_batch() +{ + (void)syscall(__NR_getpgid); +} + +static void bufs_validate() +{ + if (env.consumer_cnt != 1) { + fprintf(stderr, "rb-libbpf benchmark doesn't support multi-consumer!\n"); + exit(1); + } + + if (args.back2back && env.producer_cnt > 1) { + fprintf(stderr, "back-to-back mode makes sense only for single-producer case!\n"); + exit(1); + } +} + +static void *bufs_sample_producer(void *input) +{ + if (args.back2back) { + /* initial batch to get everything started */ + bufs_trigger_batch(); + return NULL; + } + + while (true) + bufs_trigger_batch(); + return NULL; +} + +static struct ringbuf_libbpf_ctx { + struct ringbuf_bench *skel; + struct ring_buffer *ringbuf; +} ringbuf_libbpf_ctx; + +static void ringbuf_libbpf_measure(struct bench_res *res) +{ + struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx; + + res->hits = atomic_swap(&buf_hits.value, 0); + res->drops = atomic_swap(&ctx->skel->bss->dropped, 0); +} + +static struct ringbuf_bench *ringbuf_setup_skeleton() +{ + struct ringbuf_bench *skel; + + setup_libbpf(); + + skel = ringbuf_bench__open(); + if (!skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + skel->rodata->batch_cnt = args.batch_cnt; + skel->rodata->use_output = args.ringbuf_use_output ? 1 : 0; + + if (args.sampled) + /* record data + header take 16 bytes */ + skel->rodata->wakeup_data_size = args.sample_rate * 16; + + bpf_map__resize(skel->maps.ringbuf, args.ringbuf_sz); + + if (ringbuf_bench__load(skel)) { + fprintf(stderr, "failed to load skeleton\n"); + exit(1); + } + + return skel; +} + +static int buf_process_sample(void *ctx, void *data, size_t len) +{ + atomic_inc(&buf_hits.value); + return 0; +} + +static void ringbuf_libbpf_setup() +{ + struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx; + struct bpf_link *link; + + ctx->skel = ringbuf_setup_skeleton(); + ctx->ringbuf = ring_buffer__new(bpf_map__fd(ctx->skel->maps.ringbuf), + buf_process_sample, NULL, NULL); + if (!ctx->ringbuf) { + fprintf(stderr, "failed to create ringbuf\n"); + exit(1); + } + + link = bpf_program__attach(ctx->skel->progs.bench_ringbuf); + if (IS_ERR(link)) { + fprintf(stderr, "failed to attach program!\n"); + exit(1); + } +} + +static void *ringbuf_libbpf_consumer(void *input) +{ + struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx; + + while (ring_buffer__poll(ctx->ringbuf, -1) >= 0) { + if (args.back2back) + bufs_trigger_batch(); + } + fprintf(stderr, "ringbuf polling failed!\n"); + return NULL; +} + +/* RINGBUF-CUSTOM benchmark */ +struct ringbuf_custom { + __u64 *consumer_pos; + __u64 *producer_pos; + __u64 mask; + void *data; + int map_fd; +}; + +static struct ringbuf_custom_ctx { + struct ringbuf_bench *skel; + struct ringbuf_custom ringbuf; + int epoll_fd; + struct epoll_event event; +} ringbuf_custom_ctx; + +static void ringbuf_custom_measure(struct bench_res *res) +{ + struct ringbuf_custom_ctx *ctx = &ringbuf_custom_ctx; + + res->hits = atomic_swap(&buf_hits.value, 0); + res->drops = atomic_swap(&ctx->skel->bss->dropped, 0); +} + +static void ringbuf_custom_setup() +{ + struct ringbuf_custom_ctx *ctx = &ringbuf_custom_ctx; + const size_t page_size = getpagesize(); + struct bpf_link *link; + struct ringbuf_custom *r; + void *tmp; + int err; + + ctx->skel = ringbuf_setup_skeleton(); + + ctx->epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (ctx->epoll_fd < 0) { + fprintf(stderr, "failed to create epoll fd: %d\n", -errno); + exit(1); + } + + r = &ctx->ringbuf; + r->map_fd = bpf_map__fd(ctx->skel->maps.ringbuf); + r->mask = args.ringbuf_sz - 1; + + /* Map writable consumer page */ + tmp = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, + r->map_fd, 0); + if (tmp == MAP_FAILED) { + fprintf(stderr, "failed to mmap consumer page: %d\n", -errno); + exit(1); + } + r->consumer_pos = tmp; + + /* Map read-only producer page and data pages. */ + tmp = mmap(NULL, page_size + 2 * args.ringbuf_sz, PROT_READ, MAP_SHARED, + r->map_fd, page_size); + if (tmp == MAP_FAILED) { + fprintf(stderr, "failed to mmap data pages: %d\n", -errno); + exit(1); + } + r->producer_pos = tmp; + r->data = tmp + page_size; + + ctx->event.events = EPOLLIN; + err = epoll_ctl(ctx->epoll_fd, EPOLL_CTL_ADD, r->map_fd, &ctx->event); + if (err < 0) { + fprintf(stderr, "failed to epoll add ringbuf: %d\n", -errno); + exit(1); + } + + link = bpf_program__attach(ctx->skel->progs.bench_ringbuf); + if (IS_ERR(link)) { + fprintf(stderr, "failed to attach program\n"); + exit(1); + } +} + +#define RINGBUF_BUSY_BIT (1 << 31) +#define RINGBUF_DISCARD_BIT (1 << 30) +#define RINGBUF_META_LEN 8 + +static inline int roundup_len(__u32 len) +{ + /* clear out top 2 bits */ + len <<= 2; + len >>= 2; + /* add length prefix */ + len += RINGBUF_META_LEN; + /* round up to 8 byte alignment */ + return (len + 7) / 8 * 8; +} + +static void ringbuf_custom_process_ring(struct ringbuf_custom *r) +{ + unsigned long cons_pos, prod_pos; + int *len_ptr, len; + bool got_new_data; + + cons_pos = smp_load_acquire(r->consumer_pos); + while (true) { + got_new_data = false; + prod_pos = smp_load_acquire(r->producer_pos); + while (cons_pos < prod_pos) { + len_ptr = r->data + (cons_pos & r->mask); + len = smp_load_acquire(len_ptr); + + /* sample not committed yet, bail out for now */ + if (len & RINGBUF_BUSY_BIT) + return; + + got_new_data = true; + cons_pos += roundup_len(len); + + atomic_inc(&buf_hits.value); + } + if (got_new_data) + smp_store_release(r->consumer_pos, cons_pos); + else + break; + }; +} + +static void *ringbuf_custom_consumer(void *input) +{ + struct ringbuf_custom_ctx *ctx = &ringbuf_custom_ctx; + int cnt; + + do { + if (args.back2back) + bufs_trigger_batch(); + cnt = epoll_wait(ctx->epoll_fd, &ctx->event, 1, -1); + if (cnt > 0) + ringbuf_custom_process_ring(&ctx->ringbuf); + } while (cnt >= 0); + fprintf(stderr, "ringbuf polling failed!\n"); + return 0; +} + +/* PERFBUF-LIBBPF benchmark */ +static struct perfbuf_libbpf_ctx { + struct perfbuf_bench *skel; + struct perf_buffer *perfbuf; +} perfbuf_libbpf_ctx; + +static void perfbuf_measure(struct bench_res *res) +{ + struct perfbuf_libbpf_ctx *ctx = &perfbuf_libbpf_ctx; + + res->hits = atomic_swap(&buf_hits.value, 0); + res->drops = atomic_swap(&ctx->skel->bss->dropped, 0); +} + +static struct perfbuf_bench *perfbuf_setup_skeleton() +{ + struct perfbuf_bench *skel; + + setup_libbpf(); + + skel = perfbuf_bench__open(); + if (!skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + skel->rodata->batch_cnt = args.batch_cnt; + + if (perfbuf_bench__load(skel)) { + fprintf(stderr, "failed to load skeleton\n"); + exit(1); + } + + return skel; +} + +static enum bpf_perf_event_ret +perfbuf_process_sample_raw(void *input_ctx, int cpu, + struct perf_event_header *e) +{ + switch (e->type) { + case PERF_RECORD_SAMPLE: + atomic_inc(&buf_hits.value); + break; + case PERF_RECORD_LOST: + break; + default: + return LIBBPF_PERF_EVENT_ERROR; + } + return LIBBPF_PERF_EVENT_CONT; +} + +static void perfbuf_libbpf_setup() +{ + struct perfbuf_libbpf_ctx *ctx = &perfbuf_libbpf_ctx; + struct perf_event_attr attr; + struct perf_buffer_raw_opts pb_opts = { + .event_cb = perfbuf_process_sample_raw, + .ctx = (void *)(long)0, + .attr = &attr, + }; + struct bpf_link *link; + + ctx->skel = perfbuf_setup_skeleton(); + + memset(&attr, 0, sizeof(attr)); + attr.config = PERF_COUNT_SW_BPF_OUTPUT, + attr.type = PERF_TYPE_SOFTWARE; + attr.sample_type = PERF_SAMPLE_RAW; + /* notify only every Nth sample */ + if (args.sampled) { + attr.sample_period = args.sample_rate; + attr.wakeup_events = args.sample_rate; + } else { + attr.sample_period = 1; + attr.wakeup_events = 1; + } + + if (args.sample_rate > args.batch_cnt) { + fprintf(stderr, "sample rate %d is too high for given batch count %d\n", + args.sample_rate, args.batch_cnt); + exit(1); + } + + ctx->perfbuf = perf_buffer__new_raw(bpf_map__fd(ctx->skel->maps.perfbuf), + args.perfbuf_sz, &pb_opts); + if (!ctx->perfbuf) { + fprintf(stderr, "failed to create perfbuf\n"); + exit(1); + } + + link = bpf_program__attach(ctx->skel->progs.bench_perfbuf); + if (IS_ERR(link)) { + fprintf(stderr, "failed to attach program\n"); + exit(1); + } +} + +static void *perfbuf_libbpf_consumer(void *input) +{ + struct perfbuf_libbpf_ctx *ctx = &perfbuf_libbpf_ctx; + + while (perf_buffer__poll(ctx->perfbuf, -1) >= 0) { + if (args.back2back) + bufs_trigger_batch(); + } + fprintf(stderr, "perfbuf polling failed!\n"); + return NULL; +} + +/* PERFBUF-CUSTOM benchmark */ + +/* copies of internal libbpf definitions */ +struct perf_cpu_buf { + struct perf_buffer *pb; + void *base; /* mmap()'ed memory */ + void *buf; /* for reconstructing segmented data */ + size_t buf_size; + int fd; + int cpu; + int map_key; +}; + +struct perf_buffer { + perf_buffer_event_fn event_cb; + perf_buffer_sample_fn sample_cb; + perf_buffer_lost_fn lost_cb; + void *ctx; /* passed into callbacks */ + + size_t page_size; + size_t mmap_size; + struct perf_cpu_buf **cpu_bufs; + struct epoll_event *events; + int cpu_cnt; /* number of allocated CPU buffers */ + int epoll_fd; /* perf event FD */ + int map_fd; /* BPF_MAP_TYPE_PERF_EVENT_ARRAY BPF map FD */ +}; + +static void *perfbuf_custom_consumer(void *input) +{ + struct perfbuf_libbpf_ctx *ctx = &perfbuf_libbpf_ctx; + struct perf_buffer *pb = ctx->perfbuf; + struct perf_cpu_buf *cpu_buf; + struct perf_event_mmap_page *header; + size_t mmap_mask = pb->mmap_size - 1; + struct perf_event_header *ehdr; + __u64 data_head, data_tail; + size_t ehdr_size; + void *base; + int i, cnt; + + while (true) { + if (args.back2back) + bufs_trigger_batch(); + cnt = epoll_wait(pb->epoll_fd, pb->events, pb->cpu_cnt, -1); + if (cnt <= 0) { + fprintf(stderr, "perf epoll failed: %d\n", -errno); + exit(1); + } + + for (i = 0; i < cnt; ++i) { + cpu_buf = pb->events[i].data.ptr; + header = cpu_buf->base; + base = ((void *)header) + pb->page_size; + + data_head = ring_buffer_read_head(header); + data_tail = header->data_tail; + while (data_head != data_tail) { + ehdr = base + (data_tail & mmap_mask); + ehdr_size = ehdr->size; + + if (ehdr->type == PERF_RECORD_SAMPLE) + atomic_inc(&buf_hits.value); + + data_tail += ehdr_size; + } + ring_buffer_write_tail(header, data_tail); + } + } + return NULL; +} + +const struct bench bench_rb_libbpf = { + .name = "rb-libbpf", + .validate = bufs_validate, + .setup = ringbuf_libbpf_setup, + .producer_thread = bufs_sample_producer, + .consumer_thread = ringbuf_libbpf_consumer, + .measure = ringbuf_libbpf_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_rb_custom = { + .name = "rb-custom", + .validate = bufs_validate, + .setup = ringbuf_custom_setup, + .producer_thread = bufs_sample_producer, + .consumer_thread = ringbuf_custom_consumer, + .measure = ringbuf_custom_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_pb_libbpf = { + .name = "pb-libbpf", + .validate = bufs_validate, + .setup = perfbuf_libbpf_setup, + .producer_thread = bufs_sample_producer, + .consumer_thread = perfbuf_libbpf_consumer, + .measure = perfbuf_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_pb_custom = { + .name = "pb-custom", + .validate = bufs_validate, + .setup = perfbuf_libbpf_setup, + .producer_thread = bufs_sample_producer, + .consumer_thread = perfbuf_custom_consumer, + .measure = perfbuf_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + diff --git a/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh new file mode 100755 index 000000000000..af4aa04caba6 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh @@ -0,0 +1,75 @@ +#!/bin/bash + +set -eufo pipefail + +RUN_BENCH="sudo ./bench -w3 -d10 -a" + +function hits() +{ + echo "$*" | sed -E "s/.*hits\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+M\/s).*/\1/" +} + +function drops() +{ + echo "$*" | sed -E "s/.*drops\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+M\/s).*/\1/" +} + +function header() +{ + local len=${#1} + + printf "\n%s\n" "$1" + for i in $(seq 1 $len); do printf '='; done + printf '\n' +} + +function summarize() +{ + bench="$1" + summary=$(echo $2 | tail -n1) + printf "%-20s %s (drops %s)\n" "$bench" "$(hits $summary)" "$(drops $summary)" +} + +header "Single-producer, parallel producer" +for b in rb-libbpf rb-custom pb-libbpf pb-custom; do + summarize $b "$($RUN_BENCH $b)" +done + +header "Single-producer, parallel producer, sampled notification" +for b in rb-libbpf rb-custom pb-libbpf pb-custom; do + summarize $b "$($RUN_BENCH --rb-sampled $b)" +done + +header "Single-producer, back-to-back mode" +for b in rb-libbpf rb-custom pb-libbpf pb-custom; do + summarize $b "$($RUN_BENCH --rb-b2b $b)" + summarize $b-sampled "$($RUN_BENCH --rb-sampled --rb-b2b $b)" +done + +header "Ringbuf back-to-back, effect of sample rate" +for b in 1 5 10 25 50 100 250 500 1000 2000 3000; do + summarize "rb-sampled-$b" "$($RUN_BENCH --rb-b2b --rb-batch-cnt $b --rb-sampled --rb-sample-rate $b rb-custom)" +done +header "Perfbuf back-to-back, effect of sample rate" +for b in 1 5 10 25 50 100 250 500 1000 2000 3000; do + summarize "pb-sampled-$b" "$($RUN_BENCH --rb-b2b --rb-batch-cnt $b --rb-sampled --rb-sample-rate $b pb-custom)" +done + +header "Ringbuf back-to-back, reserve+commit vs output" +summarize "reserve" "$($RUN_BENCH --rb-b2b rb-custom)" +summarize "output" "$($RUN_BENCH --rb-b2b --rb-use-output rb-custom)" + +header "Ringbuf sampled, reserve+commit vs output" +summarize "reserve-sampled" "$($RUN_BENCH --rb-sampled rb-custom)" +summarize "output-sampled" "$($RUN_BENCH --rb-sampled --rb-use-output rb-custom)" + +header "Single-producer, consumer/producer competing on the same CPU, low batch count" +for b in rb-libbpf rb-custom pb-libbpf pb-custom; do + summarize $b "$($RUN_BENCH --rb-batch-cnt 1 --rb-sample-rate 1 --prod-affinity 0 --cons-affinity 0 $b)" +done + +header "Ringbuf, multi-producer contention" +for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do + summarize "rb-libbpf nr_prod $b" "$($RUN_BENCH -p$b --rb-batch-cnt 50 rb-libbpf)" +done + diff --git a/tools/testing/selftests/bpf/progs/perfbuf_bench.c b/tools/testing/selftests/bpf/progs/perfbuf_bench.c new file mode 100644 index 000000000000..e5ab4836a641 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/perfbuf_bench.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(value_size, sizeof(int)); + __uint(key_size, sizeof(int)); +} perfbuf SEC(".maps"); + +const volatile int batch_cnt = 0; + +long sample_val = 42; +long dropped __attribute__((aligned(128))) = 0; + +SEC("fentry/__x64_sys_getpgid") +int bench_perfbuf(void *ctx) +{ + __u64 *sample; + int i; + + for (i = 0; i < batch_cnt; i++) { + if (bpf_perf_event_output(ctx, &perfbuf, BPF_F_CURRENT_CPU, + &sample_val, sizeof(sample_val))) + __sync_add_and_fetch(&dropped, 1); + } + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/ringbuf_bench.c b/tools/testing/selftests/bpf/progs/ringbuf_bench.c new file mode 100644 index 000000000000..123607d314d6 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/ringbuf_bench.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); +} ringbuf SEC(".maps"); + +const volatile int batch_cnt = 0; +const volatile long use_output = 0; + +long sample_val = 42; +long dropped __attribute__((aligned(128))) = 0; + +const volatile long wakeup_data_size = 0; + +static __always_inline long get_flags() +{ + long sz; + + if (!wakeup_data_size) + return 0; + + sz = bpf_ringbuf_query(&ringbuf, BPF_RB_AVAIL_DATA); + return sz >= wakeup_data_size ? BPF_RB_FORCE_WAKEUP : BPF_RB_NO_WAKEUP; +} + +SEC("fentry/__x64_sys_getpgid") +int bench_ringbuf(void *ctx) +{ + long *sample, flags; + int i; + + if (!use_output) { + for (i = 0; i < batch_cnt; i++) { + sample = bpf_ringbuf_reserve(&ringbuf, + sizeof(sample_val), 0); + if (!sample) { + __sync_add_and_fetch(&dropped, 1); + } else { + *sample = sample_val; + flags = get_flags(); + bpf_ringbuf_submit(sample, flags); + } + } + } else { + for (i = 0; i < batch_cnt; i++) { + flags = get_flags(); + if (bpf_ringbuf_output(&ringbuf, &sample_val, + sizeof(sample_val), flags)) + __sync_add_and_fetch(&dropped, 1); + } + } + return 0; +} -- cgit v1.2.3 From fbee97feed9b3e4acdf9590e1f6b4a2eefecfffe Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 29 May 2020 16:07:13 -0600 Subject: bpf: Add support to attach bpf program to a devmap entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add BPF_XDP_DEVMAP attach type for use with programs associated with a DEVMAP entry. Allow DEVMAPs to associate a program with a device entry by adding a bpf_prog.fd to 'struct bpf_devmap_val'. Values read show the program id, so the fd and id are a union. bpf programs can get access to the struct via vmlinux.h. The program associated with the fd must have type XDP with expected attach type BPF_XDP_DEVMAP. When a program is associated with a device index, the program is run on an XDP_REDIRECT and before the buffer is added to the per-cpu queue. At this point rxq data is still valid; the next patch adds tx device information allowing the prorgam to see both ingress and egress device indices. XDP generic is skb based and XDP programs do not work with skb's. Block the use case by walking maps used by a program that is to be attached via xdpgeneric and fail if any of them are DEVMAP / DEVMAP_HASH with Block attach of BPF_XDP_DEVMAP programs to devices. Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200529220716.75383-3-dsahern@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +++ include/uapi/linux/bpf.h | 1 + kernel/bpf/devmap.c | 88 ++++++++++++++++++++++++++++++++++++++++-- net/core/dev.c | 18 +++++++++ tools/include/uapi/linux/bpf.h | 1 + 5 files changed, 109 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e5884f7f801c..e042311f991f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1250,6 +1250,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog); +bool dev_map_can_have_prog(struct bpf_map *map); struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_flush(void); @@ -1363,6 +1364,10 @@ static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map { return NULL; } +static inline bool dev_map_can_have_prog(struct bpf_map *map) +{ + return false; +} static inline void __dev_flush(void) { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 630432c5c292..f1e364d69007 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -225,6 +225,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_GETPEERNAME, BPF_CGROUP_INET4_GETSOCKNAME, BPF_CGROUP_INET6_GETSOCKNAME, + BPF_XDP_DEVMAP, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a1459de0914e..0089d56617ec 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -63,12 +63,17 @@ struct xdp_dev_bulk_queue { /* DEVMAP values */ struct bpf_devmap_val { u32 ifindex; /* device index */ + union { + int fd; /* prog fd on map write */ + u32 id; /* prog id on map read */ + } bpf_prog; }; struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; struct bpf_dtab *dtab; + struct bpf_prog *xdp_prog; struct rcu_head rcu; unsigned int idx; struct bpf_devmap_val val; @@ -111,12 +116,18 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { + u32 valsize = attr->value_size; u64 cost = 0; int err; - /* check sanity of attributes */ + /* check sanity of attributes. 2 value sizes supported: + * 4 bytes: ifindex + * 8 bytes: ifindex + prog fd + */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) + (valsize != offsetofend(struct bpf_devmap_val, ifindex) && + valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) || + attr->map_flags & ~DEV_CREATE_FLAG_MASK) return -EINVAL; /* Lookup returns a pointer straight to dev->ifindex, so make sure the @@ -223,6 +234,8 @@ static void dev_map_free(struct bpf_map *map) hlist_for_each_entry_safe(dev, next, head, index_hlist) { hlist_del_rcu(&dev->index_hlist); + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -237,6 +250,8 @@ static void dev_map_free(struct bpf_map *map) if (!dev) continue; + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -323,6 +338,16 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, return -ENOENT; } +bool dev_map_can_have_prog(struct bpf_map *map) +{ + if ((map->map_type == BPF_MAP_TYPE_DEVMAP || + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) && + map->value_size != offsetofend(struct bpf_devmap_val, ifindex)) + return true; + + return false; +} + static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { struct net_device *dev = bq->dev; @@ -447,6 +472,30 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, return bq_enqueue(dev, xdpf, dev_rx); } +static struct xdp_buff *dev_map_run_prog(struct net_device *dev, + struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + u32 act; + + act = bpf_prog_run_xdp(xdp_prog, xdp); + switch (act) { + case XDP_PASS: + return xdp; + case XDP_DROP: + break; + default: + bpf_warn_invalid_xdp_action(act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception(dev, xdp_prog, act); + break; + } + + xdp_return_buff(xdp); + return NULL; +} + int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, struct net_device *dev_rx) { @@ -458,6 +507,11 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, { struct net_device *dev = dst->dev; + if (dst->xdp_prog) { + xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog); + if (!xdp) + return 0; + } return __xdp_enqueue(dev, xdp, dev_rx); } @@ -494,6 +548,8 @@ static void __dev_map_entry_free(struct rcu_head *rcu) struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -547,6 +603,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_devmap_val *val, unsigned int idx) { + struct bpf_prog *prog = NULL; struct bpf_dtab_netdev *dev; dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, @@ -558,11 +615,31 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, if (!dev->dev) goto err_out; + if (val->bpf_prog.fd >= 0) { + prog = bpf_prog_get_type_dev(val->bpf_prog.fd, + BPF_PROG_TYPE_XDP, false); + if (IS_ERR(prog)) + goto err_put_dev; + if (prog->expected_attach_type != BPF_XDP_DEVMAP) + goto err_put_prog; + } + dev->idx = idx; dev->dtab = dtab; + if (prog) { + dev->xdp_prog = prog; + dev->val.bpf_prog.id = prog->aux->id; + } else { + dev->xdp_prog = NULL; + dev->val.bpf_prog.id = 0; + } dev->val.ifindex = val->ifindex; return dev; +err_put_prog: + bpf_prog_put(prog); +err_put_dev: + dev_put(dev->dev); err_out: kfree(dev); return ERR_PTR(-EINVAL); @@ -572,8 +649,8 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; - struct bpf_devmap_val val = { }; u32 i = *(u32 *)key; if (unlikely(map_flags > BPF_EXIST)) @@ -588,6 +665,9 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, if (!val.ifindex) { dev = NULL; + /* can not specify fd if ifindex is 0 */ + if (val.bpf_prog.fd != -1) + return -EINVAL; } else { dev = __dev_map_alloc_node(net, dtab, &val, i); if (IS_ERR(dev)) @@ -616,8 +696,8 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; - struct bpf_devmap_val val = { }; u32 idx = *(u32 *)key; unsigned long flags; int err = -EEXIST; diff --git a/net/core/dev.c b/net/core/dev.c index ae37586f6ee8..10684833f864 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5420,6 +5420,18 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) struct bpf_prog *new = xdp->prog; int ret = 0; + if (new) { + u32 i; + + /* generic XDP does not work with DEVMAPs that can + * have a bpf_prog installed on an entry + */ + for (i = 0; i < new->aux->used_map_cnt; i++) { + if (dev_map_can_have_prog(new->aux->used_maps[i])) + return -EINVAL; + } + } + switch (xdp->command) { case XDP_SETUP_PROG: rcu_assign_pointer(dev->xdp_prog, new); @@ -8835,6 +8847,12 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, return -EINVAL; } + if (prog->expected_attach_type == BPF_XDP_DEVMAP) { + NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); + bpf_prog_put(prog); + return -EINVAL; + } + /* prog->aux->id may be 0 for orphaned device-bound progs */ if (prog->aux->id && prog->aux->id == prog_id) { bpf_prog_put(prog); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 974ca6e948e3..65d7717bce2f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -225,6 +225,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_GETPEERNAME, BPF_CGROUP_INET4_GETSOCKNAME, BPF_CGROUP_INET6_GETSOCKNAME, + BPF_XDP_DEVMAP, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From 64b59025c15b244c0954cf52b24fbabfcf5ed8f6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 29 May 2020 16:07:14 -0600 Subject: xdp: Add xdp_txq_info to xdp_buff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add xdp_txq_info as the Tx counterpart to xdp_rxq_info. At the moment only the device is added. Other fields (queue_index) can be added as use cases arise. >From a UAPI perspective, add egress_ifindex to xdp context for bpf programs to see the Tx device. Update the verifier to only allow accesses to egress_ifindex by XDP programs with BPF_XDP_DEVMAP expected attach type. Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200529220716.75383-4-dsahern@kernel.org Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 5 +++++ include/uapi/linux/bpf.h | 2 ++ kernel/bpf/devmap.c | 3 +++ net/core/filter.c | 17 +++++++++++++++++ tools/include/uapi/linux/bpf.h | 2 ++ 5 files changed, 29 insertions(+) (limited to 'tools') diff --git a/include/net/xdp.h b/include/net/xdp.h index 90f11760bd12..d54022959491 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -61,12 +61,17 @@ struct xdp_rxq_info { struct xdp_mem_info mem; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ +struct xdp_txq_info { + struct net_device *dev; +}; + struct xdp_buff { void *data; void *data_end; void *data_meta; void *data_hard_start; struct xdp_rxq_info *rxq; + struct xdp_txq_info *txq; u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f1e364d69007..f862a58fb567 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3707,6 +3707,8 @@ struct xdp_md { /* Below access go through struct xdp_rxq_info */ __u32 ingress_ifindex; /* rxq->dev->ifindex */ __u32 rx_queue_index; /* rxq->queue_index */ + + __u32 egress_ifindex; /* txq->dev->ifindex */ }; enum sk_action { diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 0089d56617ec..c04fb1c72f5e 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -476,8 +476,11 @@ static struct xdp_buff *dev_map_run_prog(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { + struct xdp_txq_info txq = { .dev = dev }; u32 act; + xdp->txq = &txq; + act = bpf_prog_run_xdp(xdp_prog, xdp); switch (act) { case XDP_PASS: diff --git a/net/core/filter.c b/net/core/filter.c index 0008b029d644..85ff827aab73 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7015,6 +7015,13 @@ static bool xdp_is_valid_access(int off, int size, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { + if (prog->expected_attach_type != BPF_XDP_DEVMAP) { + switch (off) { + case offsetof(struct xdp_md, egress_ifindex): + return false; + } + } + if (type == BPF_WRITE) { if (bpf_prog_is_dev_bound(prog->aux)) { switch (off) { @@ -7985,6 +7992,16 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, offsetof(struct xdp_rxq_info, queue_index)); break; + case offsetof(struct xdp_md, egress_ifindex): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, txq)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev), + si->dst_reg, si->dst_reg, + offsetof(struct xdp_txq_info, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct net_device, ifindex)); + break; } return insn - insn_buf; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 65d7717bce2f..f74bc4a2385e 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3706,6 +3706,8 @@ struct xdp_md { /* Below access go through struct xdp_rxq_info */ __u32 ingress_ifindex; /* rxq->dev->ifindex */ __u32 rx_queue_index; /* rxq->queue_index */ + + __u32 egress_ifindex; /* txq->dev->ifindex */ }; enum sk_action { -- cgit v1.2.3 From 2778797037a658be71a6c55b54700bf58ba21eb7 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 29 May 2020 16:07:15 -0600 Subject: libbpf: Add SEC name for xdp programs attached to device map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Support SEC("xdp_devmap*") as a short cut for loading the program with type BPF_PROG_TYPE_XDP and expected attach type BPF_XDP_DEVMAP. Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200529220716.75383-5-dsahern@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 74d967619dcf..85d4f1c5fc52 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6657,6 +6657,8 @@ static const struct bpf_sec_def section_defs[] = { .expected_attach_type = BPF_TRACE_ITER, .is_attach_btf = true, .attach_fn = attach_iter), + BPF_EAPROG_SEC("xdp_devmap", BPF_PROG_TYPE_XDP, + BPF_XDP_DEVMAP), BPF_PROG_SEC("xdp", BPF_PROG_TYPE_XDP), BPF_PROG_SEC("perf_event", BPF_PROG_TYPE_PERF_EVENT), BPF_PROG_SEC("lwt_in", BPF_PROG_TYPE_LWT_IN), -- cgit v1.2.3 From d39aec79e5923bf984df991ffe51d4a2b7a9e746 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 29 May 2020 16:07:16 -0600 Subject: selftest: Add tests for XDP programs in devmap entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add tests to verify ability to add an XDP program to a entry in a DEVMAP. Add negative tests to show DEVMAP programs can not be attached to devices as a normal XDP program, and accesses to egress_ifindex require BPF_XDP_DEVMAP attach type. Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200529220716.75383-6-dsahern@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/xdp_devmap_attach.c | 97 ++++++++++++++++++++++ .../selftests/bpf/progs/test_xdp_devmap_helpers.c | 22 +++++ .../bpf/progs/test_xdp_with_devmap_helpers.c | 44 ++++++++++ 3 files changed, 163 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c create mode 100644 tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c create mode 100644 tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c new file mode 100644 index 000000000000..d19dbd668f6a --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include "test_xdp_devmap_helpers.skel.h" +#include "test_xdp_with_devmap_helpers.skel.h" + +#define IFINDEX_LO 1 + +struct bpf_devmap_val { + u32 ifindex; /* device index */ + union { + int fd; /* prog fd on map write */ + u32 id; /* prog id on map read */ + } bpf_prog; +}; + +void test_xdp_with_devmap_helpers(void) +{ + struct test_xdp_with_devmap_helpers *skel; + struct bpf_prog_info info = {}; + struct bpf_devmap_val val = { + .ifindex = IFINDEX_LO, + }; + __u32 len = sizeof(info); + __u32 duration = 0, idx = 0; + int err, dm_fd, map_fd; + + + skel = test_xdp_with_devmap_helpers__open_and_load(); + if (CHECK_FAIL(!skel)) { + perror("test_xdp_with_devmap_helpers__open_and_load"); + return; + } + + /* can not attach program with DEVMAPs that allow programs + * as xdp generic + */ + dm_fd = bpf_program__fd(skel->progs.xdp_redir_prog); + err = bpf_set_link_xdp_fd(IFINDEX_LO, dm_fd, XDP_FLAGS_SKB_MODE); + CHECK(err == 0, "Generic attach of program with 8-byte devmap", + "should have failed\n"); + + dm_fd = bpf_program__fd(skel->progs.xdp_dummy_dm); + map_fd = bpf_map__fd(skel->maps.dm_ports); + err = bpf_obj_get_info_by_fd(dm_fd, &info, &len); + if (CHECK_FAIL(err)) + goto out_close; + + val.bpf_prog.fd = dm_fd; + err = bpf_map_update_elem(map_fd, &idx, &val, 0); + CHECK(err, "Add program to devmap entry", + "err %d errno %d\n", err, errno); + + err = bpf_map_lookup_elem(map_fd, &idx, &val); + CHECK(err, "Read devmap entry", "err %d errno %d\n", err, errno); + CHECK(info.id != val.bpf_prog.id, "Expected program id in devmap entry", + "expected %u read %u\n", info.id, val.bpf_prog.id); + + /* can not attach BPF_XDP_DEVMAP program to a device */ + err = bpf_set_link_xdp_fd(IFINDEX_LO, dm_fd, XDP_FLAGS_SKB_MODE); + CHECK(err == 0, "Attach of BPF_XDP_DEVMAP program", + "should have failed\n"); + + val.ifindex = 1; + val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_dummy_prog); + err = bpf_map_update_elem(map_fd, &idx, &val, 0); + CHECK(err == 0, "Add non-BPF_XDP_DEVMAP program to devmap entry", + "should have failed\n"); + +out_close: + test_xdp_with_devmap_helpers__destroy(skel); +} + +void test_neg_xdp_devmap_helpers(void) +{ + struct test_xdp_devmap_helpers *skel; + __u32 duration = 0; + + skel = test_xdp_devmap_helpers__open_and_load(); + if (CHECK(skel, + "Load of XDP program accessing egress ifindex without attach type", + "should have failed\n")) { + test_xdp_devmap_helpers__destroy(skel); + } +} + + +void test_xdp_devmap_attach(void) +{ + if (test__start_subtest("DEVMAP with programs in entries")) + test_xdp_with_devmap_helpers(); + + if (test__start_subtest("Verifier check of DEVMAP programs")) + test_neg_xdp_devmap_helpers(); +} diff --git a/tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c b/tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c new file mode 100644 index 000000000000..e5c0f131c8a7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +/* fails to load without expected_attach_type = BPF_XDP_DEVMAP + * because of access to egress_ifindex + */ +#include "vmlinux.h" +#include + +SEC("xdp_dm_log") +int xdpdm_devlog(struct xdp_md *ctx) +{ + char fmt[] = "devmap redirect: dev %u -> dev %u len %u\n"; + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + unsigned int len = data_end - data; + + bpf_trace_printk(fmt, sizeof(fmt), + ctx->ingress_ifindex, ctx->egress_ifindex, len); + + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c b/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c new file mode 100644 index 000000000000..deef0e050863 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_devmap_val)); + __uint(max_entries, 4); +} dm_ports SEC(".maps"); + +SEC("xdp_redir") +int xdp_redir_prog(struct xdp_md *ctx) +{ + return bpf_redirect_map(&dm_ports, 1, 0); +} + +/* invalid program on DEVMAP entry; + * SEC name means expected attach type not set + */ +SEC("xdp_dummy") +int xdp_dummy_prog(struct xdp_md *ctx) +{ + return XDP_PASS; +} + +/* valid program on DEVMAP entry via SEC name; + * has access to egress and ingress ifindex + */ +SEC("xdp_devmap") +int xdp_dummy_dm(struct xdp_md *ctx) +{ + char fmt[] = "devmap redirect: dev %u -> dev %u len %u\n"; + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + unsigned int len = data_end - data; + + bpf_trace_printk(fmt, sizeof(fmt), + ctx->ingress_ifindex, ctx->egress_ifindex, len); + + return XDP_PASS; +} +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From df8fe57c071c58f355d0a4985ecd2fcaf99b050f Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Jun 2020 14:13:52 -0700 Subject: tools/bpf: sync bpf.h Sync bpf.h into tool/include/uapi/ Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f74bc4a2385e..f862a58fb567 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3613,6 +3613,7 @@ struct bpf_sock { __u32 dst_ip4; __u32 dst_ip6[4]; __u32 state; + __s32 rx_queue_mapping; }; struct bpf_tcp_sock { -- cgit v1.2.3 From 463bac5f1ca79fcd964bf50426eab024fb4dd8a4 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 29 May 2020 16:07:19 -0700 Subject: bpf, selftests: Add test for ktls with skb bpf ingress policy This adds a test for bpf ingress policy. To ensure data writes happen as expected with extra TLS headers we run these tests with data verification enabled by default. This will test receive packets have "PASS" stamped into the front of the payload. Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/159079363965.5745.3390806911628980210.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/test_sockmap_kern.h | 46 +++++- tools/testing/selftests/bpf/test_sockmap.c | 163 ++++++++++++++++++--- 2 files changed, 187 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_kern.h b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h index a443d3637db3..057036ca1111 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_kern.h +++ b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h @@ -79,11 +79,18 @@ struct { struct { __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 1); + __uint(max_entries, 2); __type(key, int); __type(value, int); } sock_skb_opts SEC(".maps"); +struct { + __uint(type, TEST_MAP_TYPE); + __uint(max_entries, 20); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); +} tls_sock_map SEC(".maps"); + SEC("sk_skb1") int bpf_prog1(struct __sk_buff *skb) { @@ -118,6 +125,43 @@ int bpf_prog2(struct __sk_buff *skb) } +SEC("sk_skb3") +int bpf_prog3(struct __sk_buff *skb) +{ + const int one = 1; + int err, *f, ret = SK_PASS; + void *data_end; + char *c; + + err = bpf_skb_pull_data(skb, 19); + if (err) + goto tls_out; + + c = (char *)(long)skb->data; + data_end = (void *)(long)skb->data_end; + + if (c + 18 < data_end) + memcpy(&c[13], "PASS", 4); + f = bpf_map_lookup_elem(&sock_skb_opts, &one); + if (f && *f) { + __u64 flags = 0; + + ret = 0; + flags = *f; +#ifdef SOCKMAP + return bpf_sk_redirect_map(skb, &tls_sock_map, ret, flags); +#else + return bpf_sk_redirect_hash(skb, &tls_sock_map, &ret, flags); +#endif + } + + f = bpf_map_lookup_elem(&sock_skb_opts, &one); + if (f && *f) + ret = SK_DROP; +tls_out: + return ret; +} + SEC("sockops") int bpf_sockmap(struct bpf_sock_ops *skops) { diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index c80643828b82..37695fc8096a 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -63,8 +63,8 @@ int s1, s2, c1, c2, p1, p2; int test_cnt; int passed; int failed; -int map_fd[8]; -struct bpf_map *maps[8]; +int map_fd[9]; +struct bpf_map *maps[9]; int prog_fd[11]; int txmsg_pass; @@ -79,7 +79,10 @@ int txmsg_end_push; int txmsg_start_pop; int txmsg_pop; int txmsg_ingress; -int txmsg_skb; +int txmsg_redir_skb; +int txmsg_ktls_skb; +int txmsg_ktls_skb_drop; +int txmsg_ktls_skb_redir; int ktls; int peek_flag; @@ -104,7 +107,7 @@ static const struct option long_options[] = { {"txmsg_start_pop", required_argument, NULL, 'w'}, {"txmsg_pop", required_argument, NULL, 'x'}, {"txmsg_ingress", no_argument, &txmsg_ingress, 1 }, - {"txmsg_skb", no_argument, &txmsg_skb, 1 }, + {"txmsg_redir_skb", no_argument, &txmsg_redir_skb, 1 }, {"ktls", no_argument, &ktls, 1 }, {"peek", no_argument, &peek_flag, 1 }, {"whitelist", required_argument, NULL, 'n' }, @@ -169,7 +172,8 @@ static void test_reset(void) txmsg_start_push = txmsg_end_push = 0; txmsg_pass = txmsg_drop = txmsg_redir = 0; txmsg_apply = txmsg_cork = 0; - txmsg_ingress = txmsg_skb = 0; + txmsg_ingress = txmsg_redir_skb = 0; + txmsg_ktls_skb = txmsg_ktls_skb_drop = txmsg_ktls_skb_redir = 0; } static int test_start_subtest(const struct _test *t, struct sockmap_options *o) @@ -502,14 +506,41 @@ unwind_iov: static int msg_verify_data(struct msghdr *msg, int size, int chunk_sz) { - int i, j, bytes_cnt = 0; + int i, j = 0, bytes_cnt = 0; unsigned char k = 0; for (i = 0; i < msg->msg_iovlen; i++) { unsigned char *d = msg->msg_iov[i].iov_base; - for (j = 0; - j < msg->msg_iov[i].iov_len && size; j++) { + /* Special case test for skb ingress + ktls */ + if (i == 0 && txmsg_ktls_skb) { + if (msg->msg_iov[i].iov_len < 4) + return -EIO; + if (txmsg_ktls_skb_redir) { + if (memcmp(&d[13], "PASS", 4) != 0) { + fprintf(stderr, + "detected redirect ktls_skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n", i, 0, d[13], d[14], d[15], d[16]); + return -EIO; + } + d[13] = 0; + d[14] = 1; + d[15] = 2; + d[16] = 3; + j = 13; + } else if (txmsg_ktls_skb) { + if (memcmp(d, "PASS", 4) != 0) { + fprintf(stderr, + "detected ktls_skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n", i, 0, d[0], d[1], d[2], d[3]); + return -EIO; + } + d[0] = 0; + d[1] = 1; + d[2] = 2; + d[3] = 3; + } + } + + for (; j < msg->msg_iov[i].iov_len && size; j++) { if (d[j] != k++) { fprintf(stderr, "detected data corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n", @@ -724,7 +755,7 @@ static int sendmsg_test(struct sockmap_options *opt) rxpid = fork(); if (rxpid == 0) { iov_buf -= (txmsg_pop - txmsg_start_pop + 1); - if (opt->drop_expected) + if (opt->drop_expected || txmsg_ktls_skb_drop) _exit(0); if (!iov_buf) /* zero bytes sent case */ @@ -911,8 +942,28 @@ static int run_options(struct sockmap_options *options, int cg_fd, int test) return err; } + /* Attach programs to TLS sockmap */ + if (txmsg_ktls_skb) { + err = bpf_prog_attach(prog_fd[0], map_fd[8], + BPF_SK_SKB_STREAM_PARSER, 0); + if (err) { + fprintf(stderr, + "ERROR: bpf_prog_attach (TLS sockmap %i->%i): %d (%s)\n", + prog_fd[0], map_fd[8], err, strerror(errno)); + return err; + } + + err = bpf_prog_attach(prog_fd[2], map_fd[8], + BPF_SK_SKB_STREAM_VERDICT, 0); + if (err) { + fprintf(stderr, "ERROR: bpf_prog_attach (TLS sockmap): %d (%s)\n", + err, strerror(errno)); + return err; + } + } + /* Attach to cgroups */ - err = bpf_prog_attach(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS, 0); + err = bpf_prog_attach(prog_fd[3], cg_fd, BPF_CGROUP_SOCK_OPS, 0); if (err) { fprintf(stderr, "ERROR: bpf_prog_attach (groups): %d (%s)\n", err, strerror(errno)); @@ -928,15 +979,15 @@ run: /* Attach txmsg program to sockmap */ if (txmsg_pass) - tx_prog_fd = prog_fd[3]; - else if (txmsg_redir) tx_prog_fd = prog_fd[4]; - else if (txmsg_apply) + else if (txmsg_redir) tx_prog_fd = prog_fd[5]; - else if (txmsg_cork) + else if (txmsg_apply) tx_prog_fd = prog_fd[6]; - else if (txmsg_drop) + else if (txmsg_cork) tx_prog_fd = prog_fd[7]; + else if (txmsg_drop) + tx_prog_fd = prog_fd[8]; else tx_prog_fd = 0; @@ -1108,7 +1159,35 @@ run: } } - if (txmsg_skb) { + if (txmsg_ktls_skb) { + int ingress = BPF_F_INGRESS; + + i = 0; + err = bpf_map_update_elem(map_fd[8], &i, &p2, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (c1 sockmap): %d (%s)\n", + err, strerror(errno)); + } + + if (txmsg_ktls_skb_redir) { + i = 1; + err = bpf_map_update_elem(map_fd[7], + &i, &ingress, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (txmsg_ingress): %d (%s)\n", + err, strerror(errno)); + } + } + + if (txmsg_ktls_skb_drop) { + i = 1; + err = bpf_map_update_elem(map_fd[7], &i, &i, BPF_ANY); + } + } + + if (txmsg_redir_skb) { int skb_fd = (test == SENDMSG || test == SENDPAGE) ? p2 : p1; int ingress = BPF_F_INGRESS; @@ -1123,8 +1202,7 @@ run: } i = 3; - err = bpf_map_update_elem(map_fd[0], - &i, &skb_fd, BPF_ANY); + err = bpf_map_update_elem(map_fd[0], &i, &skb_fd, BPF_ANY); if (err) { fprintf(stderr, "ERROR: bpf_map_update_elem (c1 sockmap): %d (%s)\n", @@ -1158,9 +1236,12 @@ run: fprintf(stderr, "unknown test\n"); out: /* Detatch and zero all the maps */ - bpf_prog_detach2(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS); + bpf_prog_detach2(prog_fd[3], cg_fd, BPF_CGROUP_SOCK_OPS); bpf_prog_detach2(prog_fd[0], map_fd[0], BPF_SK_SKB_STREAM_PARSER); bpf_prog_detach2(prog_fd[1], map_fd[0], BPF_SK_SKB_STREAM_VERDICT); + bpf_prog_detach2(prog_fd[0], map_fd[8], BPF_SK_SKB_STREAM_PARSER); + bpf_prog_detach2(prog_fd[2], map_fd[8], BPF_SK_SKB_STREAM_VERDICT); + if (tx_prog_fd >= 0) bpf_prog_detach2(tx_prog_fd, map_fd[1], BPF_SK_MSG_VERDICT); @@ -1229,8 +1310,10 @@ static void test_options(char *options) } if (txmsg_ingress) strncat(options, "ingress,", OPTSTRING); - if (txmsg_skb) - strncat(options, "skb,", OPTSTRING); + if (txmsg_redir_skb) + strncat(options, "redir_skb,", OPTSTRING); + if (txmsg_ktls_skb) + strncat(options, "ktls_skb,", OPTSTRING); if (ktls) strncat(options, "ktls,", OPTSTRING); if (peek_flag) @@ -1362,6 +1445,40 @@ static void test_txmsg_ingress_redir(int cgrp, struct sockmap_options *opt) test_send(opt, cgrp); } +static void test_txmsg_skb(int cgrp, struct sockmap_options *opt) +{ + bool data = opt->data_test; + int k = ktls; + + opt->data_test = true; + ktls = 1; + + txmsg_pass = txmsg_drop = 0; + txmsg_ingress = txmsg_redir = 0; + txmsg_ktls_skb = 1; + txmsg_pass = 1; + + /* Using data verification so ensure iov layout is + * expected from test receiver side. e.g. has enough + * bytes to write test code. + */ + opt->iov_length = 100; + opt->iov_count = 1; + opt->rate = 1; + test_exec(cgrp, opt); + + txmsg_ktls_skb_drop = 1; + test_exec(cgrp, opt); + + txmsg_ktls_skb_drop = 0; + txmsg_ktls_skb_redir = 1; + test_exec(cgrp, opt); + + opt->data_test = data; + ktls = k; +} + + /* Test cork with hung data. This tests poor usage patterns where * cork can leave data on the ring if user program is buggy and * doesn't flush them somehow. They do take some time however @@ -1542,11 +1659,13 @@ char *map_names[] = { "sock_bytes", "sock_redir_flags", "sock_skb_opts", + "tls_sock_map", }; int prog_attach_type[] = { BPF_SK_SKB_STREAM_PARSER, BPF_SK_SKB_STREAM_VERDICT, + BPF_SK_SKB_STREAM_VERDICT, BPF_CGROUP_SOCK_OPS, BPF_SK_MSG_VERDICT, BPF_SK_MSG_VERDICT, @@ -1558,6 +1677,7 @@ int prog_attach_type[] = { }; int prog_type[] = { + BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_SOCK_OPS, @@ -1620,6 +1740,7 @@ struct _test test[] = { {"txmsg test redirect", test_txmsg_redir}, {"txmsg test drop", test_txmsg_drop}, {"txmsg test ingress redirect", test_txmsg_ingress_redir}, + {"txmsg test skb", test_txmsg_skb}, {"txmsg test apply", test_txmsg_apply}, {"txmsg test cork", test_txmsg_cork}, {"txmsg test hanging corks", test_txmsg_cork_hangs}, -- cgit v1.2.3 From 9c441fe4c06a553ad770b6f21616327a3badf793 Mon Sep 17 00:00:00 2001 From: Ferenc Fejes Date: Sat, 30 May 2020 23:09:02 +0200 Subject: selftests/bpf: Add test for SO_BINDTODEVICE opt of bpf_setsockopt This test intended to verify if SO_BINDTODEVICE option works in bpf_setsockopt. Because we already in the SOL_SOCKET level in this connect bpf prog its safe to verify the sanity in the beginning of the connect_v4_prog by calling the bind_to_device test helper. The testing environment already created by the test_sock_addr.sh script so this test assume that two netdevices already existing in the system: veth pair with names test_sock_addr1 and test_sock_addr2. The test will try to bind the socket to those devices first. Then the test assume there are no netdevice with "nonexistent_dev" name so the bpf_setsockopt will give use ENODEV error. At the end the test remove the device binding from the socket by binding it to an empty name. Signed-off-by: Ferenc Fejes Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/3f055b8e45c65639c5c73d0b4b6c589e60b86f15.1590871065.git.fejes@inf.elte.hu --- tools/testing/selftests/bpf/progs/connect4_prog.c | 33 +++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index c2c85c31cffd..1ab2c5eba86c 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -9,6 +9,8 @@ #include #include #include +#include +#include #include #include @@ -21,6 +23,10 @@ #define TCP_CA_NAME_MAX 16 #endif +#ifndef IFNAMSIZ +#define IFNAMSIZ 16 +#endif + int _version SEC("version") = 1; __attribute__ ((noinline)) @@ -75,6 +81,29 @@ static __inline int set_cc(struct bpf_sock_addr *ctx) return 0; } +static __inline int bind_to_device(struct bpf_sock_addr *ctx) +{ + char veth1[IFNAMSIZ] = "test_sock_addr1"; + char veth2[IFNAMSIZ] = "test_sock_addr2"; + char missing[IFNAMSIZ] = "nonexistent_dev"; + char del_bind[IFNAMSIZ] = ""; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &veth1, sizeof(veth1))) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &veth2, sizeof(veth2))) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &missing, sizeof(missing)) != -ENODEV) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &del_bind, sizeof(del_bind))) + return 1; + + return 0; +} + SEC("cgroup/connect4") int connect_v4_prog(struct bpf_sock_addr *ctx) { @@ -88,6 +117,10 @@ int connect_v4_prog(struct bpf_sock_addr *ctx) tuple.ipv4.daddr = bpf_htonl(DST_REWRITE_IP4); tuple.ipv4.dport = bpf_htons(DST_REWRITE_PORT4); + /* Bind to device and unbind it. */ + if (bind_to_device(ctx)) + return 0; + if (ctx->type != SOCK_STREAM && ctx->type != SOCK_DGRAM) return 0; else if (ctx->type == SOCK_STREAM) -- cgit v1.2.3 From febeb6dff7beafcaf89521f6c8ff7b0adac08d54 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 1 Jun 2020 13:26:01 -0700 Subject: libbpf: Add _GNU_SOURCE for reallocarray to ringbuf.c On systems with recent enough glibc, reallocarray compat won't kick in, so reallocarray() itself has to come from stdlib.h include. But _GNU_SOURCE is necessary to enable it. So add it. Fixes: bf99c936f947 ("libbpf: Add BPF ring buffer support") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200601202601.2139477-1-andriin@fb.com --- tools/lib/bpf/ringbuf.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c index bc10fa1d43c7..4fc6c6cbb4eb 100644 --- a/tools/lib/bpf/ringbuf.c +++ b/tools/lib/bpf/ringbuf.c @@ -4,6 +4,9 @@ * * Copyright (C) 2020 Facebook, Inc. */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif #include #include #include -- cgit v1.2.3 From 7f045a49fee04b5662cbdeaf0838f9322ae8c63a Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:38 +0200 Subject: bpf: Add link-based BPF program attachment to network namespace Extend bpf() syscall subcommands that operate on bpf_link, that is LINK_CREATE, LINK_UPDATE, OBJ_GET_INFO, to accept attach types tied to network namespaces (only flow dissector at the moment). Link-based and prog-based attachment can be used interchangeably, but only one can exist at a time. Attempts to attach a link when a prog is already attached directly, and the other way around, will be met with -EEXIST. Attempts to detach a program when link exists result in -EINVAL. Attachment of multiple links of same attach type to one netns is not supported with the intention to lift the restriction when a use-case presents itself. Because of that link create returns -E2BIG when trying to create another netns link, when one already exists. Link-based attachments to netns don't keep a netns alive by holding a ref to it. Instead links get auto-detached from netns when the latter is being destroyed, using a pernet pre_exit callback. When auto-detached, link lives in defunct state as long there are open FDs for it. -ENOLINK is returned if a user tries to update a defunct link. Because bpf_link to netns doesn't hold a ref to struct net, special care is taken when releasing, updating, or filling link info. The netns might be getting torn down when any of these link operations are in progress. That is why auto-detach and update/release/fill_info are synchronized by the same mutex. Also, link ops have to always check if auto-detach has not happened yet and if netns is still alive (refcnt > 0). Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-5-jakub@cloudflare.com --- include/linux/bpf-netns.h | 8 ++ include/linux/bpf_types.h | 3 + include/net/netns/bpf.h | 1 + include/uapi/linux/bpf.h | 5 + kernel/bpf/net_namespace.c | 244 ++++++++++++++++++++++++++++++++++++++++- kernel/bpf/syscall.c | 3 + tools/include/uapi/linux/bpf.h | 5 + 7 files changed, 267 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/include/linux/bpf-netns.h b/include/linux/bpf-netns.h index f3aec3d79824..4052d649f36d 100644 --- a/include/linux/bpf-netns.h +++ b/include/linux/bpf-netns.h @@ -34,6 +34,8 @@ int netns_bpf_prog_query(const union bpf_attr *attr, int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); int netns_bpf_prog_detach(const union bpf_attr *attr); +int netns_bpf_link_create(const union bpf_attr *attr, + struct bpf_prog *prog); #else static inline int netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -51,6 +53,12 @@ static inline int netns_bpf_prog_detach(const union bpf_attr *attr) { return -EOPNOTSUPP; } + +static inline int netns_bpf_link_create(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} #endif #endif /* _BPF_NETNS_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index fa8e1b552acd..a18ae82a298a 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -126,3 +126,6 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) BPF_LINK_TYPE(BPF_LINK_TYPE_CGROUP, cgroup) #endif BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter) +#ifdef CONFIG_NET +BPF_LINK_TYPE(BPF_LINK_TYPE_NETNS, netns) +#endif diff --git a/include/net/netns/bpf.h b/include/net/netns/bpf.h index a858d1c5b166..a8dce2a380c8 100644 --- a/include/net/netns/bpf.h +++ b/include/net/netns/bpf.h @@ -12,6 +12,7 @@ struct bpf_prog; struct netns_bpf { struct bpf_prog __rcu *progs[MAX_NETNS_BPF_ATTACH_TYPE]; + struct bpf_link *links[MAX_NETNS_BPF_ATTACH_TYPE]; }; #endif /* __NETNS_BPF_H__ */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f862a58fb567..b9ed9f14f2a2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -237,6 +237,7 @@ enum bpf_link_type { BPF_LINK_TYPE_TRACING = 2, BPF_LINK_TYPE_CGROUP = 3, BPF_LINK_TYPE_ITER = 4, + BPF_LINK_TYPE_NETNS = 5, MAX_BPF_LINK_TYPE, }; @@ -3839,6 +3840,10 @@ struct bpf_link_info { __u64 cgroup_id; __u32 attach_type; } cgroup; + struct { + __u32 netns_ino; + __u32 attach_type; + } netns; }; } __attribute__((aligned(8))); diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index b37d81450c3a..78cf061f8179 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -8,9 +8,140 @@ * Functions to manage BPF programs attached to netns */ +struct bpf_netns_link { + struct bpf_link link; + enum bpf_attach_type type; + enum netns_bpf_attach_type netns_type; + + /* We don't hold a ref to net in order to auto-detach the link + * when netns is going away. Instead we rely on pernet + * pre_exit callback to clear this pointer. Must be accessed + * with netns_bpf_mutex held. + */ + struct net *net; +}; + /* Protects updates to netns_bpf */ DEFINE_MUTEX(netns_bpf_mutex); +/* Must be called with netns_bpf_mutex held. */ +static void __net_exit bpf_netns_link_auto_detach(struct bpf_link *link) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + + net_link->net = NULL; +} + +static void bpf_netns_link_release(struct bpf_link *link) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + enum netns_bpf_attach_type type = net_link->netns_type; + struct net *net; + + /* Link auto-detached by dying netns. */ + if (!net_link->net) + return; + + mutex_lock(&netns_bpf_mutex); + + /* Recheck after potential sleep. We can race with cleanup_net + * here, but if we see a non-NULL struct net pointer pre_exit + * has not happened yet and will block on netns_bpf_mutex. + */ + net = net_link->net; + if (!net) + goto out_unlock; + + net->bpf.links[type] = NULL; + RCU_INIT_POINTER(net->bpf.progs[type], NULL); + +out_unlock: + mutex_unlock(&netns_bpf_mutex); +} + +static void bpf_netns_link_dealloc(struct bpf_link *link) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + + kfree(net_link); +} + +static int bpf_netns_link_update_prog(struct bpf_link *link, + struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + enum netns_bpf_attach_type type = net_link->netns_type; + struct net *net; + int ret = 0; + + if (old_prog && old_prog != link->prog) + return -EPERM; + if (new_prog->type != link->prog->type) + return -EINVAL; + + mutex_lock(&netns_bpf_mutex); + + net = net_link->net; + if (!net || !check_net(net)) { + /* Link auto-detached or netns dying */ + ret = -ENOLINK; + goto out_unlock; + } + + old_prog = xchg(&link->prog, new_prog); + rcu_assign_pointer(net->bpf.progs[type], new_prog); + bpf_prog_put(old_prog); + +out_unlock: + mutex_unlock(&netns_bpf_mutex); + return ret; +} + +static int bpf_netns_link_fill_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + const struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + unsigned int inum = 0; + struct net *net; + + mutex_lock(&netns_bpf_mutex); + net = net_link->net; + if (net && check_net(net)) + inum = net->ns.inum; + mutex_unlock(&netns_bpf_mutex); + + info->netns.netns_ino = inum; + info->netns.attach_type = net_link->type; + return 0; +} + +static void bpf_netns_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_link_info info = {}; + + bpf_netns_link_fill_info(link, &info); + seq_printf(seq, + "netns_ino:\t%u\n" + "attach_type:\t%u\n", + info.netns.netns_ino, + info.netns.attach_type); +} + +static const struct bpf_link_ops bpf_netns_link_ops = { + .release = bpf_netns_link_release, + .dealloc = bpf_netns_link_dealloc, + .update_prog = bpf_netns_link_update_prog, + .fill_link_info = bpf_netns_link_fill_info, + .show_fdinfo = bpf_netns_link_show_fdinfo, +}; + int netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -67,6 +198,13 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) net = current->nsproxy->net_ns; mutex_lock(&netns_bpf_mutex); + + /* Attaching prog directly is not compatible with links */ + if (net->bpf.links[type]) { + ret = -EEXIST; + goto out_unlock; + } + switch (type) { case NETNS_BPF_FLOW_DISSECTOR: ret = flow_dissector_bpf_prog_attach(net, prog); @@ -75,6 +213,7 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) ret = -EINVAL; break; } +out_unlock: mutex_unlock(&netns_bpf_mutex); return ret; @@ -86,6 +225,10 @@ static int __netns_bpf_prog_detach(struct net *net, { struct bpf_prog *attached; + /* Progs attached via links cannot be detached */ + if (net->bpf.links[type]) + return -EINVAL; + attached = rcu_dereference_protected(net->bpf.progs[type], lockdep_is_held(&netns_bpf_mutex)); if (!attached) @@ -111,13 +254,110 @@ int netns_bpf_prog_detach(const union bpf_attr *attr) return ret; } +static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, + enum netns_bpf_attach_type type) +{ + struct bpf_prog *prog; + int err; + + mutex_lock(&netns_bpf_mutex); + + /* Allow attaching only one prog or link for now */ + if (net->bpf.links[type]) { + err = -E2BIG; + goto out_unlock; + } + /* Links are not compatible with attaching prog directly */ + prog = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); + if (prog) { + err = -EEXIST; + goto out_unlock; + } + + switch (type) { + case NETNS_BPF_FLOW_DISSECTOR: + err = flow_dissector_bpf_prog_attach(net, link->prog); + break; + default: + err = -EINVAL; + break; + } + if (err) + goto out_unlock; + + net->bpf.links[type] = link; + +out_unlock: + mutex_unlock(&netns_bpf_mutex); + return err; +} + +int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog) +{ + enum netns_bpf_attach_type netns_type; + struct bpf_link_primer link_primer; + struct bpf_netns_link *net_link; + enum bpf_attach_type type; + struct net *net; + int err; + + if (attr->link_create.flags) + return -EINVAL; + + type = attr->link_create.attach_type; + netns_type = to_netns_bpf_attach_type(type); + if (netns_type < 0) + return -EINVAL; + + net = get_net_ns_by_fd(attr->link_create.target_fd); + if (IS_ERR(net)) + return PTR_ERR(net); + + net_link = kzalloc(sizeof(*net_link), GFP_USER); + if (!net_link) { + err = -ENOMEM; + goto out_put_net; + } + bpf_link_init(&net_link->link, BPF_LINK_TYPE_NETNS, + &bpf_netns_link_ops, prog); + net_link->net = net; + net_link->type = type; + net_link->netns_type = netns_type; + + err = bpf_link_prime(&net_link->link, &link_primer); + if (err) { + kfree(net_link); + goto out_put_net; + } + + err = netns_bpf_link_attach(net, &net_link->link, netns_type); + if (err) { + bpf_link_cleanup(&link_primer); + goto out_put_net; + } + + put_net(net); + return bpf_link_settle(&link_primer); + +out_put_net: + put_net(net); + return err; +} + static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) { enum netns_bpf_attach_type type; + struct bpf_link *link; mutex_lock(&netns_bpf_mutex); - for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) - __netns_bpf_prog_detach(net, type); + for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) { + link = net->bpf.links[type]; + if (link) + bpf_netns_link_auto_detach(link); + else + __netns_bpf_prog_detach(net, type); + } mutex_unlock(&netns_bpf_mutex); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c77ab9c76f7b..e14a842d7e0d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3887,6 +3887,9 @@ static int link_create(union bpf_attr *attr) case BPF_PROG_TYPE_TRACING: ret = tracing_bpf_link_attach(attr, prog); break; + case BPF_PROG_TYPE_FLOW_DISSECTOR: + ret = netns_bpf_link_create(attr, prog); + break; default: ret = -EINVAL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f862a58fb567..b9ed9f14f2a2 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -237,6 +237,7 @@ enum bpf_link_type { BPF_LINK_TYPE_TRACING = 2, BPF_LINK_TYPE_CGROUP = 3, BPF_LINK_TYPE_ITER = 4, + BPF_LINK_TYPE_NETNS = 5, MAX_BPF_LINK_TYPE, }; @@ -3839,6 +3840,10 @@ struct bpf_link_info { __u64 cgroup_id; __u32 attach_type; } cgroup; + struct { + __u32 netns_ino; + __u32 attach_type; + } netns; }; } __attribute__((aligned(8))); -- cgit v1.2.3 From d60d81acc2c180e33244857e35ef60072573b000 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:40 +0200 Subject: libbpf: Add support for bpf_link-based netns attachment Add bpf_program__attach_nets(), which uses LINK_CREATE subcommand to create an FD-based kernel bpf_link, for attach types tied to network namespace, that is BPF_FLOW_DISSECTOR for the moment. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-7-jakub@cloudflare.com --- tools/lib/bpf/libbpf.c | 23 ++++++++++++++++++----- tools/lib/bpf/libbpf.h | 2 ++ tools/lib/bpf/libbpf.map | 1 + 3 files changed, 21 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 85d4f1c5fc52..7f01be2b88b8 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -7896,8 +7896,9 @@ static struct bpf_link *attach_iter(const struct bpf_sec_def *sec, return bpf_program__attach_iter(prog, NULL); } -struct bpf_link * -bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd) +static struct bpf_link * +bpf_program__attach_fd(struct bpf_program *prog, int target_fd, + const char *target_name) { enum bpf_attach_type attach_type; char errmsg[STRERR_BUFSIZE]; @@ -7917,12 +7918,12 @@ bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd) link->detach = &bpf_link__detach_fd; attach_type = bpf_program__get_expected_attach_type(prog); - link_fd = bpf_link_create(prog_fd, cgroup_fd, attach_type, NULL); + link_fd = bpf_link_create(prog_fd, target_fd, attach_type, NULL); if (link_fd < 0) { link_fd = -errno; free(link); - pr_warn("program '%s': failed to attach to cgroup: %s\n", - bpf_program__title(prog, false), + pr_warn("program '%s': failed to attach to %s: %s\n", + bpf_program__title(prog, false), target_name, libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg))); return ERR_PTR(link_fd); } @@ -7930,6 +7931,18 @@ bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd) return link; } +struct bpf_link * +bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd) +{ + return bpf_program__attach_fd(prog, cgroup_fd, "cgroup"); +} + +struct bpf_link * +bpf_program__attach_netns(struct bpf_program *prog, int netns_fd) +{ + return bpf_program__attach_fd(prog, netns_fd, "netns"); +} + struct bpf_link * bpf_program__attach_iter(struct bpf_program *prog, const struct bpf_iter_attach_opts *opts) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 8528a02d5af8..334437af3014 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -253,6 +253,8 @@ LIBBPF_API struct bpf_link * bpf_program__attach_lsm(struct bpf_program *prog); LIBBPF_API struct bpf_link * bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd); +LIBBPF_API struct bpf_link * +bpf_program__attach_netns(struct bpf_program *prog, int netns_fd); struct bpf_map; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index c18860200abb..f732c77b7ed0 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -262,6 +262,7 @@ LIBBPF_0.0.9 { bpf_link_get_fd_by_id; bpf_link_get_next_id; bpf_program__attach_iter; + bpf_program__attach_netns; perf_buffer__consume; ring_buffer__add; ring_buffer__consume; -- cgit v1.2.3 From be6e19818ba626eb1b354490aee40a2cfc1a219f Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:41 +0200 Subject: bpftool: Extract helpers for showing link attach type Code for printing link attach_type is duplicated in a couple of places, and likely will be duplicated for future link types as well. Create helpers to prevent duplication. Suggested-by: Andrii Nakryiko Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-8-jakub@cloudflare.com --- tools/bpf/bpftool/link.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index 670a561dc31b..1ff416eff3d7 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -62,6 +62,15 @@ show_link_header_json(struct bpf_link_info *info, json_writer_t *wtr) jsonw_uint_field(json_wtr, "prog_id", info->prog_id); } +static void show_link_attach_type_json(__u32 attach_type, json_writer_t *wtr) +{ + if (attach_type < ARRAY_SIZE(attach_type_name)) + jsonw_string_field(wtr, "attach_type", + attach_type_name[attach_type]); + else + jsonw_uint_field(wtr, "attach_type", attach_type); +} + static int get_prog_info(int prog_id, struct bpf_prog_info *info) { __u32 len = sizeof(*info); @@ -105,22 +114,13 @@ static int show_link_close_json(int fd, struct bpf_link_info *info) jsonw_uint_field(json_wtr, "prog_type", prog_info.type); - if (info->tracing.attach_type < ARRAY_SIZE(attach_type_name)) - jsonw_string_field(json_wtr, "attach_type", - attach_type_name[info->tracing.attach_type]); - else - jsonw_uint_field(json_wtr, "attach_type", - info->tracing.attach_type); + show_link_attach_type_json(info->tracing.attach_type, + json_wtr); break; case BPF_LINK_TYPE_CGROUP: jsonw_lluint_field(json_wtr, "cgroup_id", info->cgroup.cgroup_id); - if (info->cgroup.attach_type < ARRAY_SIZE(attach_type_name)) - jsonw_string_field(json_wtr, "attach_type", - attach_type_name[info->cgroup.attach_type]); - else - jsonw_uint_field(json_wtr, "attach_type", - info->cgroup.attach_type); + show_link_attach_type_json(info->cgroup.attach_type, json_wtr); break; default: break; @@ -153,6 +153,14 @@ static void show_link_header_plain(struct bpf_link_info *info) printf("prog %u ", info->prog_id); } +static void show_link_attach_type_plain(__u32 attach_type) +{ + if (attach_type < ARRAY_SIZE(attach_type_name)) + printf("attach_type %s ", attach_type_name[attach_type]); + else + printf("attach_type %u ", attach_type); +} + static int show_link_close_plain(int fd, struct bpf_link_info *info) { struct bpf_prog_info prog_info; @@ -176,19 +184,11 @@ static int show_link_close_plain(int fd, struct bpf_link_info *info) else printf("\n\tprog_type %u ", prog_info.type); - if (info->tracing.attach_type < ARRAY_SIZE(attach_type_name)) - printf("attach_type %s ", - attach_type_name[info->tracing.attach_type]); - else - printf("attach_type %u ", info->tracing.attach_type); + show_link_attach_type_plain(info->tracing.attach_type); break; case BPF_LINK_TYPE_CGROUP: printf("\n\tcgroup_id %zu ", (size_t)info->cgroup.cgroup_id); - if (info->cgroup.attach_type < ARRAY_SIZE(attach_type_name)) - printf("attach_type %s ", - attach_type_name[info->cgroup.attach_type]); - else - printf("attach_type %u ", info->cgroup.attach_type); + show_link_attach_type_plain(info->cgroup.attach_type); break; default: break; -- cgit v1.2.3 From e948947a6e111b3d4bbe538105ee2f3611e032ad Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:42 +0200 Subject: bpftool: Support link show for netns-attached links Make `bpf link show` aware of new link type, that is links attached to netns. When listing netns-attached links, display netns inode number as its identifier and link attach type. Sample session: # readlink /proc/self/ns/net net:[4026532251] # bpftool prog show 357: flow_dissector tag a04f5eef06a7f555 gpl loaded_at 2020-05-30T16:53:51+0200 uid 0 xlated 16B jited 37B memlock 4096B 358: flow_dissector tag a04f5eef06a7f555 gpl loaded_at 2020-05-30T16:53:51+0200 uid 0 xlated 16B jited 37B memlock 4096B # bpftool link show 108: netns prog 357 netns_ino 4026532251 attach_type flow_dissector # bpftool link -jp show [{ "id": 108, "type": "netns", "prog_id": 357, "netns_ino": 4026532251, "attach_type": "flow_dissector" } ] (... after netns is gone ...) # bpftool link show 108: netns prog 357 netns_ino 0 attach_type flow_dissector # bpftool link -jp show [{ "id": 108, "type": "netns", "prog_id": 357, "netns_ino": 0, "attach_type": "flow_dissector" } ] Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-9-jakub@cloudflare.com --- tools/bpf/bpftool/link.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'tools') diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index 1ff416eff3d7..fca57ee8fafe 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -17,6 +17,7 @@ static const char * const link_type_name[] = { [BPF_LINK_TYPE_TRACING] = "tracing", [BPF_LINK_TYPE_CGROUP] = "cgroup", [BPF_LINK_TYPE_ITER] = "iter", + [BPF_LINK_TYPE_NETNS] = "netns", }; static int link_parse_fd(int *argc, char ***argv) @@ -122,6 +123,11 @@ static int show_link_close_json(int fd, struct bpf_link_info *info) info->cgroup.cgroup_id); show_link_attach_type_json(info->cgroup.attach_type, json_wtr); break; + case BPF_LINK_TYPE_NETNS: + jsonw_uint_field(json_wtr, "netns_ino", + info->netns.netns_ino); + show_link_attach_type_json(info->netns.attach_type, json_wtr); + break; default: break; } @@ -190,6 +196,10 @@ static int show_link_close_plain(int fd, struct bpf_link_info *info) printf("\n\tcgroup_id %zu ", (size_t)info->cgroup.cgroup_id); show_link_attach_type_plain(info->cgroup.attach_type); break; + case BPF_LINK_TYPE_NETNS: + printf("\n\tnetns_ino %u ", info->netns.netns_ino); + show_link_attach_type_plain(info->netns.attach_type); + break; default: break; } -- cgit v1.2.3 From 1f043f87bb595bbe6c7e6b291d115284840a6c33 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:43 +0200 Subject: selftests/bpf: Add tests for attaching bpf_link to netns Extend the existing test case for flow dissector attaching to cover: - link creation, - link updates, - link info querying, - mixing links with direct prog attachment. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-10-jakub@cloudflare.com --- .../bpf/prog_tests/flow_dissector_reattach.c | 588 +++++++++++++++++++-- 1 file changed, 551 insertions(+), 37 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c index 1f51ba66b98b..15cb554a66d8 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -18,21 +19,30 @@ #include "test_progs.h" -static bool is_attached(int netns) +static int init_net = -1; + +static __u32 query_attached_prog_id(int netns) { - __u32 cnt; + __u32 prog_ids[1] = {}; + __u32 prog_cnt = ARRAY_SIZE(prog_ids); int err; - err = bpf_prog_query(netns, BPF_FLOW_DISSECTOR, 0, NULL, NULL, &cnt); + err = bpf_prog_query(netns, BPF_FLOW_DISSECTOR, 0, NULL, + prog_ids, &prog_cnt); if (CHECK_FAIL(err)) { perror("bpf_prog_query"); - return true; /* fail-safe */ + return 0; } - return cnt > 0; + return prog_cnt == 1 ? prog_ids[0] : 0; +} + +static bool prog_is_attached(int netns) +{ + return query_attached_prog_id(netns) > 0; } -static int load_prog(void) +static int load_prog(enum bpf_prog_type type) { struct bpf_insn prog[] = { BPF_MOV64_IMM(BPF_REG_0, BPF_OK), @@ -40,61 +50,566 @@ static int load_prog(void) }; int fd; - fd = bpf_load_program(BPF_PROG_TYPE_FLOW_DISSECTOR, prog, - ARRAY_SIZE(prog), "GPL", 0, NULL, 0); + fd = bpf_load_program(type, prog, ARRAY_SIZE(prog), "GPL", 0, NULL, 0); if (CHECK_FAIL(fd < 0)) perror("bpf_load_program"); return fd; } -static void do_flow_dissector_reattach(void) +static __u32 query_prog_id(int prog) { - int prog_fd[2] = { -1, -1 }; + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); int err; - prog_fd[0] = load_prog(); - if (prog_fd[0] < 0) - return; + err = bpf_obj_get_info_by_fd(prog, &info, &info_len); + if (CHECK_FAIL(err || info_len != sizeof(info))) { + perror("bpf_obj_get_info_by_fd"); + return 0; + } - prog_fd[1] = load_prog(); - if (prog_fd[1] < 0) - goto out_close; + return info.id; +} + +static int unshare_net(int old_net) +{ + int err, new_net; - err = bpf_prog_attach(prog_fd[0], 0, BPF_FLOW_DISSECTOR, 0); + err = unshare(CLONE_NEWNET); if (CHECK_FAIL(err)) { - perror("bpf_prog_attach-0"); - goto out_close; + perror("unshare(CLONE_NEWNET)"); + return -1; + } + new_net = open("/proc/self/ns/net", O_RDONLY); + if (CHECK_FAIL(new_net < 0)) { + perror("open(/proc/self/ns/net)"); + setns(old_net, CLONE_NEWNET); + return -1; } + return new_net; +} + +static void test_prog_attach_prog_attach(int netns, int prog1, int prog2) +{ + int err; + + err = bpf_prog_attach(prog1, 0, BPF_FLOW_DISSECTOR, 0); + if (CHECK_FAIL(err)) { + perror("bpf_prog_attach(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); /* Expect success when attaching a different program */ - err = bpf_prog_attach(prog_fd[1], 0, BPF_FLOW_DISSECTOR, 0); + err = bpf_prog_attach(prog2, 0, BPF_FLOW_DISSECTOR, 0); if (CHECK_FAIL(err)) { - perror("bpf_prog_attach-1"); + perror("bpf_prog_attach(prog2) #1"); goto out_detach; } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog2)); /* Expect failure when attaching the same program twice */ - err = bpf_prog_attach(prog_fd[1], 0, BPF_FLOW_DISSECTOR, 0); + err = bpf_prog_attach(prog2, 0, BPF_FLOW_DISSECTOR, 0); if (CHECK_FAIL(!err || errno != EINVAL)) - perror("bpf_prog_attach-2"); + perror("bpf_prog_attach(prog2) #2"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog2)); out_detach: err = bpf_prog_detach(0, BPF_FLOW_DISSECTOR); if (CHECK_FAIL(err)) perror("bpf_prog_detach"); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_create_link_create(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + int link1, link2; + + link1 = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect failure creating link when another link exists */ + errno = 0; + link2 = bpf_link_create(prog2, netns, BPF_FLOW_DISSECTOR, &opts); + if (CHECK_FAIL(link2 != -1 || errno != E2BIG)) + perror("bpf_prog_attach(prog2) expected E2BIG"); + if (link2 != -1) + close(link2); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + close(link1); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_prog_attach_link_create(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + int err, link; + + err = bpf_prog_attach(prog1, -1, BPF_FLOW_DISSECTOR, 0); + if (CHECK_FAIL(err)) { + perror("bpf_prog_attach(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect failure creating link when prog attached */ + errno = 0; + link = bpf_link_create(prog2, netns, BPF_FLOW_DISSECTOR, &opts); + if (CHECK_FAIL(link != -1 || errno != EEXIST)) + perror("bpf_link_create(prog2) expected EEXIST"); + if (link != -1) + close(link); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + err = bpf_prog_detach(-1, BPF_FLOW_DISSECTOR); + if (CHECK_FAIL(err)) + perror("bpf_prog_detach"); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_create_prog_attach(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + int err, link; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect failure attaching prog when link exists */ + errno = 0; + err = bpf_prog_attach(prog2, -1, BPF_FLOW_DISSECTOR, 0); + if (CHECK_FAIL(!err || errno != EEXIST)) + perror("bpf_prog_attach(prog2) expected EEXIST"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + close(link); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_create_prog_detach(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + int err, link; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect failure detaching prog when link exists */ + errno = 0; + err = bpf_prog_detach(-1, BPF_FLOW_DISSECTOR); + if (CHECK_FAIL(!err || errno != EINVAL)) + perror("bpf_prog_detach expected EINVAL"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + close(link); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_prog_attach_detach_query(int netns, int prog1, int prog2) +{ + int err; + + err = bpf_prog_attach(prog1, 0, BPF_FLOW_DISSECTOR, 0); + if (CHECK_FAIL(err)) { + perror("bpf_prog_attach(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + err = bpf_prog_detach(0, BPF_FLOW_DISSECTOR); + if (CHECK_FAIL(err)) { + perror("bpf_prog_detach"); + return; + } + + /* Expect no prog attached after successful detach */ + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_create_close_query(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + int link; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + close(link); + /* Expect no prog attached after closing last link FD */ + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_update_no_old_prog(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + int err, link; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect success replacing the prog when old prog not specified */ + update_opts.flags = 0; + update_opts.old_prog_fd = 0; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(err)) + perror("bpf_link_update"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog2)); + + close(link); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_update_replace_old_prog(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + int err, link; + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect success F_REPLACE and old prog specified to succeed */ + update_opts.flags = BPF_F_REPLACE; + update_opts.old_prog_fd = prog1; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(err)) + perror("bpf_link_update"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog2)); + + close(link); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_update_invalid_opts(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + int err, link; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect update to fail w/ old prog FD but w/o F_REPLACE*/ + errno = 0; + update_opts.flags = 0; + update_opts.old_prog_fd = prog1; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(!err || errno != EINVAL)) { + perror("bpf_link_update expected EINVAL"); + goto out_close; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect update to fail on old prog FD mismatch */ + errno = 0; + update_opts.flags = BPF_F_REPLACE; + update_opts.old_prog_fd = prog2; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(!err || errno != EPERM)) { + perror("bpf_link_update expected EPERM"); + goto out_close; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect update to fail for invalid old prog FD */ + errno = 0; + update_opts.flags = BPF_F_REPLACE; + update_opts.old_prog_fd = -1; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(!err || errno != EBADF)) { + perror("bpf_link_update expected EBADF"); + goto out_close; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect update to fail with invalid flags */ + errno = 0; + update_opts.flags = BPF_F_ALLOW_MULTI; + update_opts.old_prog_fd = 0; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(!err || errno != EINVAL)) + perror("bpf_link_update expected EINVAL"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + +out_close: + close(link); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_update_invalid_prog(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + int err, link, prog3; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect failure when new prog FD is not valid */ + errno = 0; + update_opts.flags = 0; + update_opts.old_prog_fd = 0; + err = bpf_link_update(link, -1, &update_opts); + if (CHECK_FAIL(!err || errno != EBADF)) { + perror("bpf_link_update expected EINVAL"); + goto out_close_link; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + prog3 = load_prog(BPF_PROG_TYPE_SOCKET_FILTER); + if (prog3 < 0) + goto out_close_link; + + /* Expect failure when new prog FD type doesn't match */ + errno = 0; + update_opts.flags = 0; + update_opts.old_prog_fd = 0; + err = bpf_link_update(link, prog3, &update_opts); + if (CHECK_FAIL(!err || errno != EINVAL)) + perror("bpf_link_update expected EINVAL"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + close(prog3); +out_close_link: + close(link); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_update_netns_gone(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + int err, link, old_net; + + old_net = netns; + netns = unshare_net(old_net); + if (netns < 0) + return; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + close(netns); + err = setns(old_net, CLONE_NEWNET); + if (CHECK_FAIL(err)) { + perror("setns(CLONE_NEWNET)"); + close(link); + return; + } + + /* Expect failure when netns destroyed */ + errno = 0; + update_opts.flags = 0; + update_opts.old_prog_fd = 0; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(!err || errno != ENOLINK)) + perror("bpf_link_update"); + + close(link); +} + +static void test_link_get_info(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + struct bpf_link_info info = {}; + struct stat netns_stat = {}; + __u32 info_len, link_id; + int err, link, old_net; + + old_net = netns; + netns = unshare_net(old_net); + if (netns < 0) + return; + + err = fstat(netns, &netns_stat); + if (CHECK_FAIL(err)) { + perror("stat(netns)"); + goto out_resetns; + } + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + goto out_resetns; + } + + info_len = sizeof(info); + err = bpf_obj_get_info_by_fd(link, &info, &info_len); + if (CHECK_FAIL(err)) { + perror("bpf_obj_get_info"); + goto out_unlink; + } + CHECK_FAIL(info_len != sizeof(info)); + + /* Expect link info to be sane and match prog and netns details */ + CHECK_FAIL(info.type != BPF_LINK_TYPE_NETNS); + CHECK_FAIL(info.id == 0); + CHECK_FAIL(info.prog_id != query_prog_id(prog1)); + CHECK_FAIL(info.netns.netns_ino != netns_stat.st_ino); + CHECK_FAIL(info.netns.attach_type != BPF_FLOW_DISSECTOR); + + update_opts.flags = 0; + update_opts.old_prog_fd = 0; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(err)) { + perror("bpf_link_update(prog2)"); + goto out_unlink; + } + + link_id = info.id; + info_len = sizeof(info); + err = bpf_obj_get_info_by_fd(link, &info, &info_len); + if (CHECK_FAIL(err)) { + perror("bpf_obj_get_info"); + goto out_unlink; + } + CHECK_FAIL(info_len != sizeof(info)); + + /* Expect no info change after update except in prog id */ + CHECK_FAIL(info.type != BPF_LINK_TYPE_NETNS); + CHECK_FAIL(info.id != link_id); + CHECK_FAIL(info.prog_id != query_prog_id(prog2)); + CHECK_FAIL(info.netns.netns_ino != netns_stat.st_ino); + CHECK_FAIL(info.netns.attach_type != BPF_FLOW_DISSECTOR); + + /* Leave netns link is attached to and close last FD to it */ + err = setns(old_net, CLONE_NEWNET); + if (CHECK_FAIL(err)) { + perror("setns(NEWNET)"); + goto out_unlink; + } + close(netns); + old_net = -1; + netns = -1; + + info_len = sizeof(info); + err = bpf_obj_get_info_by_fd(link, &info, &info_len); + if (CHECK_FAIL(err)) { + perror("bpf_obj_get_info"); + goto out_unlink; + } + CHECK_FAIL(info_len != sizeof(info)); + + /* Expect netns_ino to change to 0 */ + CHECK_FAIL(info.type != BPF_LINK_TYPE_NETNS); + CHECK_FAIL(info.id != link_id); + CHECK_FAIL(info.prog_id != query_prog_id(prog2)); + CHECK_FAIL(info.netns.netns_ino != 0); + CHECK_FAIL(info.netns.attach_type != BPF_FLOW_DISSECTOR); + +out_unlink: + close(link); +out_resetns: + if (old_net != -1) + setns(old_net, CLONE_NEWNET); + if (netns != -1) + close(netns); +} + +static void run_tests(int netns) +{ + struct test { + const char *test_name; + void (*test_func)(int netns, int prog1, int prog2); + } tests[] = { + { "prog attach, prog attach", + test_prog_attach_prog_attach }, + { "link create, link create", + test_link_create_link_create }, + { "prog attach, link create", + test_prog_attach_link_create }, + { "link create, prog attach", + test_link_create_prog_attach }, + { "link create, prog detach", + test_link_create_prog_detach }, + { "prog attach, detach, query", + test_prog_attach_detach_query }, + { "link create, close, query", + test_link_create_close_query }, + { "link update no old prog", + test_link_update_no_old_prog }, + { "link update with replace old prog", + test_link_update_replace_old_prog }, + { "link update invalid opts", + test_link_update_invalid_opts }, + { "link update invalid prog", + test_link_update_invalid_prog }, + { "link update netns gone", + test_link_update_netns_gone }, + { "link get info", + test_link_get_info }, + }; + int i, progs[2] = { -1, -1 }; + char test_name[80]; + + for (i = 0; i < ARRAY_SIZE(progs); i++) { + progs[i] = load_prog(BPF_PROG_TYPE_FLOW_DISSECTOR); + if (progs[i] < 0) + goto out_close; + } + + for (i = 0; i < ARRAY_SIZE(tests); i++) { + snprintf(test_name, sizeof(test_name), + "flow dissector %s%s", + tests[i].test_name, + netns == init_net ? " (init_net)" : ""); + if (test__start_subtest(test_name)) + tests[i].test_func(netns, progs[0], progs[1]); + } out_close: - close(prog_fd[1]); - close(prog_fd[0]); + for (i = 0; i < ARRAY_SIZE(progs); i++) { + if (progs[i] != -1) + CHECK_FAIL(close(progs[i])); + } } void test_flow_dissector_reattach(void) { - int init_net, self_net, err; + int err, new_net, saved_net; - self_net = open("/proc/self/ns/net", O_RDONLY); - if (CHECK_FAIL(self_net < 0)) { + saved_net = open("/proc/self/ns/net", O_RDONLY); + if (CHECK_FAIL(saved_net < 0)) { perror("open(/proc/self/ns/net"); return; } @@ -111,30 +626,29 @@ void test_flow_dissector_reattach(void) goto out_close; } - if (is_attached(init_net)) { + if (prog_is_attached(init_net)) { test__skip(); printf("Can't test with flow dissector attached to init_net\n"); goto out_setns; } /* First run tests in root network namespace */ - do_flow_dissector_reattach(); + run_tests(init_net); /* Then repeat tests in a non-root namespace */ - err = unshare(CLONE_NEWNET); - if (CHECK_FAIL(err)) { - perror("unshare(CLONE_NEWNET)"); + new_net = unshare_net(init_net); + if (new_net < 0) goto out_setns; - } - do_flow_dissector_reattach(); + run_tests(new_net); + close(new_net); out_setns: /* Move back to netns we started in. */ - err = setns(self_net, CLONE_NEWNET); + err = setns(saved_net, CLONE_NEWNET); if (CHECK_FAIL(err)) perror("setns(/proc/self/ns/net)"); out_close: close(init_net); - close(self_net); + close(saved_net); } -- cgit v1.2.3 From b8215dce7dfd817ca38807f55165bf502146cd68 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:44 +0200 Subject: selftests/bpf, flow_dissector: Close TAP device FD after the test test_flow_dissector leaves a TAP device after it's finished, potentially interfering with other tests that will run after it. Fix it by closing the TAP descriptor on cleanup. Fixes: 0905beec9f52 ("selftests/bpf: run flow dissector tests in skb-less mode") Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-11-jakub@cloudflare.com --- tools/testing/selftests/bpf/prog_tests/flow_dissector.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index 2301c4d3ecec..ef5aab2f60b5 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -524,6 +524,7 @@ void test_flow_dissector(void) CHECK_ATTR(err, tests[i].name, "bpf_map_delete_elem %d\n", err); } + close(tap_fd); bpf_prog_detach(prog_fd, BPF_FLOW_DISSECTOR); bpf_object__close(obj); } -- cgit v1.2.3 From b4b8a3bf9ef0fbbf343b624d68ea328dd4edd5c4 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:45 +0200 Subject: selftests/bpf: Convert test_flow_dissector to use BPF skeleton Switch flow dissector test setup from custom BPF object loader to BPF skeleton to save boilerplate and prepare for testing higher-level API for attaching flow dissector with bpf_link. To avoid depending on program order in the BPF object when populating the flow dissector PROG_ARRAY map, change the program section names to contain the program index into the map. This follows the example set by tailcall tests. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-12-jakub@cloudflare.com --- .../selftests/bpf/prog_tests/flow_dissector.c | 50 +++++++++++++++++++--- tools/testing/selftests/bpf/progs/bpf_flow.c | 20 ++++----- 2 files changed, 55 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index ef5aab2f60b5..b6370c0b3b7a 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -6,6 +6,8 @@ #include #include +#include "bpf_flow.skel.h" + #ifndef IP_MF #define IP_MF 0x2000 #endif @@ -444,17 +446,54 @@ static int ifup(const char *ifname) return 0; } +static int init_prog_array(struct bpf_object *obj, struct bpf_map *prog_array) +{ + int i, err, map_fd, prog_fd; + struct bpf_program *prog; + char prog_name[32]; + + map_fd = bpf_map__fd(prog_array); + if (map_fd < 0) + return -1; + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + snprintf(prog_name, sizeof(prog_name), "flow_dissector/%i", i); + + prog = bpf_object__find_program_by_title(obj, prog_name); + if (!prog) + return -1; + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) + return -1; + + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (err) + return -1; + } + return 0; +} + void test_flow_dissector(void) { int i, err, prog_fd, keys_fd = -1, tap_fd; - struct bpf_object *obj; + struct bpf_flow *skel; __u32 duration = 0; - err = bpf_flow_load(&obj, "./bpf_flow.o", "flow_dissector", - "jmp_table", "last_dissection", &prog_fd, &keys_fd); - if (CHECK_FAIL(err)) + skel = bpf_flow__open_and_load(); + if (CHECK(!skel, "skel", "failed to open/load skeleton\n")) return; + prog_fd = bpf_program__fd(skel->progs._dissect); + if (CHECK(prog_fd < 0, "bpf_program__fd", "err %d\n", prog_fd)) + goto out_destroy_skel; + keys_fd = bpf_map__fd(skel->maps.last_dissection); + if (CHECK(keys_fd < 0, "bpf_map__fd", "err %d\n", keys_fd)) + goto out_destroy_skel; + err = init_prog_array(skel->obj, skel->maps.jmp_table); + if (CHECK(err, "init_prog_array", "err %d\n", err)) + goto out_destroy_skel; + for (i = 0; i < ARRAY_SIZE(tests); i++) { struct bpf_flow_keys flow_keys; struct bpf_prog_test_run_attr tattr = { @@ -526,5 +565,6 @@ void test_flow_dissector(void) close(tap_fd); bpf_prog_detach(prog_fd, BPF_FLOW_DISSECTOR); - bpf_object__close(obj); +out_destroy_skel: + bpf_flow__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/bpf_flow.c b/tools/testing/selftests/bpf/progs/bpf_flow.c index 9941f0ba471e..de6de9221518 100644 --- a/tools/testing/selftests/bpf/progs/bpf_flow.c +++ b/tools/testing/selftests/bpf/progs/bpf_flow.c @@ -20,20 +20,20 @@ #include int _version SEC("version") = 1; -#define PROG(F) SEC(#F) int bpf_func_##F +#define PROG(F) PROG_(F, _##F) +#define PROG_(NUM, NAME) SEC("flow_dissector/"#NUM) int bpf_func##NAME /* These are the identifiers of the BPF programs that will be used in tail * calls. Name is limited to 16 characters, with the terminating character and * bpf_func_ above, we have only 6 to work with, anything after will be cropped. */ -enum { - IP, - IPV6, - IPV6OP, /* Destination/Hop-by-Hop Options IPv6 Extension header */ - IPV6FR, /* Fragmentation IPv6 Extension Header */ - MPLS, - VLAN, -}; +#define IP 0 +#define IPV6 1 +#define IPV6OP 2 /* Destination/Hop-by-Hop Options IPv6 Ext. Header */ +#define IPV6FR 3 /* Fragmentation IPv6 Extension Header */ +#define MPLS 4 +#define VLAN 5 +#define MAX_PROG 6 #define IP_MF 0x2000 #define IP_OFFSET 0x1FFF @@ -59,7 +59,7 @@ struct frag_hdr { struct { __uint(type, BPF_MAP_TYPE_PROG_ARRAY); - __uint(max_entries, 8); + __uint(max_entries, MAX_PROG); __uint(key_size, sizeof(__u32)); __uint(value_size, sizeof(__u32)); } jmp_table SEC(".maps"); -- cgit v1.2.3 From 06716e04a043aa5e010f952a823ad038054b0e5c Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:46 +0200 Subject: selftests/bpf: Extend test_flow_dissector to cover link creation Extend the existing flow_dissector test case to run tests once using direct prog attachments, and then for the second time using indirect attachment via link. The intention is to exercises the newly added high-level API for attaching programs to network namespace with links (bpf_program__attach_netns). Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-13-jakub@cloudflare.com --- .../selftests/bpf/prog_tests/flow_dissector.c | 115 +++++++++++++++------ 1 file changed, 82 insertions(+), 33 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index b6370c0b3b7a..ea14e3ece812 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -103,6 +103,7 @@ struct test { #define VLAN_HLEN 4 +static __u32 duration; struct test tests[] = { { .name = "ipv4", @@ -474,11 +475,87 @@ static int init_prog_array(struct bpf_object *obj, struct bpf_map *prog_array) return 0; } +static void run_tests_skb_less(int tap_fd, struct bpf_map *keys) +{ + int i, err, keys_fd; + + keys_fd = bpf_map__fd(keys); + if (CHECK(keys_fd < 0, "bpf_map__fd", "err %d\n", keys_fd)) + return; + + for (i = 0; i < ARRAY_SIZE(tests); i++) { + /* Keep in sync with 'flags' from eth_get_headlen. */ + __u32 eth_get_headlen_flags = + BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG; + struct bpf_prog_test_run_attr tattr = {}; + struct bpf_flow_keys flow_keys = {}; + __u32 key = (__u32)(tests[i].keys.sport) << 16 | + tests[i].keys.dport; + + /* For skb-less case we can't pass input flags; run + * only the tests that have a matching set of flags. + */ + + if (tests[i].flags != eth_get_headlen_flags) + continue; + + err = tx_tap(tap_fd, &tests[i].pkt, sizeof(tests[i].pkt)); + CHECK(err < 0, "tx_tap", "err %d errno %d\n", err, errno); + + err = bpf_map_lookup_elem(keys_fd, &key, &flow_keys); + CHECK_ATTR(err, tests[i].name, "bpf_map_lookup_elem %d\n", err); + + CHECK_ATTR(err, tests[i].name, "skb-less err %d\n", err); + CHECK_FLOW_KEYS(tests[i].name, flow_keys, tests[i].keys); + + err = bpf_map_delete_elem(keys_fd, &key); + CHECK_ATTR(err, tests[i].name, "bpf_map_delete_elem %d\n", err); + } +} + +static void test_skb_less_prog_attach(struct bpf_flow *skel, int tap_fd) +{ + int err, prog_fd; + + prog_fd = bpf_program__fd(skel->progs._dissect); + if (CHECK(prog_fd < 0, "bpf_program__fd", "err %d\n", prog_fd)) + return; + + err = bpf_prog_attach(prog_fd, 0, BPF_FLOW_DISSECTOR, 0); + if (CHECK(err, "bpf_prog_attach", "err %d errno %d\n", err, errno)) + return; + + run_tests_skb_less(tap_fd, skel->maps.last_dissection); + + err = bpf_prog_detach(prog_fd, BPF_FLOW_DISSECTOR); + CHECK(err, "bpf_prog_detach", "err %d errno %d\n", err, errno); +} + +static void test_skb_less_link_create(struct bpf_flow *skel, int tap_fd) +{ + struct bpf_link *link; + int err, net_fd; + + net_fd = open("/proc/self/ns/net", O_RDONLY); + if (CHECK(net_fd < 0, "open(/proc/self/ns/net)", "err %d\n", errno)) + return; + + link = bpf_program__attach_netns(skel->progs._dissect, net_fd); + if (CHECK(IS_ERR(link), "attach_netns", "err %ld\n", PTR_ERR(link))) + goto out_close; + + run_tests_skb_less(tap_fd, skel->maps.last_dissection); + + err = bpf_link__destroy(link); + CHECK(err, "bpf_link__destroy", "err %d\n", err); +out_close: + close(net_fd); +} + void test_flow_dissector(void) { int i, err, prog_fd, keys_fd = -1, tap_fd; struct bpf_flow *skel; - __u32 duration = 0; skel = bpf_flow__open_and_load(); if (CHECK(!skel, "skel", "failed to open/load skeleton\n")) @@ -526,45 +603,17 @@ void test_flow_dissector(void) * via BPF map in this case. */ - err = bpf_prog_attach(prog_fd, 0, BPF_FLOW_DISSECTOR, 0); - CHECK(err, "bpf_prog_attach", "err %d errno %d\n", err, errno); - tap_fd = create_tap("tap0"); CHECK(tap_fd < 0, "create_tap", "tap_fd %d errno %d\n", tap_fd, errno); err = ifup("tap0"); CHECK(err, "ifup", "err %d errno %d\n", err, errno); - for (i = 0; i < ARRAY_SIZE(tests); i++) { - /* Keep in sync with 'flags' from eth_get_headlen. */ - __u32 eth_get_headlen_flags = - BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG; - struct bpf_prog_test_run_attr tattr = {}; - struct bpf_flow_keys flow_keys = {}; - __u32 key = (__u32)(tests[i].keys.sport) << 16 | - tests[i].keys.dport; - - /* For skb-less case we can't pass input flags; run - * only the tests that have a matching set of flags. - */ - - if (tests[i].flags != eth_get_headlen_flags) - continue; - - err = tx_tap(tap_fd, &tests[i].pkt, sizeof(tests[i].pkt)); - CHECK(err < 0, "tx_tap", "err %d errno %d\n", err, errno); - - err = bpf_map_lookup_elem(keys_fd, &key, &flow_keys); - CHECK_ATTR(err, tests[i].name, "bpf_map_lookup_elem %d\n", err); - - CHECK_ATTR(err, tests[i].name, "skb-less err %d\n", err); - CHECK_FLOW_KEYS(tests[i].name, flow_keys, tests[i].keys); - - err = bpf_map_delete_elem(keys_fd, &key); - CHECK_ATTR(err, tests[i].name, "bpf_map_delete_elem %d\n", err); - } + /* Test direct prog attachment */ + test_skb_less_prog_attach(skel, tap_fd); + /* Test indirect prog attachment via link */ + test_skb_less_link_create(skel, tap_fd); close(tap_fd); - bpf_prog_detach(prog_fd, BPF_FLOW_DISSECTOR); out_destroy_skel: bpf_flow__destroy(skel); } -- cgit v1.2.3 From 836e66c218f355ec01ba57671c85abf32961dcea Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Jun 2020 16:58:32 +0200 Subject: bpf: Fix up bpf_skb_adjust_room helper's skb csum setting Lorenz recently reported: In our TC classifier cls_redirect [0], we use the following sequence of helper calls to decapsulate a GUE (basically IP + UDP + custom header) encapsulated packet: bpf_skb_adjust_room(skb, -encap_len, BPF_ADJ_ROOM_MAC, BPF_F_ADJ_ROOM_FIXED_GSO) bpf_redirect(skb->ifindex, BPF_F_INGRESS) It seems like some checksums of the inner headers are not validated in this case. For example, a TCP SYN packet with invalid TCP checksum is still accepted by the network stack and elicits a SYN ACK. [...] That is, we receive the following packet from the driver: | ETH | IP | UDP | GUE | IP | TCP | skb->ip_summed == CHECKSUM_UNNECESSARY ip_summed is CHECKSUM_UNNECESSARY because our NICs do rx checksum offloading. On this packet we run skb_adjust_room_mac(-encap_len), and get the following: | ETH | IP | TCP | skb->ip_summed == CHECKSUM_UNNECESSARY Note that ip_summed is still CHECKSUM_UNNECESSARY. After bpf_redirect()'ing into the ingress, we end up in tcp_v4_rcv(). There, skb_checksum_init() is turned into a no-op due to CHECKSUM_UNNECESSARY. The bpf_skb_adjust_room() helper is not aware of protocol specifics. Internally, it handles the CHECKSUM_COMPLETE case via skb_postpull_rcsum(), but that does not cover CHECKSUM_UNNECESSARY. In this case skb->csum_level of the original skb prior to bpf_skb_adjust_room() call was 0, that is, covering UDP. Right now there is no way to adjust the skb->csum_level. NICs that have checksum offload disabled (CHECKSUM_NONE) or that support CHECKSUM_COMPLETE are not affected. Use a safe default for CHECKSUM_UNNECESSARY by resetting to CHECKSUM_NONE and add a flag to the helper called BPF_F_ADJ_ROOM_NO_CSUM_RESET that allows users from opting out. Opting out is useful for the case where we don't remove/add full protocol headers, or for the case where a user wants to adjust the csum level manually e.g. through bpf_csum_level() helper that is added in subsequent patch. The bpf_skb_proto_{4_to_6,6_to_4}() for NAT64/46 translation from the BPF bpf_skb_change_proto() helper uses bpf_skb_net_hdr_{push,pop}() pair internally as well but doesn't change layers, only transitions between v4 to v6 and vice versa, therefore no adoption is required there. [0] https://lore.kernel.org/bpf/20200424185556.7358-1-lmb@cloudflare.com/ Fixes: 2be7e212d541 ("bpf: add bpf_skb_adjust_room helper") Reported-by: Lorenz Bauer Reported-by: Alan Maguire Signed-off-by: Daniel Borkmann Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Reviewed-by: Alan Maguire Link: https://lore.kernel.org/bpf/CACAyw9-uU_52esMd1JjuA80fRPHJv5vsSg8GnfW3t_qDU4aVKQ@mail.gmail.com/ Link: https://lore.kernel.org/bpf/11a90472e7cce83e76ddbfce81fdfce7bfc68808.1591108731.git.daniel@iogearbox.net --- include/linux/skbuff.h | 8 ++++++++ include/uapi/linux/bpf.h | 8 ++++++++ net/core/filter.c | 8 ++++++-- tools/include/uapi/linux/bpf.h | 8 ++++++++ 4 files changed, 30 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a0d5c2760103..0c0377fc00c2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3919,6 +3919,14 @@ static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb) } } +static inline void __skb_reset_checksum_unnecessary(struct sk_buff *skb) +{ + if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + skb->ip_summed = CHECKSUM_NONE; + skb->csum_level = 0; + } +} + /* Check if we need to perform checksum complete validation. * * Returns true if checksum complete is needed, false otherwise diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b9ed9f14f2a2..3ba2bbbed80c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1635,6 +1635,13 @@ union bpf_attr { * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. * + * By default, the helper will reset any offloaded checksum + * indicator of the skb to CHECKSUM_NONE. This can be avoided + * by the following flag: + * + * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded + * checksum data of the skb to CHECKSUM_NONE. + * * There are two supported modes at this time: * * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer @@ -3433,6 +3440,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), + BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), }; enum { diff --git a/net/core/filter.c b/net/core/filter.c index ae82bcb03124..278dcc0af961 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3113,7 +3113,8 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, { int ret; - if (flags & ~BPF_F_ADJ_ROOM_FIXED_GSO) + if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | + BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { @@ -3163,7 +3164,8 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32 off; int ret; - if (unlikely(flags & ~BPF_F_ADJ_ROOM_MASK)) + if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK | + BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; @@ -3191,6 +3193,8 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) : bpf_skb_net_grow(skb, off, len_diff_abs, flags); + if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET)) + __skb_reset_checksum_unnecessary(skb); bpf_compute_data_pointers(skb); return ret; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b9ed9f14f2a2..3ba2bbbed80c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1635,6 +1635,13 @@ union bpf_attr { * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. * + * By default, the helper will reset any offloaded checksum + * indicator of the skb to CHECKSUM_NONE. This can be avoided + * by the following flag: + * + * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded + * checksum data of the skb to CHECKSUM_NONE. + * * There are two supported modes at this time: * * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer @@ -3433,6 +3440,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), + BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), }; enum { -- cgit v1.2.3 From 7cdec54f9713256bb170873a1fc5c75c9127c9d2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Jun 2020 16:58:33 +0200 Subject: bpf: Add csum_level helper for fixing up csum levels Add a bpf_csum_level() helper which BPF programs can use in combination with bpf_skb_adjust_room() when they pass in BPF_F_ADJ_ROOM_NO_CSUM_RESET flag to the latter to avoid falling back to CHECKSUM_NONE. The bpf_csum_level() allows to adjust CHECKSUM_UNNECESSARY skb->csum_levels via BPF_CSUM_LEVEL_{INC,DEC} which calls __skb_{incr,decr}_checksum_unnecessary() on the skb. The helper also allows a BPF_CSUM_LEVEL_RESET which sets the skb's csum to CHECKSUM_NONE as well as a BPF_CSUM_LEVEL_QUERY to just return the current level. Without this helper, there is no way to otherwise adjust the skb->csum_level. I did not add an extra dummy flags as there is plenty of free bitspace in level argument itself iff ever needed in future. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Reviewed-by: Alan Maguire Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/279ae3717cb3d03c0ffeb511493c93c450a01e1a.1591108731.git.daniel@iogearbox.net --- include/uapi/linux/bpf.h | 43 +++++++++++++++++++++++++++++++++++++++++- net/core/filter.c | 38 +++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 43 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 122 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3ba2bbbed80c..c65b374a5090 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3220,6 +3220,38 @@ union bpf_attr { * calculation. * Return * Requested value, or 0, if flags are not recognized. + * + * int bpf_csum_level(struct sk_buff *skb, u64 level) + * Description + * Change the skbs checksum level by one layer up or down, or + * reset it entirely to none in order to have the stack perform + * checksum validation. The level is applicable to the following + * protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of + * | ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP | + * through **bpf_skb_adjust_room**\ () helper with passing in + * **BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one call + * to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since + * the UDP header is removed. Similarly, an encap of the latter + * into the former could be accompanied by a helper call to + * **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the + * skb is still intended to be processed in higher layers of the + * stack instead of just egressing at tc. + * + * There are three supported level settings at this time: + * + * * **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and + * sets CHECKSUM_NONE to force checksum validation by the stack. + * * **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current + * skb->csum_level. + * Return + * 0 on success, or a negative error in case of failure. In the + * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level + * is returned or the error code -EACCES in case the skb is not + * subject to CHECKSUM_UNNECESSARY. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3356,7 +3388,8 @@ union bpf_attr { FN(ringbuf_reserve), \ FN(ringbuf_submit), \ FN(ringbuf_discard), \ - FN(ringbuf_query), + FN(ringbuf_query), \ + FN(csum_level), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3433,6 +3466,14 @@ enum { BPF_F_CURRENT_NETNS = (-1L), }; +/* BPF_FUNC_csum_level level values. */ +enum { + BPF_CSUM_LEVEL_QUERY, + BPF_CSUM_LEVEL_INC, + BPF_CSUM_LEVEL_DEC, + BPF_CSUM_LEVEL_RESET, +}; + /* BPF_FUNC_skb_adjust_room flags. */ enum { BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), diff --git a/net/core/filter.c b/net/core/filter.c index 278dcc0af961..d01a244b5087 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2015,6 +2015,40 @@ static const struct bpf_func_proto bpf_csum_update_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level) +{ + /* The interface is to be used in combination with bpf_skb_adjust_room() + * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET + * is passed as flags, for example. + */ + switch (level) { + case BPF_CSUM_LEVEL_INC: + __skb_incr_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_DEC: + __skb_decr_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_RESET: + __skb_reset_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_QUERY: + return skb->ip_summed == CHECKSUM_UNNECESSARY ? + skb->csum_level : -EACCES; + default: + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto bpf_csum_level_proto = { + .func = bpf_csum_level, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) { return dev_forward_skb(dev, skb); @@ -6280,6 +6314,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_csum_diff_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; + case BPF_FUNC_csum_level: + return &bpf_csum_level_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: @@ -6613,6 +6649,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_store_bytes_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; + case BPF_FUNC_csum_level: + return &bpf_csum_level_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3ba2bbbed80c..c65b374a5090 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3220,6 +3220,38 @@ union bpf_attr { * calculation. * Return * Requested value, or 0, if flags are not recognized. + * + * int bpf_csum_level(struct sk_buff *skb, u64 level) + * Description + * Change the skbs checksum level by one layer up or down, or + * reset it entirely to none in order to have the stack perform + * checksum validation. The level is applicable to the following + * protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of + * | ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP | + * through **bpf_skb_adjust_room**\ () helper with passing in + * **BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one call + * to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since + * the UDP header is removed. Similarly, an encap of the latter + * into the former could be accompanied by a helper call to + * **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the + * skb is still intended to be processed in higher layers of the + * stack instead of just egressing at tc. + * + * There are three supported level settings at this time: + * + * * **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and + * sets CHECKSUM_NONE to force checksum validation by the stack. + * * **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current + * skb->csum_level. + * Return + * 0 on success, or a negative error in case of failure. In the + * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level + * is returned or the error code -EACCES in case the skb is not + * subject to CHECKSUM_UNNECESSARY. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3356,7 +3388,8 @@ union bpf_attr { FN(ringbuf_reserve), \ FN(ringbuf_submit), \ FN(ringbuf_discard), \ - FN(ringbuf_query), + FN(ringbuf_query), \ + FN(csum_level), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3433,6 +3466,14 @@ enum { BPF_F_CURRENT_NETNS = (-1L), }; +/* BPF_FUNC_csum_level level values. */ +enum { + BPF_CSUM_LEVEL_QUERY, + BPF_CSUM_LEVEL_INC, + BPF_CSUM_LEVEL_DEC, + BPF_CSUM_LEVEL_RESET, +}; + /* BPF_FUNC_skb_adjust_room flags. */ enum { BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), -- cgit v1.2.3 From c4ba153b6501fa7ccfdc7e57946fb1d6011e36e8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Jun 2020 16:58:34 +0200 Subject: bpf, selftests: Adapt cls_redirect to call csum_level helper Adapt bpf_skb_adjust_room() to pass in BPF_F_ADJ_ROOM_NO_CSUM_RESET flag and use the new bpf_csum_level() helper to inc/dec the checksum level by one after the encap/decap. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Reviewed-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/e7458f10e3f3d795307cbc5ad870112671d9c6f7.1591108731.git.daniel@iogearbox.net --- tools/testing/selftests/bpf/progs/test_cls_redirect.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.c b/tools/testing/selftests/bpf/progs/test_cls_redirect.c index 1668b993eb86..f0b72e86bee5 100644 --- a/tools/testing/selftests/bpf/progs/test_cls_redirect.c +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.c @@ -380,9 +380,10 @@ static ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap) } if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC, - BPF_F_ADJ_ROOM_FIXED_GSO)) { + BPF_F_ADJ_ROOM_FIXED_GSO | + BPF_F_ADJ_ROOM_NO_CSUM_RESET) || + bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC)) return TC_ACT_SHOT; - } return bpf_redirect(skb->ifindex, BPF_F_INGRESS); } @@ -472,7 +473,9 @@ static ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap, } if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET, - BPF_F_ADJ_ROOM_FIXED_GSO)) { + BPF_F_ADJ_ROOM_FIXED_GSO | + BPF_F_ADJ_ROOM_NO_CSUM_RESET) || + bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) { metrics->errors_total_encap_adjust_failed++; return TC_ACT_SHOT; } -- cgit v1.2.3 From 9a5f25ad30e5bb40a2e0c61c991594d3e6529c0a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 1 Jun 2020 22:03:49 -0700 Subject: selftests/bpf: Fix sample_cnt shared between two threads Make sample_cnt volatile to fix possible selftests failure due to compiler optimization preventing latest sample_cnt value to be visible to main thread. sample_cnt is incremented in background thread, which is then joined into main thread. So in terms of visibility sample_cnt update is ok. But because it's not volatile, compiler might make optimizations that would prevent main thread to see latest updated value. Fix this by marking global variable volatile. Fixes: cb1c9ddd5525 ("selftests/bpf: Add BPF ringbuf selftests") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200602050349.215037-1-andriin@fb.com --- tools/testing/selftests/bpf/prog_tests/ringbuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index bb8541f240e2..2bba908dfa63 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -25,7 +25,7 @@ struct sample { char comm[16]; }; -static int sample_cnt; +static volatile int sample_cnt; static int process_sample(void *ctx, void *data, size_t len) { -- cgit v1.2.3 From 7cec0b927142f510a1fac88033017616cce44c26 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 2 Jun 2020 11:57:43 -0700 Subject: selftests/bpf: Fix verifier test Adjust verifier test due to addition of new field. Fixes: c3c16f2ea6d2 ("bpf: Add rx_queue_mapping to bpf_sock") Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/verifier/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/verifier/sock.c b/tools/testing/selftests/bpf/verifier/sock.c index 0bc51ad9e0fb..b1aac2641498 100644 --- a/tools/testing/selftests/bpf/verifier/sock.c +++ b/tools/testing/selftests/bpf/verifier/sock.c @@ -222,7 +222,7 @@ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), - BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetofend(struct bpf_sock, state)), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetofend(struct bpf_sock, rx_queue_mapping)), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, -- cgit v1.2.3 From 9bc499befeef07a4d79f4924bfca05634ad8fc97 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 2 Jun 2020 19:44:48 +0200 Subject: bpf, selftests: Use bpf_probe_read_kernel Since commit 0ebeea8ca8a4 ("bpf: Restrict bpf_probe_read{, str}() only to archs where they work") 44 verifier tests fail on s390 due to not having bpf_probe_read anymore. Fix by using bpf_probe_read_kernel. Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200602174448.2501214-1-iii@linux.ibm.com --- tools/testing/selftests/bpf/verifier/const_or.c | 8 ++-- .../selftests/bpf/verifier/helper_access_var_len.c | 44 +++++++++++----------- .../selftests/bpf/verifier/helper_value_access.c | 36 +++++++++--------- tools/testing/selftests/bpf/verifier/precise.c | 8 ++-- 4 files changed, 48 insertions(+), 48 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/verifier/const_or.c b/tools/testing/selftests/bpf/verifier/const_or.c index 84446dfc7c1d..6c214c58e8d4 100644 --- a/tools/testing/selftests/bpf/verifier/const_or.c +++ b/tools/testing/selftests/bpf/verifier/const_or.c @@ -6,7 +6,7 @@ BPF_MOV64_IMM(BPF_REG_2, 34), BPF_ALU64_IMM(BPF_OR, BPF_REG_2, 13), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .result = ACCEPT, @@ -20,7 +20,7 @@ BPF_MOV64_IMM(BPF_REG_2, 34), BPF_ALU64_IMM(BPF_OR, BPF_REG_2, 24), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .errstr = "invalid stack type R1 off=-48 access_size=58", @@ -36,7 +36,7 @@ BPF_MOV64_IMM(BPF_REG_4, 13), BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_4), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .result = ACCEPT, @@ -51,7 +51,7 @@ BPF_MOV64_IMM(BPF_REG_4, 24), BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_4), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .errstr = "invalid stack type R1 off=-48 access_size=58", diff --git a/tools/testing/selftests/bpf/verifier/helper_access_var_len.c b/tools/testing/selftests/bpf/verifier/helper_access_var_len.c index 5a605ae131a9..87c4e7900083 100644 --- a/tools/testing/selftests/bpf/verifier/helper_access_var_len.c +++ b/tools/testing/selftests/bpf/verifier/helper_access_var_len.c @@ -19,7 +19,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -36,7 +36,7 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 64), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .errstr = "invalid indirect read from stack off -64+0 size 64", @@ -55,7 +55,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -84,7 +84,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -112,7 +112,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -132,7 +132,7 @@ BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 3), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -152,7 +152,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -171,7 +171,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -190,7 +190,7 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 64, 3), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -208,7 +208,7 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), BPF_JMP_IMM(BPF_JSGT, BPF_REG_2, 64, 3), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -233,7 +233,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -259,7 +259,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -286,7 +286,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -313,7 +313,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -468,7 +468,7 @@ BPF_MOV64_IMM(BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_2, 0), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .errstr = "R1 type=inv expected=fp", @@ -481,7 +481,7 @@ BPF_MOV64_IMM(BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_2, 1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .errstr = "R1 type=inv expected=fp", @@ -495,7 +495,7 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), BPF_MOV64_IMM(BPF_REG_2, 0), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .result = ACCEPT, @@ -513,7 +513,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_2, 0), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, @@ -534,7 +534,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, @@ -554,7 +554,7 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0), BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 8, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, @@ -580,7 +580,7 @@ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 63), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_EXIT_INSN(), }, @@ -607,7 +607,7 @@ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 32), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 32), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_EXIT_INSN(), }, diff --git a/tools/testing/selftests/bpf/verifier/helper_value_access.c b/tools/testing/selftests/bpf/verifier/helper_value_access.c index 961f28139b96..1c7882ddfa63 100644 --- a/tools/testing/selftests/bpf/verifier/helper_value_access.c +++ b/tools/testing/selftests/bpf/verifier/helper_value_access.c @@ -10,7 +10,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val)), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -29,7 +29,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_2, 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -67,7 +67,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val) + 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -87,7 +87,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_2, -8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -109,7 +109,7 @@ BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val) - offsetof(struct test_val, foo)), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -129,7 +129,7 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct test_val, foo)), BPF_MOV64_IMM(BPF_REG_2, 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -170,7 +170,7 @@ BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val) - offsetof(struct test_val, foo) + 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -191,7 +191,7 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct test_val, foo)), BPF_MOV64_IMM(BPF_REG_2, -8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -212,7 +212,7 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct test_val, foo)), BPF_MOV64_IMM(BPF_REG_2, -1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -235,7 +235,7 @@ BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val) - offsetof(struct test_val, foo)), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -256,7 +256,7 @@ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), BPF_MOV64_IMM(BPF_REG_2, 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -300,7 +300,7 @@ sizeof(struct test_val) - offsetof(struct test_val, foo) + 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -322,7 +322,7 @@ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), BPF_MOV64_IMM(BPF_REG_2, -8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -344,7 +344,7 @@ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), BPF_MOV64_IMM(BPF_REG_2, -1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -368,7 +368,7 @@ BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val) - offsetof(struct test_val, foo)), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -390,7 +390,7 @@ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), BPF_MOV64_IMM(BPF_REG_2, 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -433,7 +433,7 @@ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), BPF_MOV64_IMM(BPF_REG_2, 1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -458,7 +458,7 @@ sizeof(struct test_val) - offsetof(struct test_val, foo) + 1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c index 02151f8c940f..6dc8003ffc70 100644 --- a/tools/testing/selftests/bpf/verifier/precise.c +++ b/tools/testing/selftests/bpf/verifier/precise.c @@ -31,14 +31,14 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_FP), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_TRACEPOINT, .fixup_map_array_48b = { 1 }, .result = VERBOSE_ACCEPT, .errstr = - "26: (85) call bpf_probe_read#4\ + "26: (85) call bpf_probe_read_kernel#113\ last_idx 26 first_idx 20\ regs=4 stack=0 before 25\ regs=4 stack=0 before 24\ @@ -91,7 +91,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_FP), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -99,7 +99,7 @@ .result = VERBOSE_ACCEPT, .flags = BPF_F_TEST_STATE_FREQ, .errstr = - "26: (85) call bpf_probe_read#4\ + "26: (85) call bpf_probe_read_kernel#113\ last_idx 26 first_idx 22\ regs=4 stack=0 before 25\ regs=4 stack=0 before 24\ -- cgit v1.2.3 From d70a6be1e2ab98f13688e4a529b326e8e11230d0 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 2 Jun 2020 19:56:48 +0200 Subject: tools/bpf: Don't use $(COMPILE.c) When using make kselftest TARGETS=bpf, tools/bpf is built with MAKEFLAGS=rR, which causes $(COMPILE.c) to be undefined, which in turn causes the build to fail with CC kselftest/bpf/tools/build/bpftool/map_perf_ring.o /bin/sh: 1: -MMD: not found Fix by using $(CC) $(CFLAGS) -c instead of $(COMPILE.c). Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200602175649.2501580-2-iii@linux.ibm.com --- tools/bpf/Makefile | 6 +++--- tools/bpf/bpftool/Makefile | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile index f897eeeb0b4f..77472e28c8fd 100644 --- a/tools/bpf/Makefile +++ b/tools/bpf/Makefile @@ -64,12 +64,12 @@ $(OUTPUT)%.lex.c: $(srctree)/tools/bpf/%.l $(QUIET_FLEX)$(LEX) -o $@ $< $(OUTPUT)%.o: $(srctree)/tools/bpf/%.c - $(QUIET_CC)$(COMPILE.c) -o $@ $< + $(QUIET_CC)$(CC) $(CFLAGS) -c -o $@ $< $(OUTPUT)%.yacc.o: $(OUTPUT)%.yacc.c - $(QUIET_CC)$(COMPILE.c) -o $@ $< + $(QUIET_CC)$(CC) $(CFLAGS) -c -o $@ $< $(OUTPUT)%.lex.o: $(OUTPUT)%.lex.c - $(QUIET_CC)$(COMPILE.c) -o $@ $< + $(QUIET_CC)$(CC) $(CFLAGS) -c -o $@ $< PROGS = $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg $(OUTPUT)bpf_asm diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 2759f9cc3289..9e85f101be85 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -126,7 +126,7 @@ else endif $(OUTPUT)_prog.o: prog.c - $(QUIET_CC)$(COMPILE.c) -MMD -DBPFTOOL_WITHOUT_SKELETONS -o $@ $< + $(QUIET_CC)$(CC) $(CFLAGS) -c -MMD -DBPFTOOL_WITHOUT_SKELETONS -o $@ $< $(OUTPUT)_bpftool: $(_OBJS) $(LIBBPF) $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(_OBJS) $(LIBS) @@ -141,10 +141,10 @@ profiler.skel.h: $(OUTPUT)_bpftool skeleton/profiler.bpf.o $(QUIET_GEN)$(OUTPUT)./_bpftool gen skeleton skeleton/profiler.bpf.o > $@ $(OUTPUT)prog.o: prog.c profiler.skel.h - $(QUIET_CC)$(COMPILE.c) -MMD -o $@ $< + $(QUIET_CC)$(CC) $(CFLAGS) -c -MMD -o $@ $< $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c - $(QUIET_CC)$(COMPILE.c) -MMD -o $@ $< + $(QUIET_CC)$(CC) $(CFLAGS) -c -MMD -o $@ $< $(OUTPUT)feature.o: | zdep @@ -152,7 +152,7 @@ $(OUTPUT)bpftool: $(__OBJS) $(LIBBPF) $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(__OBJS) $(LIBS) $(OUTPUT)%.o: %.c - $(QUIET_CC)$(COMPILE.c) -MMD -o $@ $< + $(QUIET_CC)$(CC) $(CFLAGS) -c -MMD -o $@ $< clean: $(LIBBPF)-clean $(call QUIET_CLEAN, bpftool) -- cgit v1.2.3 From e7ad28e6fdbffa2b9b1bd376431fb81a5403bcfd Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 2 Jun 2020 19:56:49 +0200 Subject: selftests/bpf: Add a default $(CXX) value When using make kselftest TARGETS=bpf, tools/bpf is built with MAKEFLAGS=rR, which causes $(CXX) to be undefined, which in turn causes the build to fail with CXX test_cpp /bin/sh: 2: g: not found Fix by adding a default $(CXX) value, like tools/build/feature/Makefile already does. Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200602175649.2501580-3-iii@linux.ibm.com --- tools/testing/selftests/bpf/Makefile | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 3ce548eff8a8..22aaec74ea0a 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -2,6 +2,8 @@ include ../../../../scripts/Kbuild.include include ../../../scripts/Makefile.arch +CXX ?= $(CROSS_COMPILE)g++ + CURDIR := $(abspath .) TOOLSDIR := $(abspath ../../..) LIBDIR := $(TOOLSDIR)/lib -- cgit v1.2.3 From 065fcfd49763ec71ae345bb5c5a74f961031e70e Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Tue, 2 Jun 2020 15:38:37 -0300 Subject: selftests: net: ip_defrag: ignore EPERM When running with conntrack rules, the dropped overlap fragments may cause EPERM to be returned to sendto. Instead of completely failing, just ignore those errors and continue. If this causes packets with overlap fragments to be dropped as expected, that is okay. And if it causes packets that are expected to be received to be dropped, which should not happen, it will be detected as failure. Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: David S. Miller --- tools/testing/selftests/net/ip_defrag.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/ip_defrag.c b/tools/testing/selftests/net/ip_defrag.c index c0c9ecb891e1..f9ed749fd8c7 100644 --- a/tools/testing/selftests/net/ip_defrag.c +++ b/tools/testing/selftests/net/ip_defrag.c @@ -192,9 +192,9 @@ static void send_fragment(int fd_raw, struct sockaddr *addr, socklen_t alen, } res = sendto(fd_raw, ip_frame, frag_len, 0, addr, alen); - if (res < 0) + if (res < 0 && errno != EPERM) error(1, errno, "send_fragment"); - if (res != frag_len) + if (res >= 0 && res != frag_len) error(1, 0, "send_fragment: %d vs %d", res, frag_len); frag_counter++; @@ -313,9 +313,9 @@ static void send_udp_frags(int fd_raw, struct sockaddr *addr, iphdr->ip_len = htons(frag_len); } res = sendto(fd_raw, ip_frame, frag_len, 0, addr, alen); - if (res < 0) + if (res < 0 && errno != EPERM) error(1, errno, "sendto overlap: %d", frag_len); - if (res != frag_len) + if (res >= 0 && res != frag_len) error(1, 0, "sendto overlap: %d vs %d", (int)res, frag_len); frag_counter++; } -- cgit v1.2.3