summaryrefslogtreecommitdiff
path: root/tools/testing/selftests/drivers/net/mlxsw/qos_max_descriptors.sh
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-10-04 23:38:03 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2022-10-04 23:38:03 +0300
commit0326074ff4652329f2a1a9c8685104576bd8d131 (patch)
tree9a7574c7ccb05bf4c7cb34fc5a65457bb8f495cb /tools/testing/selftests/drivers/net/mlxsw/qos_max_descriptors.sh
parent522667b24f08009591c90e75bfe2ffb67f555498 (diff)
parent681bf011b9b5989c6e9db6beb64494918aab9a43 (diff)
downloadlinux-0326074ff4652329f2a1a9c8685104576bd8d131.tar.xz
Merge tag 'net-next-6.1' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core: - Introduce and use a single page frag cache for allocating small skb heads, clawing back the 10-20% performance regression in UDP flood test from previous fixes. - Run packets which already went thru HW coalescing thru SW GRO. This significantly improves TCP segment coalescing and simplifies deployments as different workloads benefit from HW or SW GRO. - Shrink the size of the base zero-copy send structure. - Move TCP init under a new slow / sleepable version of DO_ONCE(). BPF: - Add BPF-specific, any-context-safe memory allocator. - Add helpers/kfuncs for PKCS#7 signature verification from BPF programs. - Define a new map type and related helpers for user space -> kernel communication over a ring buffer (BPF_MAP_TYPE_USER_RINGBUF). - Allow targeting BPF iterators to loop through resources of one task/thread. - Add ability to call selected destructive functions. Expose crash_kexec() to allow BPF to trigger a kernel dump. Use CAP_SYS_BOOT check on the loading process to judge permissions. - Enable BPF to collect custom hierarchical cgroup stats efficiently by integrating with the rstat framework. - Support struct arguments for trampoline based programs. Only structs with size <= 16B and x86 are supported. - Invoke cgroup/connect{4,6} programs for unprivileged ICMP ping sockets (instead of just TCP and UDP sockets). - Add a helper for accessing CLOCK_TAI for time sensitive network related programs. - Support accessing network tunnel metadata's flags. - Make TCP SYN ACK RTO tunable by BPF programs with TCP Fast Open. - Add support for writing to Netfilter's nf_conn:mark. Protocols: - WiFi: more Extremely High Throughput (EHT) and Multi-Link Operation (MLO) work (802.11be, WiFi 7). - vsock: improve support for SO_RCVLOWAT. - SMC: support SO_REUSEPORT. - Netlink: define and document how to use netlink in a "modern" way. Support reporting missing attributes via extended ACK. - IPSec: support collect metadata mode for xfrm interfaces. - TCPv6: send consistent autoflowlabel in SYN_RECV state and RST packets. - TCP: introduce optional per-netns connection hash table to allow better isolation between namespaces (opt-in, at the cost of memory and cache pressure). - MPTCP: support TCP_FASTOPEN_CONNECT. - Add NEXT-C-SID support in Segment Routing (SRv6) End behavior. - Adjust IP_UNICAST_IF sockopt behavior for connected UDP sockets. - Open vSwitch: - Allow specifying ifindex of new interfaces. - Allow conntrack and metering in non-initial user namespace. - TLS: support the Korean ARIA-GCM crypto algorithm. - Remove DECnet support. Driver API: - Allow selecting the conduit interface used by each port in DSA switches, at runtime. - Ethernet Power Sourcing Equipment and Power Device support. - Add tc-taprio support for queueMaxSDU parameter, i.e. setting per traffic class max frame size for time-based packet schedules. - Support PHY rate matching - adapting between differing host-side and link-side speeds. - Introduce QUSGMII PHY mode and 1000BASE-KX interface mode. - Validate OF (device tree) nodes for DSA shared ports; make phylink-related properties mandatory on DSA and CPU ports. Enforcing more uniformity should allow transitioning to phylink. - Require that flash component name used during update matches one of the components for which version is reported by info_get(). - Remove "weight" argument from driver-facing NAPI API as much as possible. It's one of those magic knobs which seemed like a good idea at the time but is too indirect to use in practice. - Support offload of TLS connections with 256 bit keys. New hardware / drivers: - Ethernet: - Microchip KSZ9896 6-port Gigabit Ethernet Switch - Renesas Ethernet AVB (EtherAVB-IF) Gen4 SoCs - Analog Devices ADIN1110 and ADIN2111 industrial single pair Ethernet (10BASE-T1L) MAC+PHY. - Rockchip RV1126 Gigabit Ethernet (a version of stmmac IP). - Ethernet SFPs / modules: - RollBall / Hilink / Turris 10G copper SFPs - HALNy GPON module - WiFi: - CYW43439 SDIO chipset (brcmfmac) - CYW89459 PCIe chipset (brcmfmac) - BCM4378 on Apple platforms (brcmfmac) Drivers: - CAN: - gs_usb: HW timestamp support - Ethernet PHYs: - lan8814: cable diagnostics - Ethernet NICs: - Intel (100G): - implement control of FCS/CRC stripping - port splitting via devlink - L2TPv3 filtering offload - nVidia/Mellanox: - tunnel offload for sub-functions - MACSec offload, w/ Extended packet number and replay window offload - significantly restructure, and optimize the AF_XDP support, align the behavior with other vendors - Huawei: - configuring DSCP map for traffic class selection - querying standard FEC statistics - querying SerDes lane number via ethtool - Marvell/Cavium: - egress priority flow control - MACSec offload - AMD/SolarFlare: - PTP over IPv6 and raw Ethernet - small / embedded: - ax88772: convert to phylink (to support SFP cages) - altera: tse: convert to phylink - ftgmac100: support fixed link - enetc: standard Ethtool counters - macb: ZynqMP SGMII dynamic configuration support - tsnep: support multi-queue and use page pool - lan743x: Rx IP & TCP checksum offload - igc: add xdp frags support to ndo_xdp_xmit - Ethernet high-speed switches: - Marvell (prestera): - support SPAN port features (traffic mirroring) - nexthop object offloading - Microchip (sparx5): - multicast forwarding offload - QoS queuing offload (tc-mqprio, tc-tbf, tc-ets) - Ethernet embedded switches: - Marvell (mv88e6xxx): - support RGMII cmode - NXP (felix): - standardized ethtool counters - Microchip (lan966x): - QoS queuing offload (tc-mqprio, tc-tbf, tc-cbs, tc-ets) - traffic policing and mirroring - link aggregation / bonding offload - QUSGMII PHY mode support - Qualcomm 802.11ax WiFi (ath11k): - cold boot calibration support on WCN6750 - support to connect to a non-transmit MBSSID AP profile - enable remain-on-channel support on WCN6750 - Wake-on-WLAN support for WCN6750 - support to provide transmit power from firmware via nl80211 - support to get power save duration for each client - spectral scan support for 160 MHz - MediaTek WiFi (mt76): - WiFi-to-Ethernet bridging offload for MT7986 chips - RealTek WiFi (rtw89): - P2P support" * tag 'net-next-6.1' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1864 commits) eth: pse: add missing static inlines once: rename _SLOW to _SLEEPABLE net: pse-pd: add regulator based PSE driver dt-bindings: net: pse-dt: add bindings for regulator based PoDL PSE controller ethtool: add interface to interact with Ethernet Power Equipment net: mdiobus: search for PSE nodes by parsing PHY nodes. net: mdiobus: fwnode_mdiobus_register_phy() rework error handling net: add framework to support Ethernet PSE and PDs devices dt-bindings: net: phy: add PoDL PSE property net: marvell: prestera: Propagate nh state from hw to kernel net: marvell: prestera: Add neighbour cache accounting net: marvell: prestera: add stub handler neighbour events net: marvell: prestera: Add heplers to interact with fib_notifier_info net: marvell: prestera: Add length macros for prestera_ip_addr net: marvell: prestera: add delayed wq and flush wq on deinit net: marvell: prestera: Add strict cleanup of fib arbiter net: marvell: prestera: Add cleanup of allocated fib_nodes net: marvell: prestera: Add router nexthops ABI eth: octeon: fix build after netif_napi_add() changes net/mlx5: E-Switch, Return EBUSY if can't get mode lock ...
Diffstat (limited to 'tools/testing/selftests/drivers/net/mlxsw/qos_max_descriptors.sh')
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/qos_max_descriptors.sh282
1 files changed, 282 insertions, 0 deletions
diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_max_descriptors.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_max_descriptors.sh
new file mode 100755
index 000000000000..5ac4f795e333
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/qos_max_descriptors.sh
@@ -0,0 +1,282 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# This test sends many small packets (size is less than cell size) through the
+# switch. A shaper is used in $swp2, so the traffic is limited there. Packets
+# are queued till they will be sent.
+#
+# The idea is to verify that the switch can handle at least 85% of maximum
+# supported descrpitors by hardware. Then, we verify that the driver configures
+# firmware to allow infinite size of egress descriptor pool, and does not use a
+# lower limitation. Increase the size of the relevant pools such that the pool's
+# size does not limit the traffic.
+
+# +-----------------------+
+# | H1 |
+# | + $h1.111 |
+# | | 192.0.2.33/28 |
+# | | |
+# | + $h1 |
+# +---|-------------------+
+# |
+# +---|-----------------------------+
+# | + $swp1 |
+# | | iPOOL1 |
+# | | |
+# | +-|------------------------+ |
+# | | + $swp1.111 | |
+# | | | |
+# | | BR1 | |
+# | | | |
+# | | + $swp2.111 | |
+# | +-|------------------------+ |
+# | | |
+# | + $swp2 |
+# | | ePOOL6 |
+# | | 1mbit |
+# +---+-----------------------------+
+# |
+# +---|-------------------+
+# | + $h2 H2 |
+# | | |
+# | + $h2.111 |
+# | 192.0.2.34/28 |
+# +-----------------------+
+#
+
+ALL_TESTS="
+ ping_ipv4
+ max_descriptors
+"
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+NUM_NETIFS=4
+source $lib_dir/lib.sh
+source $lib_dir/devlink_lib.sh
+source mlxsw_lib.sh
+
+MAX_POOL_SIZE=$(devlink_pool_size_get)
+SHAPER_RATE=1mbit
+
+# The current TBF qdisc interface does not allow us to configure the shaper to
+# flat zero. The ASIC shaper is guaranteed to work with a granularity of
+# 200Mbps. On Spectrum-2, writing a value close to zero instead of zero works
+# well, but the performance on Spectrum-1 is unpredictable. Thus, do not run the
+# test on Spectrum-1.
+mlxsw_only_on_spectrum 2+ || exit
+
+h1_create()
+{
+ simple_if_init $h1
+
+ vlan_create $h1 111 v$h1 192.0.2.33/28
+ ip link set dev $h1.111 type vlan egress-qos-map 0:1
+}
+
+h1_destroy()
+{
+ vlan_destroy $h1 111
+
+ simple_if_fini $h1
+}
+
+h2_create()
+{
+ simple_if_init $h2
+
+ vlan_create $h2 111 v$h2 192.0.2.34/28
+}
+
+h2_destroy()
+{
+ vlan_destroy $h2 111
+
+ simple_if_fini $h2
+}
+
+switch_create()
+{
+ # pools
+ # -----
+
+ devlink_pool_size_thtype_save 1
+ devlink_pool_size_thtype_save 6
+
+ devlink_port_pool_th_save $swp1 1
+ devlink_port_pool_th_save $swp2 6
+
+ devlink_tc_bind_pool_th_save $swp1 1 ingress
+ devlink_tc_bind_pool_th_save $swp2 1 egress
+
+ devlink_pool_size_thtype_set 1 dynamic $MAX_POOL_SIZE
+ devlink_pool_size_thtype_set 6 static $MAX_POOL_SIZE
+
+ # $swp1
+ # -----
+
+ ip link set dev $swp1 up
+ vlan_create $swp1 111
+ ip link set dev $swp1.111 type vlan ingress-qos-map 0:0 1:1
+
+ devlink_port_pool_th_set $swp1 1 16
+ devlink_tc_bind_pool_th_set $swp1 1 ingress 1 16
+
+ tc qdisc replace dev $swp1 root handle 1: \
+ ets bands 8 strict 8 priomap 7 6
+ dcb buffer set dev $swp1 prio-buffer all:0 1:1
+
+ # $swp2
+ # -----
+
+ ip link set dev $swp2 up
+ vlan_create $swp2 111
+ ip link set dev $swp2.111 type vlan egress-qos-map 0:0 1:1
+
+ devlink_port_pool_th_set $swp2 6 $MAX_POOL_SIZE
+ devlink_tc_bind_pool_th_set $swp2 1 egress 6 $MAX_POOL_SIZE
+
+ tc qdisc replace dev $swp2 root handle 1: tbf rate $SHAPER_RATE \
+ burst 128K limit 500M
+ tc qdisc replace dev $swp2 parent 1:1 handle 11: \
+ ets bands 8 strict 8 priomap 7 6
+
+ # bridge
+ # ------
+
+ ip link add name br1 type bridge vlan_filtering 0
+ ip link set dev $swp1.111 master br1
+ ip link set dev br1 up
+
+ ip link set dev $swp2.111 master br1
+}
+
+switch_destroy()
+{
+ # Do this first so that we can reset the limits to values that are only
+ # valid for the original static / dynamic setting.
+ devlink_pool_size_thtype_restore 6
+ devlink_pool_size_thtype_restore 1
+
+ # bridge
+ # ------
+
+ ip link set dev $swp2.111 nomaster
+
+ ip link set dev br1 down
+ ip link set dev $swp1.111 nomaster
+ ip link del dev br1
+
+ # $swp2
+ # -----
+
+ tc qdisc del dev $swp2 parent 1:1 handle 11:
+ tc qdisc del dev $swp2 root
+
+ devlink_tc_bind_pool_th_restore $swp2 1 egress
+ devlink_port_pool_th_restore $swp2 6
+
+ vlan_destroy $swp2 111
+ ip link set dev $swp2 down
+
+ # $swp1
+ # -----
+
+ dcb buffer set dev $swp1 prio-buffer all:0
+ tc qdisc del dev $swp1 root
+
+ devlink_tc_bind_pool_th_restore $swp1 1 ingress
+ devlink_port_pool_th_restore $swp1 1
+
+ vlan_destroy $swp1 111
+ ip link set dev $swp1 down
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ h2mac=$(mac_get $h2)
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1 192.0.2.34 " h1->h2"
+}
+
+percentage_used()
+{
+ local num_packets=$1; shift
+ local max_packets=$1; shift
+
+ bc <<< "
+ scale=2
+ 100 * $num_packets / $max_packets
+ "
+}
+
+max_descriptors()
+{
+ local cell_size=$(devlink_cell_size_get)
+ local exp_perc_used=85
+ local max_descriptors
+ local pktsize=30
+
+ RET=0
+
+ max_descriptors=$(mlxsw_max_descriptors_get) || exit 1
+
+ local d0=$(ethtool_stats_get $swp2 tc_no_buffer_discard_uc_tc_1)
+
+ log_info "Send many small packets, packet size = $pktsize bytes"
+ start_traffic_pktsize $pktsize $h1.111 192.0.2.33 192.0.2.34 $h2mac
+
+ # Sleep to wait for congestion.
+ sleep 5
+
+ local d1=$(ethtool_stats_get $swp2 tc_no_buffer_discard_uc_tc_1)
+ ((d1 == d0))
+ check_err $? "Drops seen on egress port: $d0 -> $d1 ($((d1 - d0)))"
+
+ # Check how many packets the switch can handle, the limitation is
+ # maximum descriptors.
+ local pkts_bytes=$(ethtool_stats_get $swp2 tc_transmit_queue_tc_1)
+ local pkts_num=$((pkts_bytes / cell_size))
+ local perc_used=$(percentage_used $pkts_num $max_descriptors)
+
+ check_err $(bc <<< "$perc_used < $exp_perc_used") \
+ "Expected > $exp_perc_used% of descriptors, handle $perc_used%"
+
+ stop_traffic
+ sleep 1
+
+ log_test "Maximum descriptors usage. The percentage used is $perc_used%"
+}
+
+trap cleanup EXIT
+setup_prepare
+setup_wait
+tests_run
+
+exit $EXIT_STATUS